#获取和读取数据
import torch
import torch.nn as nn
import pandas as pd #处理数据
import All_function as func #自定义包
torch.set_default_tensor_type(torch.FloatTensor)
train_data=pd.read_csv('Kaggle_house/train.csv') # 样本,特征,标签:1460,80,1(79个特征)
test_data=pd.read_csv('Kaggle_house/test.csv') # (1459,80)(79个特征)
#特征连接
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
#数据预处理
#1.连续值处理
numeric_features=all_features.dtypes[all_features.dtypes!='object'].index
all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std()))
all_features[numeric_features]=all_features[numeric_features].fillna(0)
#2.离散值处理
all_features=pd.get_dummies(all_features,dummy_na=True) #处理后的all_features有331列
#训练和测试数据
num_train=train_data.shape[0]
train_features=torch.tensor(all_features[:num_train].values,dtype=torch.float)
test_features=torch.tensor(all_features[num_train:].values,dtype=torch.float)
train_labels=torch.tensor(train_data.SalePrice.values,dtype=torch.float).view(-1,1) #(1460,1)
#训练模型
loss=torch.nn.MSELoss()
def get_net(feature_num):
net=nn.Linear(feature_num,1)
for param in net.parameters():
nn.init.normal_(param,mean=0,std=0.01)
return net
#比赛用来评价模型的对数均方根误差
def log_rmse(net,features,labels):
with torch.no_grad(): #不创建计算图
#将小于1的值设置为1,使得取对数时数值更稳定
clippes_pred=torch.max(net(features),torch.tensor(1.0))
rmse=torch.sqrt(2*loss(clippes_pred.log(),labels.log()).mean())
return rmse.item()
#训练
def train(net, train_features, train_labels, test_features, test_labels,num_epochs, learning_rate, weight_decay, batch_size): #使用Adam优化算法
train_ls, test_ls = [], []
dataset = torch.utils.data.TensorDataset(train_features, train_labels)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
# 这里使用了Adam优化算法
optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay)
net = net.float()
for epoch in range(num_epochs):
for X, y in train_iter:
l = loss(net(X.float()), y.float())
optimizer.zero_grad()
l.backward()
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
#K折交叉验证,选择模型设计并调节超参数
def get_k_fold_data(k, i, X, y):
# 返回第i折交叉验证时所需要的训练和验证数据
assert k > 1
fold_size = X.shape[0] // k #把原始数据集分割成k份
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == i:
X_valid, y_valid = X_part, y_part #第i折为验证集,其余k-1折为训练集
elif X_train is None: #第一折
X_train, y_train = X_part, y_part
else:
X_train = torch.cat((X_train, X_part), dim=0)
y_train = torch.cat((y_train, y_part), dim=0)
return X_train, y_train, X_valid, y_valid #返回训练集和验证集
#在K折交叉验证中训练K次并返回训练和验证的平均误差
def k_fold(k, X_train, y_train, num_epochs,
learning_rate, weight_decay, batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net(X_train.shape[1])
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
weight_decay, batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
if i == 0:
func.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
range(1, num_epochs + 1), valid_ls,
['train', 'valid'])
print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
return train_l_sum / k, valid_l_sum / k
#模型选择
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))
#预测并打包结果
#用所有的训练集重新训练并预测房屋价格,如果得到与K折交叉验证时差不多的训练误差,则这个结果很可能是可靠的
def train_and_pred(train_features, test_features, train_labels, test_data,
num_epochs, lr, weight_decay, batch_size):
net = get_net(train_features.shape[1])
train_ls, _ = train(net, train_features, train_labels, None, None,
num_epochs, lr, weight_decay, batch_size)
func.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
print('train rmse %f' % train_ls[-1])
preds = net(test_features).detach().numpy() #detach()隔断参数,不参与更新
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('./submission.csv', index=False)
train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)
All_function包 :
import matplotlib.pyplot as plt
from IPython import display
def set_figsize(figsize=(3.5, 2.5)):
use_svg_display()
# 设置图的尺寸
plt.rcParams['figure.figsize'] = figsize
def use_svg_display():
# svg:Scalable Vector Graphicas 可缩放矢量图
"""Use svg format to display plot in jupyter"""
display.set_matplotlib_formats('svg')
def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None,
legend=None, figsize=(3.5, 2.5)):
set_figsize(figsize)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.semilogy(x_vals, y_vals)
if x2_vals and y2_vals:
plt.semilogy(x2_vals, y2_vals, linestyle=':')
plt.legend(legend)
plt.show()
结果:
fold 0, train rmse 0.239903, valid rmse 0.221246
fold 1, train rmse 0.229439, valid rmse 0.269979
fold 2, train rmse 0.231695, valid rmse 0.238354
fold 3, train rmse 0.237770, valid rmse 0.218455
fold 4, train rmse 0.230555, valid rmse 0.258525
5-fold validation: avg train rmse 0.233873, avg valid rmse 0.241312
train rmse 0.229990