欢迎您访问365答案网,请分享给你的朋友!
生活常识 学习资料

kaggle房价预测HousePrices

时间:2023-05-24

import numpy as npimport pandas as pdimport torchfrom torch import nnfrom d2l import torch as d2limport matplotlib.pyplot as plttrain_data=pd.read_csv('data/train.csv')test_data=pd.read_csv('data/test.csv')print(train_data.shape)print(test_data.shape)print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))print(all_features.shape)#特征标准化numeric_features=all_features.dtypes[all_features.dtypes!='object'].indexall_features[numeric_features]=all_features[numeric_features].apply( lambda x:(x-x.mean())/x.std())all_features[numeric_features]=all_features[numeric_features].fillna(0)#离散数据独热化all_features=pd.get_dummies(all_features,dummy_na=True)print(all_features.shape)#转化为张量n_train=train_data.shape[0]train_features=torch.tensor(all_features[:n_train].values,dtype=torch.float32)test_features=torch.tensor(all_features[n_train:].values,dtype=torch.float32)train_labels=torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32)loss=nn.MSELoss()in_features=train_features.shape[1]def get_net(): net=nn.Sequential(nn.Linear(in_features,1)) return net#评价函数def log_rmse(net,features,labels): clipped_preds=torch.clamp(net(features),1,float('inf')) rmse=torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels))) return rmse.item()def train(net, train_features, train_labels, test_features, test_labels,num_epochs, learning_rate, weight_decay, batch_size): train_ls,test_ls=[],[] #print(train_features.shape,train_labels.shape) train_iter=d2l.load_array((train_features,train_labels),batch_size) optimizer=torch.optim.Adam(net.parameters(),lr=learning_rate,weight_decay=weight_decay) for epoch in range(num_epochs): for X,y in train_iter: optimizer.zero_grad() l=loss(net(X),y) l.backward() optimizer.step() train_ls.append(log_rmse(net,train_features,train_labels)) if test_labels is not None: test_ls.append(log_rmse(net,test_features,test_labels)) return train_ls,test_lsdef get_k_fold_data(k, i, X, y): assert k>1 fold_size=X.shape[0]//k #print(fold_size) X_train,y_train=None,None for j in range(k): idx=slice(fold_size*j,fold_size*(j+1)) X_part,y_part=X[idx,:],y[idx] if j==i: X_valid,y_valid=X_part,y_part elif X_train==None: X_train, y_train=X_part,y_part else: X_train=torch.cat([X_train,X_part],0) y_train=torch.cat([y_train,y_part],0) #print(X_train.shape,y_train.shape,X_valid.shape,y_valid.shape) return X_train, y_train, X_valid, y_validdef k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size): train_l_sum,valid_l_sum=0,0 for i in range(k): data=get_k_fold_data(k,i,X_train,y_train) net=get_net() train_ls,valid_ls=train(net,*data, num_epochs, learning_rate, weight_decay, batch_size) train_l_sum+=train_ls[-1] valid_l_sum+=valid_ls[-1] if i == 0: d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls], xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs], legend=['train', 'valid'], yscale='log') print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, ' f'验证log rmse{float(valid_ls[-1]):f}') plt.show() return train_l_sum / k, valid_l_sum / kk,num_epochs,lr,weight_decay,batch_size=5,100,5,0.01,64train_l,valid_l=k_fold(k, train_features, train_labels, num_epochs, lr,weight_decay, batch_size)print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, ' f'平均验证log rmse: {float(valid_l):f}')def train_and_pred(train_features, test_feature, train_labels, test_data,num_epochs, lr, weight_decay, batch_size): net = get_net() train_ls, _ = train(net, train_features, train_labels, None, None,num_epochs, lr, weight_decay, batch_size) d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch', ylabel='log rmse', xlim=[1, num_epochs], yscale='log') print(f'训练log rmse:{float(train_ls[-1]):f}') # 将⽹络应⽤于测试集。 preds = net(test_features).detach().numpy() # 将其重新格式化以导出到Kaggle test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0]) submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1) submission.to_csv('submission.csv', index=False)train_and_pred(train_features, test_features, train_labels, test_data,num_epochs, lr, weight_decay, batch_size)

Copyright © 2016-2020 www.365daan.com All Rights Reserved. 365答案网 版权所有 备案号:

部分内容来自互联网,版权归原作者所有,如有冒犯请联系我们,我们将在三个工作时内妥善处理。