给定一组由输入x和输出y构成的数据集D = {(x1,y1),(x2,y2),(x3,y3),…(xm,xm)},其中xi=(xi1;xi2;xi3…;xid),yi∈R。
原理阐述线性回归是通过训练学习得到一个线性模型y来最大限度地根据输入x拟合输出y。即 y ^ = w x i + b widehat{y} = wx_{i} + b y =wxi+b 确定参数w和b使得拟合值 y ^ widehat{y} y 和真实值y的均方误差尽可能小。
代码 基于numpy的代码实现import numpy as npdef linear_loss(X,y,w,b): ##回归模型主体 ##输入 :X变量矩阵 y标签向量 w参数权重矩阵 b偏置 ##输出 dw db loss y_hat num_train = X.shape[0] ##得到数据量 num_feature = X.shape[1] ##得到特征数 y_hat = np.dot(X,w)+b ##线性模型y^=wx+b loss = np.sum((y_hat-y)**2)/num_train ##计算均方损失 dw = np.dot(X.T,(y_hat-y))/num_train ##计算梯度 db = np.sum((y_hat-y))/num_train ##计算梯度 return y_hat,loss,dw,dbdef initialize_params(dims): ##初始化函数 ## 输入dims:数据维度 数据特征数量 ##输出: 初始权重矩阵w 初始偏置参数b w = np.zeros((dims,1)) b = 0 return w,b def linear_train(X,y,learning_rate=0.01,epochs=20000): ##模型最优化过程 ##输入 数据矩阵X 标签值y 学习率 迭代次数 ##输出 每次迭代后的均方损失loss_every 优化后的参数字典params 迭代后的梯度 loss_every = [] ##存储均方损失 w,b = initialize_params(X.shape[1]) ##得到初始参数 for i in range(1,epochs): ##梯度下降法 y_hat,loss,dw,db = linear_loss(X,y,w,b) w += -learning_rate * dw b += -learning_rate * db loss_every.append(loss) if i%20000 == 0: ##每训练20000次,打印一次迭代次数i和损失loss print("epoch %d loss %f"%(i,loss)) params = {'w':w, ##将w和b存到字典里 'b':b} grads = {'dw':dw, 'db':db} return loss_every,params,grads##调用数据集 并分为训练集和测试集from sklearn.datasets import load_diabetesfrom sklearn.utils import shufflediabetes = load_diabetes()data,target = diabetes.data,diabetes.targetX,y = shuffle(data,target,random_state=13) ##打乱数据集offset = int(X.shape[0]*0.8) ##获得数据数量*0.8 再取整X_train,y_train = X[:offset],y[:offset]X_test,y_test = X[offset:],y[offset:]y_train = y_train.reshape((-1,1))y_test = y_test.reshape((-1,1))print("X_train's shape:",X_train.shape)print("X_test's shape:",X_test.shape)##模型训练loss_every,params,grads = linear_train(X_train,y_train,0.01,200000)print(params)##模型预测def predict(X,params): ## 输入 数据矩阵X 含w,b参数字典params ## 输出 y的预测值 w = params['w'] b = params['b'] ##字典值调用 y_pred = np.dot(X,w)+b return y_pred y_pred = predict(X_test,params)def r2_score(y_test,y_pred): ##模型效果评估 ## 输入y的测试集的值和y的预测值 ## 输出r2 y_avg = np.mean(y_test) ss_tot = np.sum((y_test-y_avg)**2) ss_res = np.sum((y_test-y_pred)**2) r2 = 1-(ss_res/ss_tot) return r2print(r2_score(y_test,y_pred))
基于sklearn的代码实现from sklearn import linear_modelfrom sklearn.metrics import mean_squared_error,r2_scorefrom sklearn.datasets import load_diabetesfrom sklearn.utils import shufflediabetes = load_diabetes() ##加载数据集data,target = diabetes.data,diabetes.targetX,y = shuffle(data,target,random_state=13) ##打乱数据集offset = int(X.shape[0]*0.8) ##获得数据量*0.8 再取整X_train,y_train = X[:offset],y[:offset]X_test,y_test = X[offset:],y[offset:]y_train = y_train.reshape((-1,1))y_test = y_test.reshape((-1,1))model = linear_model.LinearRegression() ##定义模型model.fit(X_train,y_train) ##拟合模型predict = model.predict(X_test) ##模型预测print("均方误差:%.2f"% mean_squared_error(y_test,predict)) ##模型评估print("R²:%.2f"% r2_score(y_test,predict))