1 单变量线性回归
1.1 sklearn实现(最小二乘法)
import os
import pandas as pd
import matplotlib.pyplot as plt
import sys
current_dir=os.getcwd()
path=current_dir+'\\'+"Salary Data.csv"
def plot_data(path):
table=pd.read_csv(path)
experience = table["Experience Years"]
salary = table["Salary"]
plt.figure(figsize=(8,6))
plt.scatter(experience,salary,color="blue",label="Data points")
plt.title("experience vs year")
plt.xlabel("Experience (Years)")
plt.ylabel("Salary")
plt.grid(True)
plt.legend()
plt.show()
plot_data(path)
table=pd.read_csv(path)
y=table['Salary']
x=table[ ['Experience Years'] ] # x.shape=(40,1)
z=table['Experience Years'] # z.shape=(40,)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=2529)
# (28, 1) (28,) (12, 1) (12,)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train,y_train)
print( model.intercept_ ) # 26596.961311068262
print( model.coef_ ) # [9405.61663234]
from sklearn.metrics import mean_squared_error, r2_score
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print( "mse = ", mse ) # 24141421.671440993
r2 = r2_score(y_test, y_pred)
print( "r2 = ", r2 ) # 0.960233432146844
y_whole_pred=model.predict(x)
# x.iloc[:,0]可以写成x, 或者x["Experience Years"]
plt.scatter(x.iloc[:,0],y,color="blue",label="Data points")
plt.plot(x,y_whole_pred,color="red",linewidth=2, label='linear regression')
plt.xlabel("Experience (Years)")
plt.ylabel("Salary")
plt.legend()
plt.show()
1.2 NumPy实现(梯度下降法)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import sys
def plot_data(path):
table=pd.read_csv(path)
experience = table["Experience Years"]
salary = table["Salary"]
plt.figure(figsize=(8,6))
plt.scatter(experience,salary,color="blue",label="Data points")
plt.title("experience vs year")
plt.xlabel("Experience (Years)")
plt.ylabel("Salary")
plt.grid(True)
plt.legend()
plt.show()
class MyLinearReg:
def __init__(self,lr = 0.01, epochs = 1000):
self.lr = lr
self.epochs = epochs
self.w = None
self.b = None
self.loss_history = []
def fit(self,X,y):
m,n = X.shape
self.w = np.zeros(n)
self.b = 0
for epoch in range(self.epochs):
# x(m,n) * w(n,), numpy广播机制矩阵向量乘法
y_pred = X @ self.w + self.b # y_pred(m,)
loss = (y_pred - y) # loss(m,)
dcost_dw = (1/m) * (X.T @ loss)
dcost_b = (1/m) * loss
dcost_b = np.sum(dcost_b)
self.w -= self.lr * dcost_dw
self.b -= self.lr * dcost_b
square_loss = (y_pred-y)**2
mean_loss = np.mean(square_loss)
self.loss_history.append(mean_loss)
if epoch % 100 == 99 :
print(f"Epoch {epoch} loss: {mean_loss}")
print("Trainning finished.")
print("Final parameters:","Slope w=",self.w," Bias b=",self.b)
# Final parameters: Slope w= [9853.19132896] Bias b= 23780.770014707407
def predict(self,X):
return X @ self.w + self.b
def get_params(self):
return self.w, self.b
# plot_data(path)
current_dir=os.getcwd()
path=current_dir+'\\'+"Salary Data.csv"
table=pd.read_csv(path)
x = table["Experience Years"].values # x(40,)
y = table["Salary"].values # y(40,)
#(32,),(8,)(32,)(8,)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# (32,) (32,) (8,) (8,)
x_train=x_train.reshape(-1,1)
x_test=x_test.reshape(-1,1)
model = MyLinearReg()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
print( "mse = ", mse ) # mse = 43053815.910611115
r2 = r2_score(y_test, y_pred)
print( "r2 = ", r2 ) # r2 = 0.9165907194371214
X=x.reshape(-1,1)
y_whole_pred=model.predict(X)
# x.iloc[:,0]可以写成x, 或者x["Experience Years"]
plt.scatter(x,y,color="blue",label="Data points")
plt.plot(x,y_whole_pred,color="red",linewidth=2, label='linear regression')
plt.xlabel("Experience (Years)")
plt.ylabel("Salary")
plt.legend()
plt.show()
Epoch 99 loss: 111815444.20061775
Epoch 199 loss: 81534511.03025383
Epoch 299 loss: 61760636.04682423
Epoch 399 loss: 48848017.74472436
Epoch 499 loss: 40415896.49608463
Epoch 599 loss: 34909602.800390095
Epoch 699 loss: 31313915.621658318
Epoch 799 loss: 28965881.353634194
Epoch 899 loss: 27432581.973080143
Epoch 999 loss: 26431315.92580659
Trainning finished.
Final parameters: Slope w= [9853.19132896] Bias b= 23780.770014707407
mse = 43053815.910611115
r2 = 0.9165907194371214
2 多变量线性回归
2.1 sklearn实现(最小二乘法)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
def draw_PairwiseScatter(x,y):
num_features = x.shape[1]
plt.figure(figsize=(15,10))
for i in range(num_features):
plt.subplot(3,5,i+1) # 子图的索引从1开始
plt.scatter(x[:,i],y,marker='o', color="green", s=15,alpha=0.5)
plt.xlabel("Feature {}".format(i+1))
plt.ylabel("Label")
plt.title("Featurs {} vs Target".format(i+1))
plt.tight_layout()
plt.show()
def draw_real_pred(x,y,model):
y_pred_whole = model.predict(x)
num_features = x.shape[1]
plt.figure( figsize=(15,10) )
for i in range(num_features):
plt.subplot(3,5,i+1)
plt.scatter(x[:,i],y,marker='o',color="green", s=15,alpha=0.5)
plt.scatter(x[:,i],y_pred_whole,marker="o", color="red", s=15,alpha=0.5)
plt.xlabel("Feature {}".format(i+1))
plt.ylabel("Label")
plt.title("Featurs {} vs Target".format(i+1))
plt.tight_layout()
plt.show()
current_dir = os.getcwd()
path = current_dir + '\\' + "Boston.csv"
house = pd.read_csv(path)
y = house['MEDV'] # (506,)
X = house.drop(['MEDV'], axis = 1) # (506,13)
X=np.array(X)
y=np.array(y)
draw_PairwiseScatter(X,y)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 2529)
# (354, 13) (152, 13) (354,) (152,)
# Ordinary Least Squares 不是梯度下降,不用标准化数据
# theta = (X.T * X)-1 * X.T * y: 最小二乘法
model.fit(x_train,y_train)
print(model.intercept_)
print(model.coef_)
y_pred = model.predict(x_test)
from sklearn.metrics import mean_absolute_error, r2_score
print( "mean_absolute_error(y_pred,y_test):", mean_absolute_error(y_pred,y_test) )
print ( model.score(x_test,y_test) )
r2 = r2_score(y_test, y_pred)
print(r2) # 0.6551914852365524
draw_real_pred(X,y,model)
2.2 NumPy实现(梯度下降法)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
def draw_PairwiseScatter(x,y):
num_features = x.shape[1]
plt.figure(figsize=(15,10))
for i in range(num_features):
plt.subplot(3,5,i+1)
plt.scatter(x[:,i],y,marker='o', color="green", s=15,alpha=0.5)
plt.xlabel("Feature {}".format(i+1))
plt.ylabel("Label")
plt.title("Featurs {} vs Target".format(i+1))
plt.tight_layout()
plt.show()
def draw_real_pred(x,y,model):
y_pred_whole = model.predict(x)
num_features = x.shape[1]
plt.figure(figsize=(15,10))
for i in range(num_features):
plt.subplot(3,5,i+1)
plt.scatter(x[:,i],y,marker='o',color="green", s=15,alpha=0.5)
plt.scatter(x[:,i],y_pred_whole,marker='o', color="red", s=15,alpha=0.5)
plt.xlabel("Feature {}".format(i+1))
plt.ylabel("Label")
plt.title("Featurs {} vs Target".format(i+1))
plt.tight_layout()
plt.show()
class MultipleLinear:
def __init__(self,learning_rate=0.01, epochs=1000):
self.learning_rate = learning_rate
self.epochs = epochs
self. theta = None
self.cost_history = None
def fit(self,X,y):
X = np.hstack( ( np.ones((X.shape[0],1)), X ) )
m,n = X.shape
self.theta = np.zeros(n)
self.cost_history = []
for epoch in range(self.epochs):
y_pred = X @ self.theta
gradient = X.T @ (y_pred - y)
self.theta -= self.learning_rate * gradient * (1/m)
cost = self.compute_cost(X,y)
self.cost_history.append(cost)
if epoch % 100 == 99:
print(f"Epoch {epoch} cost: {cost}")
print("Training complete")
print ( self.theta )
def predict(self,X):
m,n = X.shape
X = np.hstack( (np.ones((m,1)), X) )
return X @ self.theta
def compute_cost(self,X,y):
m = X.shape[0]
y_pred = X @ self.theta
sq_errors = (y_pred - y)**2
cost = 1 / (2 * m) * np.sum(sq_errors)
return cost
current_dir = os.getcwd()
path = current_dir + '\\' + "Boston.csv"
house = pd.read_csv(path)
y = house['MEDV'] # (506,)
X = house.drop(['MEDV'], axis = 1) # (506,13)
X=np.array(X)
y=np.array(y)
draw_PairwiseScatter(X,y)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# (404, 13) (102, 13) (404,) (102,)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train) # (404,13)
x_test_scaled = scaler.transform(x_test) # (102,13)
model = MultipleLinear()
model.fit(x_train_scaled, y_train)
y_pred = model.predict(x_test_scaled)
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred)
print("r2 = ",r2) # r2 = 0.6543244875135051
draw_real_pred(X,y,model)