机器学习笔记——逻辑斯蒂回归

news2025/7/13 23:43:56

参数化模型与非参数化

像前面的KNN模型，不需要对f的形式做出假设，在学习中可以得到任意的模型叫非参数化
而需要对参数进行学习的模型叫参数化模型，参数化限制了f的可能的集合，学习难度相对较低

逻辑斯蒂回归

逻辑斯蒂函数
在这里插入图片描述
似然函数

对数似然函数

在多分类使用softmax函数

重点

ROC曲线

真阳性率、假阳性率 FPR的变化曲线就叫做ROC曲线
ROC曲线的面积就叫AUC

在这里插入图片描述

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
#%%
# 从源文件中读入数据并处理
lines = np.loadtxt('./data/lr_dataset.csv', delimiter=',', dtype=float)
x_total = lines[:, 0:2]
y_total = lines[:, 2]
print('数据集大小：', len(x_total))
#%%
pos_index=np.where(y_total==1)
neg_index=np.where(y_total==0)
plt.scatter(x_total[pos_index,0],x_total[pos_index,1],marker='o',color='coral',s=10)
plt.scatter(x_total[neg_index,0],x_total[neg_index,1],marker='x',color='blue',s=10)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()

#%%
np.random.seed(0)
ratio = 0.7
split = int(len(x_total) * ratio)
idx = np.random.permutation(len(x_total))
x_total = x_total[idx]
y_total = y_total[idx]
x_train, y_train = x_total[:split], y_total[:split]
x_test, y_test = x_total[split:], y_total[split:]

#%%
y_test
idx=np.argsort(y_test[::-1])

#%%
y_test
#%%
def acc(y_true,y_pred):
    return np.mean(y_true==y_pred)
def auc(y_true,y_pred):
    idx=np.argsort(y_pred)[::-1]
    y_true=y_true[idx]
    y_pred=y_pred[idx]
    tp=np.cumsum(y_true) #累加
    fp=np.cumsum(1-y_true)
    tpr=tp/tp[-1]
    fpr=fp/fp[-1]
    s=0.0
    tpr = np.concatenate([[0], tpr]) #拼接函数
    fpr = np.concatenate([[0], fpr])
    for i in range(1, len(fpr)):
        s += (fpr[i] - fpr[i - 1]) * tpr[i]
        return s
#%%

def logistic(z):
    return 1/(1+np.exp(-z))
def GD(num_steps,learning_rate,l2_coef):
    theta=np.random.normal(size=(X.shape[1],))
    train_losses=[]
    test_losses = []
    train_acc = []
    test_acc = []
    train_auc = []
    test_auc = []
    for i in range(num_steps):
        pred = logistic(X @ theta)
        grad = -X.T @ (y_train - pred) + l2_coef * theta
        theta -= learning_rate * grad
        train_loss = - y_train.T @ np.log(pred) \
                     - (1 - y_train).T @ np.log(1 - pred) \
                     + l2_coef * np.linalg.norm(theta) ** 2 / 2
        train_losses.append(train_loss / len(X))
        test_pred = logistic(X_test @ theta)
        test_loss = - y_test.T @ np.log(test_pred) \
                    - (1 - y_test).T @ np.log(1 - test_pred)
        test_losses.append(test_loss / len(X_test))
        # 记录各个评价指标，阈值采用0.5
        train_acc.append(acc(y_train, pred >= 0.5))
        test_acc.append(acc(y_test, test_pred >= 0.5))
        train_auc.append(auc(y_train, pred))
        test_auc.append(auc(y_test, test_pred))
    return theta, train_losses, test_losses, \
    train_acc, test_acc, train_auc, test_auc
#%%
# 定义梯度下降迭代的次数，学习率，以及L2正则系数
num_steps = 250
learning_rate = 0.002
l2_coef = 1.0
np.random.seed(0)

# 在x矩阵上拼接1
X = np.concatenate([x_train, np.ones((x_train.shape[0], 1))], axis=1)
X_test = np.concatenate([x_test, np.ones((x_test.shape[0], 1))], axis=1)

theta, train_losses, test_losses, train_acc, test_acc, \
    train_auc, test_auc = GD(num_steps, learning_rate, l2_coef)

# 计算测试集上的预测准确率
y_pred = np.where(logistic(X_test @ theta) >= 0.5, 1, 0)
final_acc = acc(y_test, y_pred)
print('预测准确率：', final_acc)
print('回归系数：', theta)

plt.figure(figsize=(13, 9))
xticks = np.arange(num_steps) + 1


#%%
# 绘制训练曲线
plt.subplot(221)
plt.plot(xticks, train_losses, color='blue', label='train loss')
plt.plot(xticks, test_losses, color='red', ls='--', label='test loss')
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

#%%
# 绘制准确率
plt.subplot(222)
plt.plot(xticks, train_acc, color='blue', label='train accuracy')
plt.plot(xticks, test_acc, color='red', ls='--', label='test accuracy')
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()