1 前置知识
- 逻辑回归解决二分类问题。
- sigmoid函数:非线性,值域(0,1)概率值
2 逻辑回归原理
3 癌症分类案例
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# 获取数据
data=pd.read_csv('breast-cancer-wisconsin.csv')
# 基本数据处理
data=data.replace('?',np.NAN)
data=data.dropna()
x=data.iloc[:,1:-1]
# y=data[:,-1]
y=data['Class']
# 分割数据
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=22)
# 特征工程(标准化)
transfer=StandardScaler()
x_train=transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)
# 逻辑回归
LR=LogisticRegression()
LR.fit(x_train,y_train)
# 模型评估
y_predict=LR.predict(x_test)
print(y_predict)
acc=LR.score(x_test,y_test)
print(acc)
4 分类问题评估
4.1 混淆矩阵
4.2 精确率&召回率
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
import pandas as ps
# 真实值
y_true=['恶性','恶性','恶性','恶性','恶性','恶性','良性','良性','良性','良性']
# 预测值
y_predict_A=['恶性','恶性','恶性','良性','良性','良性','良性','良性','良性','良性']
y_predict_B=['恶性','恶性','恶性','恶性','恶性','恶性','恶性','恶性','恶性','良性']
labels=['恶性','良性']
# 构建混淆矩阵
m_A=confusion_matrix(y_true,y_predict_A,labels=labels)
m_B=confusion_matrix(y_true,y_predict_B,labels=labels)
df_A=pd.DataFrame(data=m_A,columns=labels,index=labels)
df_B=pd.DataFrame(data=m_B,columns=labels,index=labels)
# 混淆矩阵
print(df_A)
print(df_B)
print('*'*20)
# 准确率
print(accuracy_score(y_true,y_predict_A))
print(accuracy_score(y_true,y_predict_B))
print('*'*20)
# 精确率
print(precision_score(y_true,y_predict_A,pos_label='恶性'))
print(precision_score(y_true,y_predict_B,pos_label='恶性'))
print('*'*20)
# 召回率
print(recall_score(y_true,y_predict_A,pos_label='恶性'))
print(recall_score(y_true,y_predict_B,pos_label='恶性'))
# f1-score
print('*'*20)
print(f1_score(y_true,y_predict_A,pos_label='恶性'))
print(f1_score(y_true,y_predict_B,pos_label='恶性'))
4.3 AUC指标&ROC曲线
5 电信客户流失案例
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,classification_report
# 获取数据
data_df=pd.read_csv('churn.csv')
# data_df.info()
# print(data_df.describe())
# 数据预处理
# one-hot编码
# data_df=pd.get_dummies(data_df)
data_df=pd.get_dummies(data_df)
# 去除热编码后意义重复的列
data_df.drop(['Churn_No','gender_Male'],axis=1,inplace=True)
# print(data_df.iloc[:5, 13:])
# 修改列名
data_df.rename(columns={'Churn_Yes':'label'},inplace=True)
# 样本均衡查看(查看标签分布情况)
data_df.label.value_counts(1)
# 特征筛选
x=data_df[['Contract_Month','Dependents_att','internet_other']]
y=data_df['label']
# 数据集划分
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=22)
# 模型训练
LR=LogisticRegression(class_weight='balanced')
LR.fit(x_train,y_train)
# 模型预测评估
y_predict=LR.predict(x_test)
print(y_predict)
# 准确率
print('*'*20)
accuracy_score=accuracy_score(y_test,y_predict)
print('accuracy_score',accuracy_score)
# 精确率
print('*'*20)
precision_score=precision_score(y_test,y_predict)
print('precision_score',precision_score)
# 召回率
print('*'*20)
recall_score=recall_score(y_test,y_predict)
print('recall_score',recall_score)
# f1_score
print('*'*20)
f1_score=f1_score(y_test,y_predict)
print('f1_score',f1_score)
# AUC
print('*'*20)
roc_auc_score=roc_auc_score(y_test,y_predict)
print('roc_auc_score',roc_auc_score)
# 分类评估报告
classification_report=classification_report(y_test,y_predict)
print(classification_report)