机器学习入门
数据预处理:
- 将?替换为缺失值
data = data.replace(to_replace="?",value=np.nan)
- 丢掉缺失值
data.dropna(how="any)
#how=all删除全是缺失值的行和列
#haow=any删除有缺失值的行和列
- 将数据集划分成测试集和训练集
data[column_name[1:10]]是自变量x
data[column_name[10]是因变量y
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data[column_names[1:10]],data[column_
- 查看训练集、测试集分布
y_train.value_counts()
y_test.value_counts()
- 进行标准化,上一篇说了标准化是什么。
from sklearn.preprocessing import StandardScaler
- 载入模型:逻辑回归(logistics),随机梯度参数估计(sgd)
# 逻辑回归
from sklearn.linear_model import LogisticRegression
# 随机梯度参数估计
from sklearn.linear_model import SGDClassifier
- 标准化
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)
- 初始化模型
lr = LogisticRegression()
sgdc = SGDClassifier()
- 训练模型,预测模型
# 模型训练
lr.fit(x_train,y_train)
# 预测模型
lr_y_predict = lr.predict(x_test)
sgdc.fit(x_train,y_train)
sgdc_y_predict = sgdc.predict(x_test)
- 衡量指标
from sklearn.metrics import classification_report
- 准确度
print ('Accuracy of LR Classifier:',lr.score(x_test,y_test))
print ('Accuracy of SGD Classifier:',sgdc.score(x_test,y_test))
- 生成报告
print(classification_report(y_test,lr_y_predict,target_names=['Benign','Malignant']))
print(classification_report(y_test,sgdc_y_predict,target_names=['Benign','Malignant']))