ml常见代码片段

news2024/12/23 19:04:57

常用ML代码片段

变换一列

new_df['brand'] = new_df['prod_name'].apply(lambda x: x.split()[0])

变换2列

new_df['chip_total_sales'] = new_df.apply(lambda x: x['total_sales'] * x['is_chip'], axis = 1)
# 重要的是axis=1

groupby 计数,求和,取第一个值,取得rank

df_per_card = new_df.groupby('loyalty_card_no')[['total_sales', 'chip_total_sales']].sum() # sum
df_per_card_pri = new_df.groupby('loyalty_card_no')['premium_customer'].min() # 取值
df_per_card_pri = new_df.groupby('loyalty_card_no')['premium_customer'].count() # 总数
# 可以分别不同的列用不同的方法,最后再把他们整合到一个dataframe

转换类别类型的列

def trans_one_col(df_data, col):
    if col in df_data.columns:
        enums = df_data[col].value_counts().index.tolist()
        for new_col in enums:
            df_data[col + new_col] = df_data[col].apply(lambda x: 1 if x==new_col else 0)
        del df_data[col]

trans_one_col(df_per_card, "premium_customer")
trans_one_col(df_per_card, "lifestage")

一个使用逻辑回归,并且split的模板

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

train_set, test_set = train_test_split(df_per_card, test_size=0.2, random_state=42)
X_train = train_set.iloc[:, :-1]
y_train = train_set.iloc[:, -1]
X_test = test_set.iloc[:, :-1]
y_test = test_set.iloc[:, -1]

# X_train
logit = linear_model.LogisticRegression()
logit.fit(X_train, y_train)
pred = logit.predict(X_test)
prop_pred = logit.decision_function(X_test)

acc = accuracy_score(y_test, pred)
prec = precision_score(y_test, pred)
rec = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
auc_s = roc_auc_score(y_test, pred)
cmat = confusion_matrix(y_test, pred)

绘制ROC 曲线的模板

from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
fpr, tpr, thresholds = roc_curve(y_test, prop_pred, pos_label=1)
plt.figure(figsize = (6,4))
plt.plot(fpr, tpr, linewidth = 2)
plt.plot([0,1], [0,1], 'k--')
plt.rcParams['font.size'] = 12
plt.title('ROC curve for Chip Purchase Classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

在这里插入图片描述

对于绘制不同类别的分布图,单变量,hist

import matplotlib.pyplot as plt

class_labels = ["Mainstream_Tier", 'Premium_Tier', "Budget_Tier" ] 
fig = plt.figure(figsize=(12, 8))  

for i, feature in enumerate(['chip_proportion']):  
#     plt.subplot(3, 4, i+1)  
    plt.hist([df_per_card[feature][df_per_card['premium_customer'] == 'Mainstream_Tier'], 
              df_per_card[feature][df_per_card['premium_customer'] == 'Premium_Tier'],
              df_per_card[feature][df_per_card['premium_customer'] == 'Budget_Tier']
             ], label=class_labels)  
    plt.xlabel("Chip proportion")  
    plt.ylabel("vvv")  
    plt.legend() 
    plt.title(feature)  

plt.tight_layout()  
plt.show()  

在这里插入图片描述
sns pairplot 带上hue和reg 可以代替这个

直接绘制每个列的分布情况

data.hist(figsize=(12, 10))

判断是不是工作日

import datetime
def date_is_weekday(datestring):
    ### return 0 if weekend, 1 if weekday
    dsplit = datestring.split('/')
    wday = datetime.datetime(int(dsplit[2]),int(dsplit[1]),int(dsplit[0])).weekday()
    return int(wday<=4)

###  01/12/2017
data["Weekday"] = data.Date.apply(lambda x: date_is_weekday(x))

转为数值的类型

data["Rainfall(mm)"] = pd.to_numeric(data["Rainfall(mm)"], errors="coerce")

绘制箱图

ax = data.boxplot(column="Temperature (C)")  # 列名
ax.set_ylabel('Temperature  before removing problem data')
plt.show()

删除偏差太大的点

data["Humidity (%)"][data["Humidity (%)"] < 0] = np.nan

删除NA

df.dropna(how='any', axis=1, inplace=True)

使用pipeline的模子

from sklearn.pipeline import make_pipeline


pipeline_step9 = Pipeline([ ('imputer', SimpleImputer(strategy="median")), 
                           ('std_scaler', StandardScaler()),
                            ('linreg', LinearRegression())
                          ])



train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

print(type(train_set))

y_train = train_set["Rented Bike Count"]
X_train = train_set[selected_columms]

y_test = test_set["Rented Bike Count"]
X_test = test_set[selected_columms]

pipeline_step9.fit(X_train, y_train)
# Predict labels for training features
predictions = pipeline_step9.predict(X_train)
# Measure prediction error, for example:
mse = mean_squared_error(y_train, predictions)


import math
# calculate the RMSE of the fit to the training data
rmse_train = math.sqrt(mse)


绘制真实的y和预测的y的散点图----拟合的直线在一起作比较

subset_size = 200
y_train_pred = pipeline_step9.predict(X_train[:subset_size])

# Then I create a scatterplot of predicted vs actual values using your variables from the cell above
ax = sns.scatterplot(x=y_train[:subset_size], y=y_train_pred)
# A perfect solution would look like the red line
sns.lineplot(x=y_train[:subset_size], y=y_train[:subset_size], color='red')
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')

在这里插入图片描述

cross validation 探索模型的稳定性

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# preprocessed_data
preprocessed_data_train_X = pipeline_step7.fit_transform(X_train)


#Linear Regression CV mean and std RMSE from the 10 folds:
lr_model = LinearRegression()
scores = cross_val_score(lr_model, preprocessed_data_train_X, y_train,
                           scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
rmse_LR_mean = rmse_scores.mean()
rmse_LR_std  = rmse_scores.std()
print('Linear Regression CV Scores:') 
print(f'Mean: {rmse_LR_mean:.2f}, Std: {rmse_LR_std:.2f}\n')

GridSearch 搜参数

from sklearn.svm import SVC
# Put the pipeline with the appropriate model 
svc_pl = Pipeline(steps=[
            ('preprocessor', preproc_pl),
            ('svc', SVC(random_state=42))
        ])

param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__gamma': ['scale', 'auto']
}

# Use GridSearchCV with cv=5 
svc_model =  GridSearchCV(svc_pl, param_grid, cv=5, return_train_score=True)

svc_model.fit(X_train, y_train)

# Return best parameters in a dictionary
svc_best_parameters = svc_model.best_params_


knn_best_cv_scoring = knn_model.best_score_

我们可以看搜参过程中 误差是怎么变的

# Function to check the performance of each parameter.
def pooled_var(stds):
    # https://en.wikipedia.org/wiki/Pooled_variance#Pooled_standard_deviation
    n = 5 # size of each group
    return np.sqrt(sum((n-1)*(stds**2))/ len(stds)*(n-1))

# Function to create loss curves
def plot_gridSearchCV_loss_curve(cv_results, grid_params, title):

    df = pd.DataFrame(cv_results)
    results = ['mean_test_score',
               'mean_train_score',
               'std_test_score',
               'std_train_score']


    fig, axes = plt.subplots(1, len(grid_params),
                             figsize = (5*len(grid_params), 7),
                             sharey='row')
    axes[0].set_ylabel("Score", fontsize=25)


    for idx, (param_name, param_range) in enumerate(grid_params.items()):
#         print(df.columns)
#         print(df.head())
#         print(f'param_{param_name}')
        grouped_df = df.groupby(f'param_{param_name}')[results]\
            .agg({'mean_train_score': 'mean',
                  'mean_test_score': 'mean',
                  'std_train_score': pooled_var,
                  'std_test_score': pooled_var})

        previous_group = df.groupby(f'param_{param_name}')[results]
        shorted_param_name = param_name
        shorted_param_name = shorted_param_name.replace("classifier__", "")
        axes[idx].set_xlabel(param_name, fontsize=30)
        axes[idx].set_ylim(0.0, 1.1)
        lw = 2
        axes[idx].plot(param_range, grouped_df['mean_train_score'], label="Training score",
                    color="darkorange", lw=lw)
        axes[idx].fill_between(param_range,grouped_df['mean_train_score'] - grouped_df['std_train_score'],
                        grouped_df['mean_train_score'] + grouped_df['std_train_score'], alpha=0.2,
                        color="darkorange", lw=lw)
        axes[idx].plot(param_range, grouped_df['mean_test_score'], label="Cross-validation score",
                    color="navy", lw=lw)
        axes[idx].fill_between(param_range, grouped_df['mean_test_score'] - grouped_df['std_test_score'],
                        grouped_df['mean_test_score'] + grouped_df['std_test_score'], alpha=0.2,
                        color="navy", lw=lw)

    handles, labels = axes[0].get_legend_handles_labels()
    fig.suptitle(f'{title} Validation curves', fontsize=30)
    fig.legend(handles, labels, loc=8, ncol=2, fontsize=20)

    fig.subplots_adjust(bottom=0.25, top=0.85)
    plt.show()

# Check the performance for each model (knn, dt, svc and sgd). Use plot_gridSearchCV_loss_curve() function.
plot_gridSearchCV_loss_curve(knn_model.cv_results_,knn_model.param_grid, "KNN classifier")
plot_gridSearchCV_loss_curve(dt_model.cv_results_, dt_model.param_grid, "Decision Tree classifier")
plot_gridSearchCV_loss_curve(svc_model.cv_results_,svc_model.param_grid, "SVC classifier")
plot_gridSearchCV_loss_curve(sgd_model.cv_results_,sgd_model.param_grid, "SGD classifier")

在这里插入图片描述

绘制分裂决定的曲线

from sklearn.inspection import DecisionBoundaryDisplay

# Assign the name of the best feature obtained in step18 to the variable below. (string)
feature_one = best_four_features[0]

# Assign the name of the second best feature obtained in step18 to the variable below. (string)
feature_two = best_four_features[1]

# Assign the training dataset that you would want to use for this step to the variable below
data2d = data[[feature_one, feature_two]]

'''
Check the decumentation of DecisionBoundaryDisplay in sklearn from 
https://scikit-learn.org/stable/modules/generated/sklearn.inspection.DecisionBoundaryDisplay.html.

Use DecisionBoundaryDisplay.from_estimator(...) and assign the instance to the variable
below.

comment out the call to DecisionBoundaryDisplay.from_estimator(...) and all the ploting lines before uploading to gradescope.
'''

final_model2 =  Pipeline(steps=[
            ('preprocessor', preproc_pl),
            ('classifier', SGDClassifier(random_state=42, alpha=0.01, eta0=10, learning_rate='adaptive', loss='hinge', penalty="l2"))
        ])

final_model2.fit(data2d, data.label)
disp_step19 = DecisionBoundaryDisplay.from_estimator(final_model2, data2d,
                                                     response_method="predict",
                                                     xlabel=feature_one, 
                                                    ylabel=feature_two, alpha = 0.5)

# Plotting the data points. Use this to create the scatter plot
disp_step19.ax_.scatter(X_train[feature_one], X_train[feature_two],
                        c=y_train, edgecolor="k",
                        cmap=plt.cm.coolwarm)
plt.xlim(-0.3, 0.3)
plt.title(f"Decision surface for tree trained on {feature_one} and {feature_two}")
plt.show()

在这里插入图片描述

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/473039.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

C语言起源、特性和发展历程

本文从ALGOL 60语言谈起&#xff0c;简述C语言的起源和发展历程&#xff0c;然后对C语言的一些特性做了探讨&#xff0c;最后说说C语言为什么在众多编程语言中&#xff0c;起到了承上启下的作用。 本文介绍以下内容&#xff1a; C语言的起源C语言的发展C语言的特性C语言的重要…

外卖项目优化-01-redis缓存短信验证码、菜品数据

文章目录 外卖项目优化-01课程内容前言1. 环境搭建1.1 版本控制解决branch和tag命名冲突 1.2 环境准备 2. 缓存短信验证码2.1 思路分析2.2 代码改造2.3 功能测试 3. 缓存菜品信息3.1 实现思路3.2 代码改造3.2.1 查询菜品缓存3.2.2 清理菜品缓存 3.3 功能测试3.4 提交并推送代码…

AutoGPT安装教程

最近安装AutoGPT时遇到了一些问题&#xff0c;写下这篇文章记录一下 1 下载AutoGPT AutoGPT链接&#xff1a;https://github.com/Significant-Gravitas/Auto-GPT/tree/v0.2.2 下载AutoGPT 推荐下载stable 版本 2 申请openai 的api key 获取api的key&#xff0c;这里就不介…

【超算/先进计算学习】日报8

目录 今日已完成任务列表遇到的问题及解决方案任务完成详细笔记阶段一阶段二阶段三阶段四 对自己的表现是否满意简述下次计划其他反馈 今日已完成任务列表 超算/高性能计算总结 遇到的问题及解决方案 无 任务完成详细笔记 阶段一 在学习的第一阶段&#xff0c;我们首先对需要…

ChatGPT+智能家居在AWE引热议 OpenCPU成家电产业智能化降本提速引擎

作为家电行业的风向标和全球三大消费电子展之一&#xff0c;4月27日-30日&#xff0c;以“智科技、创未来”为主题的AWE 2023在上海新国际博览中心举行&#xff0c;本届展会展现了科技、场景等创新成果&#xff0c;为我们揭示家电与消费电子的发展方向。今年展馆规模扩大至14个…

【pytest里的参数化:看几个例子就够了!】

参数化多个参数&#xff1a;可以使用多个参数来参数化测试。例如&#xff1a; import pytestpytest.mark.parametrize("x, y, expected", [(1, 2, 3),(3, 4, 7),(5, 6, 11), ]) def test_addition(x, y, expected):assert x y expected参数化列表&#xff1a;可以…

轻叶H5营销单页,让你的营销更加清爽高效

网络营销就是营销企业品牌形象、产品信息发布、优惠促销活动&#xff0c;最终目的就是争抢流量和客户。现在为了吸引流量&#xff0c;各种营销方式、广告玩法层出不穷&#xff0c;成本投入大&#xff0c;带来的转化不一定好。今天&#xff0c;我们要来讲一讲H5营销单页。 H5营销…

项目管理软件project下载安装配置图文教程

目录 前言 配置安装过程 总结 前言 Project是一种计划、组织和管理任务的工具&#xff0c;通常用于团队协作和项目管理。它可以帮助用户创建任务列表、分配任务、设置截止日期、跟踪进度、分析数据等。Project还可以生成各种报告&#xff0c;如甘特图、资源使用情况、任务分…

稀疏矩阵存储格式总结

稀疏矩阵是指矩阵中的元素大部分是0的矩阵&#xff0c;实际问题中大规模矩阵基本上都是稀疏矩阵&#xff0c;很多稀疏度在90%甚至99%以上,大规模的稀疏造成了大量无效数据的计算和存储资源占用&#xff0c;也无法有效的载入有限内存计算。因此我们需要有高效的稀疏矩阵存储格式…

SpringCloud:ElasticSearch之数据同步

elasticsearch中的酒店数据来自于mysql数据库&#xff0c;因此mysql数据发生改变时&#xff0c;elasticsearch也必须跟着改变&#xff0c;这个就是elasticsearch与mysql之间的数据同步。 1.思路分析 常见的数据同步方案有三种&#xff1a; 同步调用异步通知监听binlog 1.1.同…

Nacos配置中心的详解与搭建

Namespace 简介 用于进行租户粒度的配置隔离&#xff0c;不同的命名空间下&#xff0c;可以存在相同的 Group 或 Data ID 的配置 配置Namespace 点击nacos的命名空间——点击新建命名空间 开发环境【dev】测试环境【test】正式环境【prod】 DataID 简介 Data ID 通常用于…

Node.js 下载与安装教程

文章目录 Node.js 下载Node.js 安装npm 配置配置node_path修改用户变量更换npm源为淘宝镜像全局安装基于淘宝源的cnpm Node.js 下载 1.进入nodejs官网&#xff1a;https://nodejs.org/en 2.单击downloads 3.此时滑动滚动条&#xff0c;找到并单击 previous release 4.在此页…

<C++>lesson1.C++入门上

文章目录 1. C关键字(C98)&#x1f49a;2. 命名空间&#x1f90e;2.1 命名空间定义2.2命名空间的使用 3. C输入/输出&#x1f5a4;4.缺省参数&#x1f499;4.1 缺省参数概念4.2 缺省参数分类 5. 函数重载❤️5.1 函数重载的概念5.2 C支持函数重载的原理 6. 引用&#x1f49c;6.…

Day4_Springboot集成Mybatis

上一节使用springboot框架搭建了项目&#xff0c;并创建了数据库user表&#xff0c;接下来集成mybatis对用户表实现增删改查操作~~~~ 目录 SpringBootApplication.java 创建model/entity文件夹&#xff0c;存放实体类 UserDao.java UserController.java 浏览器Json插件&am…

Leetcode刷题日志3.0

目录 前言&#xff1a; 1.相对名次​​​​​​ 2.学生出勤记录 I 3.重塑矩阵 4.分糖果 5.最长和谐子序列 6.种花问题 前言&#xff1a; 今天我就分享一下最近在leetcode刷到的题&#xff0c;希望对大家有所帮助。编程语言&#xff1a;Python3。好了废话不多讲了&…

消息队列使用场景介绍

消息队列中间件是分布式系统中重要的组件&#xff0c;主要解决应用耦合&#xff0c;异步消息&#xff0c;流量削锋等问题 实现高性能&#xff0c;高可用&#xff0c;可伸缩和最终一致性架构 使用较多的消息队列有ActiveMQ&#xff0c;RabbitMQ&#xff0c;ZeroMQ&#xff0c;Ka…

【华中农业大学2023年十二届程序设计竞赛(同步赛)】B. 写信

文章目录 题目描述思路代码 题目描述 思路 错位排序&#xff0c;可搜索引擎。复杂度太高 递推式&#xff1a; f [ n ] ( n − 1 ) ∗ ( f [ n − 1 ] f [ n − 2 ] ) f[n](n-1)*(f[n-1]f[n-2]) f[n](n−1)∗(f[n−1]f[n−2]) 正解&#xff1a;打表&#xff01;YYDS 1e9的数…

12.Hadoop练习题

1.网络问题 &#xff08;1&#xff09;机器联网出现问题 情况&#xff1a;ping一下百度&#xff0c;发现百度ping不通 sudo vim /etc/sysconfig/network-scripts/ifcfg-ens33检查GATEWAY是否正确&#xff0c;修改过来之后保存退出&#xff0c;重启虚拟机 sudo systemctl re…

图论 (Java) 从入门到入土 /第一部分 图的基础-图的定义/

零.前言 图&#xff0c;是一种比较复杂的数据结构。和树的一个节点只和上层一个节点相连不同&#xff0c;在图中&#xff0c;任意两个节点都可能相连&#xff0c;且可能具有方向性&#xff0c;并且节点的边具有权重&#xff0c;因此&#xff0c;图被用于描述各种复杂的数据对象…

python:tkinter 生成 buttonBar 示例

tk_test1.py # -*- coding: utf-8 -*- import os import tkinter as tk from tkinter import filedialogroot tk.Tk() root.title("生成 buttonBar 示例 ") var tk.StringVar() # 动态字符串 label tk.Label(root, textvariablevar) listbox tk.Listbox(root, s…