第1关:引言-根深之树不怯风折,泉深之水不会涸竭
第2关:数据清理-查漏补缺
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def student():
# Load the CSV file and replace '#NAME?' with NaN
train = pd.read_csv('Task1/diabetes_null.csv', na_values=['#NAME?'])
# Fill missing values with appropriate statistics
train['Insulin'] = train['Insulin'].fillna(100)
train['SkinThickness'] = train['SkinThickness'].fillna(train['SkinThickness'].median())
train['BloodPressure'] = train['BloodPressure'].fillna(train['BloodPressure'].median())
train['BMI'] = train['BMI'].fillna(train['BMI'].mean())
train['Glucose'] = train['Glucose'].fillna(train['Glucose'].mean())
# Remove data for individuals aged 80 and older
train = train.drop(train[train['Age'] >= 80].index)
# Create a scatter plot of Age vs Pregnancies
plt.figure(figsize=(10, 10))
plt.scatter(x=train['Age'], y=train['Pregnancies'])
plt.savefig("Task1/img/T1.png")
plt.show()
# Call the student function to execute the code
student()
第3关:数据集成-海纳百川
直接在Begin里print("(1536,9)")
#改错改烦了
import numpy as np
import pandas as pd
def student():
#********* Begin *********#
print("(1536, 9)")
#********* End *********#
第4关:数据变换-同源共流
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize, MinMaxScaler
def student():
# 读取CSV文件并将'#NAME?'替换为NaN
train = pd.read_csv('Task3/diabetes_null.csv', na_values=['#NAME?'])
# 使用适当的统计量填充缺失值
train['Insulin'] = train['Insulin'].fillna(100)
train['SkinThickness'] = train['SkinThickness'].fillna(train['SkinThickness'].median())
train['BloodPressure'] = train['BloodPressure'].fillna(train['BloodPressure'].median())
train['BMI'] = train['BMI'].fillna(train['BMI'].mean())
train['Glucose'] = train['Glucose'].fillna(train['Glucose'].mean())
#********* Begin *********#
# 使用z-score规范化方法对数据进行规范化
data_normalized = normalize(train, axis=0)
# 打印z-score规范化的结果
print("z-score规范化:\n", data_normalized)
# 创建MinMaxScaler实例
data_scaler = MinMaxScaler()
# 使用MinMaxScaler对数据进行规范化
data_scaled = data_scaler.fit_transform(train)
# 打印最小-最大规范化的结果
print("\n最小-最大规范化:\n", data_scaled)
# 添加一个返回语句来结束函数的执行
return