看到了不错的代码我就想收集一下 。
这是2024科大讯飞电力需求预测挑战赛的baseline。你可以在AI and competition里获取更多比赛的baseline。本次比赛官网如下:电力需求预测挑战赛。目前该baseline在比赛中相对靠前。
AI-and-competition/202407科大讯飞电力需求预测baseline/2024-baseline-lb-229.ipynb at main · yunsuxiaozi/AI-and-competition · GitHub
1 sin、cos函数有什么用? 周期!
2 历史特征、时间窗口;
3 彩蛋:有可能时间序列模型更好用哦!
import pandas as pd#导入csv文件的库
import numpy as np#矩阵运算与科学计算的库
#model lgb回归模型,日志评估
from lightgbm import LGBMRegressor,log_evaluation
#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import KFold
import gc#垃圾回收模块
import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法,它可以控制警告信息的输出方式和级别。
#config
class Config():
seed=2024#随机种子
num_folds=10#K折交叉验证
TARGET_NAME ='target'#标签
import random#提供了一些用于生成随机数的函数
#设置随机种子,保证模型可以复现
def seed_everything(seed):
np.random.seed(seed)#numpy的随机种子
random.seed(seed)#python内置的随机种子
seed_everything(Config.seed)
path='/kaggle/input/kedapower/2407keda/'
train=pd.read_csv(path+"train.csv")
test=pd.read_csv(path+"test.csv")
train.head()
# 先对数据集拼接,根据id和dt排序,根据id排序是因为同一个id是同一个房屋,dt要降序排列是因为dt是距离当前多少天。
total=pd.concat((train,test),axis=0)
#id从小到大,dt从大到小,因为dt是距离现在多少天.
total=total.sort_values(['id','dt'],ascending=[True,False])
print(f"len(total):{len(total)}")
total.head()
print("type feature")
user_type=total['type'].unique()
for t in user_type:
total[f'type_{t}']=(total['type']==t)
"""
dt:虽然dt是脱敏处理后的天数,但这里还是按照常规的day来处理。7是一周,30是一个月,90是一个季度,180是半年,365是一年,由于这些特征是人类通常表示时间的周期,用sin和cos提取特征。之后提取了差分特征,并提取了常规的dayofweek,month,year特征。
"""
print("dt feature")
for day in [7,30,90,180,365]:
total[f'sin_dt_{day}']=np.sin(2*np.pi*total['dt']/day)
total[f'cos_dt_{day}']=np.cos(2*np.pi*total['dt']/day)
for gap in [2,4,7,15,30,60]:
for col in ['sin_dt_7','cos_dt_7','sin_dt_30','cos_dt_30','sin_dt_90','cos_dt_90','sin_dt_180','cos_dt_180','sin_dt_365','cos_dt_365']:
total[f"{col}_shift{gap}"]=total.groupby(total['id'])[col].shift(gap)
total[f"{col}_gap{gap}"]=total[col]-total[f"{col}_shift{gap}"]
total.drop([f"{col}_shift{gap}"],axis=1,inplace=True)
total['dt_dayofweek']=total['dt']%7
for d in range(7):
total[f'dt_dayofweek_{d}']=(total['dt_dayofweek']==d)
total['dt_month']=total['dt']//30%12
for d in range(12):
total[f'dt_month_{d}']=(total['dt_month']==d)
total['dt_year']=total['dt']//365
# target:对于target特征的提取有很多,也比较暴力,基本都是groupby的统计特征,自己看吧。
print("target feature")
for gap in [10,13,20]:
for col in ['target']:
total[f"{col}_shift{gap}"]=total.groupby(total['id'])[col].shift(gap)
for w in [3,6,10]:
total[f"{col}_gap{gap}_window{w}"]=total.groupby(total['id'])[col].transform(lambda x: x.rolling(w).mean()).shift(gap)
for col in [['id'],['type'],['dt_dayofweek'],['dt_month'],
['id','type'],['id','dt_dayofweek'],['id','dt_month'],
['type','dt_dayofweek'],['type','dt_month']
]:
print(f"col:{col}")
if len(col)>1:
colname='_'.join(col)
else:
colname=col[0]
total[f"{colname}_target_min"]=total.groupby(col)['target'].transform('min')
total[f"{colname}_target_max"]=total.groupby(col)['target'].transform('max')
total[f"{colname}_target_mean"]=total.groupby(col)['target'].transform('mean')
total[f"{colname}_target_median"]=total.groupby(col)['target'].transform('median')
total[f"{colname}_target_std"]=total.groupby(col)['target'].transform('std')
total[f"{colname}_target_skew"]=total.groupby(col)['target'].transform('skew')
total[f"{colname}_target_last"]=total.groupby(col)['target'].transform('last')
total[f"{colname}_target_first"]=total.groupby(col)['target'].transform('first')
total[f"{colname}_target_nunique"]=total.groupby(col)['target'].transform('nunique')
total[f"{colname}_target_ptp"]=total[f"{colname}_target_max"]-total[f"{colname}_target_min"]
total[f"{colname}_target_max/min"]=total[f"{colname}_target_max"]/(total[f"{colname}_target_min"]+1e-15)
total[f"{colname}_target_mean/std"]=total[f"{colname}_target_mean"]/(total[f"{colname}_target_std"]+1e-15)
"""
这里也根据模型的特征重要性筛选了一些特征,特征重要性小于10就drop。这里选择dt<=500的训练数据进行训练是因为一方面shift特征的构造中会有一些缺失值,另一方面时间上距离0太远的数据可能也没什么用。
"""
#根据模型的特征重要性筛选的特征
useless_cols=['cos_dt_30_gap30', 'cos_dt_30_gap60', 'type_16', 'cos_dt_7_gap7','sin_dt_30_gap30', 'type_5','sin_dt_30_gap60', 'type_0', 'type_1', 'type_10', 'type_11', 'sin_dt_7_gap7']
total.drop(useless_cols,axis=1,inplace=True)
train=total[~total['target'].isna()]
train=train[train['dt']<=500]
test=total[total['target'].isna()]
del total
gc.collect()
print(f"features_count:{len(test.columns)}")
train.head()
#遍历表格df的所有列修改数据类型减少内存使用
def reduce_mem_usage(df, float16_as32=True):
#memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:#遍历每列的列名
col_type = df[col].dtype#列名的type
if col_type != object and str(col_type)!='category':#不是object也就是说这里处理的是数值类型的变量
c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
#如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
#如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
#如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
#如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:#如果是浮点数类型.
#如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
if float16_as32:#如果数据需要更高的精度可以选择float32
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float16)
#如果数值在float32的取值范围内,对它进行类型转换
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
#如果数值在float64的取值范围内,对它进行类型转换
else:
df[col] = df[col].astype(np.float64)
#计算一下结束后的内存
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#相比一开始的内存减少了百分之多少
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
train=reduce_mem_usage(train, float16_as32=False)
test=reduce_mem_usage(test, float16_as32=False)
choose_cols=[ col for col in test.drop(['id','dt','target','dt_dayofweek','dt_month'],axis=1).columns]
def fit_and_predict(train_feats=train,test_feats=test,model=None,num_folds=10,seed=2024,name='lgb'):
X=train_feats[choose_cols].copy()
y=train_feats[Config.TARGET_NAME].copy()
test_X=test_feats[choose_cols].copy()
test_pred_pro=np.zeros((num_folds,len(test_X)))
del train_feats,test_feats
gc.collect()
#10折交叉验证
kf = KFold(n_splits=num_folds,shuffle=True,random_state=seed)
for fold, (train_index, valid_index) in (enumerate(kf.split(X))):
print(f"name {name},fold:{fold}")
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
callbacks=[log_evaluation(100)],
)
if fold==0:
useless_cols=[]
feature_importance=list(model.feature_importances_)
for i in range(len(choose_cols)):
if (feature_importance[i]<10):
useless_cols.append(choose_cols[i])
print(f"useless_cols:{list(set(useless_cols))}")
valid_pred=model.predict(X_valid)
test_pred_pro[fold]=model.predict(test_X)
del X_train,X_valid,y_train,y_valid
gc.collect()
test_preds=test_pred_pro.mean(axis=0)
return test_preds
lgb_params = { "boosting_type": "gbdt","objective": "regression","metric": "mse",
'random_state': 2024, 'n_estimators': 1000,
'reg_alpha': 0.1, 'reg_lambda': 10,
'colsample_bytree': 0.8, 'subsample': 0.8,
'learning_rate': 0.05, 'num_leaves': 64, 'min_child_samples': 62,
'max_bin':255, "extra_trees": True,
'device':'gpu','gpu_use_dp':True,#这行GPU环境的参数,想在CPU环境下运行注释这行代码
}
test_preds=fit_and_predict(model=LGBMRegressor(**lgb_params),num_folds=Config.num_folds,seed=2024,name='lgb')
print(test_preds.shape)
submission=test[['id','dt']] submission['target']=test_preds submission=submission.sort_values(['id','dt']) submission.to_csv("yunbase.csv",index=None) submission.head()