4 回归：比赛代码，时间窗口特征

看到了不错的代码我就想收集一下。

这是2024科大讯飞电力需求预测挑战赛的baseline。你可以在AI and competition里获取更多比赛的baseline。本次比赛官网如下:电力需求预测挑战赛。目前该baseline在比赛中相对靠前。

AI-and-competition/202407科大讯飞电力需求预测baseline/2024-baseline-lb-229.ipynb at main · yunsuxiaozi/AI-and-competition · GitHub

1 sin、cos函数有什么用？周期！

2 历史特征、时间窗口；

3 彩蛋：有可能时间序列模型更好用哦！

import pandas as pd#导入csv文件的库
import numpy as np#矩阵运算与科学计算的库
#model lgb回归模型,日志评估
from  lightgbm import LGBMRegressor,log_evaluation
#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import KFold
import gc#垃圾回收模块
import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法，它可以控制警告信息的输出方式和级别。

#config
class Config():
    seed=2024#随机种子
    num_folds=10#K折交叉验证
    TARGET_NAME ='target'#标签
import random#提供了一些用于生成随机数的函数
#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(Config.seed)

path='/kaggle/input/kedapower/2407keda/'
train=pd.read_csv(path+"train.csv")
test=pd.read_csv(path+"test.csv")
train.head()

# 先对数据集拼接,根据id和dt排序,根据id排序是因为同一个id是同一个房屋,dt要降序排列是因为dt是距离当前多少天。

total=pd.concat((train,test),axis=0)
#id从小到大,dt从大到小,因为dt是距离现在多少天.
total=total.sort_values(['id','dt'],ascending=[True,False])
print(f"len(total):{len(total)}")
total.head()

print("type feature")
user_type=total['type'].unique()
for t in user_type:
    total[f'type_{t}']=(total['type']==t)
"""
dt:虽然dt是脱敏处理后的天数,但这里还是按照常规的day来处理。7是一周,30是一个月,90是一个季度,180是半年,365是一年,由于这些特征是人类通常表示时间的周期,用sin和cos提取特征。之后提取了差分特征,并提取了常规的dayofweek,month,year特征。
"""

print("dt feature")  
for day in [7,30,90,180,365]:
    total[f'sin_dt_{day}']=np.sin(2*np.pi*total['dt']/day)
    total[f'cos_dt_{day}']=np.cos(2*np.pi*total['dt']/day)
for gap in [2,4,7,15,30,60]:
    for col in ['sin_dt_7','cos_dt_7','sin_dt_30','cos_dt_30','sin_dt_90','cos_dt_90','sin_dt_180','cos_dt_180','sin_dt_365','cos_dt_365']:
        total[f"{col}_shift{gap}"]=total.groupby(total['id'])[col].shift(gap)
        total[f"{col}_gap{gap}"]=total[col]-total[f"{col}_shift{gap}"]
        total.drop([f"{col}_shift{gap}"],axis=1,inplace=True)
total['dt_dayofweek']=total['dt']%7
for d in range(7):
    total[f'dt_dayofweek_{d}']=(total['dt_dayofweek']==d)  
total['dt_month']=total['dt']//30%12
for d in range(12):
    total[f'dt_month_{d}']=(total['dt_month']==d)
total['dt_year']=total['dt']//365

# target:对于target特征的提取有很多,也比较暴力,基本都是groupby的统计特征,自己看吧。

print("target feature")
for gap in [10,13,20]:
    for col in ['target']:
        total[f"{col}_shift{gap}"]=total.groupby(total['id'])[col].shift(gap)    
        for w in [3,6,10]:
            total[f"{col}_gap{gap}_window{w}"]=total.groupby(total['id'])[col].transform(lambda x: x.rolling(w).mean()).shift(gap)

for col in [['id'],['type'],['dt_dayofweek'],['dt_month'],
            ['id','type'],['id','dt_dayofweek'],['id','dt_month'],
            ['type','dt_dayofweek'],['type','dt_month']
           ]:
    print(f"col:{col}")
    if len(col)>1:
        colname='_'.join(col)
    else:
        colname=col[0]
    total[f"{colname}_target_min"]=total.groupby(col)['target'].transform('min')
    total[f"{colname}_target_max"]=total.groupby(col)['target'].transform('max')
    total[f"{colname}_target_mean"]=total.groupby(col)['target'].transform('mean')
    total[f"{colname}_target_median"]=total.groupby(col)['target'].transform('median')
    total[f"{colname}_target_std"]=total.groupby(col)['target'].transform('std')
    total[f"{colname}_target_skew"]=total.groupby(col)['target'].transform('skew')
    total[f"{colname}_target_last"]=total.groupby(col)['target'].transform('last')
    total[f"{colname}_target_first"]=total.groupby(col)['target'].transform('first')
    total[f"{colname}_target_nunique"]=total.groupby(col)['target'].transform('nunique')
    
    total[f"{colname}_target_ptp"]=total[f"{colname}_target_max"]-total[f"{colname}_target_min"]
    total[f"{colname}_target_max/min"]=total[f"{colname}_target_max"]/(total[f"{colname}_target_min"]+1e-15)
    total[f"{colname}_target_mean/std"]=total[f"{colname}_target_mean"]/(total[f"{colname}_target_std"]+1e-15)

"""
这里也根据模型的特征重要性筛选了一些特征,特征重要性小于10就drop。这里选择dt<=500的训练数据进行训练是因为一方面shift特征的构造中会有一些缺失值,另一方面时间上距离0太远的数据可能也没什么用。
"""
#根据模型的特征重要性筛选的特征
useless_cols=['cos_dt_30_gap30', 'cos_dt_30_gap60', 'type_16', 'cos_dt_7_gap7','sin_dt_30_gap30', 'type_5','sin_dt_30_gap60', 'type_0', 'type_1', 'type_10', 'type_11', 'sin_dt_7_gap7']
total.drop(useless_cols,axis=1,inplace=True)
train=total[~total['target'].isna()]
train=train[train['dt']<=500]
test=total[total['target'].isna()]
del total
gc.collect()
print(f"features_count:{len(test.columns)}")
train.head()

#遍历表格df的所有列修改数据类型减少内存使用
def reduce_mem_usage(df, float16_as32=True):
    #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:#遍历每列的列名
        col_type = df[col].dtype#列名的type
        if col_type != object and str(col_type)!='category':#不是object也就是说这里处理的是数值类型的变量
            c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
            if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
                #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:#如果是浮点数类型.
                #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:#如果数据需要更高的精度可以选择float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)  
                #如果数值在float32的取值范围内，对它进行类型转换
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                #如果数值在float64的取值范围内，对它进行类型转换
                else:
                    df[col] = df[col].astype(np.float64)
    #计算一下结束后的内存
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #相比一开始的内存减少了百分之多少
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
train=reduce_mem_usage(train, float16_as32=False)
test=reduce_mem_usage(test, float16_as32=False)

choose_cols=[ col for col in test.drop(['id','dt','target','dt_dayofweek','dt_month'],axis=1).columns]

def fit_and_predict(train_feats=train,test_feats=test,model=None,num_folds=10,seed=2024,name='lgb'):
    X=train_feats[choose_cols].copy()
    y=train_feats[Config.TARGET_NAME].copy()
    test_X=test_feats[choose_cols].copy()
    test_pred_pro=np.zeros((num_folds,len(test_X)))
    del train_feats,test_feats
    gc.collect()
    
    #10折交叉验证
    kf = KFold(n_splits=num_folds,shuffle=True,random_state=seed)
    for fold, (train_index, valid_index) in (enumerate(kf.split(X))):
        print(f"name {name},fold:{fold}")

        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
                      callbacks=[log_evaluation(100)],
                     )
        if fold==0:
            useless_cols=[]
            feature_importance=list(model.feature_importances_)
            for i in range(len(choose_cols)):
                if (feature_importance[i]<10):
                    useless_cols.append(choose_cols[i])
            print(f"useless_cols:{list(set(useless_cols))}")
        valid_pred=model.predict(X_valid)
        test_pred_pro[fold]=model.predict(test_X)
        del X_train,X_valid,y_train,y_valid
        gc.collect()
    test_preds=test_pred_pro.mean(axis=0)
    return test_preds

lgb_params = { "boosting_type": "gbdt","objective": "regression","metric": "mse",       
               'random_state': 2024, 'n_estimators': 1000,
                'reg_alpha': 0.1, 'reg_lambda': 10, 
                'colsample_bytree': 0.8, 'subsample': 0.8,
                'learning_rate': 0.05, 'num_leaves': 64, 'min_child_samples': 62,
                'max_bin':255, "extra_trees": True,
                'device':'gpu','gpu_use_dp':True,#这行GPU环境的参数,想在CPU环境下运行注释这行代码
             }

test_preds=fit_and_predict(model=LGBMRegressor(**lgb_params),num_folds=Config.num_folds,seed=2024,name='lgb')
print(test_preds.shape)

submission=test[['id','dt']]
submission['target']=test_preds
submission=submission.sort_values(['id','dt'])
submission.to_csv("yunbase.csv",index=None)
submission.head()