机器学习 Rider数据集分析和预测

介绍数据集

ride_id：乘车ID
rideable_type：乘车类型
started_at：开始日期
ended_at ：结束日期
start_station_name：开始站的名字
start_station_id：开始站的ID
end_station_name：结束站的名字
end_station_id ：结束站的ID
start_lat：出发的维度
start_lng：出发的经度
end_lat：结束的维度
end_lng：结束的经度
member_casual：会员和非会员

实战演练

准备数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#导入数据
df1 = pd.read_csv('C:/Rider/202101-divvy-tripdata/202101-divvy-tripdata.csv',engine='python')
df2 = pd.read_csv('C:/Rider/202102-divvy-tripdata/202102-divvy-tripdata.csv',engine='python')
df3 = pd.read_csv('C:/Rider/202103-divvy-tripdata/202103-divvy-tripdata.csv',engine='python')
df4 = pd.read_csv('C:/Rider/202004-divvy-tripdata/202004-divvy-tripdata.csv',engine='python')
df5 = pd.read_csv('C:/Rider/202005-divvy-tripdata/202005-divvy-tripdata.csv',engine='python')
df6 = pd.read_csv('C:/Rider/202006-divvy-tripdata/202006-divvy-tripdata.csv',engine='python')
df7 = pd.read_csv('C:/Rider/202007-divvy-tripdata/202007-divvy-tripdata.csv',engine='python')
df8 = pd.read_csv('C:/Rider/202008-divvy-tripdata/202008-divvy-tripdata.csv',engine='python')
df9 = pd.read_csv('C:/Rider/202009-divvy-tripdata/202009-divvy-tripdata.csv',engine='python')
df10 = pd.read_csv('C:/Rider/202010-divvy-tripdata/202010-divvy-tripdata.csv',engine='python')
df11 = pd.read_csv('C:/Rider/202011-divvy-tripdata/202011-divvy-tripdata.csv',engine='python')
df12 = pd.read_csv('C:/Rider/202012-divvy-tripdata/202012-divvy-tripdata.csv',engine='python')

#将所有数据进行拼接成
df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12],axis=0)

df.head()

df.shape #(3489748, 13)

在这里插入图片描述

#检查数据是否有缺失值
pd.isnull(df).sum()
'''
ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    122175
start_station_id      122801
end_station_name      143242
end_station_id        143703
start_lat                  0
start_lng                  0
end_lat                 4738
end_lng                 4738
member_casual              0
dtype: int64
'''

# 对整张表格进行统计描述
df.describe()

在这里插入图片描述

start_station_name

#统计一共有多少种
df['start_station_name'].nunique()  #708

# 统计每种有多少个
df['start_station_name'].value_counts()
'''
Streeter Dr & Grand Ave          35363
Clark St & Elm St                32472
Lake Shore Dr & Monroe St        29918
Theater on the Lake              29888
Lake Shore Dr & North Blvd       27126
                                 ...  
N Hampden Ct & W Diversey Ave        1
Stewart Ave & 63rd St (*)            1
N Damen Ave & W Wabansia St          1
N Clark St & W Elm St                1
S Michigan Ave & E 118th St          1
Name: start_station_name, Length: 708, dtype: int64
'''

# 对指定的列进行统计描述
df.start_station_name.describe()
'''
count                     3367573
unique                        708
top       Streeter Dr & Grand Ave
freq                        35363
Name: start_station_name, dtype: object
'''

end_station_name

#一共有多少种
df['end_station_name'].nunique()  #706

# 统计每种有多少
df['end_station_name'].value_counts()
'''
Streeter Dr & Grand Ave          37425
Clark St & Elm St                32259
Theater on the Lake              31281
Lake Shore Dr & Monroe St        29280
Lake Shore Dr & North Blvd       27611
                                 ...  
Avenue L & 114th St                  3
S Wentworth Ave & W 111th St         2
hubbard_test_lws                     2
Kedzie Ave & 110th St                2
N Hampden Ct & W Diversey Ave        1
Name: end_station_name, Length: 706, dtype: int64
'''

df.end_station_name.describe()
'''
count                     3346506
unique                        706
top       Streeter Dr & Grand Ave
freq                        37425
Name: end_station_name, dtype: object
'''

将开始的日期转换为周

df['started_at'] = pd.DatetimeIndex(df['started_at'])
#去掉时分秒
df["started_at"] = df["started_at"].dt.date 

df['start_week']=pd.to_datetime(df['started_at'])
df['start_week'] = df['start_week'].dt.weekday

将开始的日期转换成月

df['start_months']=pd.to_datetime(df['started_at'])
df['start_months']=df['start_months'].dt.month

将开始的日期转换为周

df['ended_at'] = pd.DatetimeIndex(df['ended_at'])
#去掉时分秒
df["ended_at"] = df["ended_at"].dt.date

df['end_week']=pd.to_datetime(df['ended_at'])
df['end_week'] = df['end_week'].dt.weekday

将开始的日期转换成月

df['end_months']=pd.to_datetime(df['ended_at'])
df['end_months']=df['end_months'].dt.month

在这里插入图片描述

数据分析

#数据集各列的数据类型，是否为空值，内存占用情况
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3489748 entries, 0 to 131572
Data columns (total 17 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
 13  start_week          int64  
 14  start_months        int64  
 15  end_week            int64  
 16  end_months          int64  
dtypes: float64(4), int64(4), object(9)
memory usage: 479.2+ MB
'''

#把类型进行分组
numerical_cols, categorical_cols = [], []

for col in df.columns:
    if df[col].dtype=="int64":
        numerical_cols.append(col)
    elif df[col].dtype=="float64":
        numerical_cols.append(col)
    else:
        categorical_cols.append(col)

n_numerical = len(numerical_cols)
n_categorical = len(categorical_cols)

print("Total No. of Numerical Columns   : {}".format(n_numerical))
print("Total No. of Categorical Columns : {}".format(n_categorical))
'''
Total No. of Numerical Columns   : 8
Total No. of Categorical Columns : 9
'''

#去除一部分不太影响数据的列
df.drop(columns=['ride_id', 'started_at','ended_at','start_station_name','end_station_name'], inplace=True)

categorical_cols.remove('ride_id')
categorical_cols.remove('started_at')
categorical_cols.remove('ended_at')
categorical_cols.remove('start_station_name')
categorical_cols.remove('end_station_name')
df.head()

在这里插入图片描述

rideable_type

df[categorical_cols[0]].unique()
'''
array(['electric_bike', 'classic_bike', 'docked_bike'], dtype=object)
'''
#将字符型数据映射数字型数据
rideable_type_mapping= {'electric_bike':0.0, 'classic_bike':1.0,'docked_bike':2.0}
df[categorical_cols[0]] = df[categorical_cols[0]].map(rideable_type_mapping)

df[categorical_cols[0]].unique()
'''
array([0., 1., 2.])
'''

#计算众数
mode = df.mode(axis=0)
mode
'''
rideable_type	start_station_id	end_station_id	start_lat	start_lng	end_lat	end_lng	member_casual	start_week  start_months	end_week	end_months
0	2.0	35	35.0	41.892278	-87.612043	41.892278	-87.612043	member	5	8	5	8
'''

start_station_id

start_id = int(mode[categorical_cols[1]])
print(start_id)#35

#填补缺失值
df[categorical_cols[1]] = df[categorical_cols[1]].fillna(start_id,axis=0)
pd.isnull(df[categorical_cols[1]]).sum()#0

end_station_id

end_id = int(mode[categorical_cols[2]])
df[categorical_cols[2]] = df[categorical_cols[2]].fillna(start_id,axis=0)
pd.isnull(df[categorical_cols[2]]).sum()#0

end_lat

end_lat = float(mode[numerical_cols[2]])
df[numerical_cols[2]] = df[numerical_cols[2]].fillna(end_lat,axis=0)
pd.isnull(df[numerical_cols[2]]).sum()#0

end_lng

end_lng = float(mode[numerical_cols[3]])
df[numerical_cols[3]] = df[numerical_cols[3]].fillna(end_lat,axis=0)
pd.isnull(df[numerical_cols[3]]).sum()#0

member_casual

df[categorical_cols[3]].unique()
member_casual_mapping = {'member':0.0, 'casual':1.0}
df[categorical_cols[3]] = df[categorical_cols[3]].map(member_casual_mapping)

import plotly_express as px
rideable_type= pd.DataFrame(df.rideable_type.value_counts()).reset_index()
rideable_type.rename(columns = {'index':'rideable_type', 'rideable_type':'count'}, inplace=True)

fig = px.pie(rideable_type, values = 'count', names='rideable_type', title = 'ride_type',hole=.3, 
             color_discrete_map={'electric_bike':'lightcyan',
                                 'classic_bike':'cyan',
                                 'docked_bike':'royalblue'})
fig.show()

在这里插入图片描述

fig = px.histogram(df, x = 'start_station_id', title = 'start_station_id')
fig.show()

在这里插入图片描述

fig = px.histogram(df, x = 'end_station_id', title = 'end_station_id')
fig.show()

在这里插入图片描述

#类型转换
df[categorical_cols[2]] = pd.to_numeric(df[categorical_cols[2]],errors='coerce')
df[categorical_cols[1]] = pd.to_numeric(df[categorical_cols[1]],errors='coerce')

fig = px.pie(data_frame=df, names='member_casual', hole=0.2)
fig.update_layout({'title':{'text':"member_casual",'x':0.5}})
fig.show()

在这里插入图片描述

fig = px.pie(data_frame=df, names='start_week', hole=0.2)
fig.update_layout({'title':{'text':"start_week",'x':0.5}})
fig.show()

在这里插入图片描述

fig = px.pie(data_frame=df, names='end_week', hole=0.2)
fig.update_layout({'title':{'text':"end_week",'x':0.5}})
fig.show()

在这里插入图片描述

fig = px.pie(data_frame=df, names='start_months', hole=0.2)
fig.update_layout({'title':{'text':"start_months",'x':0.5}})
fig.show()

在这里插入图片描述

fig = px.pie(data_frame=df, names='end_months', hole=0.2)
fig.update_layout({'title':{'text':"end_months",'x':0.5}})
fig.show()

在这里插入图片描述

训练数据

import sklearn
from sklearn.preprocessing import StandardScaler

y_full = df.pop('member_casual')
X_full = df

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full)

X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y_full)

rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)

features = X_full.columns
feature_imps = rfc.feature_importances_#特征重要性

fig = px.bar(x=features, y=feature_imps)
fig.update_layout({'title':{'text':"Feature Importnace", 'x':0.5}})
fig.show()

在这里插入图片描述

#线性回归模型
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=100,penalty="l2").fit(X_train,y_train)
print("模型train得分:{:.3f}".format(logreg.score(X_train,y_train)))
print("模型test得分:{:.3f}".format(logreg.score(X_valid,y_valid)))
'''
模型train得分:0.608
模型test得分:0.609
'''

pred = logreg.predict(X_valid)

logreg_mse = mean_squared_error(y_valid, pred)
logreg_rmse = np.sqrt(logreg_mse)
logreg_r2 = r2_score(y_valid, pred)

print(logreg_mse,logreg_rmse,logreg_r2)
'''
0.3913119228093261 0.6255492968658234 -0.6179405993775167
'''