介绍数据集
ride_id:乘车ID
rideable_type:乘车类型
started_at:开始日期
ended_at :结束日期
start_station_name:开始站的名字
start_station_id:开始站的ID
end_station_name:结束站的名字
end_station_id :结束站的ID
start_lat:出发的维度
start_lng:出发的经度
end_lat:结束的维度
end_lng:结束的经度
member_casual:会员和非会员
实战演练
准备数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#导入数据
df1 = pd.read_csv('C:/Rider/202101-divvy-tripdata/202101-divvy-tripdata.csv',engine='python')
df2 = pd.read_csv('C:/Rider/202102-divvy-tripdata/202102-divvy-tripdata.csv',engine='python')
df3 = pd.read_csv('C:/Rider/202103-divvy-tripdata/202103-divvy-tripdata.csv',engine='python')
df4 = pd.read_csv('C:/Rider/202004-divvy-tripdata/202004-divvy-tripdata.csv',engine='python')
df5 = pd.read_csv('C:/Rider/202005-divvy-tripdata/202005-divvy-tripdata.csv',engine='python')
df6 = pd.read_csv('C:/Rider/202006-divvy-tripdata/202006-divvy-tripdata.csv',engine='python')
df7 = pd.read_csv('C:/Rider/202007-divvy-tripdata/202007-divvy-tripdata.csv',engine='python')
df8 = pd.read_csv('C:/Rider/202008-divvy-tripdata/202008-divvy-tripdata.csv',engine='python')
df9 = pd.read_csv('C:/Rider/202009-divvy-tripdata/202009-divvy-tripdata.csv',engine='python')
df10 = pd.read_csv('C:/Rider/202010-divvy-tripdata/202010-divvy-tripdata.csv',engine='python')
df11 = pd.read_csv('C:/Rider/202011-divvy-tripdata/202011-divvy-tripdata.csv',engine='python')
df12 = pd.read_csv('C:/Rider/202012-divvy-tripdata/202012-divvy-tripdata.csv',engine='python')
#将所有数据进行拼接成
df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12],axis=0)
df.head()
df.shape #(3489748, 13)
#检查数据是否有缺失值
pd.isnull(df).sum()
'''
ride_id 0
rideable_type 0
started_at 0
ended_at 0
start_station_name 122175
start_station_id 122801
end_station_name 143242
end_station_id 143703
start_lat 0
start_lng 0
end_lat 4738
end_lng 4738
member_casual 0
dtype: int64
'''
# 对整张表格进行统计描述
df.describe()
start_station_name
#统计一共有多少种
df['start_station_name'].nunique() #708
# 统计每种有多少个
df['start_station_name'].value_counts()
'''
Streeter Dr & Grand Ave 35363
Clark St & Elm St 32472
Lake Shore Dr & Monroe St 29918
Theater on the Lake 29888
Lake Shore Dr & North Blvd 27126
...
N Hampden Ct & W Diversey Ave 1
Stewart Ave & 63rd St (*) 1
N Damen Ave & W Wabansia St 1
N Clark St & W Elm St 1
S Michigan Ave & E 118th St 1
Name: start_station_name, Length: 708, dtype: int64
'''
# 对指定的列进行统计描述
df.start_station_name.describe()
'''
count 3367573
unique 708
top Streeter Dr & Grand Ave
freq 35363
Name: start_station_name, dtype: object
'''
end_station_name
#一共有多少种
df['end_station_name'].nunique() #706
# 统计每种有多少
df['end_station_name'].value_counts()
'''
Streeter Dr & Grand Ave 37425
Clark St & Elm St 32259
Theater on the Lake 31281
Lake Shore Dr & Monroe St 29280
Lake Shore Dr & North Blvd 27611
...
Avenue L & 114th St 3
S Wentworth Ave & W 111th St 2
hubbard_test_lws 2
Kedzie Ave & 110th St 2
N Hampden Ct & W Diversey Ave 1
Name: end_station_name, Length: 706, dtype: int64
'''
df.end_station_name.describe()
'''
count 3346506
unique 706
top Streeter Dr & Grand Ave
freq 37425
Name: end_station_name, dtype: object
'''
将开始的日期转换为周
df['started_at'] = pd.DatetimeIndex(df['started_at'])
#去掉时分秒
df["started_at"] = df["started_at"].dt.date
df['start_week']=pd.to_datetime(df['started_at'])
df['start_week'] = df['start_week'].dt.weekday
将开始的日期转换成月
df['start_months']=pd.to_datetime(df['started_at'])
df['start_months']=df['start_months'].dt.month
将开始的日期转换为周
df['ended_at'] = pd.DatetimeIndex(df['ended_at'])
#去掉时分秒
df["ended_at"] = df["ended_at"].dt.date
df['end_week']=pd.to_datetime(df['ended_at'])
df['end_week'] = df['end_week'].dt.weekday
将开始的日期转换成月
df['end_months']=pd.to_datetime(df['ended_at'])
df['end_months']=df['end_months'].dt.month
数据分析
#数据集各列的数据类型,是否为空值,内存占用情况
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3489748 entries, 0 to 131572
Data columns (total 17 columns):
# Column Dtype
--- ------ -----
0 ride_id object
1 rideable_type object
2 started_at object
3 ended_at object
4 start_station_name object
5 start_station_id object
6 end_station_name object
7 end_station_id object
8 start_lat float64
9 start_lng float64
10 end_lat float64
11 end_lng float64
12 member_casual object
13 start_week int64
14 start_months int64
15 end_week int64
16 end_months int64
dtypes: float64(4), int64(4), object(9)
memory usage: 479.2+ MB
'''
#把类型进行分组
numerical_cols, categorical_cols = [], []
for col in df.columns:
if df[col].dtype=="int64":
numerical_cols.append(col)
elif df[col].dtype=="float64":
numerical_cols.append(col)
else:
categorical_cols.append(col)
n_numerical = len(numerical_cols)
n_categorical = len(categorical_cols)
print("Total No. of Numerical Columns : {}".format(n_numerical))
print("Total No. of Categorical Columns : {}".format(n_categorical))
'''
Total No. of Numerical Columns : 8
Total No. of Categorical Columns : 9
'''
#去除一部分不太影响数据的列
df.drop(columns=['ride_id', 'started_at','ended_at','start_station_name','end_station_name'], inplace=True)
categorical_cols.remove('ride_id')
categorical_cols.remove('started_at')
categorical_cols.remove('ended_at')
categorical_cols.remove('start_station_name')
categorical_cols.remove('end_station_name')
df.head()
rideable_type
df[categorical_cols[0]].unique()
'''
array(['electric_bike', 'classic_bike', 'docked_bike'], dtype=object)
'''
#将字符型数据映射数字型数据
rideable_type_mapping= {'electric_bike':0.0, 'classic_bike':1.0,'docked_bike':2.0}
df[categorical_cols[0]] = df[categorical_cols[0]].map(rideable_type_mapping)
df[categorical_cols[0]].unique()
'''
array([0., 1., 2.])
'''
#计算众数
mode = df.mode(axis=0)
mode
'''
rideable_type start_station_id end_station_id start_lat start_lng end_lat end_lng member_casual start_week start_months end_week end_months
0 2.0 35 35.0 41.892278 -87.612043 41.892278 -87.612043 member 5 8 5 8
'''
start_station_id
start_id = int(mode[categorical_cols[1]])
print(start_id)#35
#填补缺失值
df[categorical_cols[1]] = df[categorical_cols[1]].fillna(start_id,axis=0)
pd.isnull(df[categorical_cols[1]]).sum()#0
end_station_id
end_id = int(mode[categorical_cols[2]])
df[categorical_cols[2]] = df[categorical_cols[2]].fillna(start_id,axis=0)
pd.isnull(df[categorical_cols[2]]).sum()#0
end_lat
end_lat = float(mode[numerical_cols[2]])
df[numerical_cols[2]] = df[numerical_cols[2]].fillna(end_lat,axis=0)
pd.isnull(df[numerical_cols[2]]).sum()#0
end_lng
end_lng = float(mode[numerical_cols[3]])
df[numerical_cols[3]] = df[numerical_cols[3]].fillna(end_lat,axis=0)
pd.isnull(df[numerical_cols[3]]).sum()#0
member_casual
df[categorical_cols[3]].unique()
member_casual_mapping = {'member':0.0, 'casual':1.0}
df[categorical_cols[3]] = df[categorical_cols[3]].map(member_casual_mapping)
import plotly_express as px
rideable_type= pd.DataFrame(df.rideable_type.value_counts()).reset_index()
rideable_type.rename(columns = {'index':'rideable_type', 'rideable_type':'count'}, inplace=True)
fig = px.pie(rideable_type, values = 'count', names='rideable_type', title = 'ride_type',hole=.3,
color_discrete_map={'electric_bike':'lightcyan',
'classic_bike':'cyan',
'docked_bike':'royalblue'})
fig.show()
fig = px.histogram(df, x = 'start_station_id', title = 'start_station_id')
fig.show()
fig = px.histogram(df, x = 'end_station_id', title = 'end_station_id')
fig.show()
#类型转换
df[categorical_cols[2]] = pd.to_numeric(df[categorical_cols[2]],errors='coerce')
df[categorical_cols[1]] = pd.to_numeric(df[categorical_cols[1]],errors='coerce')
fig = px.pie(data_frame=df, names='member_casual', hole=0.2)
fig.update_layout({'title':{'text':"member_casual",'x':0.5}})
fig.show()
fig = px.pie(data_frame=df, names='start_week', hole=0.2)
fig.update_layout({'title':{'text':"start_week",'x':0.5}})
fig.show()
fig = px.pie(data_frame=df, names='end_week', hole=0.2)
fig.update_layout({'title':{'text':"end_week",'x':0.5}})
fig.show()
fig = px.pie(data_frame=df, names='start_months', hole=0.2)
fig.update_layout({'title':{'text':"start_months",'x':0.5}})
fig.show()
fig = px.pie(data_frame=df, names='end_months', hole=0.2)
fig.update_layout({'title':{'text':"end_months",'x':0.5}})
fig.show()
训练数据
import sklearn
from sklearn.preprocessing import StandardScaler
y_full = df.pop('member_casual')
X_full = df
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full)
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y_full)
rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)
features = X_full.columns
feature_imps = rfc.feature_importances_#特征重要性
fig = px.bar(x=features, y=feature_imps)
fig.update_layout({'title':{'text':"Feature Importnace", 'x':0.5}})
fig.show()
#线性回归模型
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=100,penalty="l2").fit(X_train,y_train)
print("模型train得分:{:.3f}".format(logreg.score(X_train,y_train)))
print("模型test得分:{:.3f}".format(logreg.score(X_valid,y_valid)))
'''
模型train得分:0.608
模型test得分:0.609
'''
pred = logreg.predict(X_valid)
logreg_mse = mean_squared_error(y_valid, pred)
logreg_rmse = np.sqrt(logreg_mse)
logreg_r2 = r2_score(y_valid, pred)
print(logreg_mse,logreg_rmse,logreg_r2)
'''
0.3913119228093261 0.6255492968658234 -0.6179405993775167
'''