介绍数据集
fuelType:燃料类型
rating:评级
renterTripsTaken:租房者出行
reviewCount:审阅计数
location.city:位置.城市
location.country:地点.国家/地区
location.latitude:位置.纬度
location.longitude:位置.长度
location.state:位置状态
owner.id:所有者id
rate.daily:每日费率
vehicle.make:车辆制造
vehicle.model:车辆型号
vehicle.type:车辆类型
vehicle.year:车辆.年
实战演练
数据集展示
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly_express as px
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#导入数据
df = pd.read_csv('C:/CarRentalData.csv',engine='python')
df.head()
#数据集大小
df.shape
#(5851, 15)
数据的可视化
fuelType
df_fuelType = pd.DataFrame(df.fuelType.value_counts()).reset_index()
df_fuelType.rename(columns = {'index':'fuelType', 'fuelType':'count'}, inplace=True)
fig = px.pie(df_fuelType, values = 'count', names='fuelType', title = 'Fuel Type',hole=.3,
color_discrete_map={'ELECTRIC':'lightcyan',
'HYBRID':'cyan',
'GASOLINE':'royalblue',
'DIESEL':'darkblue'})
fig.show()
rating
print("Rating Statistics:")
print(df['rating'].describe())
'''
Rating Statistics:
count 5350.000000
mean 4.920325
std 0.182425
min 1.000000
25% 4.900000
50% 5.000000
75% 5.000000
max 5.000000
Name: rating, dtype: float64
'''
fig = px.histogram(df, x = 'rating', title = 'Histogram of Rental Car Rating')
fig.show()
renterTripsTaken
print("Renter Trips Taken Statistics:")
print(df['renterTripsTaken'].describe())
'''
Renter Trips Taken Statistics:
count 5851.000000
mean 33.477354
std 41.898954
min 0.000000
25% 5.000000
50% 18.000000
75% 46.000000
max 395.000000
Name: renterTripsTaken, dtype: float64
'''
fig = px.histogram(df, x = 'renterTripsTaken', title = 'Histogram of Renter Trips Taken')
fig.show()
reviewCount
print("Review Count Statistics:")
print(df['reviewCount'].describe())
'''
Review Count Statistics:
count 5851.000000
mean 28.454794
std 35.136113
min 0.000000
25% 4.000000
50% 16.000000
75% 39.000000
max 321.000000
Name: reviewCount, dtype: float64
'''
fig = px.histogram(df, x = 'reviewCount', title = 'Histogram of Review Count')
fig.show()
Car Rentals by City
import plotly.graph_objects as go
def get_average_lat_long(city, ltype):
choices = df[df['location.city'] == city]
lat = choices['location.latitude'].mean()
long = choices['location.longitude'].mean()
if ltype == 0:
return lat
else:
return long
df_location = pd.DataFrame(df['location.city'].value_counts()).reset_index()
df_location.rename(columns = {'index':'city', 'location.city':'count'}, inplace=True)
df_location['latitude'] = df_location['city'].apply(lambda x: get_average_lat_long(x, 0))
df_location['longitude'] = df_location['city'].apply(lambda x: get_average_lat_long(x, 1))
df_location['text'] = df_location['city'] + '<br>Car Rentals ' + (df_location['count']).astype(str)
limits = [(0,20),(21,50),(51,150),(151,200),(201,1000),(1001,1500)]
colors = ["royalblue","orange","lightgrey","lightseagreen","red","crimson"]
scale = 0.5
fig = go.Figure()
for i in range(len(limits)):
lim = limits[i]
df_sub = df_location[lim[0]:lim[1]]
fig.add_trace(go.Scattergeo(
locationmode = 'USA-states',
lon = df_sub['longitude'],
lat = df_sub['latitude'],
text = df_sub['text'],
marker = dict(
size = df_sub['count']/scale,
color = colors[i],
line_color='rgb(40,40,40)',
line_width=0.5,
sizemode = 'area'
),
name = '{0} - {1}'.format(lim[0],lim[1])))
fig.update_layout(
title_text = 'Car Rentals by City',
showlegend = True,
geo = dict(
scope = 'usa',
landcolor = 'rgb(217, 217, 217)',
)
)
fig.show()
Car Rentals by State
df_state = pd.DataFrame(df['location.state'].value_counts()).reset_index()
df_state.rename(columns = {'index':'state', 'location.state':'count'}, inplace=True)
fig = go.Figure(data=go.Choropleth(
locations=df_state['state'], # 空间坐标
z = df_state['count'].astype(float), # 要进行颜色编码的数据
locationmode = 'USA-states', # 位置集与locations中的条目匹配
colorscale = 'Reds',
colorbar_title = "Number of Cars Rented",
))
fig.update_layout(
title_text = 'Car Rentals by State',
geo_scope='usa',
)
fig.show()
own.id
df_owner = pd.DataFrame(df['owner.id'].value_counts()).reset_index()
df_owner.rename(columns = {'index':'owner_id', 'owner.id':'number of rental cars'}, inplace=True)
print('Total Number of Unique Rental Cars per Owner Statistics:')
print(df_owner['number of rental cars'].describe())
'''
Total Number of Unique Rental Cars per Owner Statistics:
count 3093.000000
mean 1.891691
std 2.789205
min 1.000000
25% 1.000000
50% 1.000000
75% 2.000000
max 49.000000
Name: number of rental cars, dtype: float64
'''
fig = px.histogram(df_owner, x = 'number of rental cars', title='Total Number of Unique Rental Cars per Owner')
fig.show()
rate.daily
print('Daily Rate of Car Rental Statistics:')
print(df['rate.daily'].describe())
'''
Daily Rate of Car Rental Statistics:
count 5851.000000
mean 93.691506
std 96.080920
min 20.000000
25% 45.000000
50% 69.000000
75% 110.000000
max 1500.000000
Name: rate.daily, dtype: float64
'''
fig = px.histogram(df, x = 'rate.daily', title='Daily Rate of Car Rental')
fig.show()
Make and Model of Top 25 Most Rented Cars
df_make_model = df.groupby(['vehicle.make', 'vehicle.model']).size().reset_index()
df_make_model.rename(columns = {0:'count'}, inplace=True)
df_make_model.replace('Mercedes-benz', 'Mercedes-Benz', inplace=True)
df_make_model['make_count'] = df_make_model['vehicle.make'].apply(lambda x : df_make_model[df_make_model['vehicle.make'] == x]['count'].sum())
df_make_model.sort_values(by = 'make_count', ascending=False, inplace=True)
fig = px.bar(df_make_model[df_make_model['make_count'] >45], x = 'vehicle.make', y='count', color = 'vehicle.model', title='Make and Model of Top 25 Most Rented Cars')
fig.update_layout(showlegend = False)
fig.show()
Vehicle Type of Rented Cars
df_vehicleType = pd.DataFrame(df['vehicle.type'].value_counts()).reset_index()
df_vehicleType.rename(columns = {'index':'vehicle.type', 'vehicle.type':'count'}, inplace=True)
fig = px.pie(df_vehicleType, values = 'count', names='vehicle.type', title = 'Vehicle Type of Rented Cars')
fig.show()
Year of Vehicle
print('Vehicle Year Statistics:')
print(df['vehicle.year'].describe())
'''
Vehicle Year Statistics:
count 5851.000000
mean 2015.340113
std 4.050813
min 1955.000000
25% 2014.000000
50% 2016.000000
75% 2018.000000
max 2020.000000
Name: vehicle.year, dtype: float64
'''
fig = px.histogram(df, x = 'vehicle.year', title='Year of Vehicle')
fig.show()
热力图
plt.figure(figsize=(14,7))
sns.heatmap(df.corr(), annot=True)
数据预处理
#统计缺失值
df.isnull().sum()
'''
fuelType 75
rating 501
renterTripsTaken 0
reviewCount 0
location.city 0
location.country 0
location.latitude 0
location.longitude 0
location.state 0
owner.id 0
rate.daily 0
vehicle.make 0
vehicle.model 0
vehicle.type 0
vehicle.year 0
dtype: int64
'''
#数据集各列的数据类型,是否为空值,内存占用情况
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5851 entries, 0 to 5850
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fuelType 5776 non-null object
1 rating 5350 non-null float64
2 renterTripsTaken 5851 non-null int64
3 reviewCount 5851 non-null int64
4 location.city 5851 non-null object
5 location.country 5851 non-null object
6 location.latitude 5851 non-null float64
7 location.longitude 5851 non-null float64
8 location.state 5851 non-null object
9 owner.id 5851 non-null int64
10 rate.daily 5851 non-null int64
11 vehicle.make 5851 non-null object
12 vehicle.model 5851 non-null object
13 vehicle.type 5851 non-null object
14 vehicle.year 5851 non-null int64
dtypes: float64(3), int64(5), object(7)
memory usage: 685.8+ KB
'''
df['fuelType'].unique()
'''
array(['ELECTRIC', 'HYBRID', 'GASOLINE', nan, 'DIESEL'], dtype=object)
'''
df['fuelType'].value_counts().reset_index()
'''
index fuelType
0 GASOLINE 4810
1 ELECTRIC 622
2 HYBRID 274
3 DIESEL 70
'''
#填补缺失值
df['fuelType'] = df['fuelType'].fillna('GASOLINE',axis=0)
#求众数
mode = df['rating'].mode()
mode = int(mode)
mode
'''
5
'''
df['rating'] = df['rating'].fillna(mode,axis=0)
pd.isnull(df).sum()
'''
fuelType 0
rating 0
renterTripsTaken 0
reviewCount 0
location.city 0
location.country 0
location.latitude 0
location.longitude 0
location.state 0
owner.id 0
rate.daily 0
vehicle.make 0
vehicle.model 0
vehicle.type 0
vehicle.year 0
dtype: int64
'''
可以发现缺失值已经填补完毕,没有缺失值了
df['fuelType'].unique()
'''
array(['ELECTRIC', 'HYBRID', 'GASOLINE', 'DIESEL'], dtype=object)
'''
#映射
fuelType_mapping= {'ELECTRIC':0.0, 'HYBRID':1.0,'GASOLINE':2.0,'DIESEL':3.0}
df['fuelType'] = df['fuelType'].map(fuelType_mapping)
df['location.state'].unique()
'''
array(['WA', 'NM', 'GA', 'SC', 'FL', 'TX', 'NC', 'CT', 'MA', 'ME', 'AL',
'MT', 'TN', 'KY', 'ID', 'UT', 'MD', 'DC', 'IA', 'OH', 'CO', 'VA',
'MI', 'NJ', 'IN', 'WI', 'KS', 'MO', 'NV', 'CA', 'LA', 'AR', 'IL',
'MS', 'NH', 'MN', 'OK', 'NE', 'OR', 'PA', 'DE', 'AZ', 'WV', 'RI',
'AK', 'HI'], dtype=object)
'''
locationState_mapping= {'WA':0.0, 'NM':1.0, 'GA':2.0, 'SC':3.0, 'FL':4.0, 'TX':5.0, 'NC':6.0, 'CT':7.0, 'MA':8.0, 'ME':9.0, 'AL':10.0,
'MT':11.0, 'TN':12.0, 'KY':13.0, 'ID':14.0, 'UT':15.0, 'MD':16.0, 'DC':17.0, 'IA':18.0, 'OH':19.0, 'CO':20.0, 'VA':21.0,
'MI':21.0, 'NJ':22.0, 'IN':23.0, 'WI':24.0, 'KS':25.0, 'MO':26.0, 'NV':27.0, 'CA':28.0, 'LA':29.0, 'AR':30.0, 'IL':31.0,
'MS':32.0, 'NH':33.0, 'MN':34.0, 'OK':35.0, 'NE':36.0, 'OR':37.0, 'PA':38.0, 'DE':39.0, 'AZ':40.0, 'WV':41.0, 'RI':42.0,
'AK':43.0, 'HI':44.0}
df['location.state'] = df['location.state'].map(locationState_mapping)
df['vehicle.type'].unique()
'''
array(['suv', 'car', 'truck', 'minivan', 'van'], dtype=object)
'''
vehicleType_mapping= {'suv':0.0, 'car':1.0,'truck':2.0,'minivan':3.0,'van':4.0}
df['vehicle.type'] = df['vehicle.type'].map(vehicleType_mapping)
其他列的值比较繁多,所以不做考虑,删掉它们就可以了
df['vehicle.make'].unique()
df['vehicle.model'].unique()
df = df.drop(labels=['location.city','location.country','vehicle.make','vehicle.model'],axis=1)
检查数据
df.head()
归一化与切分数据集
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
y = df.pop('vehicle.type')
X = df
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y)
数据训练
import sklearn
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)
# Feature Importances
features = X.columns
feature_imps = rfc.feature_importances_#特征重要性
# 重要特征的可视化
fig = px.bar(x=features, y=feature_imps)
fig.update_layout({'title':{'text':"Feature Importnace", 'x':0.5}})
fig.show()
#多分类线性模型
from sklearn.svm import LinearSVC
linear_svm = LinearSVC().fit(X_train,y_train)
print("模型train得分:{:.3f}".format(linear_svm.score(X_train,y_train)))
print("模型test得分:{:.3f}".format(linear_svm.score(X_valid,y_valid)))
'''
模型train得分:0.624
模型test得分:0.643
'''
# Prediction
pred = linear_svm.predict(X_valid)
linear_svm_mse = mean_squared_error(y_valid, pred)
linear_svm_rmse = np.sqrt(linear_svm_mse)
linear_svm_r2 = r2_score(y_valid, pred)
print(linear_svm_mse,linear_svm_rmse,linear_svm_r2)
'''
0.5399863294600137 0.7348376211517845 -0.11486343099011109
'''