机器学习 CarRentalData数据集分析和预测

介绍数据集

fuelType：燃料类型
rating：评级
renterTripsTaken：租房者出行
reviewCount：审阅计数
location.city：位置.城市
location.country：地点.国家/地区
location.latitude：位置.纬度
location.longitude：位置.长度
location.state：位置状态
owner.id：所有者id
rate.daily：每日费率
vehicle.make：车辆制造
vehicle.model：车辆型号
vehicle.type：车辆类型
vehicle.year：车辆.年

实战演练

数据集展示

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly_express as px
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#导入数据
df = pd.read_csv('C:/CarRentalData.csv',engine='python')
df.head()

#数据集大小
df.shape 
#(5851, 15)

在这里插入图片描述

数据的可视化

fuelType

df_fuelType = pd.DataFrame(df.fuelType.value_counts()).reset_index()
df_fuelType.rename(columns = {'index':'fuelType', 'fuelType':'count'}, inplace=True)

fig = px.pie(df_fuelType, values = 'count', names='fuelType', title = 'Fuel Type',hole=.3, 
             color_discrete_map={'ELECTRIC':'lightcyan',
                                 'HYBRID':'cyan',
                                 'GASOLINE':'royalblue',
                                 'DIESEL':'darkblue'})
                                 
fig.show()

在这里插入图片描述

rating

print("Rating Statistics:")
print(df['rating'].describe())

'''
Rating Statistics:
count    5350.000000
mean        4.920325
std         0.182425
min         1.000000
25%         4.900000
50%         5.000000
75%         5.000000
max         5.000000
Name: rating, dtype: float64
'''

fig = px.histogram(df, x = 'rating', title = 'Histogram of Rental Car Rating')
fig.show()

在这里插入图片描述

renterTripsTaken

print("Renter Trips Taken Statistics:")
print(df['renterTripsTaken'].describe())

'''
Renter Trips Taken Statistics:
count    5851.000000
mean       33.477354
std        41.898954
min         0.000000
25%         5.000000
50%        18.000000
75%        46.000000
max       395.000000
Name: renterTripsTaken, dtype: float64
'''

fig = px.histogram(df, x = 'renterTripsTaken', title = 'Histogram of Renter Trips Taken')
fig.show()

在这里插入图片描述

reviewCount

print("Review Count Statistics:")
print(df['reviewCount'].describe())

'''
Review Count Statistics:
count    5851.000000
mean       28.454794
std        35.136113
min         0.000000
25%         4.000000
50%        16.000000
75%        39.000000
max       321.000000
Name: reviewCount, dtype: float64
'''

fig = px.histogram(df, x = 'reviewCount', title = 'Histogram of Review Count')
fig.show()

在这里插入图片描述

Car Rentals by City

import plotly.graph_objects as go

def get_average_lat_long(city, ltype):
    choices = df[df['location.city'] == city]
    lat = choices['location.latitude'].mean()
    long = choices['location.longitude'].mean()
    if ltype == 0:
        return lat
    else:
        return long
        
df_location = pd.DataFrame(df['location.city'].value_counts()).reset_index()
df_location.rename(columns = {'index':'city', 'location.city':'count'}, inplace=True)
df_location['latitude'] = df_location['city'].apply(lambda x: get_average_lat_long(x, 0))
df_location['longitude'] = df_location['city'].apply(lambda x: get_average_lat_long(x, 1))

df_location['text'] = df_location['city'] + '<br>Car Rentals ' + (df_location['count']).astype(str)
limits = [(0,20),(21,50),(51,150),(151,200),(201,1000),(1001,1500)]
colors = ["royalblue","orange","lightgrey","lightseagreen","red","crimson"]
scale = 0.5
fig = go.Figure()

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_location[lim[0]:lim[1]]
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['count']/scale,
            color = colors[i],
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1])))

fig.update_layout(
        title_text = 'Car Rentals by City',
        showlegend = True,
        geo = dict(
            scope = 'usa',
            landcolor = 'rgb(217, 217, 217)',
        )
    )

fig.show()

在这里插入图片描述

Car Rentals by State

df_state = pd.DataFrame(df['location.state'].value_counts()).reset_index()
df_state.rename(columns = {'index':'state', 'location.state':'count'}, inplace=True)

fig = go.Figure(data=go.Choropleth(
    locations=df_state['state'], # 空间坐标
    z = df_state['count'].astype(float), # 要进行颜色编码的数据
    locationmode = 'USA-states', # 位置集与locations中的条目匹配
    colorscale = 'Reds',
    colorbar_title = "Number of Cars Rented",
))

fig.update_layout(
    title_text = 'Car Rentals by State',
    geo_scope='usa', 
)

fig.show()

在这里插入图片描述

own.id

df_owner = pd.DataFrame(df['owner.id'].value_counts()).reset_index()
df_owner.rename(columns = {'index':'owner_id', 'owner.id':'number of rental cars'}, inplace=True)

print('Total Number of Unique Rental Cars per Owner Statistics:')
print(df_owner['number of rental cars'].describe())

'''
Total Number of Unique Rental Cars per Owner Statistics:
count    3093.000000
mean        1.891691
std         2.789205
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        49.000000
Name: number of rental cars, dtype: float64
'''

fig = px.histogram(df_owner, x = 'number of rental cars', title='Total Number of Unique Rental Cars per Owner')
fig.show()

在这里插入图片描述

rate.daily

print('Daily Rate of Car Rental Statistics:')
print(df['rate.daily'].describe())

'''
Daily Rate of Car Rental Statistics:
count    5851.000000
mean       93.691506
std        96.080920
min        20.000000
25%        45.000000
50%        69.000000
75%       110.000000
max      1500.000000
Name: rate.daily, dtype: float64
'''

fig = px.histogram(df, x = 'rate.daily', title='Daily Rate of Car Rental')
fig.show()

在这里插入图片描述

Make and Model of Top 25 Most Rented Cars

df_make_model = df.groupby(['vehicle.make', 'vehicle.model']).size().reset_index()
df_make_model.rename(columns = {0:'count'}, inplace=True)
df_make_model.replace('Mercedes-benz', 'Mercedes-Benz', inplace=True)
df_make_model['make_count'] = df_make_model['vehicle.make'].apply(lambda x : df_make_model[df_make_model['vehicle.make'] == x]['count'].sum())
df_make_model.sort_values(by = 'make_count', ascending=False, inplace=True)

fig = px.bar(df_make_model[df_make_model['make_count'] >45], x = 'vehicle.make', y='count', color = 'vehicle.model', title='Make and Model of Top 25 Most Rented Cars')
fig.update_layout(showlegend = False)
fig.show()

在这里插入图片描述

Vehicle Type of Rented Cars

df_vehicleType = pd.DataFrame(df['vehicle.type'].value_counts()).reset_index()
df_vehicleType.rename(columns = {'index':'vehicle.type', 'vehicle.type':'count'}, inplace=True)

fig = px.pie(df_vehicleType, values = 'count', names='vehicle.type', title = 'Vehicle Type of Rented Cars')
fig.show()

在这里插入图片描述

Year of Vehicle

print('Vehicle Year Statistics:')
print(df['vehicle.year'].describe())

'''
Vehicle Year Statistics:
count    5851.000000
mean     2015.340113
std         4.050813
min      1955.000000
25%      2014.000000
50%      2016.000000
75%      2018.000000
max      2020.000000
Name: vehicle.year, dtype: float64
'''

fig = px.histogram(df, x = 'vehicle.year', title='Year of Vehicle')
fig.show()

在这里插入图片描述

热力图

plt.figure(figsize=(14,7))
sns.heatmap(df.corr(), annot=True)

在这里插入图片描述

数据预处理

#统计缺失值
df.isnull().sum()

'''
fuelType               75
rating                501
renterTripsTaken        0
reviewCount             0
location.city           0
location.country        0
location.latitude       0
location.longitude      0
location.state          0
owner.id                0
rate.daily              0
vehicle.make            0
vehicle.model           0
vehicle.type            0
vehicle.year            0
dtype: int64
'''

#数据集各列的数据类型，是否为空值，内存占用情况
df.info()

'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5851 entries, 0 to 5850
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fuelType            5776 non-null   object 
 1   rating              5350 non-null   float64
 2   renterTripsTaken    5851 non-null   int64  
 3   reviewCount         5851 non-null   int64  
 4   location.city       5851 non-null   object 
 5   location.country    5851 non-null   object 
 6   location.latitude   5851 non-null   float64
 7   location.longitude  5851 non-null   float64
 8   location.state      5851 non-null   object 
 9   owner.id            5851 non-null   int64  
 10  rate.daily          5851 non-null   int64  
 11  vehicle.make        5851 non-null   object 
 12  vehicle.model       5851 non-null   object 
 13  vehicle.type        5851 non-null   object 
 14  vehicle.year        5851 non-null   int64  
dtypes: float64(3), int64(5), object(7)
memory usage: 685.8+ KB
'''

df['fuelType'].unique()

'''
array(['ELECTRIC', 'HYBRID', 'GASOLINE', nan, 'DIESEL'], dtype=object)
'''

df['fuelType'].value_counts().reset_index()
'''
	index	fuelType
0	GASOLINE	4810
1	ELECTRIC	622
2	HYBRID	274
3	DIESEL	70
'''
#填补缺失值
df['fuelType'] = df['fuelType'].fillna('GASOLINE',axis=0)

#求众数
mode = df['rating'].mode()
mode = int(mode)
mode
'''
5
'''
df['rating'] = df['rating'].fillna(mode,axis=0)

pd.isnull(df).sum()
'''
fuelType              0
rating                0
renterTripsTaken      0
reviewCount           0
location.city         0
location.country      0
location.latitude     0
location.longitude    0
location.state        0
owner.id              0
rate.daily            0
vehicle.make          0
vehicle.model         0
vehicle.type          0
vehicle.year          0
dtype: int64
'''

可以发现缺失值已经填补完毕，没有缺失值了

df['fuelType'].unique()
'''
array(['ELECTRIC', 'HYBRID', 'GASOLINE', 'DIESEL'], dtype=object)
'''
#映射
fuelType_mapping= {'ELECTRIC':0.0, 'HYBRID':1.0,'GASOLINE':2.0,'DIESEL':3.0}
df['fuelType'] = df['fuelType'].map(fuelType_mapping)

df['location.state'].unique()
'''
array(['WA', 'NM', 'GA', 'SC', 'FL', 'TX', 'NC', 'CT', 'MA', 'ME', 'AL',
       'MT', 'TN', 'KY', 'ID', 'UT', 'MD', 'DC', 'IA', 'OH', 'CO', 'VA',
       'MI', 'NJ', 'IN', 'WI', 'KS', 'MO', 'NV', 'CA', 'LA', 'AR', 'IL',
       'MS', 'NH', 'MN', 'OK', 'NE', 'OR', 'PA', 'DE', 'AZ', 'WV', 'RI',
       'AK', 'HI'], dtype=object)
'''

locationState_mapping= {'WA':0.0, 'NM':1.0, 'GA':2.0, 'SC':3.0, 'FL':4.0, 'TX':5.0, 'NC':6.0, 'CT':7.0, 'MA':8.0, 'ME':9.0, 'AL':10.0,
       'MT':11.0, 'TN':12.0, 'KY':13.0, 'ID':14.0, 'UT':15.0, 'MD':16.0, 'DC':17.0, 'IA':18.0, 'OH':19.0, 'CO':20.0, 'VA':21.0,
       'MI':21.0, 'NJ':22.0, 'IN':23.0, 'WI':24.0, 'KS':25.0, 'MO':26.0, 'NV':27.0, 'CA':28.0, 'LA':29.0, 'AR':30.0, 'IL':31.0,
       'MS':32.0, 'NH':33.0, 'MN':34.0, 'OK':35.0, 'NE':36.0, 'OR':37.0, 'PA':38.0, 'DE':39.0, 'AZ':40.0, 'WV':41.0, 'RI':42.0,
       'AK':43.0, 'HI':44.0}
df['location.state'] = df['location.state'].map(locationState_mapping)

df['vehicle.type'].unique()
'''
array(['suv', 'car', 'truck', 'minivan', 'van'], dtype=object)
'''
vehicleType_mapping= {'suv':0.0, 'car':1.0,'truck':2.0,'minivan':3.0,'van':4.0}
df['vehicle.type'] = df['vehicle.type'].map(vehicleType_mapping)

其他列的值比较繁多，所以不做考虑，删掉它们就可以了

df['vehicle.make'].unique()
df['vehicle.model'].unique()

df = df.drop(labels=['location.city','location.country','vehicle.make','vehicle.model'],axis=1)

检查数据

df.head()

在这里插入图片描述

归一化与切分数据集

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

y = df.pop('vehicle.type')
X = df
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y)

数据训练

import sklearn
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OrdinalEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)
# Feature Importances
features = X.columns
feature_imps = rfc.feature_importances_#特征重要性

# 重要特征的可视化
fig = px.bar(x=features, y=feature_imps)
fig.update_layout({'title':{'text':"Feature Importnace", 'x':0.5}})
fig.show()

在这里插入图片描述

#多分类线性模型
from sklearn.svm import  LinearSVC
linear_svm = LinearSVC().fit(X_train,y_train)
print("模型train得分:{:.3f}".format(linear_svm.score(X_train,y_train)))
print("模型test得分:{:.3f}".format(linear_svm.score(X_valid,y_valid)))
'''
模型train得分:0.624
模型test得分:0.643
'''

# Prediction
pred = linear_svm.predict(X_valid)

linear_svm_mse = mean_squared_error(y_valid, pred)
linear_svm_rmse = np.sqrt(linear_svm_mse)
linear_svm_r2 = r2_score(y_valid, pred)

print(linear_svm_mse,linear_svm_rmse,linear_svm_r2)
'''
0.5399863294600137 0.7348376211517845 -0.11486343099011109
'''