在分析数据之前,我们需要剔除异常值的影响,也就是在某个分组情况下,标准差过大(标准差越大,证明情况越不稳定),如果标准差比较小,就算是最小值和最大值差的比较大,我也认为他是一个比较平稳的波动。 方差分析这个老师讲的很好:[https://www.bilibili.com/video/BV1jB4y1676T/?spm_id_from=333.788&vd_source=642d9a85cff4a726a7de10f2383987df]
Step 6:Reduce Std.
grouped_data = df_replen. groupby( [ 'mot' , 'country' , 'priority' , 'cc' ] ) . agg( { 'lt_pu_pod' : [ 'std' , 'count' , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } )
grouped_data = grouped_data. reset_index( )
grouped_data. columns = [ 'mot' , 'country' , 'priority' , 'cc' , 'std' , 'count' , 'quantile_90' , 'quantile_10' ]
grouped_data
mot country priority cc std count quantile_90 quantile_10 0 SEA AE 40 AD NaN 1 10.360 10.360
3150 rows × 8 columns
for group_i in grouped_data. index:
group_mot = grouped_data. loc[ group_i, 'mot' ]
group_country = grouped_data. loc[ group_i, 'country' ]
group_priority = grouped_data. loc[ group_i, 'priority' ]
group_cc = grouped_data. loc[ group_i, 'cc' ]
group_std = grouped_data. loc[ group_i, 'std' ]
group_count = grouped_data. loc[ group_i, 'count' ]
group_quantile_90 = grouped_data. loc[ group_i, 'quantile_90' ]
group_quantile_10 = grouped_data. loc[ group_i, 'quantile_10' ]
if group_count>= 5 and group_std>= 8 :
index_replen = df_replen[ ( df_replen[ 'mot' ] == group_mot) & ( df_replen[ 'country' ] == group_country) & ( df_replen[ 'priority' ] == group_priority)
& ( ( df_replen[ 'lt_pu_pod' ] >= group_quantile_90) | ( df_replen[ 'lt_pu_pod' ] <= group_quantile_10) )
] . index
for repl_i in index_replen:
df_replen. loc[ repl_i, 'Type_' ] = 'Y'
df_replen_1 = df_replen[ df_replen[ 'Type_' ] . isnull( ) ]
df_replen_1 = df_replen_1[ ~ df_replen_1[ 'hawb' ] . isnull( ) & ~ df_replen_1[ 'ts_atd' ] . isnull( ) & ~ df_replen_1[ 'cc' ] . isnull( ) & ~ df_replen_1[ 'weight' ] . isnull( ) ] . reset_index( ) . drop( 'index' , axis= 1 )
df_replen_1. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103166 entries, 0 to 103165
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 hawb 103233 non-null object
1 mot 103233 non-null object
2 geo 103166 non-null object
3 country 103166 non-null object
4 shippingfrequency 103166 non-null object
5 isgreen 103166 non-null object
6 priority 103166 non-null object
7 forwarder 103166 non-null object
8 cc 93166 non-null object
9 dn_gr_status 103166 non-null object
10 volume 103166 non-null float64
11 weight 103166 non-null float64
12 dn_seq_qty 103166 non-null float64
13 ts_pu 103166 non-null object
14 ts_atd 103166 non-null object
15 ts_ata 103038 non-null object
16 ts_cc 102855 non-null object
17 ts_pod 103166 non-null object
18 ts_atd_pre_2_date 103166 non-null object
19 lt_pu_pod 103166 non-null float64
20 lt_pu_atd 103166 non-null float64
21 lt_atd_ata 103166 non-null float64
22 data_source 103166 non-null object
23 Type_ 0 non-null object
dtypes: float64(6), object(18)
memory usage: 18.9+ MB
grouped_data_1 = df_replen_1. groupby( [ 'mot' , 'country' , 'priority' , 'cc' ] ) . agg( { 'lt_pu_pod' : [ 'std' , 'count' , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } )
grouped_data_1 = grouped_data_1. reset_index( )
grouped_data_1. columns = [ 'mot' , 'country' , 'priority' , 'cc' , 'std' , 'count' , 'quantile_90' , 'quantile_10' ]
grouped_data_1
mot country priority cc std count quantile_90 quantile_10 0 AIR ER 30 AD NaN 1 16.333 16.333
3017 rows × 8 columns
print ( 'Reduce std' , 'As is:【' , grouped_data[ 'std' ] . mean( ) , '-->' , '】To be:【' , grouped_data_1[ 'std' ] . mean( ) , '】' )
Reduce std As is:【 4.281785006069334 --> 】To be:【 2.748443864082784 】
Step 7:Research Relationship
research weight&leadtime relationship
for mot in df_replen_1[ 'geo' ] . unique( ) . tolist( ) :
plt. figure( figsize= ( 10 , 6 ) )
sns. heatmap( data= df_replen_1[ ( df_replen_1[ 'geo' ] == mot) ] . corr( ) , cmap= 'coolwarm' , annot= True , fmt= ".2f" , cbar= True )
plt. title( "Correlation between volume,qty,weight and lt_pu_pod mot" )
plt. xlabel( "{0}" . format ( mot) )
plt. ylabel( "country" )
plt. xticks( rotation= 45 )
plt. show( )
for mot in df_replen_1[ 'mot' ] . unique( ) . tolist( ) :
plt. figure( figsize= ( 10 , 6 ) )
sns. heatmap( data= df_replen_1[ ( df_replen_1[ 'mot' ] == mot) ] . corr( ) , cmap= 'coolwarm' , annot= True , fmt= ".2f" , cbar= True )
plt. title( "Correlation between volume,qty,weight and lt_pu_pod mot" )
plt. xlabel( "{0}" . format ( mot) )
plt. ylabel( "country" )
plt. xticks( rotation= 45 )
plt. show( )
corr = df_replen_1. corr( ) . iloc[ 0 , 1 ]
for t, group in df_replen_1. groupby( 'mot' ) :
plt. scatter( group[ 'volume' ] , group[ 'lt_pu_atd' ] , label= t)
corr = group. corr( ) . iloc[ 0 , 2 ]
plt. title( f"Weight vs Time ( { t. title( ) } ) (Correlation: { corr: .2f } )" )
plt. xlabel( "volume" )
plt. ylabel( "Time" )
plt. legend( )
plt. show( )
corr = df_replen_1. corr( ) . iloc[ 0 , 1 ]
for t, group in df_replen_1. groupby( 'mot' ) :
plt. scatter( group[ 'weight' ] , group[ 'lt_pu_pod' ] , label= t)
corr = group. corr( ) . iloc[ 0 , 2 ]
plt. title( f"Weight vs Time ( { t. title( ) } ) (Correlation: { corr: .2f } )" )
plt. xlabel( "Weight" )
plt. ylabel( "Time" )
plt. legend( )
plt. show( )
方差1、GEO与LT_PU_POD的关联性
方差分析
原假设:每个GEO直接与LT_PU_POD没有关联性
在独立样本T检验中,自由度的计算方法与样本数量有关。如果你有4个geo,假设每个geo对应一个独立的样本组别,那么自由度的计算如下:
自由度 = (样本1的观测数量 - 1) + (样本2的观测数量 - 1) + … + (样本4的观测数量 - 1)
具体计算时,需要知道每个geo对应的样本数量。假设分别为n1、n2、n3、n4,则自由度为 (n1 - 1) + (n2 - 1) + (n3 - 1) + (n4 - 1)。
df_replen_1. groupby( [ 'geo' ] ) . agg( { 'lt_pu_pod' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
geo lt_pu_pod std count mean <lambda_0> <lambda_1> <lambda_2> 0 EA 3.444 47424 8.940807 6.35 17.53 3.98 1 Brazil NaN 1 7.830000 7.83 7.83 7.83 2 MX 8.325913 22476 7.995670 6.34 9.43 3.87 3 NE 7.462291 7542 15.752510 14.32 26.35 7.39 4 PJ 5.333 6861 9.633206 8.12 17.43 5.43
from scipy. stats import f_oneway
geo1_data = df_replen_1[ df_replen_1[ 'geo' ] == 'AP' ] [ 'lt_pu_pod' ]
geo2_data = df_replen_1[ df_replen_1[ 'geo' ] == 'EMEA' ] [ 'lt_pu_pod' ]
geo3_data = df_replen_1[ df_replen_1[ 'geo' ] == 'LAS' ] [ 'lt_pu_pod' ]
geo4_data = df_replen_1[ df_replen_1[ 'geo' ] == 'NA' ] [ 'lt_pu_pod' ]
f_stat, p_value = f_oneway( geo1_data, geo2_data, geo3_data, geo4_data)
print ( "单因素方差分析结果:" )
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
单因素方差分析结果:
F统计量: 2676.3050920291266
p值: 0.0
当p值等于0时,通常表示在检验中观察到的差异极其显著。这意味着在假设检验中,得到的样本数据非常不可能出现,或者可以说观察到的差异非常显著,远远超过了我们预期的随机差异。
方差2、MOT与LT_PU_POD的关联性
from scipy. stats import f_oneway
mot1_data = df_replen_1[ df_replen_1[ 'mot' ] == 'AIR' ] [ 'lt_pu_pod' ]
mot2_data = df_replen_1[ df_replen_1[ 'mot' ] == 'SEA' ] [ 'lt_pu_pod' ]
mot3_data = df_replen_1[ df_replen_1[ 'mot' ] == 'TRUCK' ] [ 'lt_pu_pod' ]
f_stat, p_value = f_oneway( mot1_data, mot2_data, mot3_data)
print ( "单因素方差分析结果:" )
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
单因素方差分析结果:
F统计量: 42951.20078674416
p值: 0.0
df_replen_1. groupby( [ 'mot' ] ) . agg( { 'lt_pu_pod' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
mot lt_pu_pod std count mean <lambda_0> <lambda_1> <lambda_2> 0 SEA 3.3333 90039 7.913256 6.81 13.34 4.13 1 SEA 10.307048 10752 45.999 20.59 41.52 13.28 2 TRUCK 0.903336 2375 4.215651 3.99 5.01 3.40
方差3、priority与LT_PU_POD的关联性
from scipy. stats import f_oneway
mot1_data = df_replen_1[ df_replen_1[ 'priority' ] == '20' ] [ 'lt_pu_pod' ]
mot2_data = df_replen_1[ df_replen_1[ 'priority' ] == '40' ] [ 'lt_pu_pod' ]
mot3_data = df_replen_1[ df_replen_1[ 'priority' ] == '60' ] [ 'lt_pu_pod' ]
mot4_data = df_replen_1[ df_replen_1[ 'priority' ] == 'PPF' ] [ 'lt_pu_pod' ]
f_stat, p_value = f_oneway( mot1_data, mot2_data, mot3_data, mot4_data)
print ( "单因素方差分析结果:" )
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
单因素方差分析结果:
F统计量: 6366.387676680081
p值: 0.0
df_replen_1. groupby( [ 'priority' ] ) . agg( { 'lt_pu_pod' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
priority lt_pu_pod std count mean <lambda_0> <lambda_1> <lambda_2> 0 33 5.207475 16000 8.501019 6.88 14.32 4.11 1 21 4.303070 66818 7.983886 6.84 13.38 4.08 2 81 11.294788 19865 15.009006 13.24 29.42 4.45 3 TUU 2.236996 483 5.569814 4.87 7.80 3.77
方差4、cc与LT_PU_POD的关联性
df_replen_1. groupby( [ 'cc' ] ) . agg( { 'lt_pu_pod' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
cc lt_pu_pod std count mean <lambda_0> <lambda_1> <lambda_2> 0 AD 7.705418 2190 10.173146 7.115 20.460 4.219 1 AN 6.972004 1994 9.171650 6.850 17.357 4.080 2 BA 14.731705 130 26.287077 21.140 48.357 13.240 3 BC 7.259393 2936 9.819053 7.120 19.810 4.140 4 BI 11.435282 1089 20.745960 18.560 37.430 8.948
from scipy. stats import f_oneway
data_list = [ ]
for category in df_replen_1[ 'cc' ] . unique( ) . tolist( ) :
category_data = df_replen_1[ df_replen_1[ 'cc' ] == category] [ 'lt_pu_pod' ]
data_list. append( category_data)
plt. figure( figsize= ( 25 , 6 ) )
sns. boxplot( data= data_list)
plt. xlabel( 'Category' )
plt. ylabel( 'lt_pu_pod' )
plt. title( 'Boxplot of lt_pu_pod for different categories' )
plt. xticks( range ( len ( df_replen_1[ 'cc' ] . unique( ) . tolist( ) ) ) , df_replen_1[ 'cc' ] . unique( ) . tolist( ) , rotation= 45 )
plt. show( )
f_stat, p_value = f_oneway( * data_list)
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
F统计量: 165.67483191021196
p值: 0.0
方差5、isgreen与LT_PU_POD的关联性
df_replen_1. groupby( [ 'isgreen' ] ) . agg( { 'lt_pu_pod' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
isgreen lt_pu_pod std count mean <lambda_0> <lambda_1> <lambda_2> 0 GL1 6.818674 5810 10.340127 7.55 20.31 5.05 1 GL2 0.836612 6 4.210000 4.74 4.76 3.13
data_list = [ ]
for category in df_replen_1[ 'isgreen' ] . unique( ) . tolist( ) :
category_data = df_replen_1[ df_replen_1[ 'isgreen' ] == category] [ 'lt_pu_pod' ]
data_list. append( category_data)
plt. figure( figsize= ( 25 , 6 ) )
sns. boxplot( data= data_list)
plt. xlabel( 'Category' )
plt. ylabel( 'lt_pu_pod' )
plt. title( 'Boxplot of lt_pu_pod for different categories' )
plt. xticks( range ( len ( df_replen_1[ 'isgreen' ] . unique( ) . tolist( ) ) ) , df_replen_1[ 'isgreen' ] . unique( ) . tolist( ) , rotation= 45 )
plt. show( )
f_stat, p_value = f_oneway( * data_list)
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
F统计量: 49.45782385934109
p值: 1.2062238051192215e-41
方差6、shippingfrequency与LT_PU_POD的关联性
df_replen_1. groupby( [ 'shippingfrequency' ] ) . agg( { 'lt_pu_pod' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
shippingfrequency lt_pu_pod std count mean <lambda_0> <lambda_1> <lambda_2> 0 Bi-Weekly(Thu) 17.664727 130 24.453231 15.940 54.18 9.731 1 Bi-Weekly(Tue) 9.306314 29 36.938276 35.590 49.57 23.954 2 Bi-Weekly(Wed) 2.271108 16 4.734375 3.840 5.48 3.820 3 Daily 6.448783 35322 7.890003 6.760 10.38 4.400
from scipy. stats import f_oneway
data_list = [ ]
for category in df_replen_1[ 'shippingfrequency' ] . unique( ) . tolist( ) :
category_data = df_replen_1[ df_replen_1[ 'shippingfrequency' ] == category] [ 'lt_pu_pod' ]
data_list. append( category_data)
plt. figure( figsize= ( 25 , 6 ) )
sns. boxplot( data= data_list)
plt. xlabel( 'Category' )
plt. ylabel( 'lt_pu_pod' )
plt. title( 'Boxplot of lt_pu_pod for different categories' )
plt. xticks( range ( len ( df_replen_1[ 'shippingfrequency' ] . unique( ) . tolist( ) ) ) , df_replen_1[ 'shippingfrequency' ] . unique( ) . tolist( ) , rotation= 45 )
plt. show( )
f_stat, p_value = f_oneway( * data_list)
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
F统计量: 2693.5635083145007
p值: 0.0
方差7、forwarder与LT_PU_POD的关联性
data_list = [ ]
for category in df_replen_1[ 'forwarder' ] . unique( ) . tolist( ) :
category_data = df_replen_1[ df_replen_1[ 'forwarder' ] == category] [ 'lt_pu_pod' ]
data_list. append( category_data)
plt. figure( figsize= ( 25 , 6 ) )
sns. boxplot( data= data_list)
plt. xlabel( 'Category' )
plt. ylabel( 'lt_pu_pod' )
plt. title( 'Boxplot of lt_pu_pod for different categories' )
plt. xticks( range ( len ( df_replen_1[ 'forwarder' ] . unique( ) . tolist( ) ) ) , df_replen_1[ 'forwarder' ] . unique( ) . tolist( ) , rotation= 45 )
plt. show( )
f_stat, p_value = f_oneway( * data_list)
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
F统计量: 12060.165772842343
p值: 0.0
import pandas as pd
import seaborn as sns
import matplotlib. pyplot as plt
cols = [ 'geo' , 'country' , 'mot' , 'volume' , 'lt_pu_pod' ]
data_subset = df_replen_1[ cols]
correlation_matrix = data_subset. groupby( [ 'geo' , 'country' , 'mot' ] ) . corr( ) . loc[ : , 'volume' ] . unstack( )
plt. figure( figsize= ( 24 , 16 ) )
sns. heatmap( data= correlation_matrix, cmap= 'coolwarm' , annot= True , fmt= ".2f" , cbar= True )
plt. title( "Correlation between volume and lt_pu_pod by geo, country, and mot" )
plt. xlabel( "mot" )
plt. ylabel( "country" )
plt. xticks( rotation= 45 )
plt. show( )
Step 5:Research climate<_pu_atd
df_replen_2 = df_replen_1. copy( )
df_replen_2[ 'ts_pu_date' ] = df_replen_2[ 'ts_pu' ] . map ( lambda x: pd. to_datetime( x) . strftime( '%Y-%m-%d' ) )
df_replen_open[ 'ts_pu_date' ] = df_replen_open[ 'ts_pu' ] . map ( lambda x: pd. to_datetime( x) . strftime( '%Y-%m-%d' ) )
for rep_index in df_replen_2. index:
ts_pu = ( pd. to_datetime( df_replen_2. loc[ rep_index, 'ts_atd_pre_2_date' ] ) )
ts_atd = ( pd. to_datetime( df_replen_2. loc[ rep_index, 'ts_atd' ] ) )
if pd. isna( ts_atd) == False :
if ts_pu > ts_atd:
ts_atd = ( pd. to_datetime( df_replen_2. loc[ rep_index, 'ts_pu' ] ) + datetime. timedelta( 1.5 ) )
if pd. isna( ts_atd) :
ts_atd = ( pd. to_datetime( df_replen_2. loc[ rep_index, 'ts_pu' ] ) + datetime. timedelta( 1.5 ) )
df_climate_tmp = df_climate_workday[ ( df_climate_workday[ 'Date' ] <= ts_atd) & ( df_climate_workday[ 'Date' ] >= ts_pu) ]
holiday_count = len ( df_climate_tmp[ df_climate_tmp[ 'weekday_cat' ] == 'holiday' ] )
climate_count = len ( df_climate_tmp[ df_climate_tmp[ 'Alarm_info_cat' ] == 'Abnormal climate' ] )
if holiday_count == 0 :
df_replen_2. loc[ rep_index, 'holiday_count' ] = '^== 0 hldys'
if holiday_count == 1 :
df_replen_2. loc[ rep_index, 'holiday_count' ] = '^== 1 hldys'
if holiday_count == 2 :
df_replen_2. loc[ rep_index, 'holiday_count' ] = '^== 2 hldys'
if holiday_count > 2 and holiday_count <= 7 :
df_replen_2. loc[ rep_index, 'holiday_count' ] = '^<= 7 hldys'
if holiday_count > 7 and holiday_count <= 14 :
df_replen_2. loc[ rep_index, 'holiday_count' ] = '^<= 14 hldys'
if holiday_count > 14 :
df_replen_2. loc[ rep_index, 'holiday_count' ] = '^> 14 hldys'
if climate_count == 0 :
df_replen_2. loc[ rep_index, 'climate_count' ] = '^== 0 abnr'
if climate_count == 1 :
df_replen_2. loc[ rep_index, 'climate_count' ] = '^== 1 abnr'
if climate_count == 2 :
df_replen_2. loc[ rep_index, 'climate_count' ] = '^== 2 abnr'
if climate_count > 2 and climate_count <= 7 :
df_replen_2. loc[ rep_index, 'climate_count' ] = '^<= 7 abnr'
if climate_count > 7 and climate_count <= 14 :
df_replen_2. loc[ rep_index, 'climate_count' ] = '^<= 14 abnr'
if climate_count > 14 :
df_replen_2. loc[ rep_index, 'climate_count' ] = '^> 14 abnr'
for rep_index in df_replen_open. index:
ts_pu = ( pd. to_datetime( df_replen_open. loc[ rep_index, 'ts_atd_pre_2_date' ] ) )
ts_atd = ( pd. to_datetime( df_replen_open. loc[ rep_index, 'ts_atd' ] ) )
if pd. isna( ts_atd) == False :
if ts_pu > ts_atd:
ts_atd = ( pd. to_datetime( df_replen_open. loc[ rep_index, 'ts_pu' ] ) + datetime. timedelta( 1.5 ) )
if pd. isna( ts_atd) :
ts_atd = ( pd. to_datetime( df_replen_open. loc[ rep_index, 'ts_pu' ] ) + datetime. timedelta( 1.5 ) )
df_climate_tmp = df_climate_workday[ ( df_climate_workday[ 'Date' ] <= ts_atd) & ( df_climate_workday[ 'Date' ] >= ts_pu) ]
holiday_count = len ( df_climate_tmp[ df_climate_tmp[ 'weekday_cat' ] == 'holiday' ] )
climate_count = len ( df_climate_tmp[ df_climate_tmp[ 'Alarm_info_cat' ] == 'Abnormal climate' ] )
if holiday_count == 0 :
df_replen_open. loc[ rep_index, 'holiday_count' ] = '^== 0 hldys'
if holiday_count == 1 :
df_replen_open. loc[ rep_index, 'holiday_count' ] = '^== 1 hldys'
if holiday_count == 2 :
df_replen_open. loc[ rep_index, 'holiday_count' ] = '^== 2 hldys'
if holiday_count > 2 and holiday_count <= 7 :
df_replen_open. loc[ rep_index, 'holiday_count' ] = '^<= 7 hldys'
if holiday_count > 7 and holiday_count <= 14 :
df_replen_open. loc[ rep_index, 'holiday_count' ] = '^<= 14 hldys'
if holiday_count > 14 :
df_replen_open. loc[ rep_index, 'holiday_count' ] = '^> 14 hldys'
if climate_count == 0 :
df_replen_open. loc[ rep_index, 'climate_count' ] = '^== 0 abnr'
if climate_count == 1 :
df_replen_open. loc[ rep_index, 'climate_count' ] = '^== 1 abnr'
if climate_count == 2 :
df_replen_open. loc[ rep_index, 'climate_count' ] = '^== 2 abnr'
if climate_count > 2 and climate_count <= 7 :
df_replen_open. loc[ rep_index, 'climate_count' ] = '^<= 7 abnr'
if climate_count > 7 and climate_count <= 14 :
df_replen_open. loc[ rep_index, 'climate_count' ] = '^<= 14 abnr'
if climate_count > 14 :
df_replen_open. loc[ rep_index, 'climate_count' ] = '^> 14 abnr'
df_replen_2. groupby( [ 'holiday_count' ] ) . agg( { 'lt_pu_atd' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
holiday_count lt_pu_atd std count mean <lambda_0> <lambda_1> <lambda_2> 0 ^<= 7 hldys 2.053809 2204 4.793593 4.40 7.54 2.82 1 ^== 0 hldys 2.184254 52896 2.649606 2.12 5.44 0.44 2 ^== 1 hldys 2.411114 28255 2.461888 2.11 4.61 0.47 3 ^== 2 hldys 1.749583 19811 3.293560 2.90 5.13 1.75
for mot in df_replen_2[ 'mot' ] . unique( ) . tolist( ) :
data_list = [ ]
for category in df_replen_2[ df_replen_2[ 'mot' ] == mot] [ 'climate_count' ] . unique( ) . tolist( ) :
category_data = df_replen_2[ df_replen_2[ 'climate_count' ] == category] [ 'lt_pu_pod' ]
data_list. append( category_data)
plt. figure( figsize= ( 25 , 6 ) )
sns. boxplot( data= data_list)
plt. xlabel( 'Category' )
plt. ylabel( 'lt_pu_pod' )
plt. title( '{0} Boxplot of lt_pu_pod for different categories' . format ( mot) )
plt. xticks( range ( len ( df_replen_2[ df_replen_2[ 'mot' ] == mot] [ 'climate_count' ] . unique( ) . tolist( ) ) ) , df_replen_2[ df_replen_2[ 'mot' ] == mot] [ 'climate_count' ] . unique( ) . tolist( ) , rotation= 45 )
plt. show( )
f_stat, p_value = f_oneway( * data_list)
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
F统计量: 168.19664286910245
p值: 8.91244672555718e-109
F统计量: 168.19664286910245
p值: 8.91244672555718e-109
F统计量: 168.19664286910245
p值: 8.91244672555718e-109
import matplotlib. pyplot as plt
import seaborn as sns
for mot in df_replen_2[ 'mot' ] . unique( ) . tolist( ) :
data_list = [ ]
for category in df_replen_2[ df_replen_2[ 'mot' ] == mot] [ 'holiday_count' ] . unique( ) . tolist( ) :
category_data = df_replen_2[ df_replen_2[ 'holiday_count' ] == category] [ 'lt_pu_pod' ]
data_list. append( category_data)
plt. figure( figsize= ( 25 , 6 ) )
sns. boxplot( data= data_list)
plt. xlabel( 'Category' )
plt. ylabel( 'lt_pu_pod' )
plt. title( '{0} Boxplot of lt_pu_pod for different categories' . format ( mot) )
plt. xticks( range ( len ( df_replen_2[ df_replen_2[ 'mot' ] == mot] [ 'holiday_count' ] . unique( ) . tolist( ) ) ) , df_replen_2[ df_replen_2[ 'mot' ] == mot] [ 'holiday_count' ] . unique( ) . tolist( ) , rotation= 45 )
plt. show( )
f_stat, p_value = f_oneway( * data_list)
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
F统计量: 134.89417962378948
p值: 3.1763774706854116e-87
F统计量: 134.89417962378948
p值: 3.1763774706854116e-87
F统计量: 82.45295180623998
p值: 1.6609295478352026e-36
df_replen_2[ 'ts_pu_date' ] = pd. to_datetime( df_replen_2[ 'ts_pu_date' ] )
df_replen_open[ 'ts_pu_date' ] = pd. to_datetime( df_replen_open[ 'ts_pu_date' ] )
df_replen_2 = pd. merge( df_replen_2, df_climate_workday[ [ 'Date' , 'Alarm_info_cat' , 'weekday_cat' , 'Week' , 'weekday' ] ] , left_on= 'ts_pu_date' , right_on= 'Date' , how= 'left' ) . drop( 'Date' , axis= 1 ) . rename( columns= { '天气' : 'climate' , '分类' : 'climate_category' } )
df_replen_open = pd. merge( df_replen_open, df_climate_workday[ [ 'Date' , 'Alarm_info_cat' , 'weekday_cat' , 'Week' , 'weekday' ] ] , left_on= 'ts_pu_date' , right_on= 'Date' , how= 'left' ) . drop( 'Date' , axis= 1 ) . rename( columns= { '天气' : 'climate' , '分类' : 'climate_category' } )
df_replen_2[ 'lt_pu_ata' ] = ( pd. to_datetime( df_replen_1[ 'ts_ata' ] ) - pd. to_datetime( df_replen_1[ 'ts_pu' ] ) ) . astype( 'timedelta64[D]' ) . astype( float )
df_replen_2[ 'lt_pu_atd' ] = ( pd. to_datetime( df_replen_1[ 'ts_atd' ] ) - pd. to_datetime( df_replen_1[ 'ts_pu' ] ) ) . astype( 'timedelta64[D]' ) . astype( float )
df_climate_workday
Date Week maximum_temperature minimum_temperature climate wind_direction city is_workday is_holiday holiday_name weekday weekday_cat date_alarm Alarm_info Alarm_info_cat unic_version rank_1 0 2022-01-01 星期六 0℃ -9℃ 多云 西风 2级 上海 0 1 New year 5 holiday NaT None None 2023-05-18 16:25:25 1 1 2022-01-02 星期日 8℃ -4℃ 多云 西风 3级 上海 0 0 6 holiday NaT None None 2023-05-18 16:25:25 1 2 2022-01-03 星期一 13℃ 6℃ 晴 东南风 2级 上海 0 1 New Year shift 0 holiday NaT None None 2023-05-18 16:25:25 1 3 2022-01-04 星期二 13℃ 9℃ 多云 东北风 1级 上海 1 0 1 workday NaT None None 2023-05-18 16:25:25 1
502 rows × 17 columns
for mot in df_replen_2[ 'mot' ] . unique( ) . tolist( ) :
data_list = [ ]
for category in df_replen_2[ ( df_replen_2[ 'mot' ] == mot) & ( ~ df_replen_2[ 'weekday' ] . isnull( ) ) ] [ 'weekday' ] . unique( ) . tolist( ) :
category_data = df_replen_2[ ( df_replen_2[ 'mot' ] == mot) & ( ~ df_replen_2[ 'weekday' ] . isnull( ) ) & ( df_replen_2[ 'weekday' ] == category) ] [ 'lt_pu_pod' ]
data_list. append( category_data)
plt. figure( figsize= ( 25 , 6 ) )
sns. boxplot( data= data_list)
plt. xlabel( 'Category' )
plt. ylabel( 'lt_pu_pod' )
plt. title( '{0} Boxplot of lt_pu_pod for different categories' . format ( mot) )
plt. xticks( range ( len ( df_replen_2[ df_replen_2[ 'mot' ] == mot] [ 'weekday' ] . unique( ) . tolist( ) ) ) , df_replen_2[ df_replen_2[ 'mot' ] == mot] [ 'weekday' ] . unique( ) . tolist( ) , rotation= 45 )
plt. show( )
f_stat, p_value = f_oneway( * data_list)
print ( "F统计量:" , f_stat)
print ( "p值:" , p_value)
F统计量: 127.26212162797701
p值: 4.906400437665232e-156
F统计量: 326.9319114756572
p值: 0.0
F统计量: 52.61132203829418
p值: 1.6153144700352586e-42
group_mot_climate_pu_atd = df_replen_2. groupby( [ 'mot' , 'Week' ] ) . agg( { 'lt_pu_atd' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
group_mot_climate_pu_atd. columns = [ 'mot' , 'holiday_count' , 'std' , 'count' , 'mean' , 'quantile_5' , 'quantile_9' , 'quantile_1' ]
group_mot_climate_pu_atd
mot holiday_count std count mean quantile_5 quantile_9 quantile_1 0 AIR 星期一 1.547858 16544 1.896216 2.0 4.0 0.0 1 AIR 星期三 1.566505 15121 2.017195 2.0 4.0 0.0 2 AIR 星期二 1.751875 17681 2.023698 2.0 4.0 0.0 3 AIR 星期五 2.986730 12385 2.426968 2.0 6.0 0.0
group_mot_climate_pu_atd = df_replen_2. groupby( [ 'mot' , 'holiday_count' ] ) . agg( { 'lt_pu_atd' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
group_mot_climate_pu_atd. columns = [ 'mot' , 'holiday_count' , 'std' , 'count' , 'mean' , 'quantile_5' , 'quantile_9' , 'quantile_1' ]
group_mot_climate_pu_atd
mot holiday_count std count mean quantile_5 quantile_9 quantile_1 0 AIR ^<= 7 hldys 1.945878 1761 4.040318 4.0 7.0 2.0 1 AIR ^== 0 hldys 1.994225 46645 2.012199 1.0 5.0 0.0 2 AIR ^== 1 hldys 2.298573 24303 1.870139 2.0 4.0 0.0 3 AIR ^== 2 hldys 1.453881 17330 2.536411 2.0 4.0 1.0 4 SEA ^<= 7 hldys 2.124920 443 5.316027 6.0 7.0 2.0
group_mot_climate_pu_atd = df_replen_2. groupby( [ 'mot' , 'climate_count' ] ) . agg( { 'lt_pu_atd' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
group_mot_climate_pu_atd. columns = [ 'mot' , 'climate_count' , 'std' , 'count' , 'mean' , 'quantile_5' , 'quantile_9' , 'quantile_1' ]
group_mot_climate_pu_atd
mot climate_count std count mean quantile_5 quantile_9 quantile_1 0 AIR ^<= 7 abnr 1.187565 3495 1.614878 1.0 3.0 0.0 1 AIR ^== 0 abnr 2.271866 62167 2.280551 2.0 5.0 0.0 2 AIR ^== 1 abnr 1.276714 12195 1.778598 2.0 3.0 0.0 3 AIR ^== 2 abnr 1.197433 12182 1.746101 2.0 3.0 0.0
group_mot_climate_pu_atd = df_replen_2. groupby( [ 'mot' , 'climate_count' , 'holiday_count' ] ) . agg( { 'lt_pu_atd' : [ 'std' , 'count' , 'mean' , lambda x: x. quantile( 0.5 ) , lambda x: x. quantile( 0.9 ) , lambda x: x. quantile( 0.1 ) ] } ) . reset_index( )
group_mot_climate_pu_atd. columns = [ 'mot' , 'climate_count' , 'holiday_count' , 'std' , 'count' , 'mean' , 'quantile_5' , 'quantile_9' , 'quantile_1' ]
group_mot_climate_pu_atd. sort_values( 'mean' , ascending= False )
mot climate_count holiday_count std count mean quantile_5 quantile_9 quantile_1 17 SEA ^== 0 abnr ^<= 7 hldys 1.830452 272 6.000000 7.0 7.0 2.2
df_replen_2[ 'ts_pu' ] = pd. to_datetime( ( df_replen_2[ 'ts_pu' ] ) )
df_replen_open[ 'ts_pu' ] = pd. to_datetime( ( df_replen_open[ 'ts_pu' ] ) )
df_replen_2[ 'ts_pu_hour' ] = df_replen_2[ 'ts_pu' ] . dt. hour
df_replen_open[ 'ts_pu_hour' ] = df_replen_open[ 'ts_pu' ] . dt. hour
df_replen_2[ 'dayofweek' ] = df_replen_2[ 'ts_pu' ] . dt. dayofweek
df_replen_open[ 'dayofweek' ] = df_replen_open[ 'ts_pu' ] . dt. dayofweek
for mot in df_replen_2[ 'geo' ] . unique( ) . tolist( ) :
plt. figure( figsize= ( 10 , 6 ) )
sns. heatmap( data= df_replen_2[ ( df_replen_2[ 'geo' ] == mot) ] [ [ 'lt_pu_pod' , 'lt_pu_atd' , 'ts_pu_hour' , 'dayofweek' ] ] . corr( ) , cmap= 'coolwarm' , annot= True , fmt= ".2f" , cbar= True )
plt. title( "Correlation between dayofweek,ts_pu_hour and lt_pu_pod lt_pu_atd" )
plt. xlabel( "{0}" . format ( mot) )
plt. ylabel( "country" )
plt. xticks( rotation= 45 )
plt. show( )