import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
from pylab import mpl
mpl. rcParams[ "font.sans-serif" ] = [ "HarmonyOS Sans SC" ]
% matplotlib inline
读取数据
data = pd. read_excel( "气温历史数据.xlsx" )
data. head( )
城市 日期 当日最高温 当日最低温 当日平均温 0 北京 2023-07-02 38.0 24.0 31.0 1 北京 2023-07-01 39.0 24.0 32.0 2 北京 2023-06-30 39.0 25.0 32.0 3 北京 2023-06-29 36.0 22.0 29.0 4 北京 2023-06-28 27.0 20.0 23.0
简单统计分析
data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27900 entries, 0 to 27899
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 城市 27900 non-null object
1 日期 27900 non-null datetime64[ns]
2 当日最高温 10032 non-null float64
3 当日最低温 10032 non-null float64
4 当日平均温 27900 non-null float64
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 1.1+ MB
data. groupby( by= [ "城市" ] ) . apply ( lambda e: e. sort_values( "日期" ) )
城市 日期 当日最高温 当日最低温 当日平均温 城市 上饶 24609 上饶 2019-01-01 NaN NaN 2.5 24608 上饶 2019-01-02 NaN NaN 2.5 24607 上饶 2019-01-03 NaN NaN 3.8 24606 上饶 2019-01-04 NaN NaN 6.4 24605 上饶 2019-01-05 NaN NaN 6.3 ... ... ... ... ... ... ... 青岛 6170 青岛 2023-06-28 28.0 23.0 25.0 6169 青岛 2023-06-29 26.0 22.0 24.0 6168 青岛 2023-06-30 28.0 21.0 24.0 6167 青岛 2023-07-01 27.0 22.0 23.0 6166 青岛 2023-07-02 25.0 23.0 24.0
27900 rows × 5 columns
date_range = pd. date_range( start= data[ "日期" ] . min ( ) , end= data[ "日期" ] . max ( ) , freq= 'D' )
data[ "日期" ] . isin( date_range) . value_counts( )
True 27900
Name: 日期, dtype: int64
n_city = data[ "城市" ] . nunique( )
fig, ax = plt. subplots( 2 , n_city, figsize= ( 49 , 4 ) )
for i, city in enumerate ( data[ "城市" ] . unique( ) ) :
city_data = data[ data[ "城市" ] == city] . sort_values( "日期" )
ax[ 0 ] [ i] . boxplot( [ city_data[ "当日最低温" ] , city_data[ "当日平均温" ] , city_data[ "当日最高温" ] ] ,
medianprops= { 'color' : 'red' , 'linewidth' : '1.5' } ,
showmeans= True ,
meanline= True ,
flierprops= { "marker" : "o" , "markerfacecolor" : "red" , "markersize" : 10 } ,
labels= [ "dMinT" , "dMeanT" , "dMaxT" ]
)
date_interval = city_data[ "日期" ] . max ( ) - city_data[ "日期" ] . min ( )
date_interval = str ( date_interval) . split( " " ) [ 0 ]
ax[ 0 ] [ i] . set_title( str ( city) + " " + date_interval+ "days" )
plt. figure( figsize= ( 12 , 5 ) )
plt. scatter( data[ "日期" ] , data[ "当日最低温" ] )
plt. figure( figsize= ( 12 , 5 ) )
plt. scatter( data[ "日期" ] , data[ "当日平均温" ] )
plt. figure( figsize= ( 12 , 5 ) )
plt. scatter( data[ "日期" ] , data[ "当日最高温" ] )
data[ data[ "城市" ] == "长沙" ] . sort_values( "日期" ) . to_excel( "长沙.xlsx" )
data[ data[ "城市" ] == "南京" ] . sort_values( "日期" ) . to_excel( "南京.xlsx" )
data. isnull( ) . sum ( )
城市 0
日期 0
当日最高温 17868
当日最低温 17868
当日平均温 0
dtype: int64
异常值处理
q3 = data[ "当日平均温" ] . quantile( 0.75 )
q2 = data[ "当日平均温" ] . median( )
q1 = data[ "当日最高温" ] . quantile( 0.25 )
iqr = q3 - q1
ub = q3 + 1.5 * iqr
lb = q1 - 1.5 * iqr
data. loc[ data[ "当日平均温" ] > ub, [ "当日平均温" ] ] = q2
data. loc[ data[ "当日平均温" ] < lb, [ "当日平均温" ] ] = lb
plt. figure( figsize= ( 12 , 5 ) )
plt. scatter( data[ "日期" ] , data[ "当日平均温" ] )
for i, city in enumerate ( data[ "城市" ] . unique( ) ) :
city_data = data[ data[ "城市" ] == city] . sort_values( "日期" )
ax[ 1 ] [ i] . boxplot( [ city_data[ "当日最低温" ] , city_data[ "当日平均温" ] , city_data[ "当日最高温" ] ] ,
medianprops= { 'color' : 'red' , 'linewidth' : '1.5' } ,
showmeans= True ,
meanline= True ,
flierprops= { "marker" : "o" , "markerfacecolor" : "red" , "markersize" : 10 } ,
labels= [ "dMinT" , "dMeanT" , "dMaxT" ]
)
date_interval = city_data[ "日期" ] . max ( ) - city_data[ "日期" ] . min ( )
date_interval = str ( date_interval) . split( " " ) [ 0 ]
ax[ 1 ] [ i] . set_title( str ( city) + " " + date_interval+ "days" )
fig. tight_layout( pad= 0.4 , w_pad= 0 , h_pad= 0 )
fig. savefig( "城市.svg" , bbox_inches= "tight" )
fig