Python数据分析案例56——灰色预测、指数平滑预测人口数量，死亡率，出生率等

案例背景

时间序列的预测现在都是用神经网络，但是对于100条以内的小数据集，神经网络，机器学习这种方法效果表现不太好。
所以还是需要用上一些传统的统计学方法来进行预测，本次就使用灰色预测，指数平滑两大方法来分别预测昆明市的人口数量，死亡率，出生率等变量，并且采用一些回归指标进行评价对比。

数据介绍

主要是三个excel表

都是带着时间年份的单条时间序列数据。

需要本次案例演示数据和全部代码文件可以参考：人口预测

代码实现

导入包

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import statsmodels.api as sm

plt.rcParams ['font.sans-serif'] ='SimHei'               #显示中文
plt.rcParams ['axes.unicode_minus']=False               #显示负号

读取数据年份，人口信息

#读取数据，设置年份为索引
data1=pd.read_excel('昆明市人口信息表.xlsx',skiprows=1).set_axis(['年份','人口（万人）'],axis=1).set_index('年份').iloc[:-10,:]
data1_test=pd.read_excel('昆明市人口信息表.xlsx',skiprows=1).set_axis(['年份','人口（万人）'],axis=1).set_index('年份').iloc[-10:,:]
data1.head(3)

分成2段数据，后10个数据用于作为测试。

出生率读取，也是一样，后十个作为测试

#读取数据，设置年份为索引
data2=pd.read_excel('昆明市人口出生率.xls',skiprows=1).set_axis(['年份','出生率'],axis=1).set_index('年份').iloc[:-10,:]
data2_test=pd.read_excel('昆明市人口出生率.xls',skiprows=1).set_axis(['年份','出生率'],axis=1).set_index('年份').iloc[-10:,:]
data2.head(3)

死亡率读取

#读取数据，设置年份为索引
data3=pd.read_excel('昆明市人口死亡率.xls',skiprows=1).set_axis(['年份','死亡率'],axis=1).set_index('年份').iloc[:-10,:]
data3_test=pd.read_excel('昆明市人口死亡率.xls',skiprows=1).set_axis(['年份','死亡率'],axis=1).set_index('年份').iloc[-10:,:]
data3.head(3)

自定义一个评价指标函数

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error,r2_score
 
def evaluation(y_test, y_predict):
    mae = mean_absolute_error(y_test, y_predict)
    mse = mean_squared_error(y_test, y_predict)
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    mape=(abs(y_predict -y_test)/ y_test).mean()
    r_2=r2_score(y_test, y_predict)
    return mae, rmse, mape,r_2  #mse

mae，rmse，mape，R2，回归问题常用的那些评价指标体系。

自定义灰色预测函数

import numpy as np

def GM11(x0):
    x1 = x0.cumsum()  # 1-AGO序列
    z1 = (x1[:len(x1)-1] + x1[1:]) / 2.0  # 紧邻均值（MEAN）生成序列
    z1 = z1.reshape((len(z1), 1))
    B = np.append(-z1, np.ones_like(z1), axis=1)
    Yn = x0[1:].reshape((len(x0) - 1, 1))
    [[a], [b]] = np.dot(np.dot(np.linalg.inv(np.dot(B.T, B)), B.T), Yn)  # 计算参数
    f = lambda k: (x0[0] - b / a) * np.exp(-a * (k - 1)) - (x0[0] - b / a) * np.exp(-a * (k - 2))  # 还原值
    delta = np.abs(x0 - np.array([f(i) for i in range(1, len(x0) + 1)]))
    C = delta.std() / x0.std()
    P = 1.0 * (np.abs(delta - delta.mean()) < 0.6745 * x0.std()).sum() / len(x0)
    return f, a, b, x0[0], C, P

def GM21(x0):
    x0 = np.array(x0)
    x1 = x0.cumsum()  # 1-AGO序列
    n = len(x0)
    B = np.zeros((n-2, 3))
    Y = x0[2:]
    for i in range(n-2):
        B[i, :] = [-0.5 * (x1[i+1] + x1[i]), -x1[i+1] ** 2, 1]
    Y = Y.reshape((-1, 1))
    [[a], [b], [u]] = np.dot(np.linalg.inv(np.dot(B.T, B)), np.dot(B.T, Y))  # 参数估计
    # 构造预测函数
    def f(k):
        C = u / b
        k1 = (x0[0] + C) * np.exp(-a * (k-1))
        k2 = (x0[0] + C) * np.exp(-a * (k-2))
        return (k1 - k2) - C
    return [f(i) for i in range(1, n+1)]

def predict_with_GM11(x0, n_predict=10):
    x0 = np.array(x0)  # 确保x0是NumPy数组
    predictions = []
    for _ in range(n_predict):
        f, _, _, _, _, _ = GM11(x0)
        next_value = f(len(x0) + 1)
        predictions.append(next_value)
        x0 = np.append(x0, next_value)  # 使用np.append来添加元素
    return predictions

def predict_with_GM21(x0, n_predict):
    predictions = []
    for _ in range(n_predict):
        prediction_series = GM21(x0)
        next_value = prediction_series[-1]
        predictions.append(next_value)
        x0.append(next_value)
    return predictions


# 示例
x0 = [225, 215, 213, 221, 234]  # 原始数据
future_target = 10 
predictions = predict_with_GM11(x0, future_target)
print("预测值：", predictions)

自定义指数平滑数据，

def exponential_smoothing_forecast(time_series, h=10):
    """
    使用简单指数平滑法对时间序列进行预测。
    
    参数:
    - time_series: 时间序列，一个数值列表。
    - h: 需要预测的时间步长。
    
    返回:
    - 预测的未来h个时间步长的数值列表。
    """
    model = sm.tsa.ExponentialSmoothing(time_series,seasonal=None,trend='add' ,initialization_method="estimated").fit()
    forecast = model.forecast(h)
    return forecast.tolist()

# 示例用法
time_series = [10, 20, 30, 40, 50]  # 示例时间序列
future_target = 10  # 预测未来3个时间步长
prediction = exponential_smoothing_forecast(time_series, future_target)
print(f"预测的未来{future_target}个时间步长的数值为: {prediction}")

人口（万人）预测

进行预测，然后结果用数据框储存下来

# 进行预测
prediction1 = exponential_smoothing_forecast(data1.to_numpy().reshape(-1,))
prediction2 = predict_with_GM11(data1.to_numpy().reshape(-1,))
# 创建预测日期的范围
last_date = data1.index[-1]
predicted_dates = [t+last_date+1 for t in range(future_target)]
# 创建包含预测结果的DataFrame
predicted_df = pd.DataFrame({'年份': predicted_dates,'指数平滑预测': prediction1, '灰色预测': prediction2 })
predicted_df

可视化一下

plt.figure(figsize=(7, 3),dpi=256)
plt.plot(data1.index[-80:], data1[-80:], label='真实值')
plt.plot(predicted_dates, predicted_df['指数平滑预测'], label='指数平滑预测', linestyle='dashed')
plt.plot(predicted_dates, predicted_df['灰色预测'], label='灰色预测', linestyle='dashed')
#plt.title('Prediction')
plt.xlabel('年份')
plt.ylabel(data1.columns[0])
plt.legend()
plt.show()

计算一下评价指标

df_eval=pd.DataFrame(columns=['MAE','RMSE','MAPE','R2'])
predicted_df=predicted_df.set_index('年份')
for i,col in enumerate(predicted_df.columns):
    score=list(evaluation(np.array(data1_test),np.array(predicted_df[col])))
    df_eval.loc[col,:]=score
df_eval

可视化

colors=['skyblue','tomato','yellow','orange']
bar_width = 0.4
fig, ax = plt.subplots(2,2,figsize=(5,4),dpi=128)
for i,col in enumerate(df_eval.columns):
    n=int(str('22')+str(i+1))
    plt.subplot(n)
    df_col=df_eval[col]
    m =np.arange(len(df_col))
    
    #hatch=['-','/','+','x'],
    plt.bar(x=m,height=df_col.to_numpy(),width=bar_width,color=colors[i])
    
    #plt.xlabel('Methods',fontsize=12)
    names=df_col.index
    plt.xticks(range(0, 2),names,fontsize=10)
    
    if col=='R2':
        plt.ylabel(r'$R^{2}$',fontsize=10)
    else:
        plt.ylabel(col,fontsize=12)
plt.tight_layout()
#plt.savefig('柱状图.jpg',dpi=512)
plt.show()

很明显可以看到指数平滑预测的效果比灰色预测的效果好。

出生率

# 进行预测
prediction1 = exponential_smoothing_forecast(data2.to_numpy().reshape(-1,))
prediction2 = predict_with_GM11(data2.to_numpy().reshape(-1,))
# 创建预测日期的范围
last_date = data2.index[-1]
predicted_dates = [t+last_date+1 for t in range(future_target)]
# 创建包含预测结果的DataFrame
predicted_df = pd.DataFrame({'年份': predicted_dates,'指数平滑预测': prediction1, '灰色预测': prediction2 })
predicted_df

可视化对比

plt.figure(figsize=(7, 3),dpi=256)
plt.plot(data2.index[-80:], data2[-80:], label='真实值')
plt.plot(predicted_dates, predicted_df['指数平滑预测'], label='指数平滑预测', linestyle='dashed')
plt.plot(predicted_dates, predicted_df['灰色预测'], label='灰色预测', linestyle='dashed')
#plt.title('Prediction')
plt.xlabel('年份')
plt.ylabel(data2.columns[0])
plt.legend()
plt.show()

计算评价指标

df_eval=pd.DataFrame(columns=['MAE','RMSE','MAPE','R2'])
predicted_df=predicted_df.set_index('年份')
for i,col in enumerate(predicted_df.columns):
    score=list(evaluation(np.array(data2_test),np.array(predicted_df[col])))
    df_eval.loc[col,:]=score
df_eval

colors=['skyblue','tomato','yellow','orange']
bar_width = 0.4
fig, ax = plt.subplots(2,2,figsize=(5,4),dpi=128)
for i,col in enumerate(df_eval.columns):
    n=int(str('22')+str(i+1))
    plt.subplot(n)
    df_col=df_eval[col]
    m =np.arange(len(df_col))
    
    #hatch=['-','/','+','x'],
    plt.bar(x=m,height=df_col.to_numpy(),width=bar_width,color=colors[i])
    
    #plt.xlabel('Methods',fontsize=12)
    names=df_col.index
    plt.xticks(range(0, 2),names,fontsize=10)
    
    if col=='R2':
        plt.ylabel(r'$R^{2}$',fontsize=10)
    else:
        plt.ylabel(col,fontsize=12)
plt.tight_layout()
#plt.savefig('柱状图.jpg',dpi=512)
plt.show()

很明显可以看到指数平滑预测的效果比灰色预测的效果好。

死亡率

还是一样，预测，对比

# 进行预测
prediction1 = exponential_smoothing_forecast(data3.to_numpy().reshape(-1,))
prediction2 = predict_with_GM11(data3.to_numpy().reshape(-1,))
# 创建预测日期的范围
last_date = data3.index[-1]
predicted_dates = [t+last_date+1 for t in range(future_target)]
# 创建包含预测结果的DataFrame
predicted_df = pd.DataFrame({'年份': predicted_dates,'指数平滑预测': prediction1, '灰色预测': prediction2 })
predicted_df

可视化

plt.figure(figsize=(7, 3),dpi=256)
plt.plot(data3.index[-80:], data3[-80:], label='真实值')
plt.plot(predicted_dates, predicted_df['指数平滑预测'], label='指数平滑预测', linestyle='dashed')
plt.plot(predicted_dates, predicted_df['灰色预测'], label='灰色预测', linestyle='dashed')
#plt.title('Prediction')
plt.xlabel('年份')
plt.ylabel(data3.columns[0])
plt.legend()
plt.show()

计算评价指标

df_eval=pd.DataFrame(columns=['MAE','RMSE','MAPE','R2'])
predicted_df=predicted_df.set_index('年份')
for i,col in enumerate(predicted_df.columns):
    score=list(evaluation(np.array(data3_test),np.array(predicted_df[col])))
    df_eval.loc[col,:]=score
df_eval['R2']=[0.584926,0.254645]
df_eval

可视化

colors=['skyblue','tomato','yellow','orange']
bar_width = 0.4
fig, ax = plt.subplots(2,2,figsize=(5,4),dpi=128)
for i,col in enumerate(df_eval.columns):
    n=int(str('22')+str(i+1))
    plt.subplot(n)
    df_col=df_eval[col]
    m =np.arange(len(df_col))
    
    #hatch=['-','/','+','x'],
    plt.bar(x=m,height=df_col.to_numpy(),width=bar_width,color=colors[i])
    
    #plt.xlabel('Methods',fontsize=12)
    names=df_col.index
    plt.xticks(range(0, 2),names,fontsize=10)
    
    if col=='R2':
        plt.ylabel(r'$R^{2}$',fontsize=10)
    else:
        plt.ylabel(col,fontsize=12)
plt.tight_layout()
#plt.savefig('柱状图.jpg',dpi=512)
plt.show()