Pandas处理时间序列之预测

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

一、移动平均过程（MA）

移动平均过程（Moving Average process）是一种常见的时间序列模型，用于描述时间序列数据中的平稳随机变动。在移动平均过程中，当前时刻的观测值被认为是过去几个时刻的随机干扰项的加权和，因此它主要用于对数据中的短期波动进行建模。
移动平均过程模型可表示为：

示例如下：

#读取文件
air_passengers = pd.read_csv("/home/mw/input/demo2813/AirPassengers.csv", header = 0, parse_dates = [0], names = ['Month', 'Passengers'], index_col = 0)


#对原始乘客数量数据进行对数转换，并输出转换后的数据中索引为1到9的部分
log_air_passengers = np.log(air_passengers.Passengers)
log_air_passengers[1:10]

'''
Month
1949-02-01    4.770685
1949-03-01    4.882802
1949-04-01    4.859812
1949-05-01    4.795791
1949-06-01    4.905275
1949-07-01    4.997212
1949-08-01    4.997212
1949-09-01    4.912655
1949-10-01    4.779123
Name: Passengers, dtype: float64
'''


# 对时间序列数据进行差分，以便进行季节性调整或去除趋势
log_air_passengers_diff = log_air_passengers - log_air_passengers.shift()


# 对数乘客数量差分数据中删除缺失值，并输出删除缺失值后的数据中索引为1到9的部分
log_air_passengers_diff.dropna(inplace=True)
log_air_passengers_diff[1:10]

'''
Month
1949-03-01    0.112117
1949-04-01   -0.022990
1949-05-01   -0.064022
1949-06-01    0.109484
1949-07-01    0.091937
1949-08-01    0.000000
1949-09-01   -0.084557
1949-10-01   -0.133531
1949-11-01   -0.134733
Name: Passengers, dtype: float64
'''

二、自回归过程（AR）

自回归过程（Autoregressive process）是一种常见的时间序列模型，用于描述时间序列数据中的自相关性。在自回归过程中，当前时刻的观测值被认为是过去几个时刻的观测值的线性组合，因此它主要用于对数据中的长期趋势和自相关性进行建模。
自回归过程模型可表示为：

示例如下：

三、ARIMA模型

定义

ARIMA 模型是一种常用的时间序列分析和预测方法，它结合了自回归（AR）模型、差分（I）和移动平均（MA）模型的特性。ARIMA 模型适用于具有一定程度的趋势和季节性的时间序列数据。

基本思想

ARIMA 模型的基本思想是，通过差分操作使非平稳时间序列变成平稳序列，然后应用自回归和移动平均模型来拟合数据。通过调整 AR、I 和 MA 的阶数，可以有效地捕捉数据中的趋势和季节性，并进行有效的预测。

建模步骤

ARIMA 模型的建模过程通常包括以下步骤：
1.确定时间序列的平稳性：通过观察时间序列图和自相关图等方法来确定是否需要进行差分操作使时间序列平稳化。
2.确定 ARIMA 模型的参数：通过观察自相关图和偏自相关图来确定 AR 和 MA 的阶数，以及可能需要的差分次数。
3.拟合 ARIMA 模型：使用确定的参数对 ARIMA 模型进行拟合，并对模型进行评估。
4.预测未来值：利用拟合好的 ARIMA 模型进行未来值的预测。

注意：
1.自回归项的数量（p）
2.移动平均项数（q）
3.所采用的差异数

在使用ARIMA模型时，经常会用到自相关函数（ACF）和偏自相关函数（PACF）来帮助确定模型的参数。

# 计算对数乘客数量差分数据的自相关函数（ACF）和偏自相关函数（PACF）
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(log_air_passengers_diff.values, nlags = 20)
lag_pacf = pacf(log_air_passengers_diff.values, nlags = 20)

注意查看图首次越过置信区间上限的位置

plt.subplot(121) 
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--')
plt.axhline(y=-1.96/np.sqrt(len(log_air_passengers_diff)),linestyle='--')
plt.axhline(y=1.96/np.sqrt(len(log_air_passengers_diff)),linestyle='--')

# 绘制 ACF 函数的图表，并在图表中添加参考线以及置信区间的边界线
plt.subplot(121) 
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--')
plt.axhline(y=-1.96/np.sqrt(len(log_air_passengers_diff)),linestyle='--')
plt.axhline(y=1.96/np.sqrt(len(log_air_passengers_diff)),linestyle='--')

注意查看图首次越过置信区间上限的位置

from statsmodels.tsa.arima_model import ARIMA


# 拟合 ARIMA 模型到时间序列数据上，并将原始数据的差分值与模型拟合值进行可视化比较，并计算残差平方和作为模型拟合的评估指标
model = ARIMA(log_air_passengers, order=(2, 1, 0))  # 创建 ARIMA 模型对象
results_AR = model.fit(disp=-1)  # 使用 fit 方法拟合 ARIMA 模型到时间序列数据上，并将拟合结果存储在 results_AR 变量中
plt.plot(log_air_passengers_diff)  # 绘制原始时间序列数据的差分数据
plt.plot(results_AR.fittedvalues, color='red')  #绘制 ARIMA 模型拟合后的值
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-log_air_passengers_diff)**2)) # 计算残差平方和，将其显示在标题中

'''
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  % freq, ValueWarning)
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:191: FutureWarning: Creating a DatetimeIndex by passing range endpoints is deprecated.  Use `pandas.date_range` instead.
  start=index[0], end=index[-1], freq=freq)
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  % freq, ValueWarning)
/opt/conda/lib/python3.6/site-packages/pandas/plotting/_converter.py:129: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()
  warnings.warn(msg, FutureWarning)
Text(0.5, 1.0, 'RSS: 1.5023')
'''

# 拟合 ARIMA 模型到时间序列数据上，并将原始数据的差分值与模型拟合值进行可视化比较，并计算残差平方和作为模型拟合的评估指标
model = ARIMA(log_air_passengers, order=(0, 1, 2))  # 调整参数
results_MA = model.fit(disp=-1)  
plt.plot(log_air_passengers_diff)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-log_air_passengers_diff)**2))

'''
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  % freq, ValueWarning)
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  % freq, ValueWarning)
Text(0.5, 1.0, 'RSS: 1.4721')
'''

model = ARIMA(log_air_passengers, order=(2, 1, 2))  #调整参数
results_ARIMA = model.fit(disp=-1)  
plt.plot(log_air_passengers_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-log_air_passengers_diff)**2))

'''
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  % freq, ValueWarning)
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  % freq, ValueWarning)
Text(0.5, 1.0, 'RSS: 1.0292')

'''

# 获取 ARIMA 模型在训练数据上的预测结果
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print(predictions_ARIMA_diff.head())

'''
Month
1949-02-01    0.009580
1949-03-01    0.017491
1949-04-01    0.027670
1949-05-01   -0.004521
1949-06-01   -0.023890
dtype: float64
'''


# 将 ARIMA 模型的拟合值进行累积求和，以便更好地理解模型对时间序列数据的整体预测效果
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print(predictions_ARIMA_diff_cumsum.head())

'''
Month
1949-02-01    0.009580
1949-03-01    0.027071
1949-04-01    0.054742
1949-05-01    0.050221
1949-06-01    0.026331
dtype: float64
'''


# 将ARIMA模型的预测结果从对数尺度转换回原始数据尺度，以便更直观地理解模型对未来数据的预测情况
predictions_ARIMA_log = pd.Series(log_air_passengers.ix[0], index=log_air_passengers.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()

'''
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
Month
1949-01-01    4.718499
1949-02-01    4.728079
1949-03-01    4.745570
1949-04-01    4.773241
1949-05-01    4.768720
dtype: float64
'''


# 绘制原始时间序列数据和ARIMA模型的预测结果
predictions_ARIMA = np.exp(predictions_ARIMA_log) # 将之前转换回原始数据尺度的结果恢复成原始数据尺度
plt.plot(air_passengers) # 绘制原始的时间序列数据
plt.plot(predictions_ARIMA) # 绘制ARIMA模型的预测结果