python高效使用方法03_pandas中的rolling.mean滚动求均值效率究竟如何？

news2025/7/4 20:42:28

在这里插入图片描述
先上图来说明pandas.rolling(n).mean()滚动求均值的方法效率其实并不是最高的，我自己尝试使用cython把滚动求均值的方法重新编译了一下，发现效率总体上是pandas的三倍以上。
总结：pandas比较合适用于普通的研究分析工作，如果用到追求高效率的生产环境中，需要考虑要不要改写具体的函数。

代码：
效率对比代码：

import numpy as np
import pandas as pd
import time
from pyecharts import options as opts
from pyecharts.charts import Bar
from cal_rolling_mean_by_cython import cal_rolling_mean_by_cython

# n_rows = 100
# n_times = 1000
# np.random.seed(1)
# df = pd.DataFrame({i: np.random.randn(n_rows) for i in ['open', 'high', 'low', 'close', 'volume']})
# df['ma_1'] = df['close'].rolling(20).mean()
# df['ma_2'] = cal_rolling_mean_by_cython(df['close'].to_numpy(), 20)
# df = df.dropna()
# a = list(df['ma_1'])
# b = list(df['ma_2'])
# if not a[0] == b[0]:
#     print(a[0], b[0])
#     print(df[['ma_1', 'ma_2']])
# assert list(df['ma_1']) == list(df['ma_2'])
# 经过验证，发现数据精度存在不同，在无数位小数点之后，存在不一样的情况，如0.22949341595210063 ！= 0.22949341595210065

def test_func_time(n_rows, n_times=1000):
    df = pd.DataFrame({i: np.random.randn(n_rows) for i in ['open', 'high', 'low', 'close', 'volume']})
    # 测试np.where的效率
    begin_time = time.perf_counter()
    for i in range(n_times):
        df['ma_1'] = df['close'].rolling(20).mean()
    end_time = time.perf_counter()
    consume_time_where = end_time - begin_time
    print(f"rolling.mean耗费的时间：{consume_time_where}")
    # 测试np.select的效率
    begin_time = time.perf_counter()
    for i in range(n_times):
        df['ma_2'] = cal_rolling_mean_by_cython(df['close'].to_numpy(), 20)
    end_time = time.perf_counter()
    consume_time_select = end_time - begin_time
    print(f"cal_rolling_mean_by_cythont耗费的时间：{consume_time_select}")
    # assert df['ma_1'].equals(df['ma_2']), "检查错误"
    return [consume_time_where, consume_time_select]


if __name__ == '__main__':
    r1 = test_func_time(n_rows=1000)
    r2 = test_func_time(n_rows=100000)
    r3 = test_func_time(n_rows=10000000)
    c = (
        Bar()
            .add_xaxis(["1千行", "十万行", "一千万行"])
            .add_yaxis("rolling_mean_by_pandas", [r1[0], r2[0], r3[0]])
            .add_yaxis("cal_rolling_mean_by_cython", [r1[1], r2[1], r3[1]])
            .reversal_axis()
            .set_series_opts(label_opts=opts.LabelOpts(position="right"))
            .set_global_opts(title_opts=opts.TitleOpts(title="耗费时间对比"))
            #.render("d:/result/夏普率耗费时间对比.html")
            .render("./rolling_mean_by_pandas和cal_rolling_mean_by_cython耗费时间对比.html")
    )

计算滚动平均的cython代码文件：cal.pyx

#cython: language_level=3
#distutils:language=c
#cython: c_string_type=unicode, c_string_encoding=utf8

import numpy as np
from numpy import NaN as nan
cimport numpy as np
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(False)
cpdef cal_rolling_mean_by_cython(np.ndarray[np.double_t, ndim=1] arr,int look_back_days):
    cdef int data_len = arr.shape[0]
    cdef np.ndarray[np.double_t, ndim=1] result = np.empty(data_len)
    cdef int i
    cdef int j
    cdef double sum=0.0
    for i in range(look_back_days-1):
        result[i] = nan
    for i in range(look_back_days-1, data_len):
        sum = 0.0
        for j in range(i-look_back_days+1,i+1):
             sum += arr[j]
        result[i] = sum/look_back_days
    return result

#def

setup.py

from setuptools import setup, Extension
from Cython.Build import cythonize
import numpy as np
import sys


ext = Extension(
    "cal_rolling_mean_by_cython", sources=["cal.pyx"],
    include_dirs=[np.get_include()],
    language='c'
)

setup(name="cal_rolling_mean_by_cython", ext_modules=cythonize([ext]))