1、导包
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.charts import Grid
2、导数据
t_f_user = pd.read_csv("tianchi_fresh_comp_train_user.csv")
tianchi_fresh_comp_train_user_2w,包含如下字段:
字段 字段说明 提取说明
user_id 用户标识 抽样&字段脱敏
item_id 商品标识 字段脱敏
behavior_type 用户对商品的行为类型 包括浏览、收藏、加购物车、购买,对应取值分别是1、2、3、4。
user_geohash 用户位置的空间标识,可以为空 由经纬度通过保密的算法生成
item_category 商品分类标识 字段脱敏
time 行为时间 精确到小时级别
3、数据探索
t_f_user.shape
(23291027, 6)
t_f_user.info()
t_f_user.describe()
查看缺失值
t_f_user.isnull().sum()
查看重复值
t_f_user.duplicated().sum()
7827917
4、数据预处理
去重
t_f_user.drop_duplicates(keep="last",inplace=True)
转换时间类型
t_f_user["time"] = pd.to_datetime(t_f_user["time"],errors="coerce")
5、统计每日PV和UV数据
t_f_user["date_time"] =t_f_user["time"].dt.date
pv_day = t_f_user[t_f_user["behavior_type"]==1].groupby(["date_time"])["behavior_type"].count()
uv_day = t_f_user[t_f_user["behavior_type"]==1].drop_duplicates(subset=["user_id","date_time"]).groupby("date_time")["user_id"].count()
attr = list(pv_day.index)
pv = (
Line(
init_opts=opts.InitOpts(
width="1000px",
height = "500px"
)
)
.add_xaxis(
xaxis_data=attr
)
.add_yaxis(
"pv",
np.around(pv_day.values/10000,decimals=2),
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
series_name="uv",
yaxis_index=1,
y_axis = np.around(uv_day.values / 10000,decimals=2),
label_opts= opts.LabelOpts(is_show=False)
)
.extend_axis(
yaxis=opts.AxisOpts(
name ="uv",
type_="value",
min_=0,
max_=1.6,
interval=0.4,
axislabel_opts=opts.LabelOpts(formatter="{value}万人")
)
)
.set_global_opts(
tooltip_opts=opts.TooltipOpts(
is_show=True,
trigger="axis",
axis_pointer_type="cross"
),
xaxis_opts= opts.AxisOpts(
type_="category",
axispointer_opts=opts.AxisPointerOpts(
is_show = True,
type_ = "shadow"
)
),
yaxis_opts=opts.AxisOpts(
name="pv",
type_="value",
min_ =0,
max_ = 100,
interval=20,
axislabel_opts= opts.LabelOpts(formatter="{value}万次"),
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True)
),
title_opts = opts.TitleOpts(title="PV与UV趋势图")
)
)
pv.render_notebook()
通过 PV、UV趋势图可以发现在双十二期间PV量和 UV量都有了明显的提升,并在12月12日达到顶峰。 非活动期间 PV和UV 每天波动性都不是很大,其中 PV 值在42万上下小幅度波动,UV值在1.2万上下小幅度波动
6、pv、uv日差异分析
pv_uv_diff = pd.concat([pv_day,uv_day],join="outer",axis=1)
pv_uv_diff.columns=["pv_diff","uv_diff"]
pv_uv_diff = pv_uv_diff.diff()
attr = pv_uv_diff.index
pv = pv_uv_diff["pv_diff"]
uv = pv_uv_diff["uv_diff"]
diff =(
Line(
init_opts=opts.InitOpts(
width="1000px",
height="500px"
)
)
.add_xaxis(xaxis_data=attr)
.add_yaxis(
"新增PPV",
pv,
label_opts=opts.LabelOpts(is_show=False)
)
.extend_axis(
yaxis=opts.AxisOpts(
name="新增UV",
type_ = "value",
min_ = -2000,
max_ = 1600,
axislabel_opts=opts.LabelOpts(formatter="{value}")
)
)
.set_global_opts(
tooltip_opts=opts.TooltipOpts(
is_show=True,
trigger="axis",
axis_pointer_type="cross"
),
xaxis_opts=opts.AxisOpts(
type_="category",
axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"),
),
yaxis_opts=opts.AxisOpts(
name="新增PV",
type_="value",
min_=-350000,
max_=250000,
interval=100000,
axislabel_opts=opts.LabelOpts(formatter="{value}"),
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
title_opts = opts.TitleOpts(title="pv、uv日差异分析")
)
)
diff2 =(
Line()
.add_xaxis(xaxis_data=attr)
.add_yaxis(
"新增uv",
uv ,
yaxis_index='1',
label_opts=opts.LabelOpts(is_show=False)
)
)
t =diff.overlap(diff2)
t.render_notebook()
7、不同时期用户行为分析
collect = t_f_user[t_f_user["behavior_type"]==2].groupby("date_time")["behavior_type"].count()
shapping_cart = t_f_user[t_f_user["behavior_type"]==3].groupby("date_time")["behavior_type"].count()
buy = t_f_user[t_f_user["behavior_type"]==4].groupby("date_time")["behavior_type"].count()
sns.set()
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
fig = plt.figure(figsize=(10,5))
ax=plt.subplot()
plt.plot(collect.index,collect.values,color="g",marker="^")
plt.plot(shapping_cart.index,shapping_cart.values,color="r",marker="^")
plt.plot(buy.index,buy.values,color="b",marker="^")
plt.legend(["收藏","添加购物车","购买"])
plt.title("不同时期用户行为分析")
plt.yticks([i for i in range(0,50001,10000)])
plt.xticks(collect.index,rotation=90)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.ylabel("人数")
plt.show()
8、不同时期的转化率
pv_day = t_f_user[t_f_user["behavior_type"]==1].groupby(["date_time"])["behavior_type"].count()
buy_day = t_f_user[t_f_user["behavior_type"]==4].groupby(["date_time"])["behavior_type"].count()
rate = np.around(buy_day.values / pv_day.values,3)
attr = buy_day.index
b_rate =(
Line()
.add_xaxis(xaxis_data=buy_day.index.to_list())
.add_yaxis(
series_name= "转化率",
y_axis=rate
)
.set_global_opts(title_opts=opts.TitleOpts(title="不同时期的转化率"))
)
b_rate.render_notebook()
seaborn 绘图
rate = np.around(buy_day / pv_day,3).reset_index()
sns.set()
fig = plt.figure(figsize=(10,5))
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
sns.lineplot(data = rate,x="date_time",y="behavior_type",markers="o")
plt.xlabel("日期")
plt.ylabel("转化率")
plt.title("不同时期转化率")
plt.xticks(rate["date_time"],rotation=90)
plt.show()
9、选取活动数据子集和日常数据子集
由于数据里面包含双十二大促的数据,因此整理分析用户的不同时段行为可能会导致分析结果与实际差异较大,因此拆分开来做不同的对比分析
active_data=t_f_user[t_f_user["date_time"].isin(["2014/12/11","2014/12/12","2014/12/13"])]
daily_data =t_f_user[~t_f_user["date_time"].isin(["2014/12/11","2014/12/12","2014/12/13"])]
双十二期间不同时段的用户行为分析
t_f_user["hour"] = t_f_user["time"].dt.hour
active_collect_hour = active_data[active_data["behavior_type"]==2].groupby("hour")["behavior_type"].count()
active_shapping_cart_hour = active_data[active_data["behavior_type"]==3].groupby("hour")["behavior_type"].count()
active_buy_hour = active_data[active_data["behavior_type"]==4].groupby("hour")["behavior_type"].count()
active_pv_hour = active_data[active_data["behavior_type"]==1].groupby("hour")["user_id"].count()
active_collect_hour_m = np.round(active_collect_hour.values / 3,0).tolist()
active_shapping_cart_hour_m = np.round(active_shapping_cart_hour.values / 3,0).tolist()
active_buy_hour_m = np.round(active_buy_hour.values / 3,0).tolist()
active_pv_hour_m = np.round(active_pv_hour.values /3,0).tolist()
attr = list(active_collect_hour.index)
plt.figure(figsize=(12,8))
ax1 = plt.subplot(2,1,1)
ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)
ax1 = plt.bar(attr,active_pv_hour_m)
plt.title("双十二期间各时间段PV值")
plt.xticks([i for i in range(0,24,1)])
ax2 = plt.subplot(2,1,2)
ax2.spines["top"].set_visible(False)
ax2.spines["right"].set_visible(False)
ax2.plot(attr,active_collect_hour_m,label="收藏",color="b",marker="^")
ax2.plot(attr,active_shapping_cart_hour_m,label="添加购物车",color="r",marker="^")
ax2.plot(attr,active_buy_hour_m ,label="购买",color="g",marker="^")
ax2.legend(loc="best")
plt.title("正常期间各时间段用户行为")
plt.xticks([i for i in range(0,24,1)])
plt.show()
日常期间不同时段的用户行为分析
from pyecharts.charts import Bar
daily_collect_hour = daily_data[daily_data["behavior_type"]==2].groupby("hour")["behavior_type"].count()
daily_shapping_cart_hour = daily_data[daily_data["behavior_type"]==3].groupby("hour")["behavior_type"].count()
daily_buy_hour = daily_data[daily_data["behavior_type"]==4].groupby("hour")["behavior_type"].count()
daily_pv_hour =daily_data[daily_data["behavior_type"]==1].groupby("hour")["user_id"].count()
daily_collect_hour_m = np.round(daily_collect_hour.values / 3,0).tolist()
daily_shapping_cart_hour_m = np.round(daily_shapping_cart_hour.values / 3,0).tolist()
daily_buy_hour_m = np.round(daily_buy_hour.values / 3,0).tolist()
daily_pv_hour_m = np.round(daily_pv_hour.values /3,0).tolist()
pv_m=(
Bar()
.add_xaxis(xaxis_data=attr)
.add_yaxis(
"浏览人数",
daily_pv_hour_m ,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(
title_opts=opts.TitleOpts(title="日常PV各时间段数据"),
)
)
daily_m =(
Line(init_opts=opts.InitOpts(width="1000px",height="500px"))
.add_xaxis(xaxis_data=attr)
.add_yaxis(
"加购人数",
daily_shapping_cart_hour_m ,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
"收藏人数",
daily_collect_hour_m,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
"购买人数",
daily_buy_hour_m ,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=15)),
title_opts=opts.TitleOpts(title="日均各时段活动用户行为",pos_top="48%"),
legend_opts=opts.LegendOpts(pos_top="48%"),
)
)
ggrid = (
Grid()
.add(pv_m, grid_opts=opts.GridOpts(pos_bottom="60%"))
.add(daily_m , grid_opts=opts.GridOpts(pos_top="60%"))
)
ggrid.render_notebook()
不同时段的购买率
#活动期间
active_hour_buy_user_num = active_data[active_data["behavior_type"] == 4].drop_duplicates(['user_id','date_time','hour']).groupby('hour')['user_id'].count()
active_hour_user_num = active_data.drop_duplicates(['user_id','date_time','hour']).groupby('hour')['user_id'].count()
active_hour_buy_rate = np.around(active_hour_buy_user_num.values / active_hour_user_num.values,3)
#日常期间
daily_hour_buy_user_num = daily_data[daily_data["behavior_type"] == 4].drop_duplicates(['user_id','date_time','hour']).groupby('hour')['user_id'].count()
daily_hour_user_num = daily_data.drop_duplicates(['user_id','date_time','hour']).groupby('hour')['user_id'].count()
daily_buy_rate = np.around(daily_hour_buy_user_num.values / daily_hour_user_num.values,3)
buy_rate=(
Line()
.add_xaxis(xaxis_data=attr)
.add_yaxis(
"日常购买率",
daily_buy_rate,
)
.add_yaxis(
"活动购买率",
active_hour_buy_rate,
)
.set_global_opts(title_opts=opts.TitleOpts(title="不同时段购买率"))
)
buy_rate.render_notebook()
转化漏斗分析
活动期间的转化漏斗
active_pv_num = active_data[active_data["behavior_type"]==1]["user_id"].count()
active_collect_num=active_data[active_data["behavior_type"]==2]["user_id"].count()
active_cart_num=active_data[active_data["behavior_type"]==3]["user_id"].count()
active_buy_num=active_data[active_data["behavior_type"]==4]["user_id"].count()
attr=["点击","加入购物车","收藏","购买"]
values=[np.around((active_pv_num/active_pv_num*100),2),
np.around((active_cart_num/active_pv_num*100),2),
np.around((active_collect_num/active_pv_num*100),2),
np.around((active_buy_num/active_pv_num*100),2),
]
data = [[attr[i], values[i]] for i in range(len(attr))]
a=(
Funnel()
.add(
series_name="用户行为",
data_pair=data,
gap=2,
tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b} : {c}%",is_show=True),
label_opts=opts.LabelOpts(is_show=True, position="ourside"),
itemstyle_opts=opts.ItemStyleOpts(border_color="#fff", border_width=1),
)
.set_global_opts(title_opts=opts.TitleOpts(title="用户转化漏斗", subtitle="活动"))
)
a.render_notebook()
日常漏斗
daily_pv_num = daily_data[daily_data["behavior_type"]==1]["user_id"].count()
daily_collect_num=daily_data[daily_data["behavior_type"]==2]["user_id"].count()
daily_cart_num=daily_data[daily_data["behavior_type"]==3]["user_id"].count()
daily_buy_num=daily_data[daily_data["behavior_type"]==4]["user_id"].count()
attr=["点击","加入购物车","收藏","购买"]
values=[np.around((daily_pv_num/daily_pv_num*100),2),
np.around((daily_cart_num/daily_pv_num*100),2),
np.around((daily_collect_num/daily_pv_num*100),2),
np.around((daily_buy_num/daily_pv_num*100),2),
]
data = [[attr[i], values[i]] for i in range(len(attr))]
a=(
Funnel()
.add(
series_name="用户行为",
data_pair=data,
gap=2,
tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b} : {c}%",is_show=True),
label_opts=opts.LabelOpts(is_show=True, position="ourside"),
itemstyle_opts=opts.ItemStyleOpts(border_color="#fff", border_width=1),
)
.set_global_opts(title_opts=opts.TitleOpts(title="用户转化漏斗", subtitle="活动"))
)
a.render_notebook()
按照“点击-加入购物车-收藏-购买”这一用户行为路径,我们可通过优化“点击-加入购物车”这一环节进而提升购买的转化率,可以通过鼓励用户收藏加购后可以领券来刺激用户加购收藏从而刺激用户的购买欲望。