HIve数仓新零售项目
注:大家觉得博客好的话,别忘了点赞收藏呀,本人每周都会更新关于人工智能和大数据相关的内容,内容多为原创,Python Java Scala SQL 代码,CV NLP 推荐系统等,Spark Flink Kafka Hbase Hive Flume等等~写的都是纯干货,各种顶会的论文解读,一起进步。
今天继续和大家分享一下HIve数仓新零售项目
#博学谷IT学习技术支持
文章目录
- HIve数仓新零售项目
- 前言
- 一、Full join模型思路
- 二、商品主题
- 1.确定目标表
- 2.实现代码
- 总结
前言
这是一个线下真实HIve数仓的一个搭建项目,还是比较复杂的,主要和大家一起分享一下整个HIve数仓的思路。
整个项目分为:
1.ODS层
2.DWD层
3.DWB层
4.DWS层
5.DM层
6.RPT层
每一层都有每一层的知识点。我会和大家分享从数据源MySQL开始,如何搭建整个完整的项目。
一、Full join模型思路
- 先统计各自结果小表数据
- 将结果小表数据进行full join
- 通过coalesce(t1.dt,t2.dt,t3.dt) 从full join的结果表中进行数据的抽取
- 去重,将日期和goods_id重复的数据去掉
最后通过group by + sum 对数据进行去重即可
实现低维度,多指标的模型
二、商品主题
上次和大家介绍了Grouping sets 模型,主要适合于多维度,多指标的情况。今天通过Full join方法实现低维度,多指标的情况。
- 维度
日期(day)+商品
- 指标
下单次数、下单件数、下单金额、被支付次数、被支付件数、被支付金额、被退款次数、被退款件数、被退款金额、被加入购物车次数、被加入购物车件数、被收藏次数、好评数、中评数、差评数
1.确定目标表
create table yp_dws.dws_sku_daycount
(
-- 维度字段
dt STRING,
sku_id string comment 'sku_id',
sku_name string comment '商品名称',
-- 指标
order_count bigint comment '被下单次数',
order_num bigint comment '被下单件数',
order_amount decimal(38,2) comment '被下单金额',
payment_count bigint comment '被支付次数',
payment_num bigint comment '被支付件数',
payment_amount decimal(38,2) comment '被支付金额',
refund_count bigint comment '被退款次数',
refund_num bigint comment '被退款件数',
refund_amount decimal(38,2) comment '被退款金额',
cart_count bigint comment '被加入购物车次数',
cart_num bigint comment '被加入购物车件数',
favor_count bigint comment '被收藏次数',
evaluation_good_count bigint comment '好评数',
evaluation_mid_count bigint comment '中评数',
evaluation_bad_count bigint comment '差评数'
) COMMENT '每日商品行为'
--PARTITIONED BY(dt STRING)
ROW format delimited fields terminated BY '\t'
stored AS orc tblproperties ('orc.compress' = 'SNAPPY');
2.实现代码
with t0 as (
select
dt,
goods_id as sku_id,
goods_name as sku_name,
order_id,
buy_num,
total_price,
order_state,
refund_id,
evaluation_id,
geval_scores,
row_number() over (partition by order_id,goods_id) as rk
from yp_dwb.dwb_order_detail
),
-- 被下单次数,被下单件数,被下单金额
t1 as (
select
dt,
sku_id,
sku_name,
count(order_id) as order_count, -- 被下单次数
sum(buy_num) as order_num, -- 被下单件数
sum(total_price) as order_amount -- 被下单金额
from t0
where rk = 1
group by dt,sku_id,sku_name
),
-- 被支付次数,被支付件数,被支付金额
t2 as (
select
dt,
sku_id,
sku_name,
count(order_id) as payment_count, -- 被支付次数
sum(buy_num) as payment_num, -- 被支付件数
sum(total_price) as payment_amount -- 被支付金额
from t0
where rk = 1 and order_state not in (1,7)
group by dt,sku_id,sku_name
),
-- 被退款次数,被退款件数,被退款金额
t3 as(
select
dt,
sku_id,
sku_name,
count(order_id) as refund_count, -- 被退款次数
sum(buy_num) as refund_num, -- 被退款件数
sum(total_price) as refund_amount -- 被退款金额
from t0
where rk = 1 and refund_id is not null
group by dt,sku_id,sku_name
),
-- 被加入购物车次数,被加入购物车件数
-- 通过分析我们发现,之前的DWB层中并没有关联和购物车相关联的表,我们需要去跨层去DWD层去访问数据
-- 通过分析发现fact_shop_cart表没有商品名,所以需要和dim_goods商品表进行关联,获取商品名
t4 as (
select
substring(sc.create_time,1,10) as dt,
sc.goods_id as sku_id,
g.goods_name as sku_name,
count(sc.id) as cart_count, -- 被加入购物车次数
sum(sc.buy_num) as cart_num -- 被加入购物车件数
from yp_dwd.fact_shop_cart sc left join yp_dwd.dim_goods g on sc.goods_id = g.id
where sc.end_date = '9999-99-99'
group by substring(sc.create_time,1,10),goods_id,goods_name
),
-- 被收藏次数
t5 as (
select
substring(gc.create_time,1,10) as dt,
gc.goods_id as sku_id,
g.goods_name as sku_name,
count(gc.id) as favor_count -- 被收藏次数
from yp_dwd.fact_goods_collect gc left join yp_dwd.dim_goods g on gc.goods_id = g.id
where gc.end_date = '9999-99-99'
group by substring(gc.create_time,1,10),gc.goods_id,g.goods_name
),
-- 好评数 中评数,差评数
t6 as (
select
dt,
sku_id,
sku_name,
count(if(geval_scores>=9,evaluation_id,null)) as evaluation_good_count, -- 好评数
count(if(geval_scores<9 and geval_scores > 6,evaluation_id,null)) as evaluation_mid_count, -- 中评数
count(if(geval_scores <= 6,evaluation_id,null)) as evaluation_bad_count -- 差评数
from t0
where rk = 1 and evaluation_id is not null
group by dt,sku_id,sku_name
),
t7 as (
select
coalesce(t1.dt,t2.dt,t3.dt,t4.dt,t5.dt,t6.dt) as dt,
coalesce(t1.sku_id,t2.sku_id,t3.sku_id,t4.sku_id,t5.sku_id,t6.sku_id) as sku_id,
coalesce(t1.sku_name,t2.sku_name,t3.sku_name,t4.sku_name,t5.sku_name,t6.sku_name) as sku_name,
coalesce(t1.order_count,0) as order_count,
coalesce(t1.order_num,0) as order_num,
coalesce(t1.order_amount,0) as order_amount,
coalesce(t2.payment_count,0) as payment_count,
coalesce(t2.payment_num,0) as payment_num,
coalesce(t2.payment_amount,0) as payment_amount,
coalesce(t3.refund_count,0) as refund_count,
coalesce(t3.refund_num,0) as refund_num,
coalesce(t3.refund_amount,0) as refund_amount,
coalesce(t4.cart_count,0) as cart_count,
coalesce(t4.cart_num,0) as cart_num,
coalesce(t5.favor_count,0) as favor_count,
coalesce(t6.evaluation_good_count,0) as evaluation_good_count,
coalesce(t6.evaluation_mid_count,0) as evaluation_mid_count,
coalesce(t6.evaluation_bad_count,0) as evaluation_bad_count
from t1
full join t2 on t1.dt = t2.dt and t1.sku_id = t2.sku_id
full join t3 on t2.dt = t3.dt and t2.sku_id = t3.sku_id
full join t4 on t3.dt = t4.dt and t3.sku_id = t4.sku_id
full join t5 on t4.dt = t5.dt and t4.sku_id = t5.sku_id
full join t6 on t5.dt = t6.dt and t5.sku_id = t6.sku_id
)
-- 对最终的结果进行分组去重
select
dt,
sku_id,
sku_name,
sum(order_count) as order_count,
sum(order_num) as order_num,
sum(order_amount) as order_amount,
sum(payment_count) as payment_count,
sum(payment_num) as payment_num,
sum(payment_amount) as payment_amount,
sum(refund_count) as refund_count,
sum(refund_num) as refund_num,
sum(refund_amount) as refund_amount,
sum(cart_count) as cart_count,
sum(cart_num) as cart_num,
sum(favor_count) as favor_count,
sum(evaluation_good_count) as evaluation_good_count,
sum(evaluation_mid_count) as evaluation_mid_count,
sum(evaluation_bad_count) as evaluation_bad_count
from t7
group by dt,sku_id,sku_name
总结
这里介绍了HIve数仓新零售项目DWS层的构建(Full join)模型,主要针对低维度,多指标的情况。Full join模型的主要思路为
- 用with语句将dwb_order_detail表关键字段提取
- 先统计6张结果小表数据
- 将6张结果小表数据进行full join
- 从full join的结果表中进行数据的抽取
- 去重,将日期和goods_id重复的数据去掉