问题描述
现在我们有一张用户访问区域的记录表,有三个字段:user_id表示用户ID,area表示用户访问的区域,visit_time是访问时间。求用户对某一区域访问的开始和结束时间,结果如右表所示。
分析问题
101访问了上午和晚上分别访问了A,中文访问了B
102 早上9点访问了C,晚上19点也访问了C,总共访问了两次。
我们需要对同一用户访问相同地点的不同次做区分,然后聚合就好了
那我们定一个规则,如果用户换区域或者相邻两条记录间隔超过3小时,那么不算相同访问了。
准备数据
-- 删除表
drop table if exists visit;
-- 建表
CREATE TABLE visit (
user_id bigint,
area string,
report_time string
);
-- 插入数据
INSERT overwrite table visit
VALUES
(101, 'A','2023-01-01 09:00:00'),
(101, 'A','2023-01-01 10:00:00'),
(101, 'B','2023-01-01 14:00:00'),
(101, 'B','2023-01-01 15:00:00'),
(101, 'A','2023-01-01 19:00:00'),
(101, 'A','2023-01-01 20:00:00'),
(102, 'C','2023-01-01 09:00:00'),
(102, 'C','2023-01-01 10:00:00'),
(102, 'C','2023-01-01 11:00:00'),
(102, 'C','2023-01-01 19:00:00'),
(102, 'C','2023-01-01 20:00:00');
-- 查看数据
select * from visit
方案1:适用于换场+长间隔
1、取出每个用户的上一条访问记录的时间地点
select user_id
,report_time,area
,lag(area,1) over( partition by user_id order by report_time) as last_area
,lag(report_time,1) over( partition by user_id order by report_time) as last_time
from visit
2、设置一个标识列,开始为0,其他为0
select t1.*
,case
when last_area is null then 1 -- 没有访问
when last_area !=area then 1 -- 换地方
when (unix_timestamp(report_time)-
unix_timestamp(last_time))/(60*60)>=3 then 1 -- 超过3小时
when last_area = area then 0 end as num
from (
select user_id
,report_time,area
,lag(area,1) over( partition by user_id order by report_time) as last_area
,lag(report_time,1) over( partition by user_id order by report_time) as last_time
from visit
) t1
3、使用sum over累计求和,给每次访问打标,相同访问标记一样
select t1.*
,sum(case
when last_area is null then 1 -- 没有访问
when last_area !=area then 1 -- 换地方
when (unix_timestamp(report_time)-
unix_timestamp(last_time))/(60*60)>=3 then 1 -- 超过3小时
when last_area = area then 0 end)
over(partition by user_id order by report_time) as num
from (
select user_id
,report_time,area
,lag(area,1) over( partition by user_id order by report_time) as last_area
,lag(report_time,1) over( partition by user_id order by report_time) as last_time
from visit
) t1
4、找到每个用户每个区域的首次末次访问时间
with t1 as (
-- 上次访问记录
select user_id
,report_time,area
,lag(area,1) over( partition by user_id order by report_time) as last_area
,lag(report_time,1) over( partition by user_id order by report_time) as last_time
from visit
)
,t2 as (
-- 判断访问
select *
,case
when last_area is null then 1 -- 没有访问
when last_area !=area then 1 -- 换地方
when (unix_timestamp(report_time)-
unix_timestamp(last_time))/(60*60)>=3 then 1 -- 超过3小时
when last_area = area then 0 end as num
from t1
)
,t3 as (
-- sum over 打标
select user_id,area,report_time
,sum(num) over(partition by user_id order by report_time) as num
from t2
)
-- 找到每个用户每个区域的起止时间
select user_id,area
,num
,min(report_time) start_time
,max(report_time) end_time
from t3
group by user_id,area,num
-- order by user_id,start_time
实现了功能,感觉有点繁琐,抛转引玉吧!有好的方案,欢迎评论。
方案2:仅适用于换场
1、标记相同场地的不同访问。按用户编号减去按用户和场地编号,就可以找出换场次序,但是无法标记长时间间隔的,比如102上午、下午分别来了一次。
select user_id
,report_time,area
,row_number()over(partition by user_id order by report_time) as rn1 -- 按用户编码
,row_number()over(partition by user_id,area order by report_time) as rn2 -- 按用户和场地编码
-- rn1 -rn2就可以区分相同场的不同次
,row_number()over(partition by user_id order by report_time)-
row_number()over(partition by user_id,area order by report_time) as rn3
from visit
order by user_id,report_time
2、按照用户、场地和编码聚合,就可以找出起止时间了。A 是正常的,但C是不对的。
select user_id
,area
,rn3
,min(report_time) as start_time
,max(report_time) as end_time
from (
select user_id
,report_time,area
,row_number()over(partition by user_id order by report_time) as rn1 -- 按用户编码
,row_number()over(partition by user_id,area order by report_time) as rn2 -- 按用户和场地编码
-- rn1 -rn2就可以区分相同场的不同次
,row_number()over(partition by user_id order by report_time)-
row_number()over(partition by user_id,area order by report_time) as rn3
from visit
order by user_id,report_time
) t1
group by user_id
,area
,rn3