1,第一题
1)表名:t_patent_detail (专利明细表)
2)表字段:专利号(patent_id)、专利名称(patent_name)、专利类型(patent_type)、申请时间
(aplly_date)、授权时间(authorize_date)、申请人(apply_users)
3)说明:同一个专利,可以有1到多个申请人,多人之间按分号隔开。本表记录数约1万条。例
如:
4)请写出hive查询语句,各类型专利top 10申请人,以及对应的专利申请数
代码演示
--各类型专利top 10申请人,以及对应的专利申请数
select t1.apply_name `申请人`,count(*) `专利申请数`,rank() over(order by
count(*) desc) `专利数排名`
from (
--先将申请人字段炸裂
select d.*, t1.coll `apply_name`
from t_patent_detail d
lateral view explode(split(apply_users,';')) t1 as coll
) t1
group by t1.apply_name; --按照申请人分组
2,第二题
问题: 现有用户登录记录表,请查询出用户连续三天登录的所有数据记录
id dt
1 2024-04-25
1 2024-04-26
1 2024-04-27
1 2024-04-28
1 2024-04-30
1 2024-05-01
1 2024-05-02
1 2024-05-04
1 2024-05-05
2 2024-04-25
2 2024-04-28
2 2024-05-02
2 2024-05-03
2 2024-05-04
期望结果:
代码演示
--建表
create table user_log(
id int,
dt string
)
row format delimited
fields terminated by '\t';
load data local inpath '/home/hivedata/lianxu.txt' into table user_log;
--第一步:求解每行日期后面第三行的日期 lead()和 真正第三天的日期
select*,
lead(dt,2) over(partition by id order by dt) later3dt,
date_add(dt,2) true3dt
from user_log;
--第二步:判断是否连续登录三天
with t as (
select*,
lead(dt,2) over(partition by id order by dt) later3dt,
date_add(dt,2) true3dt
from user_log
) select *,if(later3dt==true3dt,1,0) num from t;
--第三步:筛选出连续登录三天的每个起始日期
with t as (
select*,
lead(dt,2) over(partition by id order by dt) later3dt,
date_add(dt,2) true3dt
from user_log
) ,t1 as (
select *,if(later3dt==true3dt,1,0) num from t
)select * from t1 where num=1;
-- 第四步:表合并求最终结果(和一个三行的表进行合并)(笛卡尔积)
with t as (
select*,
lead(dt,2) over(partition by id order by dt) later3dt,
date_add(dt,2) true3dt
from user_log
) ,t1 as (
select *,if(later3dt==true3dt,1,0) num from t
),t2 as (
select * from t1 where num=1
) select id,dt,list,date_add(dt,d.list) dt2 from t2,(select explode(array(0,1,2)) list) d;