第21题:找出恶意购买用户
create table sql1_21(
order_id int,
user_id string,
order_status string,
operate_time string
)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties(
'input.regex'='(\\d+)\\s+(.+?)\\s+(.+?)\\s+(.+?)'
);
load data local inpath '/home/homedata/sql_1/sql1_21.txt' into table sql1_21;
order_id user_id order_status operate_time
1101 a 已支付 2023-01-01 10:00:00
1102 a 已取消 2023-01-01 10:10:00
1103 a 待支付 2023-01-01 10:20:00
1104 b 已取消 2023-01-01 10:30:00
1105 a 待确认 2023-01-01 10:50:00
1106 a 已取消 2023-01-01 11:00:00
1107 b 已取消 2023-01-01 11:40:00
1108 b 已取消 2023-01-01 11:50:00
1109 b 已支付 2023-01-01 12:00:00
1110 b 已取消 2023-01-01 12:11:00
1111 c 已取消 2023-01-01 12:20:00
1112 c 已取消 2023-01-01 12:30:00
1113 c 已取消 2023-01-01 12:55:00
1114 c 已取消 2023-01-01 13:00:00
恶意购买的用户定义是:同一个用户,在任意半小时内(含),取消订单次数>=3次的就被视为恶意买家。——这道题主要在于range 的使用,range可以用于固定窗口的大小,range是根据值的范围进行固定的,而rows是根据行数。
with t1 as (
select order_id, user_id, unix_timestamp(operate_time) operate_time
from sql1_21 where order_status = "已取消"
)
select user_id,
count(*) over (partition by user_id order by operate_time range between 3600 preceding and current row )
from t1;
第22题:取每个保单的最大保单版本编号的证件号以及客户名称
create table sql1_22(
x1 String, -- 保单编号
x2 String, -- 客户名称
x3 String, -- 证件号
x4 Int --保单版本编号
)
row format delimited
fields terminated by ",";
insert into sql1_22 values('01','a','001',3),('02','b','002',2),('03','c','003',1), ('01','d','004',2),('02','e','005',1),('01','f','006',1);
直接group by即可
select x1,concat_ws(",",collect_list (x2)) x2 ,min(x3) x3 from sql1_22 group by x1;
第23题:获取班级前3名,以及他们的分差
create table sql1_23(
Stu_no int,
class string,
score int
)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties(
'input.regex'='(\\d+)\\s+(.+?)\\s+(\\d+)'
);
load data local inpath '/home/homedata/sql_1/sql1_23.txt' into table sql1_23;
Stu_no class score
1 1901 90
2 1901 90
3 1901 83
4 1901 60
5 1902 66
6 1902 23
7 1902 99
8 1902 67
9 1902 87
实现每班前三名,分数一样不并列,同时求出前三名按名次排序的一次的分差
with t1 as (
select *,row_number() over (partition by class order by score desc ) rowNumber from sql1_23
)
select *,
`if`( rowNumber = 1,score,score - lag(score,1,score) over (partition by class order by rowNumber )) diff_score
from t1 where rowNumber <=3;
第24题:间隔连续问题
create table sql1_24(
id int,
dt string
)
row format delimited
fields terminated by ' ';
load data local inpath '/home/homedata/sql_1/sql1_24.txt' into table sql1_24;
id dt
1001 2021-12-12
1002 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1002 2021-12-16
1001 2021-12-19
1002 2021-12-17
1001 2021-12-20
计算每个用户最大的连续登录天数,可以间隔一天。解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。
按照分组的思想,间隔两天以内就划分为同一组,然后获取每组中的first_value和last_value ,然后日期相减取最大值即可。
with t1 as (
select *,datediff(dt,lag(dt,1,null) over (partition by id order by dt )) cz from sql1_24
),t2 as (
select id,dt,`if`(cz = null or cz >2 ,1,0) status from t1
),t3 as (
select id,dt,sum(status) over (partition by id order by dt) groupId from t2
),t4 as (
select id,
first_value(dt) over (partition by id,groupId order by dt) sdt,
last_value(dt) over (partition by id,groupId order by dt) edt
from t3
)
select id,max(datediff(edt,sdt)+1) days from t4 group by id;
第25题:行列转换
create table sql1_25(
a string,
b string,
c int
)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties(
'input.regex'='(.+?)\\s+(.+?)\\s+(\\d+)',
'output.format.string'='%1$s %2$s %3$s'
);
load data local inpath '/home/homedata/sql_1/sql1_25.txt' into table sql1_25;
select * from sql1_25;
2014 B 9
2015 A 8
2014 A 10
2015 B 7
第一题:行转列问题
将上面的数据转为下面这种格式
select a,
sum(case b when "A" then c else 0 end) col_A,
sum(case b when "B" then c else 0 end) col_B
from sql1_25 group by a;
问题2:再转回去
create table sql1_25_2 as select a,
sum(case b when 'A' then c else 0 end) col_A,
sum(case b when 'B' then c else 0 end) col_B
from sql1_25 group by a ;
select a,'A' b ,col_A c from sql1_25_2
union all
select a,'B' b ,col_B from sql1_25_2;
问题3:将上面的数据转为下面的这种
创建表格
create table sql1_25_3(
year int,
deptno string,
score string
);
insert into sql1_25_3 values (2014,"B",9),
(2015,"A",8),(2014,"A",10),(2015,"B",7),(2014,"B",6);
两种写法思路是一样的,只是步骤换了一下,实际上就是一种解法不同写法
写法一:
with t1 as (
select year a,case deptno when "A" then score else null end col_A,
case deptno when "B" then score else null end col_B
from sql1_25_3
)
select a,concat_ws(",", collect_list(col_A)) col_A,
concat_ws(",", collect_list(col_B)) col_B from t1 group by a;
写法二:
with t1 as
( select year a ,deptno,concat_ws(',',collect_list(score)) tep1 from sql1_25_3 group by year,deptno )
select a,
concat_ws(',',collect_list(case deptno when 'A' then tep1 end ))col_A ,
concat_ws(',',collect_list(case deptno when 'B' then tep1 end ))col_B
from t1 group by a ;