05 窗口函数
专用窗口函数
SQL136 每类试卷得分前3名
找到每类试卷得分的前3名,如果两人最大分数相同,选择最小分数大者,如果还相同,选择uid大者
SELECT tag,uid,ranking
FROM
(
SELECT
tag,
ER.uid,
ROW_NUMBER() OVER (PARTITION BY tag ORDER BY tag, MAX(score) DESC, MIN(score) DESC, ER.uid DESC) AS ranking
FROM exam_record AS ER JOIN examination_info AS EI USING(exam_id)
GROUP BY tag,ER.uid
) AS H
WHERE ranking<=3
SQL137 第二快/慢用时之差大于试卷时长一半的试卷
找到第二快和第二慢用时之差大于试卷时长的一半的试卷信息,按试卷ID降序排序
SELECT exam_id,duration,release_time
FROM
(
SELECT
exam_id,
duration,
release_time,
SUM(
CASE
WHEN rk_desc=2 THEN time_diff
WHEN rk_asc=2 THEN -time_diff
ELSE 0
END
) AS sum_time #第二名与倒数第二名时间差
FROM
(
SELECT
ER.exam_id,
timestampdiff(minute,start_time,submit_time) AS time_diff,
duration,
release_time,
ROW_NUMBER()OVER(PARTITION BY exam_id ORDER BY timestampdiff(minute,start_time,submit_time) DESC ) AS rk_desc, #从高往低排序
ROW_NUMBER()OVER(PARTITION BY exam_id ORDER BY timestampdiff(minute,start_time,submit_time) ASC ) AS rk_asc #从低往高排序
FROM exam_record AS ER
LEFT JOIN examination_info AS EI USING(exam_id)
WHERE submit_time IS NOT NULL
) AS t1
GROUP BY exam_id
) AS t2
WHERE sum_time*2>=duration
ORDER BY exam_id DESC
SQL138 连续两次作答试卷的最大时间窗
请计算在2021年至少有两天作答过试卷的人中,计算该年连续两次作答试卷的最大时间窗days_window,那么根据该年的历史规律他在days_window天里平均会做多少套试卷,按最大时间窗和平均做答试卷套数倒序排序
SELECT
uid,
days_window,
ROUND(days_window*ANS_CNT/LONG, 2) AS avg_exam_cnt
FROM
(
SELECT
uid,
DATEDIFF(MAX(start_time), MIN(start_time))+1 AS LONG,
COUNT(start_time) AS ANS_CNT,
MAX(DATEDIFF(next_start_time, start_time))+1 AS days_window
FROM
(
SELECT
uid,
exam_id,
start_time,
LEAD(start_time) OVER (PARTITION BY uid ORDER BY start_time) AS next_start_time
FROM exam_record
WHERE YEAR(start_time)=2021
) AS T1
GROUP BY uid
) AS T2
WHERE LONG>=2
ORDER BY days_window DESC, avg_exam_cnt DESC
LEAD( )
可以从当前行访问下一行的数据或下一行之后的行
LEAD(return_value ,offset [,default])
OVER (
[PARTITION BY partition_expression, ... ]
ORDER BY sort_expression [ASC | DESC], ...)
return_value
#基于指定偏移量的后续行的返回值
offset #
从当前行转发的行数,默认值为1
PARTITION BY #
将结果集的行分配到应用了LEAD()
函数的分区
ORDER BY #
指定应用LEAD()
函数的每个分区中行的逻辑顺序
LAG ( )
- lag :用于统计窗口内往上第n行值
- lead :用于统计窗口内往下第n行值
SQL139 近三个月未完成试卷数为0的用户完成情况
找到每个人近三个有试卷作答记录的月份中没有试卷是未完成状态的用户的试卷作答完成数,按试卷完成数和用户ID降序排名
SELECT uid, COUNT(submit_time) AS exam_complete_cn
FROM
( SELECT uid,submit_time,
DENSE_RANK() OVER (PARTITION BY uid ORDER BY DATE_FORMAT(start_time,'%Y%m') DESC ) AS ranks
FROM exam_record
) AS T1
WHERE ranks<=3
GROUP BY uid
HAVING COUNT(uid)=COUNT(submit_time)
ORDER BY exam_complete_cn DESC, uid DESC
SQL140 未完成率较高的50%用户近三个月答卷情况
请统计SQL试卷上未完成率较高的50%用户中,6级和7级用户在有试卷作答记录的近三个月中,每个月的答卷数目和完成数目。按用户ID、月份升序排序
with t as
(
select uid
from
(select *,
row_number()over(order by incomplete_rate desc) as rn,
count(1)over() as num
from
(select uid,
count(1)-count(submit_time) as incomplete_cnt,
count(1) as total_cnt,
(count(1)-count(submit_time))/count(1) as incomplete_rate
from exam_record
where exam_id in (select exam_id from examination_info where tag = 'SQL')
group by uid) AS t1
) AS t2
where rn<=(num+1)/2
and uid in (select uid from user_info where level in ('6','7') )
)
select uid,
dtime as start_month,
count(1) as total_cnt,
count(score) as complete_cnt
from
(select uid,
DATE_FORMAT(start_time,"%Y%m") as dtime,
DENSE_RANK()over(partition by uid order by DATE_FORMAT(start_time,"%y%m") desc) as rn,
score
from exam_record
where uid in (select uid from t)) AS t1
where rn<=3
group by uid,start_month
order by uid,start_month
SQL141 试卷完成数同比2020年的增长率及排名变化
请计算2021年上半年各类试卷的做完次数相比2020年上半年同期的增长率(百分比格式,保留1位小数),以及做完次数排名变化,按增长率和21年排名降序输出
SELECT
tag,
exam_cnt_20,
exam_cnt_21,
growth_rate,
exam_cnt_rank_20,
exam_cnt_rank_21,
CAST(exam_cnt_rank_21 AS SIGNED) - CAST(exam_cnt_rank_20 AS SIGNED) AS rank_delta
FROM
(
SELECT
tag,
exam_cnt_20,
exam_cnt_21,
IFNULL(CONCAT(ROUND((exam_cnt_21-exam_cnt_20)/exam_cnt_20*100,1),'%'),0) AS growth_rate,
RANK() OVER (ORDER BY exam_cnt_20 DESC ) AS exam_cnt_rank_20,
RANK() OVER (ORDER BY exam_cnt_21 DESC ) AS exam_cnt_rank_21
FROM
(
SELECT
tag,
COUNT(IF(YEAR(submit_time)=2020 AND MONTH(submit_time)<=6,submit_time,NULL)) AS exam_cnt_20,
COUNT(IF(YEAR(submit_time)=2021 AND MONTH(submit_time)<=6,submit_time,NULL)) AS exam_cnt_21
FROM exam_record JOIN examination_info USING(exam_id)
GROUP BY tag
) AS T1
) AS T2
WHERE exam_cnt_20<>0 AND exam_cnt_21<>0
ORDER BY growth_rate DESC,exam_cnt_rank_21 DESC
数字和字符串互转
1. 数字转字符串
CONCAT
SELECT CONCAT(123,'aa',456) AS str
CAST > CHAR
SELECT CAST(123 AS CHAR) AS str
2. 字符串转数字
CAST > SIGNED
SELECT CAST('123' AS SIGNED)
CONVERT
SELECT CONVERT('123', SIGNED)
聚合窗口函数
SQL142 对试卷得分做min-max归一化
min-max标准化也被称为离差标准化,是对原始数据的线性变换,使结果值映射到[0 - 1]之间
请你将用户作答高难度试卷的得分在每份试卷作答记录内执行min-max归一化后缩放到[0,100]区间,并输出用户ID、试卷ID、归一化后分数平均值;最后按照试卷ID升序、归一化分数降序输出。(注:得分区间默认为[0,100],如果某个试卷作答记录中只有一个得分,那么无需使用公式,归一化并缩放后分数仍为原分数)
SELECT
uid,
exam_id,
ROUND(AVG(new_score),0) AS avg_new_score
FROM
(SELECT
uid,exam_id,
IF(S0=S1,score,(score-S0)/(S1-S0)*100) AS new_score
FROM
(
SELECT
uid,
exam_id,
score,
MAX(score) OVER (PARTITION BY exam_id) AS S1,
MIN(score) OVER (PARTITION BY exam_id) AS S0
FROM exam_record JOIN examination_info USING(exam_id)
WHERE difficulty='hard'
) AS T1
) AS T2
WHERE new_score IS NOT NULL
GROUP BY uid,exam_id
ORDER BY exam_id,avg_new_score DESC
SQL143 每份试卷每月作答数和截止当月的作答总数
请输出每份试卷每月作答数和截止当月的作答总数
SELECT DISTINCT
exam_id,
DATE_FORMAT(start_time,'%Y%m') start_month,
count(start_time) over(
partition by exam_id,DATE_FORMAT(start_time,'%Y%m')
) month_cnt,
count(start_time) over(
partition by exam_id
order by DATE_FORMAT(start_time,'%Y%m')
) cum_exam_cnt
FROM
exam_record
数据分组累加sum() over (partition by ... order by ...)
sum(...) over( ),对所有行求和
sum(...) over( order by ... ),第一行 到 与当前行同序号行的最后一行的所有值求和
SQL144 每月及截止当月的答题情况
请输出自从有用户作答记录以来,每月的试卷作答记录中月活用户数、新增用户数、截止当月的单月最大新增用户数、截止当月的累积用户数。结果按月份升序输出
SELECT
t1.month AS start_month,
t1.mau,
IFNULL(month_add_uv, 0) AS month_add_uv,
MAX(month_add_uv) OVER(ORDER BY t1.month) AS max_month_add_uv,
SUM(month_add_uv) OVER(ORDER BY t1.month) AS cum_sum_uv
FROM (
SELECT
DATE_FORMAT(start_time, '%Y%m' ) AS month,
COUNT(DISTINCT uid) AS mau
FROM exam_record
GROUP BY month
) AS t1
LEFT JOIN (
SELECT
f_month,
COUNT(uid) AS month_add_uv
FROM (
SELECT
uid,
DATE_FORMAT(MIN(start_time), '%Y%m') AS f_month
FROM exam_record
GROUP BY uid
) AS t1
GROUP BY f_month
) AS t2 ON t1.month=t2.f_month
ORDER BY start_month
06 其他常用操作
空值处理
SQL145 统计有未完成状态的试卷的未完成数和未完成率
请统计有未完成状态的试卷的未完成数incomplete_cnt和未完成率incomplete_rate
SELECT
exam_id,
SUM(IF(submit_time IS NULL, 1, 0)) AS incomplete_cnt,
ROUND(SUM(IF(submit_time IS NULL, 1, 0))/COUNT(id),3) AS complete_rate
FROM exam_record
GROUP BY exam_id
HAVING incomplete_cnt<>0
SQL146 0级用户高难度试卷的平均用时和平均得分
请输出每个0级用户所有的高难度试卷考试平均用时和平均得分,未完成的默认试卷最大考试时长和0分处理
SELECT
uid,
ROUND(AVG(score_new),0) avg_score,
ROUND(AVG(cost_time),1) avg_time_took
FROM (
SELECT
a.uid,
IF(submit_time IS NOT NULL,score,0) AS score_new,
IF(submit_time IS NULL,b.duration,TIMESTAMPDIFF(minute,start_time,submit_time)) AS cost_time
FROM exam_record AS a
LEFT JOIN examination_info AS b ON a.exam_id=b.exam_id
LEFT JOIN user_info AS c ON a.uid=c.uid
WHERE difficulty='hard'
AND level='0'
) AS t1
GROUP BY uid
高级条件语句
SQL147 筛选限定昵称成就值活跃日期的用户
请找到昵称以『牛客』开头『号』结尾、成就值在1200~2500之间,且最近一次活跃(答题或作答试卷)在2021年9月的用户信息
SELECT
uid,
nick_name,
achievement
FROM user_info
WHERE nick_name LIKE '牛客%%号'
AND achievement BETWEEN 1200 AND 2500
AND uid IN
(
SELECT uid
FROM (
SELECT
uid,
start_time AS time
FROM exam_record
UNION ALL
SELECT uid,
submit_time AS time
FROM practice_record
) AS T
GROUP BY uid
HAVING DATE_FORMAT(MAX(time), '%Y%m') = 202109
)
SQL148 筛选昵称规则和试卷规则的作答记录
找到昵称以"牛客"+纯数字+"号"或者纯数字组成的用户对于字母c开头的试卷类别(如C,C++,c#等)的已完成的试卷ID和平均得分,按用户ID、平均分升序排序
SELECT
uid,
exam_id,
ROUND(AVG(score),0) AS avg_score
FROM exam_record
JOIN examination_info AS EI USING(exam_id)
JOIN user_info AS UI USING(uid)
WHERE (nick_name RLIKE "^牛客[0-9]+号$" OR nick_name RLIKE "^[0-9]+$")
#昵称以"牛客"+纯数字+"号"或者纯数字组成的用户
AND tag LIKE 'C%'
AND submit_time IS NOT NULL
#对于字母c开头的试卷类别(如C,C++,c#等)的已完成的试卷ID和平均得分
GROUP BY uid,exam_id
ORDER BY uid,avg_score
#按用户ID、平均分升序排序
RLIKE
后面可以跟正则表达式
1、字符^
意义:表示匹配的字符必须在最前边
例如:^A不匹配“an A”中的‘A’,但匹配“An A”中最前面的‘A’。
2、字符$
意义:与^类似,匹配最末的字符。
例如:t$不匹配“eater”中的‘t’,但匹配“eat”中的‘t’。
3、字符[0-9]
意义:字符列表,匹配列出中的任一个字符。你可以通过连字符-指出字符范围。
例如:[abc]跟[a-c]一样。它们匹配“brisket”中的‘b’和“ache”中的‘c’。
4、字符+
意义:匹配+号前面的字符1次及以上。等价于{1,}
例如:a+匹配“candy”中的‘a’和“caaaaaaandy”中的所有‘a’
SQL149 根据指定记录是否存在输出不同情况
请你筛选表中的数据,当有任意一个0级用户未完成试卷数大于2时,输出每个0级用户的试卷未完成数和未完成率(保留3位小数);若不存在这样的用户,则输出所有有作答记录的用户的这两个指标,结果按未完成率升序排序
WITH T AS (
SELECT
uid,
level,
COUNT(start_time) AS done,
SUM(IF(start_time is not null and submit_time IS NULL,1,0)) AS incomplete_cnt,
ROUND(AVG(IF(start_time is not null and submit_time IS NULL,1,0)),3) AS incomplete_rate
FROM user_info
LEFT JOIN exam_record USING(uid)
GROUP BY uid,level
)
SELECT
uid,
incomplete_cnt,
incomplete_rate
FROM T
WHERE IF((SELECT MAX(incomplete_cnt) FROM T WHERE level=0)>2,level=0,done>0)
#当有任意一个0级用户未完成试卷数大于2时,输出每个0级用户的试卷未完成数和未完成率(保留3位小数
#若不存在这样的用户,则输出所有有作答记录的用户的这两个指标
ORDER BY incomplete_rate
#结果按未完成率升序排序
SQL150 各用户等级的不同得分表现占比
为了得到用户试卷作答的定性表现,我们将试卷得分按分界点[90,75,60]分为优良中差四个得分等级(分界点划分到左区间),请统计不同用户等级的人在完成过的试卷中各得分等级占比(结果保留3位小数),未完成过试卷的用户无需输出,结果按用户等级降序、占比降序排序
WITH T AS (
SELECT
uid,
level,
score,
(CASE
WHEN score >=90 THEN '优'
WHEN score >=75 THEN '良'
WHEN score >=60 THEN '中'
ELSE '差'
END) AS score_grade,
COUNT(*) OVER (PARTITION BY level) AS total
FROM exam_record JOIN user_info USING(uid)
WHERE score IS NOT NULL
)
SELECT
level,
score_grade,
ROUND((COUNT(uid)/total),3) AS ratio
FROM T
GROUP BY level,score_grade
限量查询
SQL151 注册时间最早的三个人
请从中找到注册时间最早的3个人
SELECT
uid,
nick_name,
register_time
FROM user_info
ORDER BY register_time
LIMIT 3
SQL152 注册当天就完成了试卷的名单第三页
找到求职方向为算法工程师,且注册当天就完成了算法类试卷的人,按参加过的所有考试最高得分排名。排名榜很长,我们将采用分页展示,每页3条,现在需要你取出第3页(页码从1开始)的人的信息
SELECT
T1.uid,
level,
register_time,
max_score
FROM exam_record
JOIN user_info USING(uid)
JOIN examination_info USING(exam_id)
JOIN
(
SELECT
uid,
MAX(score) AS max_score
FROM exam_record
GROUP BY uid
) AS T1 USING(uid)
WHERE
job='算法'
#找到求职方向为算法工程师,
AND (DATE(register_time)=DATE(submit_time) AND tag='算法')
#且注册当天就完成了算法类试卷的人
ORDER BY max_score DESC
#按参加过的所有考试最高得分排名
LIMIT 6,3
#排名榜很长,我们将采用分页展示,每页3条,现在需要你取出第3页(页码从1开始)的人的信息
文本转换函数
SQL153 修复串列了的记录
录题同学有一次手误将部分记录的试题类别tag、难度、时长同时录入到了tag字段,请帮忙找出这些录错了的记录,并拆分后按正确的列类型输出
WITH T AS (
SELECT
exam_id,
tag AS TEXT
FROM examination_info
WHERE tag LIKE '%,%'
)
SELECT
exam_id,
SUBSTRING_INDEX(TEXT,',',1) AS tag,
SUBSTRING_INDEX(SUBSTRING_INDEX(TEXT,',',2),',',-1) AS difficulty,
SUBSTRING_INDEX(TEXT,',',-1) AS duration
FROM T
SUBSTRING_INDEX
SUBSTRING_INDEX(str, delim, count) 有三个参数,如果count是正数,那么就是从左往右数,第N个分隔符的左边的全部内容。如果是负数,那么就是从右边开始数,第N个分隔符右边的所有内容。
SUBSTRING_INDEX(tag, ',', 1) 可以获得左数第一个内容 tag,SUBSTRING_INDEX(tag, ',', -1) 获得右数第一个 duration
SUBSTRING_INDEX(SUBSTRING_INDEX(tag, ',', 2), ',', -1) 切分两次,获得中间位置的 difficulty
SQL154 对过长的昵称截取处理
有的用户的昵称特别长,在一些展示场景会导致样式混乱,因此需要将特别长的昵称转换一下再输出,请输出字符数大于10的用户信息,对于字符数大于13的用户输出前10个字符然后加上三个点号:『...』
SELECT
uid,
CASE WHEN CHAR_LENGTH(nick_name)<=13 THEN nick_name
ELSE CONCAT(SUBSTR(nick_name,1,10),'...') END AS nick_name
#对于字符数大于13的用户输出前10个字符然后加上三个点号:『...』
FROM user_info
WHERE CHAR_LENGTH(nick_name)>10
#输出字符数大于10的用户信息
CHAR_LENGTH
char_length(str)或character_length(str) 返回字符串 str 的字符 单位为字符
length(str) 返回字符串 str 的字节长度 单位为字节
SUBSTR
SUBSTR(str,pos,len): 从pos开始的位置,截取len个字符
substr(string ,1,3) :取string左1起,3字长 =str
substr(string, -1,3):取string右1起,3字长 往右不够3字长 =g
substr(string, -3,3):取string右3起,3字长 =ing
SUBSTR(str,pos): pos开始的位置,一直截取到最后
substr(string ,4) : 从右4位置截取到最后 =ing
SQL155 大小写混乱时的筛选统计
试卷的类别tag可能出现大小写混乱的情况,请先筛选出试卷作答数小于3的类别tag,统计将其转换为大写后对应的原本试卷作答数
如果转换后tag并没有发生变化,不输出该条结果
WITH T AS (
SELECT
tag,
COUNT(uid) AS answer_cnt
FROM exam_record
LEFT JOIN examination_info USING(exam_id)
GROUP BY tag
)
SELECT a.tag, b.answer_cnt
FROM T as a
JOIN T as b
ON UPPER(a.tag) = b.tag AND a.tag<>b.tag AND a.answer_cnt < 3;