成绩单 score
install.packages("dplyr")
library(dplyr)
install.packages("tibble")
library(tibble)
install.packages("stringr")
library(stringr)
score = tibble(ID=c("1222-1","2001-0","3321-1","4898-0","2782-0","1002-8","4211-0","1023-1","3325-1"),
gender=c("female","male","male","male","female","female","male","female","male") ,
chinese_mid_score=round(runif(9,80,90),digits = 0),
chinese_final_score=round(runif(9,80,90),digits = 0),
english_score=c(round(runif(8,80,90),digits = 0),NA),
match_score=round(runif(9,80,90),digits = 0),
musci_score=round(runif(9,80,90),digits = 0)
)
view(score)
在之前的系列3中,介绍了成绩单的计算,链接: https://blog.csdn.net/qq_43792777/article/details/131049634?spm=1001.2014.3001.5502
本系列继续以成绩单为例,对系列3进行改进。
之前介绍了rowwise
+summary
组合来处理NA
,但是当数据量特别大时,会出现bug
新的函数,rowSums
score1 = score |>
mutate(rowSums(select(score,musci_score,english_score,match_score),na.rm = TRUE))
view(score1)
select
有个缺点就是当需要计算的数据列过多时,需要一一列举很麻烦,用contains
会很简单
计算每个学生各科分数之和
score_sum = score |>
mutate(rowSums(select(score, contains("_score")),na.rm = TRUE))
view(score_sum)
新的问题来了,我的数据命名没有规律该怎么办呢?
可以根据数据类型来计算
查看数据类型的方式:typeof
typeof(score$ID)
typeof(score$english_score)
across
应用范围比select
+contains
更广泛
score_sum1 = score |>
mutate(total_score = rowSums(across(where(is.numeric)),na.rm = TRUE))
view(score_sum1)
double属于numeric,可以用 where(is.numeric)
也可以用 where(is.double)
考试题目太难了,给学生提升一下成绩的 task,给每科都除以0.95
score_curve = score |>
mutate(across(.cols = where(is.numeric), .fns = ~.x/0.95, .names = "curve_{col}"))
view(score_curve)
原成绩 >= 85分 为 pass, otherwise 则为 fail
score_pass = score |>
mutate(across(.cols = where(is.numeric), .fns = ~if_else(.x>=85,"pass","fail"), .names = "pass_{col}"))
view(score_pass)
.cols
还可以是 everything()
score_pass_everything = score |>
mutate(across(.cols = everything(), .fns = ~if_else(.x>=85,"pass","fail"), .names = "pass_{col}"))
view(score_pass_everything)
看一下结果,ID 列 和 gender 列 也输出为 pass or fail
当然,本例子中将 ID 列 和 gender 列 强制转换成 numeric 是不恰当的,只需要了解该方法即可
修改后的成绩 >= 85分 为 pass, otherwise 则为 fail
score_curve_pass = score |>
mutate(across(.cols = where(is.numeric),
.fns = c(curve = ~.x/0.95, pass = ~if_else(.>=85,"pass","fail")),
.names = "{fn}_{col}"))
view(score_curve_pass)
summarise
+across
计算均值、方差、5%、95%
score_summary = score |>
summarise(across(.cols = where(is.numeric),
.fns = c(
mean = ~mean(.x, na.rm = TRUE),
var = ~var(.x, na.rm = TRUE),
q5 = ~quantile(.x, 0.05, na.rm = TRUE),
q95 = ~quantile(.x, 0.95, na.rm = TRUE)
),
.names = "{fn}:{col}"
))
view(score_summary)