summarize()分组摘要分组平均值最大最小值计数
library(nycflights13)
library(tidyverse)
summarize()可以将数据折叠成一行
如果不与group_by()一起使用,那么summarize()也没什么用
summarize(flights, delay = mean(dep_delay, na.rm = TRUE))
delay |
---|
12.63907 |
group_by()可以将分析数据集改为单个分组
by_day <- group_by(flights,year,month,day)
summarize(by_day,delay=mean(dep_delay,na.rm=TRUE))
通过summarize可以起到mutate的作用
by_dest <- group_by(flights,dest)
delay <- summarize(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
ggplot(data=delay,mapping=aes(x=dist,y=delay))+
geom_point(aes(size=count),alpha=1/3)+
geom_smooth(se=FALSE)
常用的摘要统计量
分组求和、求平均值、计数等等。
思路是group_by统计量,summarize(count = n())
#先选出没有取消的航班
not_cancelled <- flights %>%
filter(!is.na(dep_delay),!is.na(arr_delay))
#平均延误时间
not_cancelled %>%
group_by(year,month,day) %>%
summarize(mean=mean(dep_delay))
摘要函数 | 名称 |
---|---|
位置度量 | mean() median() |
分散程度度量 | sd() IQR() mad() |
秩的度量 | min() quantile() max() |
定位度量 | first() nth() last() |
计数 | n() |
#位置度量
not_cancelled %>%
group_by(year,month,day) %>%
summarize(
#平均延误时间
avg_delay1=mean(arr_delay),
#平均延误纠正时间 (只选择大于0
avg_delay2=mean(arr_delay[arr_delay>0])
)
year | month | day | avg_delay1 | avg_delay2 |
---|---|---|---|---|
2013 | 1 | 1 | 12.6510229 | 32.48156 |
2013 | 1 | 2 | 12.6928879 | 32.02991 |
2013 | 1 | 3 | 5.7333333 | 27.66087 |
2013 | 1 | 4 | -1.9328194 | 28.30976 |
2013 | 1 | 5 | -1.5258020 | 22.55882 |
2013 | 1 | 6 | 4.2364294 | 24.37270 |
2013 | 1 | 7 | -4.9473118 | 27.76132 |
2013 | 1 | 8 | -3.2275785 | 20.78909 |
2013 | 1 | 9 | -0.2642777 | 25.63415 |
2013 | 1 | 10 | -5.8988159 | 27.34545 |
#分散程度度量
not_cancelled %>%
group_by(dest) %>%
summarize(distace_sd=sd(distance)) %>%
arrange(desc(distace_sd))
dest | distace_sd |
---|---|
EGE | 10.542765 |
SAN | 10.350094 |
SFO | 10.216017 |
HNL | 10.004197 |
SEA | 9.977993 |
LAS | 9.907786 |
PDX | 9.873299 |
PHX | 9.862546 |
LAX | 9.657195 |
IND | 9.458066 |
CVG | 9.018212 |
SAT | 9.005084 |
#秩的度量min(x),quantile(x,0.25),max(x)
#每天最早和最晚出发的航班
not_cancelled %>%
group_by(year,month,day) %>%
summarize(
first=min(dep_time),
last=max(dep_time)
)
year | month | day | first | last |
---|---|---|---|---|
2013 | 1 | 1 | 517 | 2356 |
2013 | 1 | 2 | 42 | 2354 |
2013 | 1 | 3 | 32 | 2349 |
2013 | 1 | 4 | 25 | 2358 |
2013 | 1 | 5 | 14 | 2357 |
2013 | 1 | 6 | 16 | 2355 |
2013 | 1 | 7 | 49 | 2359 |
2013 | 1 | 8 | 454 | 2351 |
2013 | 1 | 9 | 2 | 2252 |
2013 | 1 | 10 | 3 | 2320 |
#计数(唯一值数量 n_distinct())
#哪个目的地有最多的航空公司
not_cancelled %>%
group_by(dest) %>%
summarize(carriers=n_distinct(carrier)) %>%
arrange(desc(carriers))
dest | carriers |
---|---|
ATL | 7 |
BOS | 7 |
CLT | 7 |
ORD | 7 |
TPA | 7 |
AUS | 6 |
DCA | 6 |
#只需要计数的情况
# 无需摘要统计
not_cancelled %>%
count(dest)
#逻辑值计数和比例
#多少航班是在5点前出发的
not_cancelled %>%
group_by(year,month,day) %>%
summarize(n_early=sum(dep_time<500))
#或者写为
not_cancelled %>%
group_by(year,month,day) %>%
summarize(early= dep_time[dep_time<500]) %>%
summarize(n=n())
注意区分上面的
#位置度量
not_cancelled %>%
group_by(year,month,day) %>%
summarize(
#平均延误时间
avg_delay1=mean(arr_delay),
#平均延误纠正时间 (只选择大于0
avg_delay2=mean(arr_delay[arr_delay>0])
)
#航班延误1小时的比例
#因为arr_delay>60是返回的一堆逻辑值110001求mean是比例
not_cancelled %>%
group_by(year,month,day) %>%
summarize(hour_prec=mean(arr_delay>60))
-
sum(dep_time<500): 这个是求dep_time小于500的个数,因为dep_time<500会先生成0,1的布尔列,求和就是1的数量
-
dep_time[dep_time<500] %>% summarize(n=n()):和上面一样,相当于先摘要出dep_time<500的,再计数
-
mean(arr_dalay): 对所有的arr_dalay求平均
-
mean(arr_delay[arr_delay>0]):只求arr_delay大于0的平均值
-
mean(add_delay>60),求的是arr_delay大于60的比例
-
dep_time[dep_time<500] %>% summarize(n=n()):和上面一样,相当于先摘要出dep_time<500的,再计数
-
mean(arr_dalay): 对所有的arr_dalay求平均
-
mean(arr_delay[arr_delay>0]):只求arr_delay大于0的平均值
-
mean(add_delay>60),求的是arr_delay大于60的比例
-
mean(add_delay[arr_delay>60]): 只求arr_delay大于60的平均值