数据框操作总结
sunqi
2020/7/28
概述
基于tidyverse,这个包//包含tidyr
选取列
代码语言:javascript复制rm(list=ls())
library(tidyverse)
代码语言:javascript复制## -- Attaching packages ------------------------------------------------------------- tidyverse 1.3.0 --
代码语言:javascript复制## √ ggplot2 3.3.2 √ purrr 0.3.4
## √ tibble 3.0.3 √ dplyr 1.0.0
## √ tidyr 1.1.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
代码语言:javascript复制## -- Conflicts ---------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
代码语言:javascript复制# 建立测试数据集
my_data <- as_tibble(iris)
my_data %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
代码语言:javascript复制# 根据名字选取列
# pull函数返回的为向量
my_data %>% pull(Species)%>% head()
代码语言:javascript复制## [1] setosa setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica
代码语言:javascript复制# select
# 根据行号
my_data %>% select(1:3)%>% head()
代码语言:javascript复制## # A tibble: 6 x 3
## Sepal.Length Sepal.Width Petal.Length
## <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4
## 2 4.9 3 1.4
## 3 4.7 3.2 1.3
## 4 4.6 3.1 1.5
## 5 5 3.6 1.4
## 6 5.4 3.9 1.7
代码语言:javascript复制# 根据列名
my_data %>% select(Sepal.Length, Petal.Length)%>% head()
代码语言:javascript复制## # A tibble: 6 x 2
## Sepal.Length Petal.Length
## <dbl> <dbl>
## 1 5.1 1.4
## 2 4.9 1.4
## 3 4.7 1.3
## 4 4.6 1.5
## 5 5 1.4
## 6 5.4 1.7
代码语言:javascript复制# 列名从开始到结束
my_data %>% select(Sepal.Length:Petal.Length)%>% head()
代码语言:javascript复制## # A tibble: 6 x 3
## Sepal.Length Sepal.Width Petal.Length
## <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4
## 2 4.9 3 1.4
## 3 4.7 3.2 1.3
## 4 4.6 3.1 1.5
## 5 5 3.6 1.4
## 6 5.4 3.9 1.7
代码语言:javascript复制# 名字开头含Petal
my_data %>% select(starts_with("Petal"))%>% head()
代码语言:javascript复制## # A tibble: 6 x 2
## Petal.Length Petal.Width
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
代码语言:javascript复制# 名字结尾含Width
my_data %>% select(ends_with("Width")) %>% head()
代码语言:javascript复制## # A tibble: 6 x 2
## Sepal.Width Petal.Width
## <dbl> <dbl>
## 1 3.5 0.2
## 2 3 0.2
## 3 3.2 0.2
## 4 3.1 0.2
## 5 3.6 0.2
## 6 3.9 0.4
代码语言:javascript复制# 名字含有etal
my_data %>% select(contains("etal")) %>% head()
代码语言:javascript复制## # A tibble: 6 x 2
## Petal.Length Petal.Width
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
代码语言:javascript复制# 正则表达式
my_data %>% select(matches(".t.")) %>% head()
代码语言:javascript复制## # A tibble: 6 x 4
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
代码语言:javascript复制# 使用字符串
my_data %>% select(one_of(c("Sepal.Length", "Petal.Length"))) %>% head()
代码语言:javascript复制## # A tibble: 6 x 2
## Sepal.Length Petal.Length
## <dbl> <dbl>
## 1 5.1 1.4
## 2 4.9 1.4
## 3 4.7 1.3
## 4 4.6 1.5
## 5 5 1.4
## 6 5.4 1.7
代码语言:javascript复制# 根据条件选取
my_data %>% select_if(is.numeric) %>% head()
代码语言:javascript复制## # A tibble: 6 x 4
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
代码语言:javascript复制# 移除列,同时语法支持列号
my_data %>% select(-Sepal.Length, -Petal.Length)%>% head()
代码语言:javascript复制## # A tibble: 6 x 3
## Sepal.Width Petal.Width Species
## <dbl> <dbl> <fct>
## 1 3.5 0.2 setosa
## 2 3 0.2 setosa
## 3 3.2 0.2 setosa
## 4 3.1 0.2 setosa
## 5 3.6 0.2 setosa
## 6 3.9 0.4 setosa
代码语言:javascript复制my_data %>% select(-(Sepal.Length:Petal.Length)) %>% head()
代码语言:javascript复制## # A tibble: 6 x 2
## Petal.Width Species
## <dbl> <fct>
## 1 0.2 setosa
## 2 0.2 setosa
## 3 0.2 setosa
## 4 0.2 setosa
## 5 0.2 setosa
## 6 0.4 setosa
行选择
代码语言:javascript复制# 切片
my_data %>% slice(1:6)
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
代码语言:javascript复制# filter根据条件
my_data %>% filter(Sepal.Length > 7) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 7.1 3 5.9 2.1 virginica
## 2 7.6 3 6.6 2.1 virginica
## 3 7.3 2.9 6.3 1.8 virginica
## 4 7.2 3.6 6.1 2.5 virginica
## 5 7.7 3.8 6.7 2.2 virginica
## 6 7.7 2.6 6.9 2.3 virginica
代码语言:javascript复制# 多个条件
my_data %>% filter(Sepal.Length > 6.7, Sepal.Width <= 3) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 6.8 2.8 4.8 1.4 versicolor
## 2 7.1 3 5.9 2.1 virginica
## 3 7.6 3 6.6 2.1 virginica
## 4 7.3 2.9 6.3 1.8 virginica
## 5 6.8 3 5.5 2.1 virginica
## 6 7.7 2.6 6.9 2.3 virginica
代码语言:javascript复制# 对所有列筛选
# 去掉没用的
my_data2 <- my_data %>% select(-Species)
# 对所有的列进行筛选
my_data2 %>% filter_all(all_vars(.> 2.4)) %>% head()
代码语言:javascript复制## # A tibble: 3 x 4
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 6.3 3.3 6 2.5
## 2 7.2 3.6 6.1 2.5
## 3 6.7 3.3 5.7 2.5
代码语言:javascript复制# 任意一个大于
my_data2 %>% filter_all(any_vars(.> 2.4)) %>% head()
代码语言:javascript复制## # A tibble: 6 x 4
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
代码语言:javascript复制# filter 删除缺失值
friends_data <- tibble(
name = c("A", "B", "C", "D"),
age = c(27, 25, 29, 26),
height = c(180, NA, NA, 169),
married = c("yes", "yes", "no", "no")
)
friends_data %>% filter(is.na(height)) %>% head()
代码语言:javascript复制## # A tibble: 2 x 4
## name age height married
## <chr> <dbl> <dbl> <chr>
## 1 B 25 NA yes
## 2 C 29 NA no
代码语言:javascript复制friends_data %>% filter(!is.na(height))%>% head()
代码语言:javascript复制## # A tibble: 2 x 4
## name age height married
## <chr> <dbl> <dbl> <chr>
## 1 A 27 180 yes
## 2 D 26 169 no
代码语言:javascript复制# 随机选取
set.seed(1234)
# 不放回取5个
my_data %>% sample_n(5, replace = FALSE)
代码语言:javascript复制## # A tibble: 5 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.2 3.5 1.5 0.2 setosa
## 2 5.7 2.6 3.5 1 versicolor
## 3 6.3 3.3 6 2.5 virginica
## 4 6.5 3.2 5.1 2 virginica
## 5 6.3 3.4 5.6 2.4 virginica
代码语言:javascript复制# 按照比例选取
my_data %>% sample_frac(0.05, replace = FALSE) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 6.4 2.8 5.6 2.2 virginica
## 2 6.8 3.2 5.9 2.3 virginica
## 3 7.9 3.8 6.4 2 virginica
## 4 6.2 2.9 4.3 1.3 versicolor
## 5 7.1 3 5.9 2.1 virginica
## 6 5.5 2.5 4 1.3 versicolor
代码语言:javascript复制# 按照特定的列,选取前5个
my_data %>% top_n(5, Sepal.Length)
代码语言:javascript复制## # A tibble: 5 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 7.7 3.8 6.7 2.2 virginica
## 2 7.7 2.6 6.9 2.3 virginica
## 3 7.7 2.8 6.7 2 virginica
## 4 7.9 3.8 6.4 2 virginica
## 5 7.7 3 6.1 2.3 virginica
代码语言:javascript复制# 分组选取前5个
my_data %>%
group_by(Species) %>%
top_n(5, Sepal.Length) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## # Groups: Species [2]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.8 4 1.2 0.2 setosa
## 2 5.7 4.4 1.5 0.4 setosa
## 3 5.7 3.8 1.7 0.3 setosa
## 4 5.5 4.2 1.4 0.2 setosa
## 5 5.5 3.5 1.3 0.2 setosa
## 6 7 3.2 4.7 1.4 versicolor
重复值删除
代码语言:javascript复制my_data <- as_tibble(iris)
# 删除重复值
my_data[!duplicated(my_data$Sepal.Width), ] %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
代码语言:javascript复制# unique函数也可以
unique(my_data) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
代码语言:javascript复制# distinct
my_data %>% distinct() %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
代码语言:javascript复制# 对单一变量去重
my_data %>% distinct(Sepal.Length, .keep_all = TRUE) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
代码语言:javascript复制# 对多个变量去重
my_data %>% distinct(Sepal.Length, Petal.Width, .keep_all = TRUE) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
对行排序
代码语言:javascript复制my_data <- as_tibble(iris)
# arrange升序排列
my_data %>% arrange(Sepal.Length) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 4.3 3 1.1 0.1 setosa
## 2 4.4 2.9 1.4 0.2 setosa
## 3 4.4 3 1.3 0.2 setosa
## 4 4.4 3.2 1.3 0.2 setosa
## 5 4.5 2.3 1.3 0.3 setosa
## 6 4.6 3.1 1.5 0.2 setosa
代码语言:javascript复制# 降序
my_data %>% arrange(desc(Sepal.Length)) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 7.9 3.8 6.4 2 virginica
## 2 7.7 3.8 6.7 2.2 virginica
## 3 7.7 2.6 6.9 2.3 virginica
## 4 7.7 2.8 6.7 2 virginica
## 5 7.7 3 6.1 2.3 virginica
## 6 7.6 3 6.6 2.1 virginica
代码语言:javascript复制# 支持符号
arrange(my_data, -Sepal.Length) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 7.9 3.8 6.4 2 virginica
## 2 7.7 3.8 6.7 2.2 virginica
## 3 7.7 2.6 6.9 2.3 virginica
## 4 7.7 2.8 6.7 2 virginica
## 5 7.7 3 6.1 2.3 virginica
## 6 7.6 3 6.6 2.1 virginica
代码语言:javascript复制# 多变量排序
my_data %>% arrange(Sepal.Length, Sepal.Width)
代码语言:javascript复制## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 4.3 3 1.1 0.1 setosa
## 2 4.4 2.9 1.4 0.2 setosa
## 3 4.4 3 1.3 0.2 setosa
## 4 4.4 3.2 1.3 0.2 setosa
## 5 4.5 2.3 1.3 0.3 setosa
## 6 4.6 3.1 1.5 0.2 setosa
## 7 4.6 3.2 1.4 0.2 setosa
## 8 4.6 3.4 1.4 0.3 setosa
## 9 4.6 3.6 1 0.2 setosa
## 10 4.7 3.2 1.3 0.2 setosa
## # ... with 140 more rows
代码语言:javascript复制# 注 如果存在缺失值,一般排序在最后
对列重命名
代码语言:javascript复制# 使用tidyverse
my_data %>%
rename(
sepal_length = Sepal.Length,
sepal_width = Sepal.Width
)
代码语言:javascript复制## # A tibble: 150 x 5
## sepal_length sepal_width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # ... with 140 more rows
代码语言:javascript复制# 使用R基础功能
names(my_data)[names(my_data) == "Sepal.Length"] <- "sepal_length"
计算新变量
代码语言:javascript复制# 使用mutate
my_data <- as_tibble(iris)
my_data %>%
mutate(sepal_by_petal_l = Sepal.Length/Petal.Length) %>% head()
代码语言:javascript复制## # A tibble: 6 x 6
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal_by_petal_l
## <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 5.1 3.5 1.4 0.2 setosa 3.64
## 2 4.9 3 1.4 0.2 setosa 3.5
## 3 4.7 3.2 1.3 0.2 setosa 3.62
## 4 4.6 3.1 1.5 0.2 setosa 3.07
## 5 5 3.6 1.4 0.2 setosa 3.57
## 6 5.4 3.9 1.7 0.4 setosa 3.18
代码语言:javascript复制# 使用transmute 会删除原来的列
my_data %>%
transmute(
sepal_by_petal_l = Sepal.Length/Petal.Length,
sepal_by_petal_w = Sepal.Width/Petal.Width
) %>% head()
代码语言:javascript复制## # A tibble: 6 x 2
## sepal_by_petal_l sepal_by_petal_w
## <dbl> <dbl>
## 1 3.64 17.5
## 2 3.5 15
## 3 3.62 16
## 4 3.07 15.5
## 5 3.57 18
## 6 3.18 9.75
代码语言:javascript复制# 对所有变量计算
my_data2 <- my_data %>%
select(-Species)
# 所有变量除以2.54
my_data2 %>%
mutate_all(funs(./2.54)) %>% head()
代码语言:javascript复制## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
代码语言:javascript复制## # A tibble: 6 x 4
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 2.01 1.38 0.551 0.0787
## 2 1.93 1.18 0.551 0.0787
## 3 1.85 1.26 0.512 0.0787
## 4 1.81 1.22 0.591 0.0787
## 5 1.97 1.42 0.551 0.0787
## 6 2.13 1.54 0.669 0.157
代码语言:javascript复制# 生成新变量添加后缀
my_data2 %>%
mutate_all(funs(cm = ./2.54)) %>% head()
代码语言:javascript复制## # A tibble: 6 x 8
## Sepal.Length Sepal.Width Petal.Length Petal.Width Sepal.Length_cm
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2 2.01
## 2 4.9 3 1.4 0.2 1.93
## 3 4.7 3.2 1.3 0.2 1.85
## 4 4.6 3.1 1.5 0.2 1.81
## 5 5 3.6 1.4 0.2 1.97
## 6 5.4 3.9 1.7 0.4 2.13
## # ... with 3 more variables: Sepal.Width_cm <dbl>, Petal.Length_cm <dbl>,
## # Petal.Width_cm <dbl>
代码语言:javascript复制# 对特定的变量计算
my_data2 %>%
mutate_at(
c("Sepal.Length", "Petal.Width"),
funs(cm = ./2.54)
) %>% head()
代码语言:javascript复制## # A tibble: 6 x 6
## Sepal.Length Sepal.Width Petal.Length Petal.Width Sepal.Length_cm
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2 2.01
## 2 4.9 3 1.4 0.2 1.93
## 3 4.7 3.2 1.3 0.2 1.85
## 4 4.6 3.1 1.5 0.2 1.81
## 5 5 3.6 1.4 0.2 1.97
## 6 5.4 3.9 1.7 0.4 2.13
## # ... with 1 more variable: Petal.Width_cm <dbl>
代码语言:javascript复制# 转换变量类型
my_data %>% mutate_if(is.factor, as.character) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
代码语言:javascript复制# 小数点处理
my_data %>% mutate_if(is.numeric, round, digits = 0) %>% head()
代码语言:javascript复制## # A tibble: 6 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5 4 1 0 setosa
## 2 5 3 1 0 setosa
## 3 5 3 1 0 setosa
## 4 5 3 2 0 setosa
## 5 5 4 1 0 setosa
## 6 5 4 2 0 setosa
汇总表
代码语言:javascript复制my_data <- as_tibble(iris)
my_data %>%
summarise(
count = n(),
mean_sep = mean(Sepal.Length, na.rm = TRUE),
mean_pet = mean(Petal.Length, na.rm = TRUE)
)
代码语言:javascript复制## # A tibble: 1 x 3
## count mean_sep mean_pet
## <int> <dbl> <dbl>
## 1 150 5.84 3.76
代码语言:javascript复制# 分组统计
my_data %>%
group_by(Species) %>%
summarise(
count = n(),
mean_sep = mean(Sepal.Length),
mean_pet = mean(Petal.Length)
)
代码语言:javascript复制## `summarise()` ungrouping output (override with `.groups` argument)
代码语言:javascript复制## # A tibble: 3 x 4
## Species count mean_sep mean_pet
## <fct> <int> <dbl> <dbl>
## 1 setosa 50 5.01 1.46
## 2 versicolor 50 5.94 4.26
## 3 virginica 50 6.59 5.55
代码语言:javascript复制# 多变量分组
ToothGrowth %>%
group_by(supp, dose) %>%
summarise(
n = n(),
mean = mean(len),
sd = sd(len)
)
代码语言:javascript复制## `summarise()` regrouping output by 'supp' (override with `.groups` argument)
代码语言:javascript复制## # A tibble: 6 x 5
## # Groups: supp [2]
## supp dose n mean sd
## <fct> <dbl> <int> <dbl> <dbl>
## 1 OJ 0.5 10 13.2 4.46
## 2 OJ 1 10 22.7 3.91
## 3 OJ 2 10 26.1 2.66
## 4 VC 0.5 10 7.98 2.75
## 5 VC 1 10 16.8 2.52
## 6 VC 2 10 26.1 4.80
代码语言:javascript复制# 总结多个变量
my_data %>%
group_by(Species) %>%
summarise_all(mean)
代码语言:javascript复制## # A tibble: 3 x 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 1.46 0.246
## 2 versicolor 5.94 2.77 4.26 1.33
## 3 virginica 6.59 2.97 5.55 2.03
代码语言:javascript复制# 选定变量总结
my_data %>%
group_by(Species) %>%
summarise_at(c("Sepal.Length", "Sepal.Width"), mean, na.rm = TRUE)
代码语言:javascript复制## # A tibble: 3 x 3
## Species Sepal.Length Sepal.Width
## <fct> <dbl> <dbl>
## 1 setosa 5.01 3.43
## 2 versicolor 5.94 2.77
## 3 virginica 6.59 2.97
代码语言:javascript复制# 判定为数字,然后总结
my_data %>%
group_by(Species) %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
代码语言:javascript复制## # A tibble: 3 x 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 1.46 0.246
## 2 versicolor 5.94 2.77 4.26 1.33
## 3 virginica 6.59 2.97 5.55 2.03
结束语
tidyverse函数高效,代码简洁,受过专业训练的一般都用这个,除非记不住,能记一点是一点吧。 love&peace