「R」连接两个数据集的各种 join

2022-01-21 19:13:49 浏览数 (3)

本文来自 stack overflow 上的一个帖子

base与data.table适用

SQL版

流行的dplyr

最后看看各种操作的性能吧

data.table 就是牛批!(可惜没有tidyverse易用)

测试代码:

代码语言:javascript复制
library(microbenchmark)
library(sqldf)
library(dplyr)
library(data.table)
sapply(c("sqldf","dplyr","data.table"), packageVersion, simplify=FALSE)

n = 5e7
set.seed(108)
df1 = data.frame(x=sample(n,n-1L), y1=rnorm(n-1L))
df2 = data.frame(x=sample(n,n-1L), y2=rnorm(n-1L))
dt1 = as.data.table(df1)
dt2 = as.data.table(df2)

mb = list()
# inner join
microbenchmark(times = 1L,
               base = merge(df1, df2, by = "x"),
               sqldf = sqldf("SELECT * FROM df1 INNER JOIN df2 ON df1.x = df2.x"),
               dplyr = inner_join(df1, df2, by = "x"),
               DT = dt1[dt2, nomatch=NULL, on = "x"]) -> mb$inner

# left outer join
microbenchmark(times = 1L,
               base = merge(df1, df2, by = "x", all.x = TRUE),
               sqldf = sqldf("SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.x = df2.x"),
               dplyr = left_join(df1, df2, by = c("x"="x")),
               DT = dt2[dt1, on = "x"]) -> mb$left

# right outer join
microbenchmark(times = 1L,
               base = merge(df1, df2, by = "x", all.y = TRUE),
               sqldf = sqldf("SELECT * FROM df2 LEFT OUTER JOIN df1 ON df2.x = df1.x"),
               dplyr = right_join(df1, df2, by = "x"),
               DT = dt1[dt2, on = "x"]) -> mb$right

# full outer join
microbenchmark(times = 1L,
               base = merge(df1, df2, by = "x", all = TRUE),
               dplyr = full_join(df1, df2, by = "x"),
               DT = merge(dt1, dt2, by = "x", all = TRUE)) -> mb$full

lapply(mb, print) -> nul

0 人点赞