-
Notifications
You must be signed in to change notification settings - Fork 5
/
混合编程.R
86 lines (71 loc) · 2.89 KB
/
混合编程.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
## ----echo=FALSE----------------------------------------------------------
library(knitr)
opts_chunk$set(eval = F)
## ------------------------------------------------------------------------
nr_of_rows <- 1e7
df <- data.frame(
Logical = sample(c(TRUE, FALSE, NA), prob = c(0.85, 0.1, 0.05), nr_of_rows, replace = TRUE),
Integer = sample(1L:100L, nr_of_rows, replace = TRUE),
Real = sample(sample(1:10000, 20) / 100, nr_of_rows, replace = TRUE),
Factor = as.factor(sample(labels(UScitiesD), nr_of_rows, replace = TRUE))
)
## ------------------------------------------------------------------------
library(pacman)
p_load(data.table,tidyverse)
## ------------------------------------------------------------------------
df %>% as_tibble -> dt
rm(df) #df不用了,因此移除掉来节省空间
## ------------------------------------------------------------------------
bind_rows(dt,dt,dt,dt,dt) -> dt5
rm(dt) #dt不用了,移除掉节省空间
## ------------------------------------------------------------------------
dt5 %>%
object.size() %>%
print(unit = "auto")
## ------------------------------------------------------------------------
dt5 %>%
mutate(sum = Integer + Real,prod = Integer * Real) %>%
group_by(Logical,Integer,Factor) %>%
summarise(n = n(),
median = median(Real),
sum_avg = mean(sum),
prod_avg = mean(prod))
## ------------------------------------------------------------------------
system.time(dt5 %>%
mutate(sum = Integer + Real,prod = Integer * Real) %>%
group_by(Logical,Integer,Factor) %>%
summarise(n = n(),
median = median(Real),
sum_avg = mean(sum),
prod_avg = mean(prod)))
## ------------------------------------------------------------------------
dt5 %>%
mutate(sum = Integer + Real,prod = Integer * Real) %>%
as.data.table() %>%
.[,.(n = .N,
median = median(Real),
sum_avg = mean(sum),
prod_avg = mean(prod)),
by = .(Logical,Integer,Factor)] %>%
as_tibble
## ------------------------------------------------------------------------
system.time(dt5 %>%
mutate(sum = Integer + Real,prod = Integer * Real) %>%
as.data.table() %>%
.[,.(n = .N,
median = median(Real),
sum_avg = mean(sum),
prod_avg = mean(prod)),
by = .(Logical,Integer,Factor)] %>%
as_tibble)
## ------------------------------------------------------------------------
my_count = function(df,...){
dt <- as.data.table(df)
dt[,.(n = .N),by = ...] %>% as_tibble()
}
## ------------------------------------------------------------------------
p_load(microbenchmark)
microbenchmark(dt5 %>% count(Integer) -> a,
dt5 %>% my_count(Integer) -> b,
times = 5,unit = "s")
setequal(a,b) #看看两者得到的效果是不是一样的