From 93b5e3a93355230dbb681b00a7caf0e1348c0fb5 Mon Sep 17 00:00:00 2001
From: Hope <32947502+hope-data-science@users.noreply.github.com>
Date: Mon, 23 Sep 2024 12:58:49 +0800
Subject: [PATCH] Add files via upload
---
doc/Introduction.R | 141 +
doc/Introduction.Rmd | 202 +
doc/Introduction.html | 4834 +++++++++++++++++
docs/404.html | 275 +-
docs/LICENSE-text.html | 124 +-
docs/LICENSE.html | 124 +-
docs/articles/Introduction.html | 808 +--
.../htmlwidgets-1.6.1/htmlwidgets.js | 901 +++
.../profvis-0.3.6.9000/profvis.css | 696 +++
.../profvis-0.3.6.9000/profvis.js | 2656 +++++++++
.../profvis-0.3.6.9000/scroll.js | 69 +
.../profvis-binding-0.3.7/profvis.js | 23 +
docs/articles/index.html | 127 +-
docs/authors.html | 158 +-
docs/bootstrap-toc.css | 60 +
docs/bootstrap-toc.js | 159 +
docs/index.html | 420 +-
docs/pkgdown.css | 208 +-
docs/pkgdown.js | 9 +-
docs/pkgdown.yml | 6 +-
docs/reference/Rplot001.png | Bin 0 -> 1011 bytes
docs/reference/arrange.html | 321 +-
docs/reference/as_fst.html | 207 +-
docs/reference/complete.html | 359 +-
docs/reference/count.html | 343 +-
docs/reference/cummean.html | 169 +-
docs/reference/distinct.html | 227 +-
docs/reference/drop_delete_na.html | 285 +-
docs/reference/dummy.html | 387 +-
docs/reference/fill.html | 297 +-
docs/reference/filter.html | 257 +-
docs/reference/fst.html | 818 ++-
docs/reference/fst_io.html | 305 +-
docs/reference/group.html | 261 +-
docs/reference/index.html | 288 +-
docs/reference/join.html | 355 +-
docs/reference/lag_lead.html | 207 +-
docs/reference/long_wide.html | 505 +-
docs/reference/mutate.html | 789 ++-
docs/reference/nest.html | 729 ++-
docs/reference/nth.html | 189 +-
docs/reference/object_size.html | 171 +-
docs/reference/pull.html | 183 +-
docs/reference/read_csv.html | 178 +-
docs/reference/reexports.html | 147 +-
docs/reference/relocate.html | 293 +-
docs/reference/replace_vars.html | 351 +-
docs/reference/rowwise.html | 243 +-
docs/reference/select.html | 433 +-
docs/reference/separate.html | 246 +-
docs/reference/slice.html | 519 +-
docs/reference/summarise.html | 259 +-
docs/reference/sys_time_print.html | 217 +-
docs/reference/tidymat.html | 257 +-
docs/reference/uncount.html | 219 +-
docs/reference/unite.html | 345 +-
docs/reference/utf8_encoding.html | 205 +-
docs/sitemap.xml | 46 +
58 files changed, 15348 insertions(+), 8262 deletions(-)
create mode 100644 doc/Introduction.R
create mode 100644 doc/Introduction.Rmd
create mode 100644 doc/Introduction.html
create mode 100644 docs/articles/Introduction_files/htmlwidgets-1.6.1/htmlwidgets.js
create mode 100644 docs/articles/Introduction_files/profvis-0.3.6.9000/profvis.css
create mode 100644 docs/articles/Introduction_files/profvis-0.3.6.9000/profvis.js
create mode 100644 docs/articles/Introduction_files/profvis-0.3.6.9000/scroll.js
create mode 100644 docs/articles/Introduction_files/profvis-binding-0.3.7/profvis.js
create mode 100644 docs/bootstrap-toc.css
create mode 100644 docs/bootstrap-toc.js
create mode 100644 docs/reference/Rplot001.png
create mode 100644 docs/sitemap.xml
diff --git a/doc/Introduction.R b/doc/Introduction.R
new file mode 100644
index 0000000..53a9e4a
--- /dev/null
+++ b/doc/Introduction.R
@@ -0,0 +1,141 @@
+## ---- include = FALSE---------------------------------------------------------
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>"
+)
+
+## ----setup--------------------------------------------------------------------
+library(tidyft)
+
+# make copies
+copy(iris) -> a
+copy(mtcars) -> b
+
+# before
+class(a)
+class(b)
+
+# convert codes
+lapply(ls(),get) %>%
+ lapply(setDT) %>%
+ invisible()
+
+# after
+class(a)
+class(b)
+
+## -----------------------------------------------------------------------------
+rm(list = ls())
+
+library(tidyft)
+# make a large data.frame
+iris[rep(1:nrow(iris),1e4),] -> dt
+# size: 1500000 rows, 5 columns
+dim(dt)
+# save as fst table
+as_fst(dt) -> ft
+# remove the data.frame from RAM
+rm(dt)
+
+# inspect the fst table of large iris
+ft
+summary_fst(ft)
+
+# list the variables in the environment
+ls() # only the ft exists
+
+## -----------------------------------------------------------------------------
+ft %>%
+ slice_fst(5555:6666) # get 5555 to 6666 row
+
+## -----------------------------------------------------------------------------
+
+sys_time_print({
+ res = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width) %>%
+ rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ arrange(group,sl) %>%
+ filter(sl > 5) %>%
+ distinct(sl,.keep_all = TRUE) %>%
+ summarise(sw = max(sw),by = group)
+})
+
+res
+
+
+## -----------------------------------------------------------------------------
+
+rm(list = ls())
+
+library(profvis)
+library(data.table)
+library(dplyr)
+library(dtplyr)
+library(tidyft)
+
+
+# make a large data.frame
+iris[rep(1:nrow(iris),1e4),] -> dt
+# size: 1500000 rows, 5 columns
+dim(dt)
+# save as fst table
+as_fst(dt) -> ft
+# remove the data.frame from RAM
+rm(dt)
+
+
+profvis({
+
+ res1 = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ dplyr::select(-Petal.Length) %>%
+ dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ dplyr::arrange(group,sl) %>%
+ dplyr::filter(sl > 5) %>%
+ dplyr::distinct(sl,.keep_all = TRUE) %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarise(sw = max(sw))
+
+ res2 = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ lazy_dt() %>%
+ dplyr::as_tibble() %>%
+ dplyr::select(-Petal.Length) %>%
+ dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ dplyr::arrange(group,sl) %>%
+ dplyr::filter(sl > 5) %>%
+ dplyr::distinct(sl,.keep_all = TRUE) %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarise(sw = max(sw)) %>%
+ as.data.table()
+
+ res3 = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>%
+ setDT() %>%
+ .[,.SD,.SDcols = -"Petal.Length"] %>%
+ setnames(old =c("Species","Sepal.Length","Sepal.Width"),
+ new = c("group","sl","sw")) %>%
+ setorder(group,sl) %>%
+ .[sl>5] %>% unique(by = "sl") %>%
+ .[,.(sw = max(sw)),by = group]
+
+
+ res4 = ft %>%
+ tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ tidyft::select(-Petal.Length) %>%
+ tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ tidyft::arrange(group,sl) %>%
+ tidyft::filter(sl > 5) %>%
+ tidyft::distinct(sl,.keep_all = TRUE) %>%
+ tidyft::summarise(sw = max(sw),by = group)
+
+
+})
+
+setequal(res1,res2)
+setequal(res2,res3)
+setequal(res3,res4)
+
+
+## -----------------------------------------------------------------------------
+sessionInfo()
+
diff --git a/doc/Introduction.Rmd b/doc/Introduction.Rmd
new file mode 100644
index 0000000..dc9ff94
--- /dev/null
+++ b/doc/Introduction.Rmd
@@ -0,0 +1,202 @@
+---
+title: "Fastest data operations with least memory in tidy syntax"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Introduction}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>"
+)
+```
+
+## Why tidyft?
+
+Before [tidyft](https://github.com/hope-data-science/tidyft), I've designed a package named [tidyfst](https://github.com/hope-data-science/tidyfst). Backed by *data.table*, it is fast and convenient. By then, I was not so interested in modification by reference, which always causes trouble in my workflow. Therefore, I use a lot of functions to make copies so as to suppress the in place replacement. However, when it comes to big data, simply making a new copy of the original data set could be time consuming and memory inefficient. So I tried to write some functions using the feature of modification by reference. This ends up in inconsistency of many functions in the *tidyfst* package. In the end, I removed all the in place replacement functions in *tidyfst* and build a new package instead. This is how *tidyft* comes into being.
+
+## The philosophy of tidyft
+
+> You cannot step into the same river twice, for other waters are continually flowing on.
+>
+> [—— Heraclitus]{style="float:right"}
+>
+
+If you try to do data operations on any data.table(s), never use it again for futher analysis, because it is not the data you know before. And you might never figure out what have happened and what has been changed in that process. If you really want to use it again, try make a copy first using `copy()`, which might take extra time and space (that's why tidyft avoid doing this all the time).
+
+Another rule is, tidyft only deals with data.table(s), the raw data.frame and other formats such as tibble could not work. If you already have lots of data.frames in the environment, try these codes.
+
+```{r setup}
+library(tidyft)
+
+# make copies
+copy(iris) -> a
+copy(mtcars) -> b
+
+# before
+class(a)
+class(b)
+
+# convert codes
+lapply(ls(),get) %>%
+ lapply(setDT) %>%
+ invisible()
+
+# after
+class(a)
+class(b)
+```
+
+One last thing, while modifications are carried out in place, doesn't mean that the results could not be showed after operation. The data.table package would return it invisibly, but in tidyft, the final results are always printed if possible. This brings no reduction to the computation performance.
+
+## Working with fst
+
+tidyft would not be so powerful without [fst](https://github.com/fstpackage/fst). I first introduce this workflow into [tidyfst](https://hope-data-science.github.io/tidyfst/articles/example5_fst.html). In such workflow, you do not have to read all data into memory, only import the needed data when necessary. tidyft is not so convenient for in-memory operations, but it works very well (if not best) with the fst workflow. Here we'll make some examples.
+
+```{r}
+rm(list = ls())
+
+library(tidyft)
+# make a large data.frame
+iris[rep(1:nrow(iris),1e4),] -> dt
+# size: 1500000 rows, 5 columns
+dim(dt)
+# save as fst table
+as_fst(dt) -> ft
+# remove the data.frame from RAM
+rm(dt)
+
+# inspect the fst table of large iris
+ft
+summary_fst(ft)
+
+# list the variables in the environment
+ls() # only the ft exists
+```
+
+The `as_fst` could save any data.frame as ".fst" file in temporary file and parse it back as fst table. Fst table is small in RAM, but if you want to get any part of the data.frame, you can get it in almost no time:
+
+```{r}
+ft %>%
+ slice_fst(5555:6666) # get 5555 to 6666 row
+```
+
+
+
+Except for `slice_fst`, there are also other functions for subsetting the data, such as `select_fst`,`filter_fst`. Good practice is: Make subsets of the data and use the least needy data to do operations. For very large data sets, you may try to do tests on a sample of the data (using `slice` or `select` to get several rows or columns) first before you implement a huge operation. Now let's do a slightly complex manipulation. We'll use `sys_time_print` to measure the running time.
+
+```{r}
+
+sys_time_print({
+ res = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width) %>%
+ rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ arrange(group,sl) %>%
+ filter(sl > 5) %>%
+ distinct(sl,.keep_all = TRUE) %>%
+ summarise(sw = max(sw),by = group)
+})
+
+res
+
+```
+
+This should be pretty fast. Becasue when we use the data in fst table, we never get them until using the "_fst" suffix functions, so the tidyft functions never modify the data in the fst file or fst table. That is to say, we do not have to worry about the modification by reference any more. No copies made, fastest ever.
+
+## Performance
+The fst workflow could also be working with other tools, though less efficient. Now let's compare the performance of tidyft, data.table, dtplyr and dplyr.
+
+```{r}
+
+rm(list = ls())
+
+library(profvis)
+library(data.table)
+library(dplyr)
+library(dtplyr)
+library(tidyft)
+
+
+# make a large data.frame
+iris[rep(1:nrow(iris),1e4),] -> dt
+# size: 1500000 rows, 5 columns
+dim(dt)
+# save as fst table
+as_fst(dt) -> ft
+# remove the data.frame from RAM
+rm(dt)
+
+
+profvis({
+
+ res1 = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ dplyr::select(-Petal.Length) %>%
+ dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ dplyr::arrange(group,sl) %>%
+ dplyr::filter(sl > 5) %>%
+ dplyr::distinct(sl,.keep_all = TRUE) %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarise(sw = max(sw))
+
+ res2 = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ lazy_dt() %>%
+ dplyr::as_tibble() %>%
+ dplyr::select(-Petal.Length) %>%
+ dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ dplyr::arrange(group,sl) %>%
+ dplyr::filter(sl > 5) %>%
+ dplyr::distinct(sl,.keep_all = TRUE) %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarise(sw = max(sw)) %>%
+ as.data.table()
+
+ res3 = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>%
+ setDT() %>%
+ .[,.SD,.SDcols = -"Petal.Length"] %>%
+ setnames(old =c("Species","Sepal.Length","Sepal.Width"),
+ new = c("group","sl","sw")) %>%
+ setorder(group,sl) %>%
+ .[sl>5] %>% unique(by = "sl") %>%
+ .[,.(sw = max(sw)),by = group]
+
+
+ res4 = ft %>%
+ tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ tidyft::select(-Petal.Length) %>%
+ tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ tidyft::arrange(group,sl) %>%
+ tidyft::filter(sl > 5) %>%
+ tidyft::distinct(sl,.keep_all = TRUE) %>%
+ tidyft::summarise(sw = max(sw),by = group)
+
+
+})
+
+setequal(res1,res2)
+setequal(res2,res3)
+setequal(res3,res4)
+
+```
+
+Because tidyft is based on data.table, therefore, if you always use data.table correctly, then tidyft should not perform better than data.table (I do use some tricks, by never do column selection but delete the unselected ones instead, which is faster and more memory efficient than using `.SDcols` in data.table). However, tidyft has a very different syntax, which might be more readable. And lots of complex operations of data.table has been wrapped in it. This could save your day to write the correct codes sometimes. I hope all my time devoted to this work could possibly save some of your valuable time on data operations of big datasets.
+
+
+## Session Information
+```{r}
+sessionInfo()
+```
+
+
+
+
+
+
+
+
+
+
diff --git a/doc/Introduction.html b/doc/Introduction.html
new file mode 100644
index 0000000..e3000c5
--- /dev/null
+++ b/doc/Introduction.html
@@ -0,0 +1,4834 @@
+
+
+
+
+
Before tidyft, I’ve designed a package named tidyfst. Backed by data.table, it is fast and convenient. By then, I was not so interested in modification by reference, which always causes trouble in my workflow. Therefore, I use a lot of functions to make copies so as to suppress the in place replacement. However, when it comes to big data, simply making a new copy of the original data set could be time consuming and memory inefficient. So I tried to write some functions using the feature of modification by reference. This ends up in inconsistency of many functions in the tidyfst package. In the end, I removed all the in place replacement functions in tidyfst and build a new package instead. This is how tidyft comes into being.
+++You cannot step into the same river twice, for other waters are continually flowing on.
+
—— Heraclitus
If you try to do data operations on any data.table(s), never use it again for futher analysis, because it is not the data you know before. And you might never figure out what have happened and what has been changed in that process. If you really want to use it again, try make a copy first using copy()
, which might take extra time and space (that’s why tidyft avoid doing this all the time).
Another rule is, tidyft only deals with data.table(s), the raw data.frame and other formats such as tibble could not work. If you already have lots of data.frames in the environment, try these codes.
+library(tidyft)
+#>
+#> Life's short, use R.
+#>
+#> Attaching package: 'tidyft'
+#> The following objects are masked from 'package:stats':
+#>
+#> filter, lag
+
+# make copies
+copy(iris) -> a
+copy(mtcars) -> b
+
+# before
+class(a)
+#> [1] "data.frame"
+class(b)
+#> [1] "data.frame"
+
+# convert codes
+lapply(ls(),get) %>%
+ lapply(setDT) %>%
+ invisible()
+
+# after
+class(a)
+#> [1] "data.table" "data.frame"
+class(b)
+#> [1] "data.table" "data.frame"
One last thing, while modifications are carried out in place, doesn’t mean that the results could not be showed after operation. The data.table package would return it invisibly, but in tidyft, the final results are always printed if possible. This brings no reduction to the computation performance.
+tidyft would not be so powerful without fst. I first introduce this workflow into tidyfst. In such workflow, you do not have to read all data into memory, only import the needed data when necessary. tidyft is not so convenient for in-memory operations, but it works very well (if not best) with the fst workflow. Here we’ll make some examples.
+rm(list = ls())
+
+library(tidyft)
+# make a large data.frame
+iris[rep(1:nrow(iris),1e4),] -> dt
+# size: 1500000 rows, 5 columns
+dim(dt)
+#> [1] 1500000 5
+# save as fst table
+as_fst(dt) -> ft
+# remove the data.frame from RAM
+rm(dt)
+
+# inspect the fst table of large iris
+ft
+#> <fst file>
+#> 1500000 rows, 5 columns (dt636472463ef3.fst)
+#>
+#> [38;5;248m [39m Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> [3m[38;5;248m <double> <double> <double> <double> <factor>[39m[23m
+#> [38;5;248m1 [39m 5.1 3.5 1.4 0.2 setosa
+#> [38;5;248m2 [39m 4.9 3.0 1.4 0.2 setosa
+#> [38;5;248m3 [39m 4.7 3.2 1.3 0.2 setosa
+#> [38;5;248m4 [39m 4.6 3.1 1.5 0.2 setosa
+#> [38;5;248m5 [39m 5.0 3.6 1.4 0.2 setosa
+#> [38;5;248m-- -- -- -- -- --[39m
+#> [38;5;248m1499996[39m 6.7 3.0 5.2 2.3 virginica
+#> [38;5;248m1499997[39m 6.3 2.5 5.0 1.9 virginica
+#> [38;5;248m1499998[39m 6.5 3.0 5.2 2.0 virginica
+#> [38;5;248m1499999[39m 6.2 3.4 5.4 2.3 virginica
+#> [38;5;248m1500000[39m 5.9 3.0 5.1 1.8 virginica
+summary_fst(ft)
+#> <fst file>
+#> 1500000 rows, 5 columns (dt636472463ef3.fst)
+#>
+#> * 'Sepal.Length': double
+#> * 'Sepal.Width' : double
+#> * 'Petal.Length': double
+#> * 'Petal.Width' : double
+#> * 'Species' : factor
+
+# list the variables in the environment
+ls() # only the ft exists
+#> [1] "ft"
The as_fst
could save any data.frame as “.fst” file in temporary file and parse it back as fst table. Fst table is small in RAM, but if you want to get any part of the data.frame, you can get it in almost no time:
ft %>%
+ slice_fst(5555:6666) # get 5555 to 6666 row
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 5.0 3.6 1.4 0.2 setosa
+#> 2: 5.4 3.9 1.7 0.4 setosa
+#> 3: 4.6 3.4 1.4 0.3 setosa
+#> 4: 5.0 3.4 1.5 0.2 setosa
+#> 5: 4.4 2.9 1.4 0.2 setosa
+#> ---
+#> 1108: 5.9 3.0 4.2 1.5 versicolor
+#> 1109: 6.0 2.2 4.0 1.0 versicolor
+#> 1110: 6.1 2.9 4.7 1.4 versicolor
+#> 1111: 5.6 2.9 3.6 1.3 versicolor
+#> 1112: 6.7 3.1 4.4 1.4 versicolor
Except for slice_fst
, there are also other functions for subsetting the data, such as select_fst
,filter_fst
. Good practice is: Make subsets of the data and use the least needy data to do operations. For very large data sets, you may try to do tests on a sample of the data (using slice
or select
to get several rows or columns) first before you implement a huge operation. Now let’s do a slightly complex manipulation. We’ll use sys_time_print
to measure the running time.
+sys_time_print({
+ res = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width) %>%
+ rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ arrange(group,sl) %>%
+ filter(sl > 5) %>%
+ distinct(sl,.keep_all = TRUE) %>%
+ summarise(sw = max(sw),by = group)
+})
+#> [1] "Finished in 0.480s elapsed (0.500s cpu)"
+
+res
+#> group sw
+#> <fctr> <num>
+#> 1: setosa 4.4
+#> 2: versicolor 3.3
+#> 3: virginica 3.8
This should be pretty fast. Becasue when we use the data in fst table, we never get them until using the "_fst" suffix functions, so the tidyft functions never modify the data in the fst file or fst table. That is to say, we do not have to worry about the modification by reference any more. No copies made, fastest ever.
+The fst workflow could also be working with other tools, though less efficient. Now let’s compare the performance of tidyft, data.table, dtplyr and dplyr.
+
+rm(list = ls())
+
+library(profvis)
+library(data.table)
+library(dplyr)
+#>
+#> Attaching package: 'dplyr'
+#> The following objects are masked from 'package:data.table':
+#>
+#> between, first, last
+#> The following objects are masked from 'package:tidyft':
+#>
+#> add_count, anti_join, arrange, count, cummean, distinct, filter,
+#> full_join, group_by, groups, inner_join, lag, lead, left_join,
+#> mutate, nth, pull, rename, right_join, select, select_vars,
+#> semi_join, slice, summarise, transmute, ungroup
+#> The following objects are masked from 'package:stats':
+#>
+#> filter, lag
+#> The following objects are masked from 'package:base':
+#>
+#> intersect, setdiff, setequal, union
+library(dtplyr)
+library(tidyft)
+
+
+# make a large data.frame
+iris[rep(1:nrow(iris),1e4),] -> dt
+# size: 1500000 rows, 5 columns
+dim(dt)
+#> [1] 1500000 5
+# save as fst table
+as_fst(dt) -> ft
+# remove the data.frame from RAM
+rm(dt)
+
+
+profvis({
+
+ res1 = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ dplyr::select(-Petal.Length) %>%
+ dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ dplyr::arrange(group,sl) %>%
+ dplyr::filter(sl > 5) %>%
+ dplyr::distinct(sl,.keep_all = TRUE) %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarise(sw = max(sw))
+
+ res2 = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ lazy_dt() %>%
+ dplyr::as_tibble() %>%
+ dplyr::select(-Petal.Length) %>%
+ dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ dplyr::arrange(group,sl) %>%
+ dplyr::filter(sl > 5) %>%
+ dplyr::distinct(sl,.keep_all = TRUE) %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarise(sw = max(sw)) %>%
+ as.data.table()
+
+ res3 = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>%
+ setDT() %>%
+ .[,.SD,.SDcols = -"Petal.Length"] %>%
+ setnames(old =c("Species","Sepal.Length","Sepal.Width"),
+ new = c("group","sl","sw")) %>%
+ setorder(group,sl) %>%
+ .[sl>5] %>% unique(by = "sl") %>%
+ .[,.(sw = max(sw)),by = group]
+
+
+ res4 = ft %>%
+ tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ tidyft::select(-Petal.Length) %>%
+ tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ tidyft::arrange(group,sl) %>%
+ tidyft::filter(sl > 5) %>%
+ tidyft::distinct(sl,.keep_all = TRUE) %>%
+ tidyft::summarise(sw = max(sw),by = group)
+
+
+})
+#> Warning: You are using a dplyr method on a raw data.table, which will call the
+#> * data frame implementation, and is likely to be inefficient.
+#> *
+#> * To suppress this message, either generate a data.table translation with
+#> * `lazy_dt()` or convert to a data frame or tibble with
+#> * `as.data.frame()`/`as_tibble()`.
+
+#> Warning: You are using a dplyr method on a raw data.table, which will call the
+#> * data frame implementation, and is likely to be inefficient.
+#> *
+#> * To suppress this message, either generate a data.table translation with
+#> * `lazy_dt()` or convert to a data frame or tibble with
+#> * `as.data.frame()`/`as_tibble()`.
+setequal(res1,res2)
+#> [1] TRUE
+setequal(res2,res3)
+#> [1] TRUE
+setequal(res3,res4)
+#> [1] TRUE
Because tidyft is based on data.table, therefore, if you always use data.table correctly, then tidyft should not perform better than data.table (I do use some tricks, by never do column selection but delete the unselected ones instead, which is faster and more memory efficient than using .SDcols
in data.table). However, tidyft has a very different syntax, which might be more readable. And lots of complex operations of data.table has been wrapped in it. This could save your day to write the correct codes sometimes. I hope all my time devoted to this work could possibly save some of your valuable time on data operations of big datasets.
sessionInfo()
+#> R version 3.6.3 (2020-02-29)
+#> Platform: x86_64-w64-mingw32/x64 (64-bit)
+#> Running under: Windows 10 x64 (build 18362)
+#>
+#> Matrix products: default
+#>
+#> locale:
+#> [1] LC_COLLATE=Chinese (Simplified)_China.936
+#> [2] LC_CTYPE=Chinese (Simplified)_China.936
+#> [3] LC_MONETARY=Chinese (Simplified)_China.936
+#> [4] LC_NUMERIC=C
+#> [5] LC_TIME=Chinese (Simplified)_China.936
+#>
+#> attached base packages:
+#> [1] stats graphics grDevices utils datasets methods base
+#>
+#> other attached packages:
+#> [1] dtplyr_1.0.1 dplyr_0.8.5 data.table_1.12.8 profvis_0.3.6
+#> [5] tidyft_0.4.5
+#>
+#> loaded via a namespace (and not attached):
+#> [1] Rcpp_1.0.3 pillar_1.4.3 compiler_3.6.3 prettyunits_1.1.1
+#> [5] remotes_2.1.1 tools_3.6.3 testthat_2.3.2 digest_0.6.25
+#> [9] pkgbuild_1.0.6 pkgload_1.0.2 jsonlite_1.6.1 tibble_2.1.3
+#> [13] memoise_1.1.0 evaluate_0.14 pkgconfig_2.0.3 rlang_0.4.5
+#> [17] cli_2.0.2 rstudioapi_0.11 parallel_3.6.3 yaml_2.2.1
+#> [21] xfun_0.12 withr_2.1.2 stringr_1.4.0 knitr_1.28
+#> [25] vctrs_0.2.4 htmlwidgets_1.5.1 desc_1.2.0 fs_1.3.1
+#> [29] devtools_2.2.2 tidyselect_1.0.0 rprojroot_1.3-2 glue_1.3.2
+#> [33] R6_2.4.1 processx_3.4.2 fansi_0.4.1 rmarkdown_2.1
+#> [37] sessioninfo_1.1.1 purrr_0.3.3 callr_3.4.2 magrittr_1.5
+#> [41] backports_1.1.5 ps_1.3.2 ellipsis_0.3.0 htmltools_0.4.0
+#> [45] usethis_1.5.1 fst_0.9.0 assertthat_0.2.1 stringi_1.4.6
+#> [49] crayon_1.3.4
vignettes/Introduction.Rmd
- Introduction.Rmd
Before tidyft, I’ve designed a package named tidyfst. Backed by data.table, it is fast and convenient. By then, I was not so interested in modification by reference, which always causes trouble in my workflow. Therefore, I use a lot of functions to make copies so as to suppress the in place replacement. However, when it comes to big data, simply making a new copy of the original data set could be time consuming and memory inefficient. So I tried to write some functions using the feature of modification by reference. This ends up in inconsistency of many functions in the tidyfst package. In the end, I removed all the in place replacement functions in tidyfst and build a new package instead. This is how tidyft comes into being.
---You cannot step into the same river twice, for other waters are continually flowing on.
-
—— Heraclitus
If you try to do data operations on any data.table(s), never use it again for futher analysis, because it is not the data you know before. And you might never figure out what have happened and what has been changed in that process. If you really want to use it again, try make a copy first using copy()
, which might take extra time and space (that’s why tidyft avoid doing this all the time).
Another rule is, tidyft only deals with data.table(s), the raw data.frame and other formats such as tibble could not work. If you already have lots of data.frames in the environment, try these codes.
-library(tidyft)
-#>
-#> Life's short, use R.
-#>
-#> Attaching package: 'tidyft'
-#> The following objects are masked from 'package:stats':
-#>
-#> filter, lag
-
-# make copies
-copy(iris) -> a
-copy(mtcars) -> b
-
-# before
-class(a)
-#> [1] "data.frame"
-class(b)
-#> [1] "data.frame"
-
-# convert codes
-lapply(ls(),get) %>%
- lapply(setDT) %>%
- invisible()
-
-# after
-class(a)
-#> [1] "data.table" "data.frame"
-class(b)
-#> [1] "data.table" "data.frame"
One last thing, while modifications are carried out in place, doesn’t mean that the results could not be showed after operation. The data.table package would return it invisibly, but in tidyft, the final results are always printed if possible. This brings no reduction to the computation performance.
-tidyft would not be so powerful without fst. I first introduce this workflow into tidyfst. In such workflow, you do not have to read all data into memory, only import the needed data when necessary. tidyft is not so convenient for in-memory operations, but it works very well (if not best) with the fst workflow. Here we’ll make some examples.
-rm(list = ls())
-
-library(tidyft)
-# make a large data.frame
-iris[rep(1:nrow(iris),1e4),] -> dt
-# size: 1500000 rows, 5 columns
-dim(dt)
-#> [1] 1500000 5
-# save as fst table
-as_fst(dt) -> ft
-# remove the data.frame from RAM
-rm(dt)
-
-# inspect the fst table of large iris
-ft
-#> <fst file>
-#> 1500000 rows, 5 columns (dt3fc013b51325.fst)
-#>
-#> [38;5;248m [39m Sepal.Length Sepal.Width Petal.Length Petal.Width Species
-#> [3m[38;5;248m <double> <double> <double> <double> <factor>[39m[23m
-#> [38;5;248m1 [39m 5.1 3.5 1.4 0.2 setosa
-#> [38;5;248m2 [39m 4.9 3.0 1.4 0.2 setosa
-#> [38;5;248m3 [39m 4.7 3.2 1.3 0.2 setosa
-#> [38;5;248m4 [39m 4.6 3.1 1.5 0.2 setosa
-#> [38;5;248m5 [39m 5.0 3.6 1.4 0.2 setosa
-#> [38;5;248m-- -- -- -- -- --[39m
-#> [38;5;248m1499996[39m 6.7 3.0 5.2 2.3 virginica
-#> [38;5;248m1499997[39m 6.3 2.5 5.0 1.9 virginica
-#> [38;5;248m1499998[39m 6.5 3.0 5.2 2.0 virginica
-#> [38;5;248m1499999[39m 6.2 3.4 5.4 2.3 virginica
-#> [38;5;248m1500000[39m 5.9 3.0 5.1 1.8 virginica
-summary_fst(ft)
-#> <fst file>
-#> 1500000 rows, 5 columns (dt3fc013b51325.fst)
-#>
-#> * 'Sepal.Length': double
-#> * 'Sepal.Width' : double
-#> * 'Petal.Length': double
-#> * 'Petal.Width' : double
-#> * 'Species' : factor
-
-# list the variables in the environment
-ls() # only the ft exists
-#> [1] "ft"
The as_fst
could save any data.frame as “.fst” file in temporary file and parse it back as fst table. Fst table is small in RAM, but if you want to get any part of the data.frame, you can get it in almost no time:
ft %>%
- slice_fst(5555:6666) # get 5555 to 6666 row
-#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
-#> <num> <num> <num> <num> <fctr>
-#> 1: 5.0 3.6 1.4 0.2 setosa
-#> 2: 5.4 3.9 1.7 0.4 setosa
-#> 3: 4.6 3.4 1.4 0.3 setosa
-#> 4: 5.0 3.4 1.5 0.2 setosa
-#> 5: 4.4 2.9 1.4 0.2 setosa
-#> ---
-#> 1108: 5.9 3.0 4.2 1.5 versicolor
-#> 1109: 6.0 2.2 4.0 1.0 versicolor
-#> 1110: 6.1 2.9 4.7 1.4 versicolor
-#> 1111: 5.6 2.9 3.6 1.3 versicolor
-#> 1112: 6.7 3.1 4.4 1.4 versicolor
Except for slice_fst
, there are also other functions for subsetting the data, such as select_fst
,filter_fst
. Good practice is: Make subsets of the data and use the least needy data to do operations. For very large data sets, you may try to do tests on a sample of the data (using slice
or select
to get several rows or columns) first before you implement a huge operation. Now let’s do a slightly complex manipulation. We’ll use sys_time_print
to measure the running time.
-sys_time_print({
- res = ft %>%
- select_fst(Species,Sepal.Length,Sepal.Width) %>%
- rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
- arrange(group,sl) %>%
- filter(sl > 5) %>%
- distinct(sl,.keep_all = TRUE) %>%
- summarise(sw = max(sw),by = group)
-})
-#> [1] "Finished in 0.470s elapsed (0.500s cpu)"
-
-res
-#> group sw
-#> <fctr> <num>
-#> 1: setosa 4.4
-#> 2: versicolor 3.3
-#> 3: virginica 3.8
This should be pretty fast. Becasue when we use the data in fst table, we never get them until using the "_fst" suffix functions, so the tidyft functions never modify the data in the fst file or fst table. That is to say, we do not have to worry about the modification by reference any more. No copies made, fastest ever.
-The fst workflow could also be working with other tools, though less efficient. Now let’s compare the performance of tidyft, data.table, dtplyr and dplyr.
-
-rm(list = ls())
-
-library(profvis)
-library(data.table)
-library(dplyr)
-#>
-#> Attaching package: 'dplyr'
-#> The following objects are masked from 'package:data.table':
-#>
-#> between, first, last
-#> The following objects are masked from 'package:tidyft':
-#>
-#> add_count, anti_join, arrange, count, cummean, distinct, filter,
-#> full_join, group_by, groups, inner_join, lag, lead, left_join,
-#> mutate, nth, pull, rename, right_join, select, select_vars,
-#> semi_join, slice, summarise, transmute, ungroup
-#> The following objects are masked from 'package:stats':
-#>
-#> filter, lag
-#> The following objects are masked from 'package:base':
-#>
-#> intersect, setdiff, setequal, union
-library(dtplyr)
-library(tidyft)
-
-
-# make a large data.frame
-iris[rep(1:nrow(iris),1e4),] -> dt
-# size: 1500000 rows, 5 columns
-dim(dt)
-#> [1] 1500000 5
-# save as fst table
-as_fst(dt) -> ft
-# remove the data.frame from RAM
-rm(dt)
-
-
-profvis({
-
- res1 = ft %>%
- select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
- dplyr::select(-Petal.Length) %>%
- dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
- dplyr::arrange(group,sl) %>%
- dplyr::filter(sl > 5) %>%
- dplyr::distinct(sl,.keep_all = TRUE) %>%
- dplyr::group_by(group) %>%
- dplyr::summarise(sw = max(sw))
-
- res2 = ft %>%
- select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
- lazy_dt() %>%
- dplyr::select(-Petal.Length) %>%
- dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
- dplyr::arrange(group,sl) %>%
- dplyr::filter(sl > 5) %>%
- dplyr::distinct(sl,.keep_all = TRUE) %>%
- dplyr::group_by(group) %>%
- dplyr::summarise(sw = max(sw)) %>%
- as.data.table()
-
- res3 = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>%
- setDT() %>%
- .[,.SD,.SDcols = -"Petal.Length"] %>%
- setnames(old =c("Species","Sepal.Length","Sepal.Width"),
- new = c("group","sl","sw")) %>%
- setorder(group,sl) %>%
- .[sl>5] %>% unique(by = "sl") %>%
- .[,.(sw = max(sw)),by = group]
-
-
- res4 = ft %>%
- tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
- tidyft::select(-Petal.Length) %>%
- tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
- tidyft::arrange(group,sl) %>%
- tidyft::filter(sl > 5) %>%
- tidyft::distinct(sl,.keep_all = TRUE) %>%
- tidyft::summarise(sw = max(sw),by = group)
-
-
-})
-#> Warning: You are using a dplyr method on a raw data.table, which will call the
-#> * data frame implementation, and is likely to be inefficient.
-#> *
-#> * To suppress this message, either generate a data.table translation with
-#> * `lazy_dt()` or convert to a data frame or tibble with
-#> * `as.data.frame()`/`as_tibble()`.
-
-#> Warning: You are using a dplyr method on a raw data.table, which will call the
-#> * data frame implementation, and is likely to be inefficient.
-#> *
-#> * To suppress this message, either generate a data.table translation with
-#> * `lazy_dt()` or convert to a data frame or tibble with
-#> * `as.data.frame()`/`as_tibble()`.
-setequal(res1,res2)
-#> [1] TRUE
-setequal(res2,res3)
-#> [1] TRUE
-setequal(res3,res4)
-#> [1] TRUE
Because tidyft is based on data.table, therefore, if you always use data.table correctly, then tidyft should not perform better than data.table (I do use some tricks, by never do column selection but delete the unselected ones instead, which is faster and more memory efficient than using .SDcols
in data.table). However, tidyft has a very different syntax, which might be more readable. And lots of complex operations of data.table has been wrapped in it. This could save your day to write the correct codes sometimes. I hope all my time devoted to this work could possibly save some of your valuable time on data operations of big datasets.
sessionInfo()
-#> R version 3.6.3 (2020-02-29)
-#> Platform: x86_64-w64-mingw32/x64 (64-bit)
-#> Running under: Windows 10 x64 (build 18362)
-#>
-#> Matrix products: default
-#>
-#> locale:
-#> [1] LC_COLLATE=Chinese (Simplified)_China.936
-#> [2] LC_CTYPE=Chinese (Simplified)_China.936
-#> [3] LC_MONETARY=Chinese (Simplified)_China.936
-#> [4] LC_NUMERIC=C
-#> [5] LC_TIME=Chinese (Simplified)_China.936
-#>
-#> attached base packages:
-#> [1] stats graphics grDevices utils datasets methods base
-#>
-#> other attached packages:
-#> [1] dtplyr_1.0.1 dplyr_0.8.5 data.table_1.12.8 profvis_0.3.6
-#> [5] tidyft_0.4.5
-#>
-#> loaded via a namespace (and not attached):
-#> [1] Rcpp_1.0.3 rstudioapi_0.11 knitr_1.28 magrittr_1.5
-#> [5] MASS_7.3-51.5 tidyselect_1.0.0 R6_2.4.1 rlang_0.4.5
-#> [9] stringr_1.4.0 tools_3.6.3 parallel_3.6.3 xfun_0.12
-#> [13] htmltools_0.4.0 yaml_2.2.1 assertthat_0.2.1 digest_0.6.25
-#> [17] rprojroot_1.3-2 tibble_2.1.3 pkgdown_1.4.1 crayon_1.3.4
-#> [21] purrr_0.3.3 vctrs_0.2.4 htmlwidgets_1.5.1 fs_1.3.1
-#> [25] fst_0.9.0 glue_1.3.2 memoise_1.1.0 evaluate_0.14
-#> [29] rmarkdown_2.1 stringi_1.4.6 pillar_1.4.3 compiler_3.6.3
-#> [33] desc_1.2.0 backports_1.1.5 jsonlite_1.6.1 pkgconfig_2.0.3
Developed by Tian-Yuan Huang.
-Site built with pkgdown 1.4.1.
-vignettes/Introduction.Rmd
+ Introduction.Rmd
Before tidyft, I’ve +designed a package named tidyfst. Backed +by data.table, it is fast and convenient. By then, I was not so +interested in modification by reference, which always causes trouble in +my workflow. Therefore, I use a lot of functions to make copies so as to +suppress the in place replacement. However, when it comes to big data, +simply making a new copy of the original data set could be time +consuming and memory inefficient. So I tried to write some functions +using the feature of modification by reference. This ends up in +inconsistency of many functions in the tidyfst package. In the +end, I removed all the in place replacement functions in +tidyfst and build a new package instead. This is how +tidyft comes into being.
+++You cannot step into the same river twice, for other waters are +continually flowing on.
+
—— +Heraclitus
If you try to do data operations on any data.table(s), never use it
+again for futher analysis, because it is not the data you know before.
+And you might never figure out what have happened and what has been
+changed in that process. If you really want to use it again, try make a
+copy first using copy()
, which might take extra time and
+space (that’s why tidyft avoid doing this all the time).
Another rule is, tidyft only deals with data.table(s), the raw +data.frame and other formats such as tibble could not work. If you +already have lots of data.frames in the environment, try these +codes.
+
+library(tidyft)
+#>
+#> Life's short, use R.
+#>
+#> Attaching package: 'tidyft'
+#> The following objects are masked from 'package:stats':
+#>
+#> filter, lag
+
+# make copies
+copy(iris) -> a
+copy(mtcars) -> b
+
+# before
+class(a)
+#> [1] "data.frame"
+class(b)
+#> [1] "data.frame"
+
+# convert codes
+lapply(ls(),get) %>%
+ lapply(setDT) %>%
+ invisible()
+
+# after
+class(a)
+#> [1] "data.table" "data.frame"
+class(b)
+#> [1] "data.table" "data.frame"
One last thing, while modifications are carried out in place, doesn’t +mean that the results could not be showed after operation. The +data.table package would return it invisibly, but in tidyft, the final +results are always printed if possible. This brings no reduction to the +computation performance.
+tidyft would not be so powerful without fst. I first introduce this +workflow into tidyfst. +In such workflow, you do not have to read all data into memory, only +import the needed data when necessary. tidyft is not so convenient for +in-memory operations, but it works very well (if not best) with the fst +workflow. Here we’ll make some examples.
+
+rm(list = ls())
+
+library(tidyft)
+# make a large data.frame
+iris[rep(1:nrow(iris),1e4),] -> dt
+# size: 1500000 rows, 5 columns
+dim(dt)
+#> [1] 1500000 5
+# save as fst table
+as_fst(dt) -> ft
+# remove the data.frame from RAM
+rm(dt)
+
+# inspect the fst table of large iris
+ft
+#> <fst file>
+#> 1500000 rows, 5 columns (dt7d0852606272.fst)
+#>
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <double> <double> <double> <double> <factor>
+#> 1 5.1 3.5 1.4 0.2 setosa
+#> 2 4.9 3.0 1.4 0.2 setosa
+#> 3 4.7 3.2 1.3 0.2 setosa
+#> 4 4.6 3.1 1.5 0.2 setosa
+#> 5 5.0 3.6 1.4 0.2 setosa
+#> -- -- -- -- -- --
+#> 1499996 6.7 3.0 5.2 2.3 virginica
+#> 1499997 6.3 2.5 5.0 1.9 virginica
+#> 1499998 6.5 3.0 5.2 2.0 virginica
+#> 1499999 6.2 3.4 5.4 2.3 virginica
+#> 1500000 5.9 3.0 5.1 1.8 virginica
+summary_fst(ft)
+#> <fst file>
+#> 1500000 rows, 5 columns (dt7d0852606272.fst)
+#>
+#> * 'Sepal.Length': double
+#> * 'Sepal.Width' : double
+#> * 'Petal.Length': double
+#> * 'Petal.Width' : double
+#> * 'Species' : factor
+
+# list the variables in the environment
+ls() # only the ft exists
+#> [1] "ft"
The as_fst
could save any data.frame as “.fst” file in
+temporary file and parse it back as fst table. Fst table is small in
+RAM, but if you want to get any part of the data.frame, you can get it
+in almost no time:
+ft %>%
+ slice_fst(5555:6666) # get 5555 to 6666 row
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 5.0 3.6 1.4 0.2 setosa
+#> 2: 5.4 3.9 1.7 0.4 setosa
+#> 3: 4.6 3.4 1.4 0.3 setosa
+#> 4: 5.0 3.4 1.5 0.2 setosa
+#> 5: 4.4 2.9 1.4 0.2 setosa
+#> ---
+#> 1108: 5.9 3.0 4.2 1.5 versicolor
+#> 1109: 6.0 2.2 4.0 1.0 versicolor
+#> 1110: 6.1 2.9 4.7 1.4 versicolor
+#> 1111: 5.6 2.9 3.6 1.3 versicolor
+#> 1112: 6.7 3.1 4.4 1.4 versicolor
Except for slice_fst
, there are also other functions for
+subsetting the data, such as
+select_fst
,filter_fst
. Good practice is: Make
+subsets of the data and use the least needy data to do operations. For
+very large data sets, you may try to do tests on a sample of the data
+(using slice
or select
to get several rows or
+columns) first before you implement a huge operation. Now let’s do a
+slightly complex manipulation. We’ll use sys_time_print
to
+measure the running time.
+
+sys_time_print({
+ res = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width) %>%
+ rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ arrange(group,sl) %>%
+ filter(sl > 5) %>%
+ distinct(sl,.keep_all = TRUE) %>%
+ summarise(sw = max(sw),by = group)
+})
+#> [1] "Finished in 0.310s elapsed (0.140s cpu)"
+
+res
+#> group sw
+#> <fctr> <num>
+#> 1: setosa 4.4
+#> 2: versicolor 3.3
+#> 3: virginica 3.8
This should be pretty fast. Becasue when we use the data in fst +table, we never get them until using the “_fst” suffix functions, so the +tidyft functions never modify the data in the fst file or fst table. +That is to say, we do not have to worry about the modification by +reference any more. No copies made, fastest ever.
+The fst workflow could also be working with other tools, though less +efficient. Now let’s compare the performance of tidyft, data.table, +dtplyr and dplyr.
+
+
+rm(list = ls())
+
+library(data.table)
+library(dplyr)
+#>
+#> Attaching package: 'dplyr'
+#> The following objects are masked from 'package:data.table':
+#>
+#> between, first, last
+#> The following objects are masked from 'package:tidyft':
+#>
+#> add_count, anti_join, arrange, count, cummean, distinct, filter,
+#> full_join, group_by, groups, inner_join, lag, lead, left_join,
+#> mutate, nth, pull, relocate, rename, right_join, select,
+#> select_vars, semi_join, slice, slice_head, slice_max, slice_min,
+#> slice_sample, slice_tail, summarise, transmute, ungroup
+#> The following objects are masked from 'package:stats':
+#>
+#> filter, lag
+#> The following objects are masked from 'package:base':
+#>
+#> intersect, setdiff, setequal, union
+library(dtplyr)
+library(tidyft)
+
+
+# make a large data.frame
+iris[rep(1:nrow(iris),1e4),] -> dt
+# size: 1500000 rows, 5 columns
+dim(dt)
+#> [1] 1500000 5
+# save as fst table
+as_fst(dt) -> ft
+# remove the data.frame from RAM
+rm(dt)
+
+
+bench::mark(
+
+ dplyr = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ dplyr::select(-Petal.Length) %>%
+ dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ dplyr::arrange(group,sl) %>%
+ dplyr::filter(sl > 5) %>%
+ dplyr::distinct(sl,.keep_all = TRUE) %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarise(sw = max(sw)),
+
+ dtplyr = ft %>%
+ select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ lazy_dt() %>%
+ dplyr::select(-Petal.Length) %>%
+ dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ dplyr::arrange(group,sl) %>%
+ dplyr::filter(sl > 5) %>%
+ dplyr::distinct(sl,.keep_all = TRUE) %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarise(sw = max(sw)) %>%
+ as.data.table(),
+
+ data.table = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>%
+ setDT() %>%
+ .[,.SD,.SDcols = -"Petal.Length"] %>%
+ setnames(old =c("Species","Sepal.Length","Sepal.Width"),
+ new = c("group","sl","sw")) %>%
+ setorder(group,sl) %>%
+ .[sl>5] %>% unique(by = "sl") %>%
+ .[,.(sw = max(sw)),by = group],
+
+
+ tidyft = ft %>%
+ tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
+ tidyft::select(-Petal.Length) %>%
+ tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
+ tidyft::arrange(group,sl) %>%
+ tidyft::filter(sl > 5) %>%
+ tidyft::distinct(sl,.keep_all = TRUE) %>%
+ tidyft::summarise(sw = max(sw),by = group),
+
+ check = setequal
+)
+#> Warning: Some expressions had a GC in every iteration; so filtering is
+#> disabled.
+#> # A tibble: 4 × 6
+#> expression min median `itr/sec` mem_alloc `gc/sec`
+#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
+#> 1 dplyr 106.8ms 146ms 7.08 179MB 19.5
+#> 2 dtplyr 183ms 186ms 5.39 200MB 10.8
+#> 3 data.table 93.3ms 112ms 8.74 130MB 12.2
+#> 4 tidyft 92.8ms 97ms 9.50 102MB 12.7
Because tidyft is based on data.table, therefore, if you always use
+data.table correctly, then tidyft should not perform better than
+data.table (I do use some tricks, by never do column selection but
+delete the unselected ones instead, which is faster and more memory
+efficient than using .SDcols
in data.table). However,
+tidyft has a very different syntax, which might be more readable. And
+lots of complex operations of data.table has been wrapped in it. This
+could save your day to write the correct codes sometimes. I hope all my
+time devoted to this work could possibly save some of your valuable time
+on data operations of big datasets.
+sessionInfo()
+#> R version 4.4.1 (2024-06-14 ucrt)
+#> Platform: x86_64-w64-mingw32/x64
+#> Running under: Windows 11 x64 (build 22631)
+#>
+#> Matrix products: default
+#>
+#>
+#> locale:
+#> [1] LC_COLLATE=Chinese (Simplified)_China.utf8
+#> [2] LC_CTYPE=Chinese (Simplified)_China.utf8
+#> [3] LC_MONETARY=Chinese (Simplified)_China.utf8
+#> [4] LC_NUMERIC=C
+#> [5] LC_TIME=Chinese (Simplified)_China.utf8
+#>
+#> time zone: Asia/Shanghai
+#> tzcode source: internal
+#>
+#> attached base packages:
+#> [1] stats graphics grDevices utils datasets methods base
+#>
+#> other attached packages:
+#> [1] dtplyr_1.3.1 dplyr_1.1.4 data.table_1.16.0 fstcore_0.9.18
+#> [5] tidyft_0.9.20
+#>
+#> loaded via a namespace (and not attached):
+#> [1] jsonlite_1.8.8 compiler_4.4.1 crayon_1.5.3 tidyselect_1.2.1
+#> [5] Rcpp_1.0.13 stringr_1.5.1 parallel_4.4.1 jquerylib_0.1.4
+#> [9] systemfonts_1.1.0 textshaping_0.4.0 yaml_2.3.9 fastmap_1.2.0
+#> [13] R6_2.5.1 generics_0.1.3 knitr_1.48 htmlwidgets_1.6.4
+#> [17] tibble_3.2.1 desc_1.4.3 bslib_0.7.0 pillar_1.9.0
+#> [21] rlang_1.1.4 utf8_1.2.4 cachem_1.1.0 stringi_1.8.4
+#> [25] xfun_0.46 fs_1.6.4 sass_0.4.9 cli_3.6.3
+#> [29] withr_3.0.1 pkgdown_2.1.0 magrittr_2.0.3 digest_0.6.36
+#> [33] rstudioapi_0.16.0 fst_0.9.8 lifecycle_1.0.4 vctrs_0.6.5
+#> [37] bench_1.1.3 evaluate_0.24.0 glue_1.7.0 ragg_1.3.2
+#> [41] profmem_0.6.0 fansi_1.0.6 rmarkdown_2.27 tools_4.4.1
+#> [45] pkgconfig_2.0.3 htmltools_0.5.8.1
Developed by Tian-Yuan Huang.
+Site built with pkgdown 2.1.0.
+Label | " + escapeHTML(label) + " |
Called from | " + escapeHTML(ref) + " |
Total time | " + (d.endTime - d.startTime) + "ms |
Memory | " + + roundOneDecimal(d.sumMemDealloc) + " / " + roundOneDecimal(d.sumMemAlloc) + + " MB |
Agg. total time | " + vis.aggLabelTimes[label] + "ms |
Call stack depth | " + d.depth + " |
DESCRIPTION
+ Huang T (2024). +tidyft: Fast and Memory Efficient Data Operations in Tidy Syntax. +R package version 0.9.20, https://hope-data-science.github.io/tidyft/, https://github.com/hope-data-science/tidyft. +
+@Manual{, + title = {tidyft: Fast and Memory Efficient Data Operations in Tidy Syntax}, + author = {Tian-Yuan Huang}, + year = {2024}, + note = {R package version 0.9.20, https://hope-data-science.github.io/tidyft/}, + url = {https://github.com/hope-data-science/tidyft}, +}
tidyft is an extension of data.table. Using modification by reference whenever possible, this toolkit is designed for big data analysis in high-performance desktop or laptop computers. The syntax of the package is similar or identical to tidyverse. It is user friendly, memory efficient and time saving. For more information, check its ancestor package tidyfst.
-This design is best for big data manipulation on out of memory data using facilities provided by fst. In such ways, you can handle the most quantity of data in the least time and space on your computer.
-This is a basic example which shows you how to solve a common problem:
-library(tidyft)
-
-# get first 5 rows of iris
-as.data.table(iris)[1:5] -> a
-#show
-a
-#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
-#> 1: 5.1 3.5 1.4 0.2 setosa
-#> 2: 4.9 3.0 1.4 0.2 setosa
-#> 3: 4.7 3.2 1.3 0.2 setosa
-#> 4: 4.6 3.1 1.5 0.2 setosa
-#> 5: 5.0 3.6 1.4 0.2 setosa
-
-# if you select
-a %>% select(1:3)
-#> Sepal.Length Sepal.Width Petal.Length
-#> 1: 5.1 3.5 1.4
-#> 2: 4.9 3.0 1.4
-#> 3: 4.7 3.2 1.3
-#> 4: 4.6 3.1 1.5
-#> 5: 5.0 3.6 1.4
-
-# you lose the unselected columns forever
-a
-#> Sepal.Length Sepal.Width Petal.Length
-#> 1: 5.1 3.5 1.4
-#> 2: 4.9 3.0 1.4
-#> 3: 4.7 3.2 1.3
-#> 4: 4.6 3.1 1.5
-#> 5: 5.0 3.6 1.4
If you still want to keep the original data, use copy()
to make a copy beforehand.
See vignettes.
-
-
-rm(list = ls())
-
-library(profvis)
-library(dplyr)
-library(tidyft)
-as.data.frame(starwars) -> starwars
-starwars[sample.int(1:nrow(starwars),1e6,replace = T),] -> starwars
-copy(starwars) -> dat1
-copy(starwars) -> dat2
-copy(starwars) -> dat3
-
-profvis({
- dat1 %>%
- dplyr::as_tibble() %>%
- dplyr::select(name, dplyr::ends_with("color")) %>%
- dplyr::arrange(hair_color,skin_color,eye_color) -> a
-
- setorder(setDT(dat2)[,.SD,.SDcols = patterns("name|color$")],
- hair_color,skin_color,eye_color) -> b
-
- dat3 %>%
- tidyft::setDT() %>%
- tidyft::select("name|color$") %>%
- tidyft::arrange(hair_color,skin_color,eye_color) -> c
-
-
-})
-
-all.equal(a,b)
-#> [1] TRUE
-all.equal(b,c)
-#> [1] TRUE
-
-
-tidyft is an extension of data.table. Using modification by reference whenever possible, this toolkit is designed for big data analysis in high-performance desktop or laptop computers. The syntax of the package is similar or identical to tidyverse. It is user friendly, memory efficient and time saving. For more information, check its ancestor package tidyfst.
+This design is best for big data manipulation on out of memory data using facilities provided by fst. In such ways, you can handle the most quantity of data in the least time and space on your computer.
+You can install the released version of tidyft via:
+
+install.packages("tidyft")
This is a basic example which shows you how to solve a common problem:
+
+library(tidyft)
+
+# get first 5 rows of iris
+as.data.table(iris)[1:5] -> a
+#show
+a
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> 1: 5.1 3.5 1.4 0.2 setosa
+#> 2: 4.9 3.0 1.4 0.2 setosa
+#> 3: 4.7 3.2 1.3 0.2 setosa
+#> 4: 4.6 3.1 1.5 0.2 setosa
+#> 5: 5.0 3.6 1.4 0.2 setosa
+
+# if you select
+a %>% select(1:3)
+#> Sepal.Length Sepal.Width Petal.Length
+#> 1: 5.1 3.5 1.4
+#> 2: 4.9 3.0 1.4
+#> 3: 4.7 3.2 1.3
+#> 4: 4.6 3.1 1.5
+#> 5: 5.0 3.6 1.4
+
+# you lose the unselected columns forever
+a
+#> Sepal.Length Sepal.Width Petal.Length
+#> 1: 5.1 3.5 1.4
+#> 2: 4.9 3.0 1.4
+#> 3: 4.7 3.2 1.3
+#> 4: 4.6 3.1 1.5
+#> 5: 5.0 3.6 1.4
If you still want to keep the original data, use copy()
to make a copy beforehand.
See vignettes.
+rm(list = ls())
+
+library(profvis)
+library(dplyr)
+library(tidyft)
+as.data.frame(starwars) -> starwars
+starwars[sample.int(1:nrow(starwars),1e6,replace = T),] -> starwars
+copy(starwars) -> dat1
+copy(starwars) -> dat2
+copy(starwars) -> dat3
+
+profvis({
+ dat1 %>%
+ dplyr::as_tibble() %>%
+ dplyr::select(name, dplyr::ends_with("color")) %>%
+ dplyr::arrange(hair_color,skin_color,eye_color) -> a
+
+ setorder(setDT(dat2)[,.SD,.SDcols = patterns("name|color$")],
+ hair_color,skin_color,eye_color) -> b
+
+ dat3 %>%
+ tidyft::setDT() %>%
+ tidyft::select("name|color$") %>%
+ tidyft::arrange(hair_color,skin_color,eye_color) -> c
+
+})
+
+all.equal(a,b)
+#> [1] TRUE
+all.equal(b,c)
+#> [1] TRUE
+
+Developed by Tian-Yuan Huang.
-Site built with pkgdown 1.4.1.
-Developed by Tian-Yuan Huang.
+Site built with pkgdown 2.1.0.
+Analogous function for arrange
in dplyr.
arrange(.data, ..., cols = NULL, order = 1L)- -
.data | -data.frame |
-
---|---|
... | -Arrange by what group? Minus symbol means arrange by -descending order. |
-
cols | -For
+
+
+
|
-
order | -For |
-
...
. Defaults to NULL
.
- A data.table
-For set_arrange
only. An integer vector with only possible
+values of 1 and -1, corresponding to ascending and descending order.
+Defaults to 1.
A data.table
+Once arranged, the order of entries would be changed forever.
--#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 4.3 3.0 1.1 0.1 setosa -#> 2: 4.4 2.9 1.4 0.2 setosa -#> 3: 4.4 3.0 1.3 0.2 setosa -#> 4: 4.4 3.2 1.3 0.2 setosa -#> 5: 4.5 2.3 1.3 0.3 setosa -#> --- -#> 146: 7.7 3.8 6.7 2.2 virginica -#> 147: 7.7 2.6 6.9 2.3 virginica -#> 148: 7.7 2.8 6.7 2.0 virginica -#> 149: 7.7 3.0 6.1 2.3 virginica -#> 150: 7.9 3.8 6.4 2.0 virginicaa#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 4.3 3.0 1.1 0.1 setosa -#> 2: 4.4 2.9 1.4 0.2 setosa -#> 3: 4.4 3.0 1.3 0.2 setosa -#> 4: 4.4 3.2 1.3 0.2 setosa -#> 5: 4.5 2.3 1.3 0.3 setosa -#> --- -#> 146: 7.7 3.8 6.7 2.2 virginica -#> 147: 7.7 2.6 6.9 2.3 virginica -#> 148: 7.7 2.8 6.7 2.0 virginica -#> 149: 7.7 3.0 6.1 2.3 virginica -#> 150: 7.9 3.8 6.4 2.0 virginica#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 5.0 2.0 3.5 1.0 versicolor -#> 2: 6.0 2.2 4.0 1.0 versicolor -#> 3: 6.2 2.2 4.5 1.5 versicolor -#> 4: 6.0 2.2 5.0 1.5 virginica -#> 5: 4.5 2.3 1.3 0.3 setosa -#> --- -#> 146: 5.4 3.9 1.7 0.4 setosa -#> 147: 5.8 4.0 1.2 0.2 setosa -#> 148: 5.2 4.1 1.5 0.1 setosa -#> 149: 5.5 4.2 1.4 0.2 setosa -#> 150: 5.7 4.4 1.5 0.4 setosaa#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 5.0 2.0 3.5 1.0 versicolor -#> 2: 6.0 2.2 4.0 1.0 versicolor -#> 3: 6.2 2.2 4.5 1.5 versicolor -#> 4: 6.0 2.2 5.0 1.5 virginica -#> 5: 4.5 2.3 1.3 0.3 setosa -#> --- -#> 146: 5.4 3.9 1.7 0.4 setosa -#> 147: 5.8 4.0 1.2 0.2 setosa -#> 148: 5.2 4.1 1.5 0.1 setosa -#> 149: 5.5 4.2 1.4 0.2 setosa -#> 150: 5.7 4.4 1.5 0.4 setosa-
+a = as.data.table(iris)
+a %>% arrange(Sepal.Length)
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 4.3 3.0 1.1 0.1 setosa
+#> 2: 4.4 2.9 1.4 0.2 setosa
+#> 3: 4.4 3.0 1.3 0.2 setosa
+#> 4: 4.4 3.2 1.3 0.2 setosa
+#> 5: 4.5 2.3 1.3 0.3 setosa
+#> ---
+#> 146: 7.7 3.8 6.7 2.2 virginica
+#> 147: 7.7 2.6 6.9 2.3 virginica
+#> 148: 7.7 2.8 6.7 2.0 virginica
+#> 149: 7.7 3.0 6.1 2.3 virginica
+#> 150: 7.9 3.8 6.4 2.0 virginica
+a
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 4.3 3.0 1.1 0.1 setosa
+#> 2: 4.4 2.9 1.4 0.2 setosa
+#> 3: 4.4 3.0 1.3 0.2 setosa
+#> 4: 4.4 3.2 1.3 0.2 setosa
+#> 5: 4.5 2.3 1.3 0.3 setosa
+#> ---
+#> 146: 7.7 3.8 6.7 2.2 virginica
+#> 147: 7.7 2.6 6.9 2.3 virginica
+#> 148: 7.7 2.8 6.7 2.0 virginica
+#> 149: 7.7 3.0 6.1 2.3 virginica
+#> 150: 7.9 3.8 6.4 2.0 virginica
+a %>% arrange(cols = c("Sepal.Width","Petal.Length"))
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 5.0 2.0 3.5 1.0 versicolor
+#> 2: 6.0 2.2 4.0 1.0 versicolor
+#> 3: 6.2 2.2 4.5 1.5 versicolor
+#> 4: 6.0 2.2 5.0 1.5 virginica
+#> 5: 4.5 2.3 1.3 0.3 setosa
+#> ---
+#> 146: 5.4 3.9 1.7 0.4 setosa
+#> 147: 5.8 4.0 1.2 0.2 setosa
+#> 148: 5.2 4.1 1.5 0.1 setosa
+#> 149: 5.5 4.2 1.4 0.2 setosa
+#> 150: 5.7 4.4 1.5 0.4 setosa
+a
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 5.0 2.0 3.5 1.0 versicolor
+#> 2: 6.0 2.2 4.0 1.0 versicolor
+#> 3: 6.2 2.2 4.5 1.5 versicolor
+#> 4: 6.0 2.2 5.0 1.5 virginica
+#> 5: 4.5 2.3 1.3 0.3 setosa
+#> ---
+#> 146: 5.4 3.9 1.7 0.4 setosa
+#> 147: 5.8 4.0 1.2 0.2 setosa
+#> 148: 5.2 4.1 1.5 0.1 setosa
+#> 149: 5.5 4.2 1.4 0.2 setosa
+#> 150: 5.7 4.4 1.5 0.4 setosa
+
+
as_fst(.data)+
as_fst(.data)
.data | -A data.frame |
-
---|
An object of class fst_table
A data.frame
--# \donttest{ - iris %>% - as_fst() -> iris_fst - iris_fst#> <fst file> -#> 150 rows, 5 columns (.a5445415a58.fst) -#> -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <double> <double> <double> <double> <factor> -#> 1 5.1 3.5 1.4 0.2 setosa -#> 2 4.9 3.0 1.4 0.2 setosa -#> 3 4.7 3.2 1.3 0.2 setosa -#> 4 4.6 3.1 1.5 0.2 setosa -#> 5 5.0 3.6 1.4 0.2 setosa -#> -- -- -- -- -- -- -#> 146 6.7 3.0 5.2 2.3 virginica -#> 147 6.3 2.5 5.0 1.9 virginica -#> 148 6.5 3.0 5.2 2.0 virginica -#> 149 6.2 3.4 5.4 2.3 virginica -#> 150 5.9 3.0 5.1 1.8 virginica# } -
An object of class fst_table
+# \donttest{
+ iris %>%
+ as_fst() -> iris_fst
+ iris_fst
+#> <fst file>
+#> 150 rows, 5 columns (.27606c92540e.fst)
+#>
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <double> <double> <double> <double> <factor>
+#> 1 5.1 3.5 1.4 0.2 setosa
+#> 2 4.9 3.0 1.4 0.2 setosa
+#> 3 4.7 3.2 1.3 0.2 setosa
+#> 4 4.6 3.1 1.5 0.2 setosa
+#> 5 5.0 3.6 1.4 0.2 setosa
+#> -- -- -- -- -- --
+#> 146 6.7 3.0 5.2 2.3 virginica
+#> 147 6.3 2.5 5.0 1.9 virginica
+#> 148 6.5 3.0 5.2 2.0 virginica
+#> 149 6.2 3.4 5.4 2.3 virginica
+#> 150 5.9 3.0 5.1 1.8 virginica
+# }
+
R/complete.R
+ Source: R/complete.R
complete.Rd
complete
function in tidyr.
complete(.data, ..., fill = NA)+
complete(.data, ..., fill = NA)
.data | -data.frame |
-
---|---|
... | -Specification of columns to expand.The selection of columns is
-supported by the flexible
|
-
fill | -Atomic value to fill into the missing cell, default uses |
-
data.table
-Atomic value to fill into the missing cell, default uses NA
.
data.table
+When the provided columns with addtion data are of different length, all the unique combinations would be returned. This operation should be used only on unique entries, and it will always returned the unique entries.
If you supply fill parameter, these values will also replace existing explicit missing values in the data set.
--df <- data.table( - group = c(1:2, 1), - item_id = c(1:2, 2), - item_name = c("a", "b", "b"), - value1 = 1:3, - value2 = 4:6 -) - -df %>% complete(item_id,item_name)#> Key: <item_id, item_name> -#> item_id item_name group value1 value2 -#> <num> <char> <num> <int> <int> -#> 1: 1 a 1 1 4 -#> 2: 1 b NA NA NA -#> 3: 2 a NA NA NA -#> 4: 2 b 2 2 5 -#> 5: 2 b 1 3 6df %>% complete(item_id,item_name,fill = 0)#> Key: <item_id, item_name> -#> item_id item_name group value1 value2 -#> <num> <char> <num> <int> <int> -#> 1: 1 a 1 1 4 -#> 2: 1 b 0 0 0 -#> 3: 2 a 0 0 0 -#> 4: 2 b 2 2 5 -#> 5: 2 b 1 3 6df %>% complete("item")#> Key: <item_id, item_name> -#> item_id item_name group value1 value2 -#> <num> <char> <num> <int> <int> -#> 1: 1 a 1 1 4 -#> 2: 1 b NA NA NA -#> 3: 2 a NA NA NA -#> 4: 2 b 2 2 5 -#> 5: 2 b 1 3 6df %>% complete(item_id=1:3)#> Key: <item_id> -#> item_id group item_name value1 value2 -#> <int> <num> <char> <int> <int> -#> 1: 1 1 a 1 4 -#> 2: 2 2 b 2 5 -#> 3: 2 1 b 3 6 -#> 4: 3 NA <NA> NA NAdf %>% complete(item_id=1:3,group=1:2)#> Key: <item_id, group> -#> item_id group item_name value1 value2 -#> <int> <int> <char> <int> <int> -#> 1: 1 1 a 1 4 -#> 2: 1 2 <NA> NA NA -#> 3: 2 1 b 3 6 -#> 4: 2 2 b 2 5 -#> 5: 3 1 <NA> NA NA -#> 6: 3 2 <NA> NA NA#> Key: <item_id, group, item_name> -#> item_id group item_name value1 value2 -#> <int> <int> <char> <int> <int> -#> 1: 1 1 a 1 4 -#> 2: 1 1 b NA NA -#> 3: 1 1 c NA NA -#> 4: 1 2 a NA NA -#> 5: 1 2 b NA NA -#> 6: 1 2 c NA NA -#> 7: 1 3 a NA NA -#> 8: 1 3 b NA NA -#> 9: 1 3 c NA NA -#> 10: 2 1 a NA NA -#> 11: 2 1 b 3 6 -#> 12: 2 1 c NA NA -#> 13: 2 2 a NA NA -#> 14: 2 2 b 2 5 -#> 15: 2 2 c NA NA -#> 16: 2 3 a NA NA -#> 17: 2 3 b NA NA -#> 18: 2 3 c NA NA -#> 19: 3 1 a NA NA -#> 20: 3 1 b NA NA -#> 21: 3 1 c NA NA -#> 22: 3 2 a NA NA -#> 23: 3 2 b NA NA -#> 24: 3 2 c NA NA -#> 25: 3 3 a NA NA -#> 26: 3 3 b NA NA -#> 27: 3 3 c NA NA -#> item_id group item_name value1 value2-
df <- data.table(
+ group = c(1:2, 1),
+ item_id = c(1:2, 2),
+ item_name = c("a", "b", "b"),
+ value1 = 1:3,
+ value2 = 4:6
+)
+
+df %>% complete(item_id,item_name)
+#> Key: <item_id, item_name>
+#> item_id item_name group value1 value2
+#> <num> <char> <num> <int> <int>
+#> 1: 1 a 1 1 4
+#> 2: 1 b NA NA NA
+#> 3: 2 a NA NA NA
+#> 4: 2 b 2 2 5
+#> 5: 2 b 1 3 6
+df %>% complete(item_id,item_name,fill = 0)
+#> Key: <item_id, item_name>
+#> item_id item_name group value1 value2
+#> <num> <char> <num> <int> <int>
+#> 1: 1 a 1 1 4
+#> 2: 1 b 0 0 0
+#> 3: 2 a 0 0 0
+#> 4: 2 b 2 2 5
+#> 5: 2 b 1 3 6
+df %>% complete("item")
+#> Key: <item_id, item_name>
+#> item_id item_name group value1 value2
+#> <num> <char> <num> <int> <int>
+#> 1: 1 a 1 1 4
+#> 2: 1 b NA NA NA
+#> 3: 2 a NA NA NA
+#> 4: 2 b 2 2 5
+#> 5: 2 b 1 3 6
+df %>% complete(item_id=1:3)
+#> Key: <item_id>
+#> item_id group item_name value1 value2
+#> <int> <num> <char> <int> <int>
+#> 1: 1 1 a 1 4
+#> 2: 2 2 b 2 5
+#> 3: 2 1 b 3 6
+#> 4: 3 NA <NA> NA NA
+df %>% complete(item_id=1:3,group=1:2)
+#> Key: <item_id, group>
+#> item_id group item_name value1 value2
+#> <int> <int> <char> <int> <int>
+#> 1: 1 1 a 1 4
+#> 2: 1 2 <NA> NA NA
+#> 3: 2 1 b 3 6
+#> 4: 2 2 b 2 5
+#> 5: 3 1 <NA> NA NA
+#> 6: 3 2 <NA> NA NA
+df %>% complete(item_id=1:3,group=1:3,item_name=c("a","b","c"))
+#> Key: <item_id, group, item_name>
+#> item_id group item_name value1 value2
+#> <int> <int> <char> <int> <int>
+#> 1: 1 1 a 1 4
+#> 2: 1 1 b NA NA
+#> 3: 1 1 c NA NA
+#> 4: 1 2 a NA NA
+#> 5: 1 2 b NA NA
+#> 6: 1 2 c NA NA
+#> 7: 1 3 a NA NA
+#> 8: 1 3 b NA NA
+#> 9: 1 3 c NA NA
+#> 10: 2 1 a NA NA
+#> 11: 2 1 b 3 6
+#> 12: 2 1 c NA NA
+#> 13: 2 2 a NA NA
+#> 14: 2 2 b 2 5
+#> 15: 2 2 c NA NA
+#> 16: 2 3 a NA NA
+#> 17: 2 3 b NA NA
+#> 18: 2 3 c NA NA
+#> 19: 3 1 a NA NA
+#> 20: 3 1 b NA NA
+#> 21: 3 1 c NA NA
+#> 22: 3 2 a NA NA
+#> 23: 3 2 b NA NA
+#> 24: 3 2 c NA NA
+#> 25: 3 3 a NA NA
+#> 26: 3 3 b NA NA
+#> 27: 3 3 c NA NA
+#> item_id group item_name value1 value2
+
+
Analogous function for count
and add_count
in dplyr.
count(.data, ..., sort = FALSE, name = "n") - -add_count(.data, ..., name = "n")- -
.data | -data.table |
-
---|---|
... | -variables to group by. |
-
sort | -logical. If TRUE result will be sorted in desending order by resulting variable. |
-
name | -character. Name of resulting variable. Default uses "n". |
-
count(.data, ..., sort = FALSE, name = "n")
+
+add_count(.data, ..., name = "n")
data.table
--#> cyl n -#> <num> <int> -#> 1: 6 7 -#> 2: 4 11 -#> 3: 8 14count(a,cyl,sort = TRUE)#> cyl n -#> <num> <int> -#> 1: 8 14 -#> 2: 4 11 -#> 3: 6 7a#> mpg cyl disp hp drat wt qsec vs am gear carb -#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> -#> 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 -#> 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 -#> 3: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 -#> 4: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 -#> 5: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 -#> 6: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 -#> 7: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 -#> 8: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 -#> 9: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 -#> 10: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 -#> 11: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 -#> 12: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 -#> 13: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 -#> 14: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 -#> 15: 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 -#> 16: 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 -#> 17: 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 -#> 18: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 -#> 19: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 -#> 20: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 -#> 21: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 -#> 22: 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 -#> 23: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 -#> 24: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 -#> 25: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 -#> 26: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 -#> 27: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 -#> 28: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 -#> 29: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 -#> 30: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 -#> 31: 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 -#> 32: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 -#> mpg cyl disp hp drat wt qsec vs am gear carb#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species N -#> <num> <num> <num> <num> <fctr> <int> -#> 1: 5.1 3.5 1.4 0.2 setosa 50 -#> 2: 4.9 3.0 1.4 0.2 setosa 50 -#> 3: 4.7 3.2 1.3 0.2 setosa 50 -#> 4: 4.6 3.1 1.5 0.2 setosa 50 -#> 5: 5.0 3.6 1.4 0.2 setosa 50 -#> --- -#> 146: 6.7 3.0 5.2 2.3 virginica 50 -#> 147: 6.3 2.5 5.0 1.9 virginica 50 -#> 148: 6.5 3.0 5.2 2.0 virginica 50 -#> 149: 6.2 3.4 5.4 2.3 virginica 50 -#> 150: 5.9 3.0 5.1 1.8 virginica 50b#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species N -#> <num> <num> <num> <num> <fctr> <int> -#> 1: 5.1 3.5 1.4 0.2 setosa 50 -#> 2: 4.9 3.0 1.4 0.2 setosa 50 -#> 3: 4.7 3.2 1.3 0.2 setosa 50 -#> 4: 4.6 3.1 1.5 0.2 setosa 50 -#> 5: 5.0 3.6 1.4 0.2 setosa 50 -#> --- -#> 146: 6.7 3.0 5.2 2.3 virginica 50 -#> 147: 6.3 2.5 5.0 1.9 virginica 50 -#> 148: 6.5 3.0 5.2 2.0 virginica 50 -#> 149: 6.2 3.4 5.4 2.3 virginica 50 -#> 150: 5.9 3.0 5.1 1.8 virginica 50
data.table
+a = as.data.table(mtcars)
+count(a,cyl)
+#> cyl n
+#> <num> <int>
+#> 1: 6 7
+#> 2: 4 11
+#> 3: 8 14
+count(a,cyl,sort = TRUE)
+#> cyl n
+#> <num> <int>
+#> 1: 8 14
+#> 2: 4 11
+#> 3: 6 7
+a
+#> mpg cyl disp hp drat wt qsec vs am gear carb
+#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
+#> 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
+#> 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
+#> 3: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
+#> 4: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
+#> 5: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
+#> 6: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
+#> 7: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
+#> 8: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
+#> 9: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
+#> 10: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
+#> 11: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
+#> 12: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
+#> 13: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
+#> 14: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
+#> 15: 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
+#> 16: 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
+#> 17: 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
+#> 18: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
+#> 19: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
+#> 20: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
+#> 21: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
+#> 22: 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
+#> 23: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
+#> 24: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
+#> 25: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
+#> 26: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
+#> 27: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
+#> 28: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
+#> 29: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
+#> 30: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
+#> 31: 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
+#> 32: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
+#> mpg cyl disp hp drat wt qsec vs am gear carb
+
+b = as.data.table(iris)
+b %>% add_count(Species,name = "N")
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species N
+#> <num> <num> <num> <num> <fctr> <int>
+#> 1: 5.1 3.5 1.4 0.2 setosa 50
+#> 2: 4.9 3.0 1.4 0.2 setosa 50
+#> 3: 4.7 3.2 1.3 0.2 setosa 50
+#> 4: 4.6 3.1 1.5 0.2 setosa 50
+#> 5: 5.0 3.6 1.4 0.2 setosa 50
+#> ---
+#> 146: 6.7 3.0 5.2 2.3 virginica 50
+#> 147: 6.3 2.5 5.0 1.9 virginica 50
+#> 148: 6.5 3.0 5.2 2.0 virginica 50
+#> 149: 6.2 3.4 5.4 2.3 virginica 50
+#> 150: 5.9 3.0 5.1 1.8 virginica 50
+b
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species N
+#> <num> <num> <num> <num> <fctr> <int>
+#> 1: 5.1 3.5 1.4 0.2 setosa 50
+#> 2: 4.9 3.0 1.4 0.2 setosa 50
+#> 3: 4.7 3.2 1.3 0.2 setosa 50
+#> 4: 4.6 3.1 1.5 0.2 setosa 50
+#> 5: 5.0 3.6 1.4 0.2 setosa 50
+#> ---
+#> 146: 6.7 3.0 5.2 2.3 virginica 50
+#> 147: 6.3 2.5 5.0 1.9 virginica 50
+#> 148: 6.5 3.0 5.2 2.0 virginica 50
+#> 149: 6.2 3.4 5.4 2.3 virginica 50
+#> 150: 5.9 3.0 5.1 1.8 virginica 50
+
Returns a vector whose elements are the cumulative mean of the elements of the argument.
cummean(x)+
cummean(x)
x | -a numeric or complex object, -or an object that can be coerced to one of these. |
-
---|
A numeric vector
+cummean(1:10)
+#> [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
+
+
Analogous function for distinct
in dplyr
distinct(.data, ..., .keep_all = FALSE)- -
.data | -data.table |
-
---|---|
... | -Optional variables to use when determining uniqueness. +
+
+
+
|
-
.keep_all | -If |
-
If TRUE
, keep all variables in data.table. If a combination of ... is not distinct,
+this keeps the first row of values.
data.table
--#> Species -#> <fctr> -#> 1: setosa -#> 2: versicolor -#> 3: virginicab %>% distinct(cyl,vs,.keep_all = TRUE)#> mpg cyl disp hp drat wt qsec vs am gear carb -#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> -#> 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 -#> 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 -#> 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 -#> 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 -#> 5: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2- -
+ a = as.data.table(iris)
+ b = as.data.table(mtcars)
+ a %>% distinct(Species)
+#> Species
+#> <fctr>
+#> 1: setosa
+#> 2: versicolor
+#> 3: virginica
+ b %>% distinct(cyl,vs,.keep_all = TRUE)
+#> mpg cyl disp hp drat wt qsec vs am gear carb
+#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
+#> 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
+#> 2: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
+#> 3: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
+#> 4: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
+#> 5: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
+
+
+
R/drop_delete_na.R
+ Source: R/drop_delete_na.R
drop_delete_na.Rd
delete_na
deletes rows or columns with too many NAs.
drop_na(.data, ...) - -delete_na(.data, MARGIN, n)- -
.data | -A data.table |
-
---|---|
... | -Colunms to be dropped or deleted. |
-
MARGIN | -1 or 2. 1 for deleting rows, 2 for deleting columns. |
-
n | -If number (proportion) of NAs is larger than or equal to "n", -the columns/rows would be deleted. When smaller than 1, use as proportion. -When larger or equal to 1, use as number. |
-
drop_na(.data, ...)
+
+delete_na(.data, MARGIN, n)
A data.table
--#> x y z -#> <num> <num> <lgcl> -#> 1: 1 NA NA -#> 2: 2 NA NA -#> 3: NA 4 NA -#> 4: 3 5 NAx %>% delete_na(2,0.75)#> x y -#> <num> <num> -#> 1: 1 NA -#> 2: 2 NA -#> 3: NA 4 -#> 4: 3 5#> x -#> <num> -#> 1: 1 -#> 2: 2 -#> 3: NA -#> 4: 3#> Null data.table (0 rows and 0 cols)#> x -#> <num> -#> 1: 1 -#> 2: 2 -#> 3: NA -#> 4: 3#> x y z -#> <num> <num> <lgcl> -#> 1: 3 5 NA#> x y z -#> <num> <num> <lgcl> -#> 1: 3 5 NA- -
A data.table
Colunms to be dropped or deleted.
1 or 2. 1 for deleting rows, 2 for deleting columns.
If number (proportion) of NAs is larger than or equal to "n", +the columns/rows would be deleted. When smaller than 1, use as proportion. +When larger or equal to 1, use as number.
A data.table
+x = data.table(x = c(1, 2, NA, 3), y = c(NA, NA, 4, 5),z = rep(NA,4))
+x
+#> x y z
+#> <num> <num> <lgcl>
+#> 1: 1 NA NA
+#> 2: 2 NA NA
+#> 3: NA 4 NA
+#> 4: 3 5 NA
+x %>% delete_na(2,0.75)
+#> x y
+#> <num> <num>
+#> 1: 1 NA
+#> 2: 2 NA
+#> 3: NA 4
+#> 4: 3 5
+
+x = data.table(x = c(1, 2, NA, 3), y = c(NA, NA, 4, 5),z = rep(NA,4))
+x %>% delete_na(2,0.5)
+#> x
+#> <num>
+#> 1: 1
+#> 2: 2
+#> 3: NA
+#> 4: 3
+
+x = data.table(x = c(1, 2, NA, 3), y = c(NA, NA, 4, 5),z = rep(NA,4))
+x %>% delete_na(2,0.24)
+#> Null data.table (0 rows and 0 cols)
+
+x = data.table(x = c(1, 2, NA, 3), y = c(NA, NA, 4, 5),z = rep(NA,4))
+x %>% delete_na(2,2)
+#> x
+#> <num>
+#> 1: 1
+#> 2: 2
+#> 3: NA
+#> 4: 3
+
+x = data.table(x = c(1, 2, NA, 3), y = c(NA, NA, 4, 5),z = rep(NA,4))
+x %>% delete_na(1,0.6)
+#> x y z
+#> <num> <num> <lgcl>
+#> 1: 3 5 NA
+x = data.table(x = c(1, 2, NA, 3), y = c(NA, NA, 4, 5),z = rep(NA,4))
+x %>% delete_na(1,2)
+#> x y z
+#> <num> <num> <lgcl>
+#> 1: 3 5 NA
+
+
+
dummy(.data, ..., longname = TRUE)+
dummy(.data, ..., longname = TRUE)
.data | -data.frame |
-
---|---|
... | -Columns you want to create dummy variables from. -Very flexible, find in the examples. |
-
longname | -logical. Should the output column labeled with the
-original column name? Default uses |
-
data.table
+If no columns provided, will return the original data frame.
This function is inspired by fastDummies package, but provides
-simple and precise usage, whereas fastDummies::dummy_cols
provides more
+simple and precise usage, whereas fastDummies::dummy_cols
provides more
features for statistical usage.
-#> Key: <Sepal.Length, Sepal.Width, Petal.Length, Petal.Width> -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa -#> <num> <num> <num> <num> <num> -#> 1: 4.3 3.0 1.1 0.1 1 -#> 2: 4.4 2.9 1.4 0.2 1 -#> 3: 4.4 3.0 1.3 0.2 1 -#> 4: 4.4 3.2 1.3 0.2 1 -#> 5: 4.5 2.3 1.3 0.3 1 -#> --- -#> 146: 7.7 2.6 6.9 2.3 0 -#> 147: 7.7 2.8 6.7 2.0 0 -#> 148: 7.7 3.0 6.1 2.3 0 -#> 149: 7.7 3.8 6.7 2.2 0 -#> 150: 7.9 3.8 6.4 2.0 0 -#> Species_versicolor Species_virginica -#> <num> <num> -#> 1: 0 0 -#> 2: 0 0 -#> 3: 0 0 -#> 4: 0 0 -#> 5: 0 0 -#> --- -#> 146: 0 1 -#> 147: 0 1 -#> 148: 0 1 -#> 149: 0 1 -#> 150: 0 1iris %>% dummy(Species,longname = FALSE)#> Key: <Sepal.Length, Sepal.Width, Petal.Length, Petal.Width> -#> Sepal.Length Sepal.Width Petal.Length Petal.Width setosa versicolor -#> <num> <num> <num> <num> <num> <num> -#> 1: 4.3 3.0 1.1 0.1 1 0 -#> 2: 4.4 2.9 1.4 0.2 1 0 -#> 3: 4.4 3.0 1.3 0.2 1 0 -#> 4: 4.4 3.2 1.3 0.2 1 0 -#> 5: 4.5 2.3 1.3 0.3 1 0 -#> --- -#> 146: 7.7 2.6 6.9 2.3 0 0 -#> 147: 7.7 2.8 6.7 2.0 0 0 -#> 148: 7.7 3.0 6.1 2.3 0 0 -#> 149: 7.7 3.8 6.7 2.2 0 0 -#> 150: 7.9 3.8 6.4 2.0 0 0 -#> virginica -#> <num> -#> 1: 0 -#> 2: 0 -#> 3: 0 -#> 4: 0 -#> 5: 0 -#> --- -#> 146: 1 -#> 147: 1 -#> 148: 1 -#> 149: 1 -#> 150: 1#> Key: <mpg, cyl, disp, hp, drat, wt, qsec, gear, carb, vs_0, vs_1> -#> mpg cyl disp hp drat wt qsec gear carb vs_0 vs_1 am_0 -#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> -#> 1: 18.1 6 225 105 2.76 3.460 20.22 3 1 0 1 1 -#> 2: 18.7 8 360 175 3.15 3.440 17.02 3 2 1 0 1 -#> 3: 21.0 6 160 110 3.90 2.620 16.46 4 4 1 0 0 -#> 4: 21.0 6 160 110 3.90 2.875 17.02 4 4 1 0 0 -#> 5: 21.4 6 258 110 3.08 3.215 19.44 3 1 0 1 1 -#> 6: 22.8 4 108 93 3.85 2.320 18.61 4 1 0 1 0 -#> am_1 -#> <num> -#> 1: 0 -#> 2: 0 -#> 3: 1 -#> 4: 1 -#> 5: 0 -#> 6: 1#> Key: <mpg, disp, hp, drat, wt, qsec, vs, am, carb, cyl_4, cyl_6, cyl_8> -#> mpg disp hp drat wt qsec vs am carb cyl_4 cyl_6 cyl_8 -#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> -#> 1: 18.1 225 105 2.76 3.460 20.22 1 0 1 0 1 0 -#> 2: 18.7 360 175 3.15 3.440 17.02 0 0 2 0 0 1 -#> 3: 21.0 160 110 3.90 2.620 16.46 0 1 4 0 1 0 -#> 4: 21.0 160 110 3.90 2.875 17.02 0 1 4 0 1 0 -#> 5: 21.4 258 110 3.08 3.215 19.44 1 0 1 0 1 0 -#> 6: 22.8 108 93 3.85 2.320 18.61 1 1 1 1 0 0 -#> gear_3 gear_4 -#> <num> <num> -#> 1: 1 0 -#> 2: 1 0 -#> 3: 0 1 -#> 4: 0 1 -#> 5: 1 0 -#> 6: 0 1-
dummy_cols
+iris = as.data.table(iris)
+iris %>% dummy(Species)
+#> Key: <Sepal.Length, Sepal.Width, Petal.Length, Petal.Width>
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
+#> <num> <num> <num> <num> <num>
+#> 1: 4.3 3.0 1.1 0.1 1
+#> 2: 4.4 2.9 1.4 0.2 1
+#> 3: 4.4 3.0 1.3 0.2 1
+#> 4: 4.4 3.2 1.3 0.2 1
+#> 5: 4.5 2.3 1.3 0.3 1
+#> ---
+#> 146: 7.7 2.6 6.9 2.3 0
+#> 147: 7.7 2.8 6.7 2.0 0
+#> 148: 7.7 3.0 6.1 2.3 0
+#> 149: 7.7 3.8 6.7 2.2 0
+#> 150: 7.9 3.8 6.4 2.0 0
+#> Species_versicolor Species_virginica
+#> <num> <num>
+#> 1: 0 0
+#> 2: 0 0
+#> 3: 0 0
+#> 4: 0 0
+#> 5: 0 0
+#> ---
+#> 146: 0 1
+#> 147: 0 1
+#> 148: 0 1
+#> 149: 0 1
+#> 150: 0 1
+iris %>% dummy(Species,longname = FALSE)
+#> Key: <Sepal.Length, Sepal.Width, Petal.Length, Petal.Width>
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width setosa versicolor
+#> <num> <num> <num> <num> <num> <num>
+#> 1: 4.3 3.0 1.1 0.1 1 0
+#> 2: 4.4 2.9 1.4 0.2 1 0
+#> 3: 4.4 3.0 1.3 0.2 1 0
+#> 4: 4.4 3.2 1.3 0.2 1 0
+#> 5: 4.5 2.3 1.3 0.3 1 0
+#> ---
+#> 146: 7.7 2.6 6.9 2.3 0 0
+#> 147: 7.7 2.8 6.7 2.0 0 0
+#> 148: 7.7 3.0 6.1 2.3 0 0
+#> 149: 7.7 3.8 6.7 2.2 0 0
+#> 150: 7.9 3.8 6.4 2.0 0 0
+#> virginica
+#> <num>
+#> 1: 0
+#> 2: 0
+#> 3: 0
+#> 4: 0
+#> 5: 0
+#> ---
+#> 146: 1
+#> 147: 1
+#> 148: 1
+#> 149: 1
+#> 150: 1
+
+mtcars = as.data.table(mtcars)
+mtcars %>% head() %>% dummy(vs,am)
+#> Key: <mpg, cyl, disp, hp, drat, wt, qsec, gear, carb, vs_0, vs_1>
+#> mpg cyl disp hp drat wt qsec gear carb vs_0 vs_1 am_0
+#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
+#> 1: 18.1 6 225 105 2.76 3.460 20.22 3 1 0 1 1
+#> 2: 18.7 8 360 175 3.15 3.440 17.02 3 2 1 0 1
+#> 3: 21.0 6 160 110 3.90 2.620 16.46 4 4 1 0 0
+#> 4: 21.0 6 160 110 3.90 2.875 17.02 4 4 1 0 0
+#> 5: 21.4 6 258 110 3.08 3.215 19.44 3 1 0 1 1
+#> 6: 22.8 4 108 93 3.85 2.320 18.61 4 1 0 1 0
+#> am_1
+#> <num>
+#> 1: 0
+#> 2: 0
+#> 3: 1
+#> 4: 1
+#> 5: 0
+#> 6: 1
+mtcars %>% head() %>% dummy("cyl|gear")
+#> Key: <mpg, disp, hp, drat, wt, qsec, vs, am, carb, cyl_4, cyl_6, cyl_8>
+#> mpg disp hp drat wt qsec vs am carb cyl_4 cyl_6 cyl_8
+#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
+#> 1: 18.1 225 105 2.76 3.460 20.22 1 0 1 0 1 0
+#> 2: 18.7 360 175 3.15 3.440 17.02 0 0 2 0 0 1
+#> 3: 21.0 160 110 3.90 2.620 16.46 0 1 4 0 1 0
+#> 4: 21.0 160 110 3.90 2.875 17.02 0 1 4 0 1 0
+#> 5: 21.4 258 110 3.08 3.215 19.44 1 0 1 0 1 0
+#> 6: 22.8 108 93 3.85 2.320 18.61 1 1 1 1 0 0
+#> gear_3 gear_4
+#> <num> <num>
+#> 1: 1 0
+#> 2: 1 0
+#> 3: 0 1
+#> 4: 0 1
+#> 5: 1 0
+#> 6: 0 1
+
+
Fills missing values in selected columns using the next or previous entry.
fill(.data, ..., direction = "down") - -shift_fill(x, direction = "down")- -
.data | -A data.table |
-
---|---|
... | -A selection of columns. |
-
direction | -Direction in which to fill missing values. -Currently either "down" (the default), "up". |
-
x | -A vector. |
-
fill(.data, ..., direction = "down")
+
+shift_fill(x, direction = "down")
A filled data.table
-A filled data.table
+fill
is filling data.table's columns,
shift_fill
is filling any vectors.
-#> Month Year -#> <int> <num> -#> 1: 1 2000 -#> 2: 2 NA -#> 3: 3 NA -#> 4: 4 NA -#> 5: 5 NA -#> 6: 6 NA -#> 7: 7 NA -#> 8: 8 NA -#> 9: 9 NA -#> 10: 10 NA -#> 11: 11 NA -#> 12: 12 2001df %>% fill(Year)#> Month Year -#> <int> <num> -#> 1: 1 2000 -#> 2: 2 2000 -#> 3: 3 2000 -#> 4: 4 2000 -#> 5: 5 2000 -#> 6: 6 2000 -#> 7: 7 2000 -#> 8: 8 2000 -#> 9: 9 2000 -#> 10: 10 2000 -#> 11: 11 2000 -#> 12: 12 2001-df <- data.table(Month = 1:12, Year = c(2000, rep(NA, 10),2001)) -df %>% fill(Year,direction = "up")#> Month Year -#> <int> <num> -#> 1: 1 2000 -#> 2: 2 2001 -#> 3: 3 2001 -#> 4: 4 2001 -#> 5: 5 2001 -#> 6: 6 2001 -#> 7: 7 2001 -#> 8: 8 2001 -#> 9: 9 2001 -#> 10: 10 2001 -#> 11: 11 2001 -#> 12: 12 2001-
+df <- data.table(Month = 1:12, Year = c(2000, rep(NA, 10),2001))
+df
+#> Month Year
+#> <int> <num>
+#> 1: 1 2000
+#> 2: 2 NA
+#> 3: 3 NA
+#> 4: 4 NA
+#> 5: 5 NA
+#> 6: 6 NA
+#> 7: 7 NA
+#> 8: 8 NA
+#> 9: 9 NA
+#> 10: 10 NA
+#> 11: 11 NA
+#> 12: 12 2001
+df %>% fill(Year)
+#> Month Year
+#> <int> <num>
+#> 1: 1 2000
+#> 2: 2 2000
+#> 3: 3 2000
+#> 4: 4 2000
+#> 5: 5 2000
+#> 6: 6 2000
+#> 7: 7 2000
+#> 8: 8 2000
+#> 9: 9 2000
+#> 10: 10 2000
+#> 11: 11 2000
+#> 12: 12 2001
+
+df <- data.table(Month = 1:12, Year = c(2000, rep(NA, 10),2001))
+df %>% fill(Year,direction = "up")
+#> Month Year
+#> <int> <num>
+#> 1: 1 2000
+#> 2: 2 2001
+#> 3: 3 2001
+#> 4: 4 2001
+#> 5: 5 2001
+#> 6: 6 2001
+#> 7: 7 2001
+#> 8: 8 2001
+#> 9: 9 2001
+#> 10: 10 2001
+#> 11: 11 2001
+#> 12: 12 2001
+
+
Analogous function for filter
in dplyr.
filter(.data, ...)+
filter(.data, ...)
.data | -data.frame |
-
---|---|
... | -List of variables or name-value pairs of summary/modifications -functions. |
-
A data.table
+Currently data.table is not able to delete rows by reference,
-https://github.com/Rdatatable/data.table/issues/635
-https://stackoverflow.com/questions/10790204/how-to-delete-a-row-by-reference-in-data-table
--#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 7.1 3.0 5.9 2.1 virginica -#> 2: 7.6 3.0 6.6 2.1 virginica -#> 3: 7.3 2.9 6.3 1.8 virginica -#> 4: 7.2 3.6 6.1 2.5 virginica -#> 5: 7.7 3.8 6.7 2.2 virginica -#> 6: 7.7 2.6 6.9 2.3 virginica -#> 7: 7.7 2.8 6.7 2.0 virginica -#> 8: 7.2 3.2 6.0 1.8 virginica -#> 9: 7.2 3.0 5.8 1.6 virginica -#> 10: 7.4 2.8 6.1 1.9 virginica -#> 11: 7.9 3.8 6.4 2.0 virginica -#> 12: 7.7 3.0 6.1 2.3 virginicairis %>% filter(Sepal.Length > 7,Sepal.Width > 3)#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 7.2 3.6 6.1 2.5 virginica -#> 2: 7.7 3.8 6.7 2.2 virginica -#> 3: 7.2 3.2 6.0 1.8 virginica -#> 4: 7.9 3.8 6.4 2.0 virginicairis %>% filter(Sepal.Length > 7 & Sepal.Width > 3)#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 7.2 3.6 6.1 2.5 virginica -#> 2: 7.7 3.8 6.7 2.2 virginica -#> 3: 7.2 3.2 6.0 1.8 virginica -#> 4: 7.9 3.8 6.4 2.0 virginica#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 7.9 3.8 6.4 2 virginica
https://github.com/Rdatatable/data.table/issues/635
+https://stackoverflow.com/questions/10790204/how-to-delete-a-row-by-reference-in-data-table
+iris = as.data.table(iris)
+iris %>% filter(Sepal.Length > 7)
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 7.1 3.0 5.9 2.1 virginica
+#> 2: 7.6 3.0 6.6 2.1 virginica
+#> 3: 7.3 2.9 6.3 1.8 virginica
+#> 4: 7.2 3.6 6.1 2.5 virginica
+#> 5: 7.7 3.8 6.7 2.2 virginica
+#> 6: 7.7 2.6 6.9 2.3 virginica
+#> 7: 7.7 2.8 6.7 2.0 virginica
+#> 8: 7.2 3.2 6.0 1.8 virginica
+#> 9: 7.2 3.0 5.8 1.6 virginica
+#> 10: 7.4 2.8 6.1 1.9 virginica
+#> 11: 7.9 3.8 6.4 2.0 virginica
+#> 12: 7.7 3.0 6.1 2.3 virginica
+iris %>% filter(Sepal.Length > 7,Sepal.Width > 3)
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 7.2 3.6 6.1 2.5 virginica
+#> 2: 7.7 3.8 6.7 2.2 virginica
+#> 3: 7.2 3.2 6.0 1.8 virginica
+#> 4: 7.9 3.8 6.4 2.0 virginica
+iris %>% filter(Sepal.Length > 7 & Sepal.Width > 3)
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 7.2 3.6 6.1 2.5 virginica
+#> 2: 7.7 3.8 6.7 2.2 virginica
+#> 3: 7.2 3.2 6.0 1.8 virginica
+#> 4: 7.9 3.8 6.4 2.0 virginica
+iris %>% filter(Sepal.Length == max(Sepal.Length))
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 7.9 3.8 6.4 2 virginica
+
An API for reading fst file as data.table.
parse_fst(path) +++ ++parse_fst(path) + +slice_fst(ft, row_no) + +select_fst(ft, ...) + +filter_fst(ft, ...) + +summary_fst(ft)
++Arguments
+ -slice_fst(ft, row_no) +
- path
+- -select_fst(ft, ...) -filter_fst(ft, ...) +
path to fst file
- ft
+- -summary_fst(ft) -
An object of class fst_table, returned by
parse_fst
Arguments
--
+- - -path -- path to fst file
- -ft -- An object of class fst_table, returned by
parse_fst
- -row_no -- An integer vector (Positive)
- -... -- The filter conditions
- row_no
+- -
An integer vector (Positive)
Value
+- ...
+- + +
The filter conditions
++Value
parse_fst
returns a fst_table class.-
select_fst
andfilter_fst
returns a data.table.Details
- ++- +Details
-
summary_fst
could provide some basic information about the fst table.See also
- - - -Examples
--- # write the file first - path = tempfile(fileext = ".fst") - fst::write_fst(iris,path) - # parse the file but not reading it - parse_fst(path) -> ft - - ft#> <fst file> -#> 150 rows, 5 columns (filea5434c14328.fst) -#> -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <double> <double> <double> <double> <factor> -#> 1 5.1 3.5 1.4 0.2 setosa -#> 2 4.9 3.0 1.4 0.2 setosa -#> 3 4.7 3.2 1.3 0.2 setosa -#> 4 4.6 3.1 1.5 0.2 setosa -#> 5 5.0 3.6 1.4 0.2 setosa -#> -- -- -- -- -- -- -#> 146 6.7 3.0 5.2 2.3 virginica -#> 147 6.3 2.5 5.0 1.9 virginica -#> 148 6.5 3.0 5.2 2.0 virginica -#> 149 6.2 3.4 5.4 2.3 virginica -#> 150 5.9 3.0 5.1 1.8 virginica- class(ft)#> [1] "fst_table"#> $Sepal.Length -#> [1] "numeric" -#> -#> $Sepal.Width -#> [1] "numeric" -#> -#> $Petal.Length -#> [1] "numeric" -#> -#> $Petal.Width -#> [1] "numeric" -#> -#> $Species -#> [1] "factor" -#>names(ft)#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"dim(ft)#> [1] 150 5summary_fst(ft)#> <fst file> -#> 150 rows, 5 columns (filea5434c14328.fst) -#> -#> * 'Sepal.Length': double -#> * 'Sepal.Width' : double -#> * 'Petal.Length': double -#> * 'Petal.Width' : double -#> * 'Species' : factor- # get the data by query - ft %>% slice_fst(1:3)#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 5.1 3.5 1.4 0.2 setosa -#> 2: 4.9 3.0 1.4 0.2 setosa -#> 3: 4.7 3.2 1.3 0.2 setosa#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 5.1 3.5 1.4 0.2 setosa -#> 2: 4.7 3.2 1.3 0.2 setosa- ft %>% select_fst(Sepal.Length)#> Sepal.Length -#> <num> -#> 1: 5.1 -#> 2: 4.9 -#> 3: 4.7 -#> 4: 4.6 -#> 5: 5.0 -#> --- -#> 146: 6.7 -#> 147: 6.3 -#> 148: 6.5 -#> 149: 6.2 -#> 150: 5.9ft %>% select_fst(Sepal.Length,Sepal.Width)#> Sepal.Length Sepal.Width -#> <num> <num> -#> 1: 5.1 3.5 -#> 2: 4.9 3.0 -#> 3: 4.7 3.2 -#> 4: 4.6 3.1 -#> 5: 5.0 3.6 -#> --- -#> 146: 6.7 3.0 -#> 147: 6.3 2.5 -#> 148: 6.5 3.0 -#> 149: 6.2 3.4 -#> 150: 5.9 3.0ft %>% select_fst("Sepal.Length")#> Sepal.Length -#> <num> -#> 1: 5.1 -#> 2: 4.9 -#> 3: 4.7 -#> 4: 4.6 -#> 5: 5.0 -#> --- -#> 146: 6.7 -#> 147: 6.3 -#> 148: 6.5 -#> 149: 6.2 -#> 150: 5.9ft %>% select_fst(1:3)#> Sepal.Length Sepal.Width Petal.Length -#> <num> <num> <num> -#> 1: 5.1 3.5 1.4 -#> 2: 4.9 3.0 1.4 -#> 3: 4.7 3.2 1.3 -#> 4: 4.6 3.1 1.5 -#> 5: 5.0 3.6 1.4 -#> --- -#> 146: 6.7 3.0 5.2 -#> 147: 6.3 2.5 5.0 -#> 148: 6.5 3.0 5.2 -#> 149: 6.2 3.4 5.4 -#> 150: 5.9 3.0 5.1ft %>% select_fst(1,3)#> Sepal.Length Petal.Length -#> <num> <num> -#> 1: 5.1 1.4 -#> 2: 4.9 1.4 -#> 3: 4.7 1.3 -#> 4: 4.6 1.5 -#> 5: 5.0 1.4 -#> --- -#> 146: 6.7 5.2 -#> 147: 6.3 5.0 -#> 148: 6.5 5.2 -#> 149: 6.2 5.4 -#> 150: 5.9 5.1ft %>% select_fst("Se")#> Sepal.Length Sepal.Width -#> <num> <num> -#> 1: 5.1 3.5 -#> 2: 4.9 3.0 -#> 3: 4.7 3.2 -#> 4: 4.6 3.1 -#> 5: 5.0 3.6 -#> --- -#> 146: 6.7 3.0 -#> 147: 6.3 2.5 -#> 148: 6.5 3.0 -#> 149: 6.2 3.4 -#> 150: 5.9 3.0- # return a warning with message - # \donttest{ - ft %>% select_fst("nothing")#> Warning: No matched columns,try other patterns. Names of the `fst_table` are listed.#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"# } - - ft %>% select_fst("Se|Sp")#> Sepal.Length Sepal.Width Species -#> <num> <num> <fctr> -#> 1: 5.1 3.5 setosa -#> 2: 4.9 3.0 setosa -#> 3: 4.7 3.2 setosa -#> 4: 4.6 3.1 setosa -#> 5: 5.0 3.6 setosa -#> --- -#> 146: 6.7 3.0 virginica -#> 147: 6.3 2.5 virginica -#> 148: 6.5 3.0 virginica -#> 149: 6.2 3.4 virginica -#> 150: 5.9 3.0 virginica#> Sepal.Width Petal.Length -#> <num> <num> -#> 1: 3.5 1.4 -#> 2: 3.0 1.4 -#> 3: 3.2 1.3 -#> 4: 3.1 1.5 -#> 5: 3.6 1.4 -#> --- -#> 146: 3.0 5.2 -#> 147: 2.5 5.0 -#> 148: 3.0 5.2 -#> 149: 3.4 5.4 -#> 150: 3.0 5.1- ft %>% filter_fst(Sepal.Width > 3)#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 5.1 3.5 1.4 0.2 setosa -#> 2: 4.7 3.2 1.3 0.2 setosa -#> 3: 4.6 3.1 1.5 0.2 setosa -#> 4: 5.0 3.6 1.4 0.2 setosa -#> 5: 5.4 3.9 1.7 0.4 setosa -#> 6: 4.6 3.4 1.4 0.3 setosa -#> 7: 5.0 3.4 1.5 0.2 setosa -#> 8: 4.9 3.1 1.5 0.1 setosa -#> 9: 5.4 3.7 1.5 0.2 setosa -#> 10: 4.8 3.4 1.6 0.2 setosa -#> 11: 5.8 4.0 1.2 0.2 setosa -#> 12: 5.7 4.4 1.5 0.4 setosa -#> 13: 5.4 3.9 1.3 0.4 setosa -#> 14: 5.1 3.5 1.4 0.3 setosa -#> 15: 5.7 3.8 1.7 0.3 setosa -#> 16: 5.1 3.8 1.5 0.3 setosa -#> 17: 5.4 3.4 1.7 0.2 setosa -#> 18: 5.1 3.7 1.5 0.4 setosa -#> 19: 4.6 3.6 1.0 0.2 setosa -#> 20: 5.1 3.3 1.7 0.5 setosa -#> 21: 4.8 3.4 1.9 0.2 setosa -#> 22: 5.0 3.4 1.6 0.4 setosa -#> 23: 5.2 3.5 1.5 0.2 setosa -#> 24: 5.2 3.4 1.4 0.2 setosa -#> 25: 4.7 3.2 1.6 0.2 setosa -#> 26: 4.8 3.1 1.6 0.2 setosa -#> 27: 5.4 3.4 1.5 0.4 setosa -#> 28: 5.2 4.1 1.5 0.1 setosa -#> 29: 5.5 4.2 1.4 0.2 setosa -#> 30: 4.9 3.1 1.5 0.2 setosa -#> 31: 5.0 3.2 1.2 0.2 setosa -#> 32: 5.5 3.5 1.3 0.2 setosa -#> 33: 4.9 3.6 1.4 0.1 setosa -#> 34: 5.1 3.4 1.5 0.2 setosa -#> 35: 5.0 3.5 1.3 0.3 setosa -#> 36: 4.4 3.2 1.3 0.2 setosa -#> 37: 5.0 3.5 1.6 0.6 setosa -#> 38: 5.1 3.8 1.9 0.4 setosa -#> 39: 5.1 3.8 1.6 0.2 setosa -#> 40: 4.6 3.2 1.4 0.2 setosa -#> 41: 5.3 3.7 1.5 0.2 setosa -#> 42: 5.0 3.3 1.4 0.2 setosa -#> 43: 7.0 3.2 4.7 1.4 versicolor -#> 44: 6.4 3.2 4.5 1.5 versicolor -#> 45: 6.9 3.1 4.9 1.5 versicolor -#> 46: 6.3 3.3 4.7 1.6 versicolor -#> 47: 6.7 3.1 4.4 1.4 versicolor -#> 48: 5.9 3.2 4.8 1.8 versicolor -#> 49: 6.0 3.4 4.5 1.6 versicolor -#> 50: 6.7 3.1 4.7 1.5 versicolor -#> 51: 6.3 3.3 6.0 2.5 virginica -#> 52: 7.2 3.6 6.1 2.5 virginica -#> 53: 6.5 3.2 5.1 2.0 virginica -#> 54: 6.4 3.2 5.3 2.3 virginica -#> 55: 7.7 3.8 6.7 2.2 virginica -#> 56: 6.9 3.2 5.7 2.3 virginica -#> 57: 6.7 3.3 5.7 2.1 virginica -#> 58: 7.2 3.2 6.0 1.8 virginica -#> 59: 7.9 3.8 6.4 2.0 virginica -#> 60: 6.3 3.4 5.6 2.4 virginica -#> 61: 6.4 3.1 5.5 1.8 virginica -#> 62: 6.9 3.1 5.4 2.1 virginica -#> 63: 6.7 3.1 5.6 2.4 virginica -#> 64: 6.9 3.1 5.1 2.3 virginica -#> 65: 6.8 3.2 5.9 2.3 virginica -#> 66: 6.7 3.3 5.7 2.5 virginica -#> 67: 6.2 3.4 5.4 2.3 virginica -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Speciesft %>% filter_fst(Sepal.Length > 6 , Species == "virginica")#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 6.3 3.3 6.0 2.5 virginica -#> 2: 7.1 3.0 5.9 2.1 virginica -#> 3: 6.3 2.9 5.6 1.8 virginica -#> 4: 6.5 3.0 5.8 2.2 virginica -#> 5: 7.6 3.0 6.6 2.1 virginica -#> 6: 7.3 2.9 6.3 1.8 virginica -#> 7: 6.7 2.5 5.8 1.8 virginica -#> 8: 7.2 3.6 6.1 2.5 virginica -#> 9: 6.5 3.2 5.1 2.0 virginica -#> 10: 6.4 2.7 5.3 1.9 virginica -#> 11: 6.8 3.0 5.5 2.1 virginica -#> 12: 6.4 3.2 5.3 2.3 virginica -#> 13: 6.5 3.0 5.5 1.8 virginica -#> 14: 7.7 3.8 6.7 2.2 virginica -#> 15: 7.7 2.6 6.9 2.3 virginica -#> 16: 6.9 3.2 5.7 2.3 virginica -#> 17: 7.7 2.8 6.7 2.0 virginica -#> 18: 6.3 2.7 4.9 1.8 virginica -#> 19: 6.7 3.3 5.7 2.1 virginica -#> 20: 7.2 3.2 6.0 1.8 virginica -#> 21: 6.2 2.8 4.8 1.8 virginica -#> 22: 6.1 3.0 4.9 1.8 virginica -#> 23: 6.4 2.8 5.6 2.1 virginica -#> 24: 7.2 3.0 5.8 1.6 virginica -#> 25: 7.4 2.8 6.1 1.9 virginica -#> 26: 7.9 3.8 6.4 2.0 virginica -#> 27: 6.4 2.8 5.6 2.2 virginica -#> 28: 6.3 2.8 5.1 1.5 virginica -#> 29: 6.1 2.6 5.6 1.4 virginica -#> 30: 7.7 3.0 6.1 2.3 virginica -#> 31: 6.3 3.4 5.6 2.4 virginica -#> 32: 6.4 3.1 5.5 1.8 virginica -#> 33: 6.9 3.1 5.4 2.1 virginica -#> 34: 6.7 3.1 5.6 2.4 virginica -#> 35: 6.9 3.1 5.1 2.3 virginica -#> 36: 6.8 3.2 5.9 2.3 virginica -#> 37: 6.7 3.3 5.7 2.5 virginica -#> 38: 6.7 3.0 5.2 2.3 virginica -#> 39: 6.3 2.5 5.0 1.9 virginica -#> 40: 6.5 3.0 5.2 2.0 virginica -#> 41: 6.2 3.4 5.4 2.3 virginica -#> Sepal.Length Sepal.Width Petal.Length Petal.Width Speciesft %>% filter_fst(Sepal.Length > 6 & Species == "virginica" & Sepal.Width < 3)#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 6.3 2.9 5.6 1.8 virginica -#> 2: 7.3 2.9 6.3 1.8 virginica -#> 3: 6.7 2.5 5.8 1.8 virginica -#> 4: 6.4 2.7 5.3 1.9 virginica -#> 5: 7.7 2.6 6.9 2.3 virginica -#> 6: 7.7 2.8 6.7 2.0 virginica -#> 7: 6.3 2.7 4.9 1.8 virginica -#> 8: 6.2 2.8 4.8 1.8 virginica -#> 9: 6.4 2.8 5.6 2.1 virginica -#> 10: 7.4 2.8 6.1 1.9 virginica -#> 11: 6.4 2.8 5.6 2.2 virginica -#> 12: 6.3 2.8 5.1 1.5 virginica -#> 13: 6.1 2.6 5.6 1.4 virginica -#> 14: 6.3 2.5 5.0 1.9 virginica-++See also
+ ++Examples
+++ # write the file first + path = tempfile(fileext = ".fst") + fst::write_fst(iris,path) + # parse the file but not reading it + parse_fst(path) -> ft + + ft +#> <fst file> +#> 150 rows, 5 columns (file27604466d41.fst) +#> +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> <double> <double> <double> <double> <factor> +#> 1 5.1 3.5 1.4 0.2 setosa +#> 2 4.9 3.0 1.4 0.2 setosa +#> 3 4.7 3.2 1.3 0.2 setosa +#> 4 4.6 3.1 1.5 0.2 setosa +#> 5 5.0 3.6 1.4 0.2 setosa +#> -- -- -- -- -- -- +#> 146 6.7 3.0 5.2 2.3 virginica +#> 147 6.3 2.5 5.0 1.9 virginica +#> 148 6.5 3.0 5.2 2.0 virginica +#> 149 6.2 3.4 5.4 2.3 virginica +#> 150 5.9 3.0 5.1 1.8 virginica + + class(ft) +#> [1] "fst_table" + lapply(ft,class) +#> $Sepal.Length +#> [1] "numeric" +#> +#> $Sepal.Width +#> [1] "numeric" +#> +#> $Petal.Length +#> [1] "numeric" +#> +#> $Petal.Width +#> [1] "numeric" +#> +#> $Species +#> [1] "factor" +#> + names(ft) +#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" + dim(ft) +#> [1] 150 5 + summary_fst(ft) +#> <fst file> +#> 150 rows, 5 columns (file27604466d41.fst) +#> +#> * 'Sepal.Length': double +#> * 'Sepal.Width' : double +#> * 'Petal.Length': double +#> * 'Petal.Width' : double +#> * 'Species' : factor + + # get the data by query + ft %>% slice_fst(1:3) +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> <num> <num> <num> <num> <fctr> +#> 1: 5.1 3.5 1.4 0.2 setosa +#> 2: 4.9 3.0 1.4 0.2 setosa +#> 3: 4.7 3.2 1.3 0.2 setosa + ft %>% slice_fst(c(1,3)) +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> <num> <num> <num> <num> <fctr> +#> 1: 5.1 3.5 1.4 0.2 setosa +#> 2: 4.7 3.2 1.3 0.2 setosa + + ft %>% select_fst(Sepal.Length) +#> Sepal.Length +#> <num> +#> 1: 5.1 +#> 2: 4.9 +#> 3: 4.7 +#> 4: 4.6 +#> 5: 5.0 +#> --- +#> 146: 6.7 +#> 147: 6.3 +#> 148: 6.5 +#> 149: 6.2 +#> 150: 5.9 + ft %>% select_fst(Sepal.Length,Sepal.Width) +#> Sepal.Length Sepal.Width +#> <num> <num> +#> 1: 5.1 3.5 +#> 2: 4.9 3.0 +#> 3: 4.7 3.2 +#> 4: 4.6 3.1 +#> 5: 5.0 3.6 +#> --- +#> 146: 6.7 3.0 +#> 147: 6.3 2.5 +#> 148: 6.5 3.0 +#> 149: 6.2 3.4 +#> 150: 5.9 3.0 + ft %>% select_fst("Sepal.Length") +#> Sepal.Length +#> <num> +#> 1: 5.1 +#> 2: 4.9 +#> 3: 4.7 +#> 4: 4.6 +#> 5: 5.0 +#> --- +#> 146: 6.7 +#> 147: 6.3 +#> 148: 6.5 +#> 149: 6.2 +#> 150: 5.9 + ft %>% select_fst(1:3) +#> Sepal.Length Sepal.Width Petal.Length +#> <num> <num> <num> +#> 1: 5.1 3.5 1.4 +#> 2: 4.9 3.0 1.4 +#> 3: 4.7 3.2 1.3 +#> 4: 4.6 3.1 1.5 +#> 5: 5.0 3.6 1.4 +#> --- +#> 146: 6.7 3.0 5.2 +#> 147: 6.3 2.5 5.0 +#> 148: 6.5 3.0 5.2 +#> 149: 6.2 3.4 5.4 +#> 150: 5.9 3.0 5.1 + ft %>% select_fst(1,3) +#> Sepal.Length Petal.Length +#> <num> <num> +#> 1: 5.1 1.4 +#> 2: 4.9 1.4 +#> 3: 4.7 1.3 +#> 4: 4.6 1.5 +#> 5: 5.0 1.4 +#> --- +#> 146: 6.7 5.2 +#> 147: 6.3 5.0 +#> 148: 6.5 5.2 +#> 149: 6.2 5.4 +#> 150: 5.9 5.1 + ft %>% select_fst("Se") +#> Sepal.Length Sepal.Width +#> <num> <num> +#> 1: 5.1 3.5 +#> 2: 4.9 3.0 +#> 3: 4.7 3.2 +#> 4: 4.6 3.1 +#> 5: 5.0 3.6 +#> --- +#> 146: 6.7 3.0 +#> 147: 6.3 2.5 +#> 148: 6.5 3.0 +#> 149: 6.2 3.4 +#> 150: 5.9 3.0 + + # return a warning with message + # \donttest{ + ft %>% select_fst("nothing") +#> Warning: No matched columns,try other patterns. Names of the `fst_table` are listed. +#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" + # } + + ft %>% select_fst("Se|Sp") +#> Sepal.Length Sepal.Width Species +#> <num> <num> <fctr> +#> 1: 5.1 3.5 setosa +#> 2: 4.9 3.0 setosa +#> 3: 4.7 3.2 setosa +#> 4: 4.6 3.1 setosa +#> 5: 5.0 3.6 setosa +#> --- +#> 146: 6.7 3.0 virginica +#> 147: 6.3 2.5 virginica +#> 148: 6.5 3.0 virginica +#> 149: 6.2 3.4 virginica +#> 150: 5.9 3.0 virginica + ft %>% select_fst(cols = names(iris)[2:3]) +#> Sepal.Width Petal.Length +#> <num> <num> +#> 1: 3.5 1.4 +#> 2: 3.0 1.4 +#> 3: 3.2 1.3 +#> 4: 3.1 1.5 +#> 5: 3.6 1.4 +#> --- +#> 146: 3.0 5.2 +#> 147: 2.5 5.0 +#> 148: 3.0 5.2 +#> 149: 3.4 5.4 +#> 150: 3.0 5.1 + + ft %>% filter_fst(Sepal.Width > 3) +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> <num> <num> <num> <num> <fctr> +#> 1: 5.1 3.5 1.4 0.2 setosa +#> 2: 4.7 3.2 1.3 0.2 setosa +#> 3: 4.6 3.1 1.5 0.2 setosa +#> 4: 5.0 3.6 1.4 0.2 setosa +#> 5: 5.4 3.9 1.7 0.4 setosa +#> 6: 4.6 3.4 1.4 0.3 setosa +#> 7: 5.0 3.4 1.5 0.2 setosa +#> 8: 4.9 3.1 1.5 0.1 setosa +#> 9: 5.4 3.7 1.5 0.2 setosa +#> 10: 4.8 3.4 1.6 0.2 setosa +#> 11: 5.8 4.0 1.2 0.2 setosa +#> 12: 5.7 4.4 1.5 0.4 setosa +#> 13: 5.4 3.9 1.3 0.4 setosa +#> 14: 5.1 3.5 1.4 0.3 setosa +#> 15: 5.7 3.8 1.7 0.3 setosa +#> 16: 5.1 3.8 1.5 0.3 setosa +#> 17: 5.4 3.4 1.7 0.2 setosa +#> 18: 5.1 3.7 1.5 0.4 setosa +#> 19: 4.6 3.6 1.0 0.2 setosa +#> 20: 5.1 3.3 1.7 0.5 setosa +#> 21: 4.8 3.4 1.9 0.2 setosa +#> 22: 5.0 3.4 1.6 0.4 setosa +#> 23: 5.2 3.5 1.5 0.2 setosa +#> 24: 5.2 3.4 1.4 0.2 setosa +#> 25: 4.7 3.2 1.6 0.2 setosa +#> 26: 4.8 3.1 1.6 0.2 setosa +#> 27: 5.4 3.4 1.5 0.4 setosa +#> 28: 5.2 4.1 1.5 0.1 setosa +#> 29: 5.5 4.2 1.4 0.2 setosa +#> 30: 4.9 3.1 1.5 0.2 setosa +#> 31: 5.0 3.2 1.2 0.2 setosa +#> 32: 5.5 3.5 1.3 0.2 setosa +#> 33: 4.9 3.6 1.4 0.1 setosa +#> 34: 5.1 3.4 1.5 0.2 setosa +#> 35: 5.0 3.5 1.3 0.3 setosa +#> 36: 4.4 3.2 1.3 0.2 setosa +#> 37: 5.0 3.5 1.6 0.6 setosa +#> 38: 5.1 3.8 1.9 0.4 setosa +#> 39: 5.1 3.8 1.6 0.2 setosa +#> 40: 4.6 3.2 1.4 0.2 setosa +#> 41: 5.3 3.7 1.5 0.2 setosa +#> 42: 5.0 3.3 1.4 0.2 setosa +#> 43: 7.0 3.2 4.7 1.4 versicolor +#> 44: 6.4 3.2 4.5 1.5 versicolor +#> 45: 6.9 3.1 4.9 1.5 versicolor +#> 46: 6.3 3.3 4.7 1.6 versicolor +#> 47: 6.7 3.1 4.4 1.4 versicolor +#> 48: 5.9 3.2 4.8 1.8 versicolor +#> 49: 6.0 3.4 4.5 1.6 versicolor +#> 50: 6.7 3.1 4.7 1.5 versicolor +#> 51: 6.3 3.3 6.0 2.5 virginica +#> 52: 7.2 3.6 6.1 2.5 virginica +#> 53: 6.5 3.2 5.1 2.0 virginica +#> 54: 6.4 3.2 5.3 2.3 virginica +#> 55: 7.7 3.8 6.7 2.2 virginica +#> 56: 6.9 3.2 5.7 2.3 virginica +#> 57: 6.7 3.3 5.7 2.1 virginica +#> 58: 7.2 3.2 6.0 1.8 virginica +#> 59: 7.9 3.8 6.4 2.0 virginica +#> 60: 6.3 3.4 5.6 2.4 virginica +#> 61: 6.4 3.1 5.5 1.8 virginica +#> 62: 6.9 3.1 5.4 2.1 virginica +#> 63: 6.7 3.1 5.6 2.4 virginica +#> 64: 6.9 3.1 5.1 2.3 virginica +#> 65: 6.8 3.2 5.9 2.3 virginica +#> 66: 6.7 3.3 5.7 2.5 virginica +#> 67: 6.2 3.4 5.4 2.3 virginica +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species + ft %>% filter_fst(Sepal.Length > 6 , Species == "virginica") +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> <num> <num> <num> <num> <fctr> +#> 1: 6.3 3.3 6.0 2.5 virginica +#> 2: 7.1 3.0 5.9 2.1 virginica +#> 3: 6.3 2.9 5.6 1.8 virginica +#> 4: 6.5 3.0 5.8 2.2 virginica +#> 5: 7.6 3.0 6.6 2.1 virginica +#> 6: 7.3 2.9 6.3 1.8 virginica +#> 7: 6.7 2.5 5.8 1.8 virginica +#> 8: 7.2 3.6 6.1 2.5 virginica +#> 9: 6.5 3.2 5.1 2.0 virginica +#> 10: 6.4 2.7 5.3 1.9 virginica +#> 11: 6.8 3.0 5.5 2.1 virginica +#> 12: 6.4 3.2 5.3 2.3 virginica +#> 13: 6.5 3.0 5.5 1.8 virginica +#> 14: 7.7 3.8 6.7 2.2 virginica +#> 15: 7.7 2.6 6.9 2.3 virginica +#> 16: 6.9 3.2 5.7 2.3 virginica +#> 17: 7.7 2.8 6.7 2.0 virginica +#> 18: 6.3 2.7 4.9 1.8 virginica +#> 19: 6.7 3.3 5.7 2.1 virginica +#> 20: 7.2 3.2 6.0 1.8 virginica +#> 21: 6.2 2.8 4.8 1.8 virginica +#> 22: 6.1 3.0 4.9 1.8 virginica +#> 23: 6.4 2.8 5.6 2.1 virginica +#> 24: 7.2 3.0 5.8 1.6 virginica +#> 25: 7.4 2.8 6.1 1.9 virginica +#> 26: 7.9 3.8 6.4 2.0 virginica +#> 27: 6.4 2.8 5.6 2.2 virginica +#> 28: 6.3 2.8 5.1 1.5 virginica +#> 29: 6.1 2.6 5.6 1.4 virginica +#> 30: 7.7 3.0 6.1 2.3 virginica +#> 31: 6.3 3.4 5.6 2.4 virginica +#> 32: 6.4 3.1 5.5 1.8 virginica +#> 33: 6.9 3.1 5.4 2.1 virginica +#> 34: 6.7 3.1 5.6 2.4 virginica +#> 35: 6.9 3.1 5.1 2.3 virginica +#> 36: 6.8 3.2 5.9 2.3 virginica +#> 37: 6.7 3.3 5.7 2.5 virginica +#> 38: 6.7 3.0 5.2 2.3 virginica +#> 39: 6.3 2.5 5.0 1.9 virginica +#> 40: 6.5 3.0 5.2 2.0 virginica +#> 41: 6.2 3.4 5.4 2.3 virginica +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species + ft %>% filter_fst(Sepal.Length > 6 & Species == "virginica" & Sepal.Width < 3) +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> <num> <num> <num> <num> <fctr> +#> 1: 6.3 2.9 5.6 1.8 virginica +#> 2: 7.3 2.9 6.3 1.8 virginica +#> 3: 6.7 2.5 5.8 1.8 virginica +#> 4: 6.4 2.7 5.3 1.9 virginica +#> 5: 7.7 2.6 6.9 2.3 virginica +#> 6: 7.7 2.8 6.7 2.0 virginica +#> 7: 6.3 2.7 4.9 1.8 virginica +#> 8: 6.2 2.8 4.8 1.8 virginica +#> 9: 6.4 2.8 5.6 2.1 virginica +#> 10: 7.4 2.8 6.1 1.9 virginica +#> 11: 6.4 2.8 5.6 2.2 virginica +#> 12: 6.3 2.8 5.1 1.5 virginica +#> 13: 6.1 2.6 5.6 1.4 virginica +#> 14: 6.3 2.5 5.0 1.9 virginica + +
Wrapper for read_fst
and write_fst
+
Wrapper for read_fst
and write_fst
from fst, but use a different default. For data import, always return a data.table.
For data export, always compress the data to the smallest size.
export_fst(x, path, compress = 100, uniform_encoding = TRUE) - -import_fst( - path, - columns = NULL, - from = 1, - to = NULL, - as.data.table = TRUE, - old_format = FALSE -)- -
data.table
package to be installed.
+
- must be FALSE, the old fst file format is deprecated and can only be read and +converted with fst package versions 0.8.0 to 0.8.10.
`import_fst` returns a data.table with the selected columns and rows. `export_fst` writes `x` to a `fst` file and invisibly returns `x` (so you can use this function in a pipeline).
---# \donttest{ -export_fst(iris,"iris_fst_test.fst") -iris_dt = import_fst("iris_fst_test.fst") -iris_dt#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species -#> <num> <num> <num> <num> <fctr> -#> 1: 5.1 3.5 1.4 0.2 setosa -#> 2: 4.9 3.0 1.4 0.2 setosa -#> 3: 4.7 3.2 1.3 0.2 setosa -#> 4: 4.6 3.1 1.5 0.2 setosa -#> 5: 5.0 3.6 1.4 0.2 setosa -#> --- -#> 146: 6.7 3.0 5.2 2.3 virginica -#> 147: 6.3 2.5 5.0 1.9 virginica -#> 148: 6.5 3.0 5.2 2.0 virginica -#> 149: 6.2 3.4 5.4 2.3 virginica -#> 150: 5.9 3.0 5.1 1.8 virginica
+# \donttest{
+export_fst(iris,"iris_fst_test.fst")
+iris_dt = import_fst("iris_fst_test.fst")
+iris_dt
+#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+#> <num> <num> <num> <num> <fctr>
+#> 1: 5.1 3.5 1.4 0.2 setosa
+#> 2: 4.9 3.0 1.4 0.2 setosa
+#> 3: 4.7 3.2 1.3 0.2 setosa
+#> 4: 4.6 3.1 1.5 0.2 setosa
+#> 5: 5.0 3.6 1.4 0.2 setosa
+#> ---
+#> 146: 6.7 3.0 5.2 2.3 virginica
+#> 147: 6.3 2.5 5.0 1.9 virginica
+#> 148: 6.5 3.0 5.2 2.0 virginica
+#> 149: 6.2 3.4 5.4 2.3 virginica
+#> 150: 5.9 3.0 5.1 1.8 virginica
+unlink("iris_fst_test.fst")
+# }
+
+