-
Notifications
You must be signed in to change notification settings - Fork 7
/
01-get_data.R
105 lines (62 loc) · 3.01 KB
/
01-get_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
##########################################################################
# Jose Cajide - @jrcajide
# Master Data Science: Downloading data
##########################################################################
# http://stat-computing.org/dataexpo/2009/the-data.html
# Download flights data
# The flights data is a well known data source representing 123 million flights over 22 years.
# It consumes roughly 12 GiB of storage in uncompressed CSV format in yearly files.
list.of.packages <- c("R.utils", "rvest", "stringr", "foreach", "doParallel")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Downloading data with R -------------------------------------------------
if(!file.exists("downloads")){
dir.create("downloads")
}
tmp_dir <- tempdir()
tmp_file <- file.path(tmp_dir, '2007.csv')
download.file('http://stat-computing.org/dataexpo/2009/2007.csv.bz2', tmp_file)
library(R.utils) # for bunzip2
bunzip2(tmp_file, "downloads/2007.csv", remove = FALSE, skip = TRUE)
# Checks
file.info(tmp_file) #Downlaoded file
utils:::format.object_size(file.info("downloads/2007.csv")$size, "auto") #Uncompressed file size
# Web Scraping ------------------------------------------------------------
library(rvest) # for read_html, html_*, ...
library(stringr) # for str_*
page <- read_html("http://stat-computing.org/dataexpo/2009/the-data.html")
(all_links <- html_nodes(page, "a"))
(linked_resources <- html_attr(all_links, "href"))
(linked_bz2_files <- str_subset(linked_resources, "\\.bz2"))
(bz2_files_links <- paste0("http://stat-computing.org/dataexpo/2009/", linked_bz2_files))
(bz2_files_links <- tail(bz2_files_links, 2)) # Nos quedamos con sólo los dos primeros
(num_files <- length(bz2_files_links))
# Custom download function
download_flights_datasets <- function (link) {
cat(link, "\n")
this_file_link <- link
this_file_name <- str_extract(basename(this_file_link), "^.{0,8}")
this_tmp_file <- file.path(tmp_dir, this_file_name)
download.file(this_file_link, this_tmp_file)
bunzip2(this_tmp_file, file.path('downloads', this_file_name), remove = FALSE, skip = TRUE)
}
# Testing download_flights_datasets
( link <- bz2_files_links[1] )
download_flights_datasets(link)
##########################################################################
# EJ: Coding exercise: Downloading all links
# using download_flights_datasets()
##########################################################################
# Sol. 1:
#Sol. 2:
# Downloading all files in parallel
library("foreach") # for foreach
library("doParallel") # for makeCluster, registerDoParallel
detectCores()
cl <- makeCluster(detectCores() - 1) # create a cluster with x cores
registerDoParallel(cl) # register the cluster
res <- foreach(i = 1:num_files,
.packages = c("R.utils", "stringr")) %dopar% {
this_file_link <- bz2_files_links[i]
download_flights_datasets(this_file_link)
}