-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_scraping.R
162 lines (117 loc) · 4.57 KB
/
data_scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
##################################
# Assignment 3: Data Science Course
# Kyle Ott & Cornelius Schneider
# 12 December 2014
##################################
## This do file scrapes the data from the Finnish Newspaper Website
## It takes a while to run this code, so we only do it once and save the result
## We would prefer to be able to run this as a code chunk but it simply takes
## too long to knit (45+ minutes), even when caching...
# Load packages
library(httr)
library(dplyr)
library(XML)
library(stringr)
library(devtools)
library(rsdmx)
library(knitr)
library(tidyr)
library(reshape2)
# set your local directory
setwd("/Users/Kyle/Dropbox/!Fall_2014/Collab_Data/Final_Project/")
# 2013 data
tables2013 = data.frame()
for (i in 1:30){
# URL with the ta table
URL_temp2013 <- paste0('http://www.taloussanomat.fi/verotiedot/2013/suurituloisimmat/?n=', i)
if (i==1) { tables2013 <- URL_temp2013 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
tables2013 <- tables2013[[1]] }
else if (i!=1){
# Gather content and parse all tables #
table_temp2013 <- URL_temp2013 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
# Identify correct table
# names(table) # the table does not have an ID
# select first table with taxData
tables_df_temp2013 <- table_temp2013[[1]]
tables2013 <- rbind(tables2013, tables_df_temp2013)
}
#end loop
}
tables2013$year <- 2013
# 2012 data
tables2012 = data.frame()
for (i in 1:30){
# URL with the medals table
URL_temp2012 <- paste0('http://www.taloussanomat.fi/verotiedot/2012/suurituloisimmat/?n=', i)
if (i==1) { tables2012 <- URL_temp2012 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
tables2012 <- tables2012[[1]] }
else if (i!=1){
#### Gather content and parse all tables ####
table_temp2012 <- URL_temp2012 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
# Identify correct table
# names(table) # the table does not have an ID
# select first table with taxData
tables_df_temp2012 <- table_temp2012[[1]]
tables2012 <- rbind(tables2012, tables_df_temp2012)
}
##end loop
}
tables2012$year <- 2012
## 2011 data
tables2011 = data.frame()
#note that for some years they don't have complete obs (15,000), which is why the loop is only up to 28 for the following year
for (i in 1:28){
# URL with the medals table
URL_temp2011 <- paste0('http://www.taloussanomat.fi/verotiedot/2011/suurituloisimmat/?n=', i)
if (i==1) { tables2011 <- URL_temp2011 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
tables2011 <- tables2011[[1]] }
else if (i!=1){
#### Gather content and parse all tables ####
table_temp2011 <- URL_temp2011 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
# Identify correct table
# names(table) # the table does not have an ID
# select first table with taxData
tables_df_temp2011 <- table_temp2011[[1]]
tables2011 <- rbind(tables2011, tables_df_temp2011)
}
#end loop
}
tables2011$year <- 2011
# 2010 data
tables2010 = data.frame()
for (i in 1:29){
# URL with the medals table
URL_temp2010 <- paste0('http://www.taloussanomat.fi/verotiedot/2010/suurituloisimmat/?n=', i)
if (i==1) { tables2010 <- URL_temp2010 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
tables2010 <- tables2010[[1]] }
else if (i!=1){
#### Gather content and parse all tables ####
table_temp2010 <- URL_temp2010 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
# select first table with taxData
tables_df_temp2010 <- table_temp2010[[1]]
tables2010 <- rbind(tables2010, tables_df_temp2010)
}
##end loop
}
tables2010$year <- 2010
## 2009 data
tables2009 = data.frame()
for (i in 1:25){
# URL with the medals table
URL_temp2009 <- paste0('http://www.taloussanomat.fi/verotiedot/2009/suurituloisimmat/?n=', i)
if (i==1) { tables2009 <- URL_temp2009 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
tables2009 <- tables2009[[1]] }
else if (i!=1){
#### Gather content and parse all tables ####
table_temp2009 <- URL_temp2009 %>% GET() %>% content(as = 'parsed') %>% readHTMLTable()
# select first table with taxData
tables_df_temp2009 <- table_temp2009[[1]]
tables2009 <- rbind(tables2009, tables_df_temp2009)
}
##end loop
}
tables2009$year <- 2009
#appending scraped tables
all <- rbind(tables2009, tables2010, tables2011, tables2012, tables2013)
# this dataset is used to start loop_best2.R
save(all, file = all.RData")