-
Notifications
You must be signed in to change notification settings - Fork 0
/
day_nine_scraping.R
86 lines (58 loc) · 1.79 KB
/
day_nine_scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
library(rvest)
url_data<-"https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture"
xpath<-"/html/body/div[2]/div/div[3]/main/div[3]/div[3]/div[1]/table[12]"
url_data %>%
read_html() %>%
html_element(xpath=xpath) %>%
html_table()
result<-url_data %>%
read_html() %>%
html_element(xpath=xpath) %>%
html_table()
View(result)
bk_url<-"https://en.wikipedia.org/wiki/List_of_countries_with_Burger_King_franchises"
bk<-"/html/body/div[2]/div/div[3]/main/div[3]/div[3]/div[1]/table[3]"
bk2<-"/html/body/div[2]/div/div[3]/main/div[3]/div[3]/div[1]/table[3]"
bk_url %>%
read_html() %>%
html_element(xpath = bk2) %>%
html_table()
bk_list<-bk_url %>%
read_html() %>%
html_element(xpath = bk) %>%
html_table()
#lets just steal some text
nyt<-"https://www.nytimes.com/2023/10/19/style/attersee-isabel-wilkinson-schor.html"
nyt_css<-".meteredContent"
nyt %>%
read_html() %>%
html_element(css = nyt_css) %>%
html_text()
dresses<-nyt %>%
read_html() %>%
html_element(css = "nyt_css") %>%
html_text()
#ok, this is a unit three skill...
library(stringr)
dressesB<-str_split(dresses, "/The New York Times", n = Inf, simplify = FALSE)0
dressesC<-data.frame(dressesB)
View(dressesC)
#now we do deeeep chaos
imgsrc <- read_html(nyt) %>%
html_node(xpath = '//*/img') %>%
html_attr('src')
imgsrc
read_html(nyt) %>% html_nodes(css = "picture") %>% html_text()
A<-"/html/body/div[2]/div/div[3]/main/div[3]/div[3]/div[1]/table[2]"
B<-"https://en.wikipedia.org/wiki/The_Game_Awards"
B %>%
read_html() %>%
html_element(xpath = A) %>%
html_table()
library(rvest)
G<-"/html/body/div[2]/div/div[3]/main/div[3]/div[3]/div[1]/table[2]"
H<-"https://en.wikipedia.org/wiki/Lady_Gaga_discography"
H %>%
read_html() %>%
html_element(xpath = G) %>%
html_table()