-
Notifications
You must be signed in to change notification settings - Fork 0
/
R爬蟲與前處理.html
498 lines (456 loc) · 55.3 KB
/
R爬蟲與前處理.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<meta name="mobile-web-app-capable" content="yes">
<title>
R爬蟲與前處理 - HackMD
</title>
<link rel="icon" type="image/png" href="https://hackmd.io/favicon.png">
<link rel="apple-touch-icon" href="https://hackmd.io/apple-touch-icon.png">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/ionicons/2.0.1/css/ionicons.min.css" integrity="sha256-3iu9jgsy9TpTwXKb7bNQzqWekRX7pPK+2OLj3R922fo=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/octicons/3.5.0/octicons.min.css" integrity="sha256-QiWfLIsCT02Sdwkogf6YMiQlj4NE84MKkzEMkZnMGdg=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.5.1/themes/prism.min.css" integrity="sha256-vtR0hSWRc3Tb26iuN2oZHt3KRUomwTufNIf5/4oeCyg=" crossorigin="anonymous" />
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@hackmd/emojify.js@2.1.0/dist/css/basic/emojify.min.css" integrity="sha256-UOrvMOsSDSrW6szVLe8ZDZezBxh5IoIfgTwdNDgTjiU=" crossorigin="anonymous" />
<style>
@import url(https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,500,500i|Source+Code+Pro:300,400,500|Source+Sans+Pro:300,300i,400,400i,600,600i|Source+Serif+Pro&subset=latin-ext);.hljs{background:#fff;color:#333;display:block;overflow-x:auto;padding:.5em}.hljs-comment,.hljs-meta{color:#969896}.hljs-emphasis,.hljs-quote,.hljs-string,.hljs-strong,.hljs-template-variable,.hljs-variable{color:#df5000}.hljs-keyword,.hljs-selector-tag,.hljs-type{color:#a71d5d}.hljs-attribute,.hljs-bullet,.hljs-literal,.hljs-number,.hljs-symbol{color:#0086b3}.hljs-built_in,.hljs-builtin-name{color:#005cc5}.hljs-name,.hljs-section{color:#63a35c}.hljs-tag{color:#333}.hljs-attr,.hljs-selector-attr,.hljs-selector-class,.hljs-selector-id,.hljs-selector-pseudo,.hljs-title{color:#795da3}.hljs-addition{background-color:#eaffea;color:#55a532}.hljs-deletion{background-color:#ffecec;color:#bd2c00}.hljs-link{text-decoration:underline}.markdown-body{word-wrap:break-word;font-size:16px;line-height:1.5}.markdown-body:after,.markdown-body:before{content:"";display:table}.markdown-body:after{clear:both}.markdown-body>:first-child{margin-top:0!important}.markdown-body>:last-child{margin-bottom:0!important}.markdown-body a:not([href]){color:inherit;text-decoration:none}.markdown-body .absent{color:#c00}.markdown-body .anchor{float:left;line-height:1;margin-left:-20px;padding-right:4px}.markdown-body .anchor:focus{outline:none}.markdown-body blockquote,.markdown-body dl,.markdown-body ol,.markdown-body p,.markdown-body pre,.markdown-body table,.markdown-body ul{margin-bottom:16px;margin-top:0}.markdown-body hr{background-color:#e7e7e7;border:0;height:.25em;margin:24px 0;padding:0}.markdown-body blockquote{border-left:.25em solid #ddd;color:#777;font-size:16px;padding:0 1em}.markdown-body blockquote>:first-child{margin-top:0}.markdown-body blockquote>:last-child{margin-bottom:0}.markdown-body kbd,.popover kbd{background-color:#fcfcfc;border:1px solid;border-color:#ccc #ccc #bbb;border-radius:3px;box-shadow:inset 0 -1px 0 #bbb;color:#555;display:inline-block;font-size:11px;line-height:10px;padding:3px 5px;vertical-align:middle}.markdown-body .loweralpha{list-style-type:lower-alpha}.markdown-body h1,.markdown-body h2,.markdown-body h3,.markdown-body h4,.markdown-body h5,.markdown-body h6{font-weight:600;line-height:1.25;margin-bottom:16px;margin-top:24px}.markdown-body h1 .octicon-link,.markdown-body h2 .octicon-link,.markdown-body h3 .octicon-link,.markdown-body h4 .octicon-link,.markdown-body h5 .octicon-link,.markdown-body h6 .octicon-link{color:#000;vertical-align:middle;visibility:hidden}.markdown-body h1:hover .anchor,.markdown-body h2:hover .anchor,.markdown-body h3:hover .anchor,.markdown-body h4:hover .anchor,.markdown-body h5:hover .anchor,.markdown-body h6:hover .anchor{text-decoration:none}.markdown-body h1:hover .anchor .octicon-link,.markdown-body h2:hover .anchor .octicon-link,.markdown-body h3:hover .anchor .octicon-link,.markdown-body h4:hover .anchor .octicon-link,.markdown-body h5:hover .anchor .octicon-link,.markdown-body h6:hover .anchor .octicon-link{visibility:visible}.markdown-body h1 code,.markdown-body h1 tt,.markdown-body h2 code,.markdown-body h2 tt,.markdown-body h3 code,.markdown-body h3 tt,.markdown-body h4 code,.markdown-body h4 tt,.markdown-body h5 code,.markdown-body h5 tt,.markdown-body h6 code,.markdown-body h6 tt{font-size:inherit}.markdown-body h1{font-size:2em}.markdown-body h1,.markdown-body h2{border-bottom:1px solid #eee;padding-bottom:.3em}.markdown-body h2{font-size:1.5em}.markdown-body h3{font-size:1.25em}.markdown-body h4{font-size:1em}.markdown-body h5{font-size:.875em}.markdown-body h6{color:#777;font-size:.85em}.markdown-body ol,.markdown-body ul{padding-left:2em}.markdown-body ol.no-list,.markdown-body ul.no-list{list-style-type:none;padding:0}.markdown-body ol ol,.markdown-body ol ul,.markdown-body ul ol,.markdown-body ul ul{margin-bottom:0;margin-top:0}.markdown-body li>p{margin-top:16px}.markdown-body li+li{padding-top:.25em}.markdown-body dl{padding:0}.markdown-body dl dt{font-size:1em;font-style:italic;font-weight:700;margin-top:16px;padding:0}.markdown-body dl dd{margin-bottom:16px;padding:0 16px}.markdown-body table{display:block;overflow:auto;width:100%;word-break:normal;word-break:keep-all}.markdown-body table th{font-weight:700}.markdown-body table td,.markdown-body table th{border:1px solid #ddd;padding:6px 13px}.markdown-body table tr{background-color:#fff;border-top:1px solid #ccc}.markdown-body table tr:nth-child(2n){background-color:#f8f8f8}.markdown-body img{background-color:#fff;box-sizing:initial;max-width:100%}.markdown-body img[align=right]{padding-left:20px}.markdown-body img[align=left]{padding-right:20px}.markdown-body .emoji{background-color:initial;max-width:none;vertical-align:text-top}.markdown-body span.frame{display:block;overflow:hidden}.markdown-body span.frame>span{border:1px solid #ddd;display:block;float:left;margin:13px 0 0;overflow:hidden;padding:7px;width:auto}.markdown-body span.frame span img{display:block;float:left}.markdown-body span.frame span span{clear:both;color:#333;display:block;padding:5px 0 0}.markdown-body span.align-center{clear:both;display:block;overflow:hidden}.markdown-body span.align-center>span{display:block;margin:13px auto 0;overflow:hidden;text-align:center}.markdown-body span.align-center span img{margin:0 auto;text-align:center}.markdown-body span.align-right{clear:both;display:block;overflow:hidden}.markdown-body span.align-right>span{display:block;margin:13px 0 0;overflow:hidden;text-align:right}.markdown-body span.align-right span img{margin:0;text-align:right}.markdown-body span.float-left{display:block;float:left;margin-right:13px;overflow:hidden}.markdown-body span.float-left span{margin:13px 0 0}.markdown-body span.float-right{display:block;float:right;margin-left:13px;overflow:hidden}.markdown-body span.float-right>span{display:block;margin:13px auto 0;overflow:hidden;text-align:right}.markdown-body code,.markdown-body tt{background-color:#0000000a;border-radius:3px;font-size:85%;margin:0;padding:.2em 0}.markdown-body code:after,.markdown-body code:before,.markdown-body tt:after,.markdown-body tt:before{content:"\00a0";letter-spacing:-.2em}.markdown-body code br,.markdown-body tt br{display:none}.markdown-body del code{text-decoration:inherit}.markdown-body pre{word-wrap:normal}.markdown-body pre>code{background:#0000;border:0;font-size:100%;margin:0;padding:0;white-space:pre;word-break:normal}.markdown-body .highlight{margin-bottom:16px}.markdown-body .highlight pre{margin-bottom:0;word-break:normal}.markdown-body .highlight pre,.markdown-body pre{background-color:#f7f7f7;border-radius:3px;font-size:85%;line-height:1.45;overflow:auto;padding:16px}.markdown-body pre code,.markdown-body pre tt{word-wrap:normal;background-color:initial;border:0;display:inline;line-height:inherit;margin:0;max-width:auto;overflow:visible;padding:0}.markdown-body pre code:after,.markdown-body pre code:before,.markdown-body pre tt:after,.markdown-body pre tt:before{content:normal}.markdown-body .csv-data td,.markdown-body .csv-data th{font-size:12px;line-height:1;overflow:hidden;padding:5px;text-align:left;white-space:nowrap}.markdown-body .csv-data .blob-line-num{background:#fff;border:0;padding:10px 8px 9px;text-align:right}.markdown-body .csv-data tr{border-top:0}.markdown-body .csv-data th{background:#f8f8f8;border-top:0;font-weight:700}.news .alert .markdown-body blockquote{border:0;padding:0 0 0 40px}.activity-tab .news .alert .commits,.activity-tab .news .markdown-body blockquote{padding-left:0}.task-list-item{list-style-type:none}.task-list-item label{font-weight:400}.task-list-item.enabled label{cursor:pointer}.task-list-item+.task-list-item{margin-top:3px}.task-list-item-checkbox{cursor:default!important;float:left;margin:.31em 0 .2em -1.3em!important;vertical-align:middle}.markdown-body{max-width:758px;overflow:visible!important;padding-bottom:40px;padding-top:40px;position:relative}.markdown-body .emoji{vertical-align:top}.markdown-body pre{border:inherit!important}.markdown-body code{color:inherit!important}.markdown-body pre code .wrapper{display:-moz-inline-flex;display:-ms-inline-flex;display:-o-inline-flex;display:inline-flex}.markdown-body pre code .gutter{float:left;overflow:hidden;-webkit-user-select:none;user-select:none}.markdown-body pre code .gutter.linenumber{border-right:3px solid #6ce26c!important;box-sizing:initial;color:#afafaf!important;cursor:default;display:inline-block;min-width:20px;padding:0 8px 0 0;position:relative;text-align:right;z-index:4}.markdown-body pre code .gutter.linenumber>span:before{content:attr(data-linenumber)}.markdown-body pre code .code{float:left;margin:0 0 0 16px}.markdown-body .gist .line-numbers{border-bottom:none;border-left:none;border-top:none}.markdown-body .gist .line-data{border:none}.markdown-body .gist table{border-collapse:inherit!important;border-spacing:0}.markdown-body code[data-gist-id]{background:none;padding:0}.markdown-body code[data-gist-id]:after,.markdown-body code[data-gist-id]:before{content:""}.markdown-body code[data-gist-id] .blob-num{border:unset}.markdown-body code[data-gist-id] table{margin-bottom:unset;overflow:unset}.markdown-body code[data-gist-id] table tr{background:unset}.markdown-body[dir=rtl] pre{direction:ltr}.markdown-body[dir=rtl] code{direction:ltr;unicode-bidi:embed}.markdown-body .alert>p:last-child{margin-bottom:0}.markdown-body pre.abc,.markdown-body pre.flow-chart,.markdown-body pre.graphviz,.markdown-body pre.mermaid,.markdown-body pre.sequence-diagram,.markdown-body pre.vega{background-color:inherit;border-radius:0;overflow:visible;text-align:center;white-space:inherit}.markdown-body pre.abc>code,.markdown-body pre.flow-chart>code,.markdown-body pre.graphviz>code,.markdown-body pre.mermaid>code,.markdown-body pre.sequence-diagram>code,.markdown-body pre.vega>code{text-align:left}.markdown-body pre.abc>svg,.markdown-body pre.flow-chart>svg,.markdown-body pre.graphviz>svg,.markdown-body pre.mermaid>svg,.markdown-body pre.sequence-diagram>svg,.markdown-body pre.vega>svg{height:100%;max-width:100%}.markdown-body pre>code.wrap{word-wrap:break-word;white-space:pre-wrap;white-space:-moz-pre-wrap;white-space:-pre-wrap;white-space:-o-pre-wrap}.markdown-body .alert>p:last-child,.markdown-body .alert>ul:last-child{margin-bottom:0}.markdown-body summary{display:list-item}.markdown-body summary:focus{outline:none}.markdown-body details summary{cursor:pointer}.markdown-body details:not([open])>:not(summary){display:none}.markdown-body figure{margin:1em 40px}.markdown-body .mark,.markdown-body mark{background-color:#fff1a7}.vimeo,.youtube{background-color:#000;background-position:50%;background-repeat:no-repeat;background-size:contain;cursor:pointer;display:table;overflow:hidden;text-align:center}.vimeo,.youtube{position:relative;width:100%}.youtube{padding-bottom:56.25%}.vimeo img{object-fit:contain;width:100%;z-index:0}.youtube img{object-fit:cover;z-index:0}.vimeo iframe,.youtube iframe,.youtube img{height:100%;left:0;position:absolute;top:0;width:100%}.vimeo iframe,.youtube iframe{vertical-align:middle;z-index:1}.vimeo .icon,.youtube .icon{color:#fff;height:auto;left:50%;opacity:.3;position:absolute;top:50%;transform:translate(-50%,-50%);transition:opacity .2s;width:auto;z-index:0}.vimeo:hover .icon,.youtube:hover .icon{opacity:.6;transition:opacity .2s}.slideshare .inner,.speakerdeck .inner{position:relative;width:100%}.slideshare .inner iframe,.speakerdeck .inner iframe{bottom:0;height:100%;left:0;position:absolute;right:0;top:0;width:100%}.figma{display:table;padding-bottom:56.25%;position:relative;width:100%}.figma iframe{border:1px solid #eee;bottom:0;height:100%;left:0;position:absolute;right:0;top:0;width:100%}.markmap-container{height:300px}.markmap-container>svg{height:100%;width:100%}.MJX_Assistive_MathML{display:none}#MathJax_Message{z-index:1000!important}.ui-infobar{color:#777;margin:25px auto -25px;max-width:760px;position:relative;z-index:2}.toc .invisable-node{list-style-type:none}.ui-toc{bottom:20px;position:fixed;z-index:998}.ui-toc.both-mode{margin-left:8px}.ui-toc.both-mode .ui-toc-label{border-bottom-left-radius:0;border-top-left-radius:0;height:40px;padding:10px 4px}.ui-toc-label{background-color:#e6e6e6;border:none;color:#868686;transition:opacity .2s}.ui-toc .open .ui-toc-label{color:#fff;opacity:1;transition:opacity .2s}.ui-toc-label:focus{background-color:#ccc;color:#000;opacity:.3}.ui-toc-label:hover{background-color:#ccc;opacity:1;transition:opacity .2s}.ui-toc-dropdown{margin-bottom:20px;margin-top:20px;max-height:70vh;max-width:45vw;overflow:auto;padding-left:10px;padding-right:10px;text-align:inherit;width:25vw}.ui-toc-dropdown>.toc{max-height:calc(70vh - 100px);overflow:auto}.ui-toc-dropdown[dir=rtl] .nav{letter-spacing:.0029em;padding-right:0}.ui-toc-dropdown a{overflow:hidden;text-overflow:ellipsis;white-space:pre}.ui-toc-dropdown .nav>li>a{color:#767676;display:block;font-size:13px;font-weight:500;padding:4px 20px}.ui-toc-dropdown .nav>li:first-child:last-child>ul,.ui-toc-dropdown .toc.expand ul{display:block}.ui-toc-dropdown .nav>li>a:focus,.ui-toc-dropdown .nav>li>a:hover{background-color:initial;border-left:1px solid #000;color:#000;padding-left:19px;text-decoration:none}.ui-toc-dropdown[dir=rtl] .nav>li>a:focus,.ui-toc-dropdown[dir=rtl] .nav>li>a:hover{border-left:none;border-right:1px solid #000;padding-right:19px}.ui-toc-dropdown .nav>.active:focus>a,.ui-toc-dropdown .nav>.active:hover>a,.ui-toc-dropdown .nav>.active>a{background-color:initial;border-left:2px solid #000;color:#000;font-weight:700;padding-left:18px}.ui-toc-dropdown[dir=rtl] .nav>.active:focus>a,.ui-toc-dropdown[dir=rtl] .nav>.active:hover>a,.ui-toc-dropdown[dir=rtl] .nav>.active>a{border-left:none;border-right:2px solid #000;padding-right:18px}.ui-toc-dropdown .nav .nav{display:none;padding-bottom:10px}.ui-toc-dropdown .nav>.active>ul{display:block}.ui-toc-dropdown .nav .nav>li>a{font-size:12px;font-weight:400;padding-bottom:1px;padding-left:30px;padding-top:1px}.ui-toc-dropdown[dir=rtl] .nav .nav>li>a{padding-right:30px}.ui-toc-dropdown .nav .nav>li>ul>li>a{font-size:12px;font-weight:400;padding-bottom:1px;padding-left:40px;padding-top:1px}.ui-toc-dropdown[dir=rtl] .nav .nav>li>ul>li>a{padding-right:40px}.ui-toc-dropdown .nav .nav>li>a:focus,.ui-toc-dropdown .nav .nav>li>a:hover{padding-left:29px}.ui-toc-dropdown[dir=rtl] .nav .nav>li>a:focus,.ui-toc-dropdown[dir=rtl] .nav .nav>li>a:hover{padding-right:29px}.ui-toc-dropdown .nav .nav>li>ul>li>a:focus,.ui-toc-dropdown .nav .nav>li>ul>li>a:hover{padding-left:39px}.ui-toc-dropdown[dir=rtl] .nav .nav>li>ul>li>a:focus,.ui-toc-dropdown[dir=rtl] .nav .nav>li>ul>li>a:hover{padding-right:39px}.ui-toc-dropdown .nav .nav>.active:focus>a,.ui-toc-dropdown .nav .nav>.active:hover>a,.ui-toc-dropdown .nav .nav>.active>a{font-weight:500;padding-left:28px}.ui-toc-dropdown[dir=rtl] .nav .nav>.active:focus>a,.ui-toc-dropdown[dir=rtl] .nav .nav>.active:hover>a,.ui-toc-dropdown[dir=rtl] .nav .nav>.active>a{padding-right:28px}.ui-toc-dropdown .nav .nav>.active>.nav>.active:focus>a,.ui-toc-dropdown .nav .nav>.active>.nav>.active:hover>a,.ui-toc-dropdown .nav .nav>.active>.nav>.active>a{font-weight:500;padding-left:38px}.ui-toc-dropdown[dir=rtl] .nav .nav>.active>.nav>.active:focus>a,.ui-toc-dropdown[dir=rtl] .nav .nav>.active>.nav>.active:hover>a,.ui-toc-dropdown[dir=rtl] .nav .nav>.active>.nav>.active>a{padding-right:38px}.markdown-body{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica Neue,Helvetica,Roboto,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol}html[lang^=ja] .markdown-body{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica Neue,Helvetica,Roboto,Arial,Hiragino Kaku Gothic Pro,ヒラギノ角ゴ Pro W3,Osaka,Meiryo,メイリオ,MS Gothic,MS ゴシック,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol}html[lang=zh-tw] .markdown-body{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica Neue,Helvetica,Roboto,Arial,PingFang TC,Microsoft JhengHei,微軟正黑,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol}html[lang=zh-cn] .markdown-body{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica Neue,Helvetica,Roboto,Arial,PingFang SC,Microsoft YaHei,微软雅黑,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol}html .markdown-body[lang^=ja]{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica Neue,Helvetica,Roboto,Arial,Hiragino Kaku Gothic Pro,ヒラギノ角ゴ Pro W3,Osaka,Meiryo,メイリオ,MS Gothic,MS ゴシック,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol}html .markdown-body[lang=zh-tw]{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica Neue,Helvetica,Roboto,Arial,PingFang TC,Microsoft JhengHei,微軟正黑,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol}html .markdown-body[lang=zh-cn]{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica Neue,Helvetica,Roboto,Arial,PingFang SC,Microsoft YaHei,微软雅黑,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol}html[lang^=ja] .ui-toc-dropdown{font-family:Source Sans Pro,Helvetica,Arial,Meiryo UI,MS PGothic,MS Pゴシック,sans-serif}html[lang=zh-tw] .ui-toc-dropdown{font-family:Source Sans Pro,Helvetica,Arial,Microsoft JhengHei UI,微軟正黑UI,sans-serif}html[lang=zh-cn] .ui-toc-dropdown{font-family:Source Sans Pro,Helvetica,Arial,Microsoft YaHei UI,微软雅黑UI,sans-serif}html .ui-toc-dropdown[lang^=ja]{font-family:Source Sans Pro,Helvetica,Arial,Meiryo UI,MS PGothic,MS Pゴシック,sans-serif}html .ui-toc-dropdown[lang=zh-tw]{font-family:Source Sans Pro,Helvetica,Arial,Microsoft JhengHei UI,微軟正黑UI,sans-serif}html .ui-toc-dropdown[lang=zh-cn]{font-family:Source Sans Pro,Helvetica,Arial,Microsoft YaHei UI,微软雅黑UI,sans-serif}.ui-affix-toc{max-height:70vh;max-width:15vw;overflow:auto;position:fixed;top:0}.back-to-top,.expand-toggle,.go-to-bottom{color:#999;display:block;font-size:12px;font-weight:500;margin-left:10px;margin-top:10px;padding:4px 10px}.back-to-top:focus,.back-to-top:hover,.expand-toggle:focus,.expand-toggle:hover,.go-to-bottom:focus,.go-to-bottom:hover{color:#563d7c;text-decoration:none}.back-to-top,.go-to-bottom{margin-top:0}.ui-user-icon{background-position:50%;background-repeat:no-repeat;background-size:cover;border-radius:50%;display:block;height:20px;margin-bottom:2px;margin-right:5px;margin-top:2px;width:20px}.ui-user-icon.small{display:inline-block;height:18px;margin:0 0 .2em;vertical-align:middle;width:18px}.ui-infobar>small>span{line-height:22px}.ui-infobar>small .dropdown{display:inline-block}.ui-infobar>small .dropdown a:focus,.ui-infobar>small .dropdown a:hover{text-decoration:none}.ui-more-info{color:#888;cursor:pointer;vertical-align:middle}.ui-more-info .fa{font-size:16px}.ui-connectedGithub,.ui-published-note{color:#888}.ui-connectedGithub{line-height:23px;white-space:nowrap}.ui-connectedGithub a.file-path{color:#888;padding-left:22px;text-decoration:none}.ui-connectedGithub a.file-path:active,.ui-connectedGithub a.file-path:hover{color:#888;text-decoration:underline}.ui-connectedGithub .fa{font-size:20px}.ui-published-note .fa{font-size:20px;vertical-align:top}.unselectable{-webkit-user-select:none;-o-user-select:none;user-select:none}.selectable{-webkit-user-select:text;-o-user-select:text;user-select:text}.inline-spoiler-section{cursor:pointer}.inline-spoiler-section .spoiler-text{background-color:#333;border-radius:2px}.inline-spoiler-section .spoiler-text>*{opacity:0}.inline-spoiler-section .spoiler-img{filter:blur(10px)}.inline-spoiler-section.raw{background-color:#333;border-radius:2px}.inline-spoiler-section.raw>*{opacity:0}.inline-spoiler-section.unveil{cursor:auto}.inline-spoiler-section.unveil .spoiler-text{background-color:#3333331a}.inline-spoiler-section.unveil .spoiler-text>*{opacity:1}.inline-spoiler-section.unveil .spoiler-img{filter:none}@media print{blockquote,div,img,pre,table{page-break-inside:avoid!important}a[href]:after{font-size:12px!important}}.markdown-body.slides{color:#222;position:relative;z-index:1}.markdown-body.slides:before{background-color:currentColor;bottom:0;box-shadow:0 0 0 50vw;content:"";display:block;left:0;position:absolute;right:0;top:0;z-index:-1}.markdown-body.slides section[data-markdown]{background-color:#fff;margin-bottom:1.5em;position:relative;text-align:center}.markdown-body.slides section[data-markdown] code{text-align:left}.markdown-body.slides section[data-markdown]:before{content:"";display:block;padding-bottom:56.23%}.markdown-body.slides section[data-markdown]>div:first-child{left:1em;max-height:100%;overflow:hidden;position:absolute;right:1em;top:50%;transform:translateY(-50%)}.markdown-body.slides section[data-markdown]>ul{display:inline-block}.markdown-body.slides>section>section+section:after{border:3px solid #777;content:"";height:1.5em;position:absolute;right:1em;top:-1.5em}.site-ui-font{font-family:Source Sans Pro,Helvetica,Arial,sans-serif}html[lang^=ja] .site-ui-font{font-family:Source Sans Pro,Helvetica,Arial,Hiragino Kaku Gothic Pro,ヒラギノ角ゴ Pro W3,Osaka,Meiryo,メイリオ,MS Gothic,MS ゴシック,sans-serif}html[lang=zh-tw] .site-ui-font{font-family:Source Sans Pro,Helvetica,Arial,PingFang TC,Microsoft JhengHei,微軟正黑,sans-serif}html[lang=zh-cn] .site-ui-font{font-family:Source Sans Pro,Helvetica,Arial,PingFang SC,Microsoft YaHei,微软雅黑,sans-serif}body{font-smoothing:subpixel-antialiased!important;-webkit-font-smoothing:subpixel-antialiased!important;-moz-osx-font-smoothing:auto!important;-webkit-overflow-scrolling:touch;font-family:Source Sans Pro,Helvetica,Arial,sans-serif;letter-spacing:.025em}html[lang^=ja] body{font-family:Source Sans Pro,Helvetica,Arial,Hiragino Kaku Gothic Pro,ヒラギノ角ゴ Pro W3,Osaka,Meiryo,メイリオ,MS Gothic,MS ゴシック,sans-serif}html[lang=zh-tw] body{font-family:Source Sans Pro,Helvetica,Arial,PingFang TC,Microsoft JhengHei,微軟正黑,sans-serif}html[lang=zh-cn] body{font-family:Source Sans Pro,Helvetica,Arial,PingFang SC,Microsoft YaHei,微软雅黑,sans-serif}abbr[title]{border-bottom:none;text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted}abbr[data-original-title],abbr[title]{cursor:help}body.modal-open{overflow-y:auto;padding-right:0!important}svg{text-shadow:none}
</style>
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv.min.js" integrity="sha256-3Jy/GbSLrg0o9y5Z5n1uw0qxZECH7C6OQpVBgNFYa0g=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/respond.js/1.4.2/respond.min.js" integrity="sha256-g6iAfvZp+nDQ2TdTR/VVKJf3bGro4ub5fvWSWVRi2NE=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/es5-shim/4.5.9/es5-shim.min.js" integrity="sha256-8E4Is26QH0bD52WoQpcB+R/tcWQtpzlCojrybUd7Mxo=" crossorigin="anonymous"></script>
<![endif]-->
</head>
<body>
<div id="doc" class="markdown-body container-fluid comment-inner comment-enabled" data-hard-breaks="true" style=""><h1 id="R爬蟲與前處理" data-id="R爬蟲與前處理" style=""><a class="anchor hidden-xs" href="#R爬蟲與前處理" title="R爬蟲與前處理"><span class="octicon octicon-link"></span></a><span>R爬蟲與前處理</span></h1><h2 id="一、R爬蟲" data-id="一、R爬蟲" style=""><a class="anchor hidden-xs" href="#一、R爬蟲" title="一、R爬蟲"><span class="octicon octicon-link"></span></a><span>一、R爬蟲</span></h2><blockquote>
<p><span>學習資源:</span><br>
<a href="http://www.hmwu.idv.tw/web/R/G01-hmwu_R-Crawler.pdf" target="_blank" rel="noopener"><span>http://www.hmwu.idv.tw/web/R/G01-hmwu_R-Crawler.pdf</span></a><br>
<a href="https://blog.gtwang.org/r/rvest-web-scraping-with-r/" target="_blank" rel="noopener"><span>https://blog.gtwang.org/r/rvest-web-scraping-with-r/</span></a></p>
</blockquote><h3 id="1-1-rvest套件簡介" data-id="1-1-rvest套件簡介" style=""><a class="anchor hidden-xs" href="#1-1-rvest套件簡介" title="1-1-rvest套件簡介"><span class="octicon octicon-link"></span></a><span>1-1 rvest套件簡介</span></h3><p><span>在rvest中,可以使用CSS選擇器或XPath表達式來定位HTML元素,進而擷取其中的文本內容或其他資訊。以下是一些示範程式碼:</span></p><pre><code>#使用CSS選擇器擷取文本內容:
library(rvest)
# 擷取百度首頁上的標題文字
url <- "https://www.baidu.com"
page <- read_html(url)
title <- page %>% html_node("title") %>% html_text()
print(title)
</code></pre><p><span>上述程式碼中,html_node() 函數用於定位HTML元素,其中的 “title” 就是CSS選擇器,表示標題元素。html_text() 函數則用於擷取元素的文本內容。</span></p><pre><code>#使用XPath表達式擷取文本內容:
library(rvest)
# 擷取Wikipedia頁面上的第一個標題
url <- "https://en.wikipedia.org/wiki/R_(programming_language)"
page <- read_html(url)
title <- page %>% html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/h1/text()') %>% html_text()
print(title)
</code></pre><p><span>上述程式碼中,html_nodes() 函數用於定位HTML元素,其中的 xpath 參數就是XPath表達式,表示標題元素的路徑。注意,XPath表達式是用單引號引起來的字串。同樣地,html_text() 函數用於擷取元素的文本內容。</span></p><pre><code>#其他擷取文本內容的程式碼:
library(rvest)
# 使用正則表達式擷取文本內容
url <- "https://www.taiwanstat.com/realtime/power/"
page <- read_html(url)
text <- page %>% html_text()
pattern <- "(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})"
ip_address <- str_match(text, pattern)[1, 2]
print(ip_address)
</code></pre><p><span>上述程式碼中,html_text() 函數用於擷取整個HTML文件的文本內容,然後使用正則表達式來匹配IP地址的模式,最後使用 str_match() 函數來擷取匹配到的IP地址。</span></p><pre><code>library(rvest) # 爬蟲相關套件
library(httr) # 爬蟲相關套件
library(dplyr) # 資料處理套件
library(lubridate) # 處理日期變數的套件
library(stringr) # 處理字串的套件
</code></pre><h3 id="1-2-網頁原始碼說明" data-id="1-2-網頁原始碼說明" style=""><a class="anchor hidden-xs" href="#1-2-網頁原始碼說明" title="1-2-網頁原始碼說明"><span class="octicon octicon-link"></span></a><span>1-2 網頁原始碼說明</span></h3><p><span>HTML(HyperText Markup Language)是一種標記語言,用於描述網頁的結構和內容。HTML文件通常由標記(tag)、屬性(attribute)、元素(element)和內容(content)等組成。以下是HTML的基本組成和結構:</span></p><ul>
<li><span>標記(tag):HTML標記是由一對符號(<>)組成的,通常包括開始標記(opening tag)和結束標記(closing tag)。</span><br>
<span>例如, <p>是一個開始標記,表示一個段落的開始,</p>是一個結束標記,表示一個段落的結束。</span></li>
<li><span>屬性(attribute):HTML標記可以包含一個或多個屬性,用於指定元素的額外訊息,如ID、Class、Style等。屬性通常位於開始標記中,例如<img src=“image.jpg” alt=“description”>,其中src和alt就是屬性。</span></li>
<li><span>元素(element):HTML元素是由標記和屬性組成的,用於描述文檔的結構和內容。例如,<p>和</p>構成一個段落元素,用於包含一段文本內容。</span></li>
<li><span>內容(content):HTML元素的內容是指出現在開始標記和結束標記之間的文本內容或其他元素。例如,<p>Hello, world!</p>中的Hello, world!就是段落元素的內容。</span></li>
</ul><p><span>HTML文件的結構可以描述為樹形結構,也就是DOM(Document Object Model)。HTML文件的根節點是<html>元素,該元素包含兩個子元素:<head>和<body>。<head>元素用於包含網頁的元訊息,如標題、引用的CSS和JavaScript等。<body>元素用於包含網頁的主要內容,如文字、圖片、表格等。在<body>元素中,可以包含多個子元素,如<p>、<img>、<table>等,這些子元素也可以再包含子元素。通過DOM模型,可以對HTML文件的各個元素進行操作和訪問,實現動態效果。</span></p><p><img src="https://i.imgur.com/qSyMAjI.png" alt="" loading="lazy"><br>
<span>(來源:</span><a href="https://blog.51cto.com/u_15441270/4674923" target="_blank" rel="noopener"><span>https://blog.51cto.com/u_15441270/4674923</span></a><span>)</span></p><blockquote>
<p><span>舉例來說:</span><br>
<span>想爬取網頁上看到的某個標題,必須找到那個標題在網頁原始碼的「地址」,例如下方案例標題的</span><br>
<span>【CSS選擇器】 #page-top > div > div:nth-child(2) > div > div > article > header > h1</span><br>
<span>【Xpath】 //*[@id=“page-top”]/div/div[2]/div/div/article/header/h1</span><br>
<span>【outerHTML】</span></p><h1 class="article-title" id="王世堅給賴清德難看?公開喊「不接受誠信條款」" data-id="王世堅給賴清德難看?公開喊「不接受誠信條款」"><a class="anchor hidden-xs" href="#王世堅給賴清德難看?公開喊「不接受誠信條款」" title="王世堅給賴清德難看?公開喊「不接受誠信條款」"><span class="octicon octicon-link"></span></a><span>王世堅給賴清德難看?公開喊「不接受誠信條款」</span></h1><p></p>
</blockquote><p><img src="https://i.imgur.com/TOlOFrA.png" alt="" loading="lazy"></p><h4 id="因此我們爬蟲的步驟應該如下" data-id="因此我們爬蟲的步驟應該如下"><a class="anchor hidden-xs" href="#因此我們爬蟲的步驟應該如下" title="因此我們爬蟲的步驟應該如下"><span class="octicon octicon-link"></span></a><span>因此我們爬蟲的步驟應該如下:</span></h4><ul>
<li><span>STEP1:檢視網頁結構</span></li>
</ul><p><img src="https://i.imgur.com/IkAoajw.png" alt="" loading="lazy"><br>
<em><span>(圖片來源:</span><a href="https://medium.com/@troy801125/web-%E4%BB%8B%E7%B4%B9-47c667043d72" target="_blank" rel="noopener"><span>https://medium.com/@troy801125/web-介紹-47c667043d72</span></a><span>)</span></em></p><p><span>有很多方法可以把網頁原始碼弄下來,有這些選項:</span><br>
<img src="https://i.imgur.com/mkqAgnM.png" alt="" loading="lazy"></p><ul>
<li><span>STEP2:運用R的一些套件把網頁html抓下來</span><br>
<span>例如rvest、rselenium,可以讓網頁上出現的東東,在你的rstudio上呈現</span></li>
</ul><p><span>可能遭遇的困難:</span><br>
<span>(1) 網頁標籤沒有規律</span><br>
<span>(2) 遇到需要登入或有網頁偏好設定才能訪問的網頁(解決方式:需要帶著cookie)</span><br>
<span>(3) 網頁會偵測到有人大量爬取資料而封鎖對方(解決方式:切換IP位址、設定Request Headers、設定User-Agent、</span><br>
<span>設定延遲時間、使用Headless Browser)</span></p><blockquote>
<p><span>這就是各個網站可能有的反爬蟲機制</span><br>
<a href="https://steam.oxxostudio.tw/category/python/spider/crack-spider.html" target="_blank" rel="noopener"><span>https://steam.oxxostudio.tw/category/python/spider/crack-spider.html</span></a></p>
</blockquote><ul>
<li><span>STEP3:以迴圈處理,爬取大量網頁</span></li>
<li><span>STEP4:存放入儲存格</span></li>
</ul><h3 id="1-3-實戰演練國防部即時軍事" data-id="1-3-實戰演練國防部即時軍事" style=""><a class="anchor hidden-xs" href="#1-3-實戰演練國防部即時軍事" title="1-3-實戰演練國防部即時軍事"><span class="octicon octicon-link"></span></a><span>1-3 實戰演練:國防部即時軍事</span></h3><pre><code># 下載 國防部即時軍事 首頁
page.source <- read_html("https://www.mnd.gov.tw/PublishTable.aspx?Types=%E5%8D%B3%E6%99%82%E8%BB%8D%E4%BA%8B%E5%8B%95%E6%85%8B&title=%E5%9C%8B%E9%98%B2%E6%B6%88%E6%81%AF")
#也可以這樣寫
url <- "https://www.mnd.gov.tw/PublishTable.aspx?Types=%E5%8D%B3%E6%99%82%E8%BB%8D%E4%BA%8B%E5%8B%95%E6%85%8B&title=%E5%9C%8B%E9%98%B2%E6%B6%88%E6%81%AF"
page.source <- read_html(url)
page.source
# 篩選出標題
title <- html_nodes(page.source, "#ctl00_ContentPlaceHolder1_RtTable_ctl01_LinkDetail")
html_text(title)
title <- html_nodes(page.source, ".w-90")
html_text(title)
# 篩選出標題時間
title.date <- html_nodes(page.source, ".w-10")
html_text(title.date)
title.href <- html_attr(title, "href")
title.href #取不到連結,觀察網站後發現是動態網頁
</code></pre><blockquote>
<p><span>遇到動態網頁,可以利用其他套件,模擬人類點擊滑動網頁的動作</span><br>
<span>譬如用Rselenium:</span><a href="https://hackmd.io/@LHB-0222/RSelenium" target="_blank" rel="noopener"><span>https://hackmd.io/@LHB-0222/RSelenium</span></a><br>
<span>其他動態網頁案例:</span><a href="http://jhsjk.people.cn/" target="_blank" rel="noopener"><span>http://jhsjk.people.cn/</span></a></p>
</blockquote><h3 id="1-4-實戰演練:爬取財經新聞" data-id="1-4-實戰演練:爬取財經新聞" style=""><a class="anchor hidden-xs" href="#1-4-實戰演練:爬取財經新聞" title="1-4-實戰演練:爬取財經新聞"><span class="octicon octicon-link"></span></a><span>1-4 實戰演練:爬取財經新聞</span></h3><blockquote>
<p><span>引用:</span><a href="http://rstudio-pubs-static.s3.amazonaws.com/500333_ee4bf68174584d30b6d9a1596e21c6ac.html" target="_blank" rel="noopener"><span>http://rstudio-pubs-static.s3.amazonaws.com/500333_ee4bf68174584d30b6d9a1596e21c6ac.html</span></a></p>
</blockquote><pre><code>url <- "http://blog.moneydj.com/news/"
doc <- read_html(url, encoding = "UTF-8")
doc
# 取得新聞標題
header <- doc %>%
html_nodes(".entry-title.mh-loop-title") %>%
html_nodes("a") %>%
html_text()
head(header, n = 1L) # 檢查檔案
# 取得新聞的文章子連結
link <- doc %>%
html_nodes(".entry-title.mh-loop-title") %>%
html_nodes("a") %>%
html_attr("href")
# 取得發布時間
d <- doc %>%
html_nodes("span.mh-meta-date.updated") %>%
html_text()
d
head(link, n = 1L) # 檢查檔案
# 先創造一個空的物件,準備將所有文章內容合併在同一個物件中
article.all <- c()
for(i in 1:length(link)) {
doc.a <- read_html(link[i])
article <- doc.a %>%
html_nodes("div.entry-content.mh-clearfix") %>%
html_nodes("p") %>%
html_text() %>%
str_c(collapse = "")
article.all <- append(article.all, article)
}
# 最後轉為data.frame格式,以便後面做分析
df <- data.frame(title = header, content = article.all)
View(df)
df$pubtime <- as.data.frame(d)
</code></pre><h3 id="1-5-練習:爬取中時新聞" data-id="1-5-練習:爬取中時新聞" style=""><a class="anchor hidden-xs" href="#1-5-練習:爬取中時新聞" title="1-5-練習:爬取中時新聞"><span class="octicon octicon-link"></span></a><span>1-5 練習:爬取中時新聞</span></h3><pre><code>url = "https://www.chinatimes.com/search/%E7%8E%8B%E4%B8%96%E5%A0%85?page=1&chdtv"
#首先取得第一頁的所有新聞連結
links_data <- read_html(url) %>% html_nodes(".title > a") %>% html_attr('href')
#試了好多次html_nodes 例如.title
#body > div.wrapper > div > div.column-wrapper.clear-fix > div > section > div.item-list.article-list > ul > li:nth-child(2) > div > div > div.col > h3
#最終找到.title > a 最符合想要的結果
</code></pre><p><img src="https://i.imgur.com/SWNJsVj.png" alt="" loading="lazy"></p><pre><code>#做出三個資料框框,儲存時間
article.all <- c()
header.all <- c()
time.all <- c()
for(i in 1:length(links_data)) {
doc.a <- read_html(links_data[i])
#內文
article <- doc.a %>%
html_nodes(".article-body") %>%
html_nodes("p") %>%
html_text() %>%
str_c(collapse = "")
article.all <- append(article.all, article)
#標題
header <- doc.a %>%
html_nodes(".article-title") %>%
html_text() %>%
str_c(collapse = "")
header.all <- append(header.all, header)
#時間
tim<- doc.a %>%
html_nodes(".date") %>%
html_text() %>%
str_c(collapse = "")
time.all <- append(time.all, tim)
}
df <- data.frame(title = header.all, pubtime = time.all,content = article.all)
View(df)
</code></pre><p><strong><span>上面只是取得一頁的資料,如果要獲取所有頁面的資料呢?</span></strong></p><p><span>首先觀察一下每個網頁的連結</span></p><blockquote>
<p><a href="https://www.chinatimes.com/search/%E7%8E%8B%E4%B8%96%E5%A0%85?page=86&chdtv" target="_blank" rel="noopener"><span>https://www.chinatimes.com/search/王世堅?page=86&chdtv</span></a></p>
</blockquote><p><span>是不是只改了page的頁數?</span><br>
<span>所以我們要創造一個更大的迴圈,把每一頁都跑一遍</span></p><pre><code>url = "https://www.chinatimes.com/search/%E7%8E%8B%E4%B8%96%E5%A0%85?page=1&chdtv"
library(glue)
#glue可以幫助我們替換掉想替換的文字或數字
#這樣就不用人工打86次url了~~
article.all <- c()
header.all <- c()
time.all <- c()
for(w in c(1:86)){ ##最大的迴圈
url = glue("https://www.chinatimes.com/search/%E7%8E%8B%E4%B8%96%E5%A0%85?page={w}&chdtv")
links_data <- read_html(url) %>% html_nodes(".title > a") %>% html_attr('href')
for(i in 1:length(links_data)) { ##每頁內處理資料的迴圈
doc.a <- read_html(links_data[i])
#內文
article <- doc.a %>%
html_nodes(".article-body") %>%
html_nodes("p") %>%
html_text() %>%
str_c(collapse = "")
article.all <- append(article.all, article)
#標題
header <- doc.a %>%
html_nodes(".article-title") %>%
html_text() %>%
str_c(collapse = "")
header.all <- append(header.all, header)
#時間
tim<- doc.a %>%
html_nodes(".date") %>%
html_text() %>%
str_c(collapse = "")
time.all <- append(time.all, tim)
}
}
df <- data.frame(title = header.all, pubtime = time.all,content = article.all)
View(df)
</code></pre><h2 id="二、前處理" data-id="二、前處理" style=""><a class="anchor hidden-xs" href="#二、前處理" title="二、前處理"><span class="octicon octicon-link"></span></a><span>二、前處理</span></h2><h3 id="2-1-整理格式" data-id="2-1-整理格式" style=""><a class="anchor hidden-xs" href="#2-1-整理格式" title="2-1-整理格式"><span class="octicon octicon-link"></span></a><span>2-1 整理格式</span></h3><p><span>這是我們剛剛爬的資料,有811筆,因為爬取時總會爬到一些不乾淨,或是資料格式怪怪的內容,我們要進行前處理。</span><br>
<img src="https://i.imgur.com/UG9yoaJ.png" alt="" loading="lazy"></p><p><span>想想看這份資料目前有甚麼問題?</span></p><ol>
<li><span>裡面爬到跟王世堅沒關係的文章</span></li>
<li><span>時間格式超級怪</span></li>
<li><span>有沒有重複的?</span></li>
</ol><p><span>請大家試著用前幾周教過的,處理資料的方式,刪掉title裡沒有王世堅的文章。</span></p><pre><code>List <- grep("王世堅",df$title)
data <- df[c(List),]
View(data)
</code></pre><p><span>請大家試著留下pubtime中真正的時間,可以回去爬蟲階段,看看網頁的構造</span><br>
<img src="https://i.imgur.com/lBR6CH0.png" alt="" loading="lazy"></p><pre><code>data$pubtime <- substr(data$pubtime, start = 1, stop = 10)
</code></pre><p><img src="https://i.imgur.com/iGCMhdj.png" alt="" loading="lazy"></p><pre><code>#刪除重複
duplicated(data) #有TRUE代表有重複
data[duplicated(data)] #檢查重複內容
data[!duplicated(data)] #檢查重複內容
data[!duplicated(data$content), ]#有重複的話就刪掉
</code></pre><h3 id="2-2-抓出我們想要的資料" data-id="2-2-抓出我們想要的資料" style=""><a class="anchor hidden-xs" href="#2-2-抓出我們想要的資料" title="2-2-抓出我們想要的資料"><span class="octicon octicon-link"></span></a><span>2-2 抓出我們想要的資料</span></h3><p><span>假設今天我想知道王世堅跟柯文哲之間都發生什麼,送過甚麼禮物,講過甚麼話,我們應該怎麼篩選出王世堅跟柯文哲的資料?</span></p><p><span>可以直接簡單粗暴的蒐柯文哲嗎?只要有柯文哲三個字就能找到所有關於柯文哲的新聞嗎?</span></p><pre><code>List <- grep("柯文哲",df$content)
data <- df[c(List),]
View(data)
</code></pre><p><span>以研究範圍的設定為準。例如當初我們搜中時,就是以王世堅來搜,不是用鬼娃恰吉之類的綽號,因為新聞平台在每篇新聞中,肯定會提到一次王世堅本名,不然有些民眾會看不懂。</span><br>
<span>但如果我們想做的是王世堅跟柯文哲之間的互動狀況,王世堅很有可能叫柯文哲柯P、現代施琅之類的有的沒的,所以在蒐的時候,可以多試幾個關鍵詞。</span></p><blockquote>
<p><span>通常我們會把指涉同一個人,合併為一個名詞,例如習近平、習近平總書記,合併為習近平(避免後續斷詞步驟時,呈現出來的詞頻不夠集中)</span><br>
<span>其它案例:蔡英文、蔡英文總統、小英總統,三個都是指涉蔡英文,但有親暱之分</span></p>
</blockquote><h2 id="三、練習:爬百度上的習近平搜尋結果" data-id="三、練習:爬百度上的習近平搜尋結果" style=""><a class="anchor hidden-xs" href="#三、練習:爬百度上的習近平搜尋結果" title="三、練習:爬百度上的習近平搜尋結果"><span class="octicon octicon-link"></span></a><span>三、練習:爬百度上的習近平搜尋結果</span></h2><ol>
<li><span>爬蟲</span><br>
<span>請從這個網站</span><br>
<span>url <- “</span><a href="https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%E7%BF%92%E8%BF%91%E5%B9%B3" target="_blank" rel="noopener"><span>https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=習近平</span></a><span>”</span></li>
</ol><p><span>爬這一頁搜尋結果的標題,以及搜尋結果的摘要(這頁就好,不需用到迴圈)</span><br>
<span>會長大概這樣(上課時的搜尋結果會與這個圖不同)</span><br>
<img src="https://i.imgur.com/Kg0VAj0.png" alt="" loading="lazy"></p><ol start="2">
<li><span>清資料</span><br>
<span>請清除所有重複資料</span><br>
<span>把習近平主席、習近平總書記等變形詞,替換成習近平</span></li>
</ol><pre><code>webtitle <- read_html(url)%>% html_nodes(".news-title_1YtI1")%>%
html_text()
abstract <- read_html(url)%>% html_nodes(" span.c-font-normal.c-color-text ")%>%
html_text()
d <- data.frame(title = webtitle, abs = abstract)
View(d)
</code></pre><h2 id="四、回家作業" data-id="四、回家作業" style=""><a class="anchor hidden-xs" href="#四、回家作業" title="四、回家作業"><span class="octicon octicon-link"></span></a><span>四、回家作業</span></h2><p><span>把剛剛的百度搜尋習近平,加上迴圈,爬10頁的搜尋結果。</span><br>
<span>一樣爬標題連結就好,無須點進標題內文爬蟲。</span></p><blockquote>
<p><span>hint:這是第三頁與第四頁的網址,觀察一下差異,再利用不同之處,加上迴圈處理</span><br>
<a href="https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%E7%BF%92%E8%BF%91%E5%B9%B3&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=20" target="_blank" rel="noopener"><span>https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=習近平&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=20</span></a><br>
<a href="https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%E7%BF%92%E8%BF%91%E5%B9%B3&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=30" target="_blank" rel="noopener"><span>https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=習近平&x_bfe_rqs=03E80&x_bfe_tjscore=0.100000&tngroupname=organic_news&newVideo=12&goods_entry_switch=1&rsv_dl=news_b_pn&pn=30</span></a></p>
</blockquote></div>
<div class="ui-toc dropup unselectable hidden-print" style="display:none;">
<div class="pull-right dropdown">
<a id="tocLabel" class="ui-toc-label btn btn-default" data-toggle="dropdown" href="#" role="button" aria-haspopup="true" aria-expanded="false" title="Table of content">
<i class="fa fa-bars"></i>
</a>
<ul id="ui-toc" class="ui-toc-dropdown dropdown-menu" aria-labelledby="tocLabel">
<div class="toc"><ul class="nav">
<li class=""><a href="#R爬蟲與前處理" title="R爬蟲與前處理">R爬蟲與前處理</a><ul class="nav">
<li class=""><a href="#一、R爬蟲" title="一、R爬蟲">一、R爬蟲</a><ul class="nav">
<li class=""><a href="#1-1-rvest套件簡介" title="1-1 rvest套件簡介">1-1 rvest套件簡介</a></li>
<li><a href="#1-2-網頁原始碼說明" title="1-2 網頁原始碼說明">1-2 網頁原始碼說明</a></li>
<li><a href="#1-3-實戰演練國防部即時軍事" title="1-3 實戰演練:國防部即時軍事">1-3 實戰演練:國防部即時軍事</a></li>
<li><a href="#1-4-實戰演練:爬取財經新聞" title="1-4 實戰演練:爬取財經新聞">1-4 實戰演練:爬取財經新聞</a></li>
<li><a href="#1-5-練習:爬取中時新聞" title="1-5 練習:爬取中時新聞">1-5 練習:爬取中時新聞</a></li>
</ul>
</li>
<li><a href="#二、前處理" title="二、前處理">二、前處理</a><ul class="nav">
<li><a href="#2-1-整理格式" title="2-1 整理格式">2-1 整理格式</a></li>
<li><a href="#2-2-抓出我們想要的資料" title="2-2 抓出我們想要的資料">2-2 抓出我們想要的資料</a></li>
</ul>
</li>
<li><a href="#三、練習:爬百度上的習近平搜尋結果" title="三、練習:爬百度上的習近平搜尋結果">三、練習:爬百度上的習近平搜尋結果</a></li>
<li><a href="#四、回家作業" title="四、回家作業">四、回家作業</a></li>
</ul>
</li>
</ul>
</div><div class="toc-menu"><a class="expand-toggle" href="#">全部展開</a><a class="back-to-top" href="#">回到頂部</a><a class="go-to-bottom" href="#">移至底部</a></div>
</ul>
</div>
</div>
<div id="ui-toc-affix" class="ui-affix-toc ui-toc-dropdown unselectable hidden-print" data-spy="affix" style="top:17px;display:none;" null null>
<div class="toc"><ul class="nav">
<li class=""><a href="#R爬蟲與前處理" title="R爬蟲與前處理">R爬蟲與前處理</a><ul class="nav">
<li class=""><a href="#一、R爬蟲" title="一、R爬蟲">一、R爬蟲</a><ul class="nav">
<li class=""><a href="#1-1-rvest套件簡介" title="1-1 rvest套件簡介">1-1 rvest套件簡介</a></li>
<li><a href="#1-2-網頁原始碼說明" title="1-2 網頁原始碼說明">1-2 網頁原始碼說明</a></li>
<li><a href="#1-3-實戰演練國防部即時軍事" title="1-3 實戰演練:國防部即時軍事">1-3 實戰演練:國防部即時軍事</a></li>
<li><a href="#1-4-實戰演練:爬取財經新聞" title="1-4 實戰演練:爬取財經新聞">1-4 實戰演練:爬取財經新聞</a></li>
<li><a href="#1-5-練習:爬取中時新聞" title="1-5 練習:爬取中時新聞">1-5 練習:爬取中時新聞</a></li>
</ul>
</li>
<li><a href="#二、前處理" title="二、前處理">二、前處理</a><ul class="nav">
<li><a href="#2-1-整理格式" title="2-1 整理格式">2-1 整理格式</a></li>
<li><a href="#2-2-抓出我們想要的資料" title="2-2 抓出我們想要的資料">2-2 抓出我們想要的資料</a></li>
</ul>
</li>
<li><a href="#三、練習:爬百度上的習近平搜尋結果" title="三、練習:爬百度上的習近平搜尋結果">三、練習:爬百度上的習近平搜尋結果</a></li>
<li><a href="#四、回家作業" title="四、回家作業">四、回家作業</a></li>
</ul>
</li>
</ul>
</div><div class="toc-menu"><a class="expand-toggle" href="#">全部展開</a><a class="back-to-top" href="#">回到頂部</a><a class="go-to-bottom" href="#">移至底部</a></div>
</div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.1.1/jquery.min.js" integrity="sha256-hVVnYaiADRTO2PzUGmuLJr8BLUSjGIZsDYGmIJLv2b8=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous" defer></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/gist-embed/2.6.0/gist-embed.min.js" integrity="sha256-KyF2D6xPIJUW5sUDSs93vWyZm+1RzIpKCexxElmxl8g=" crossorigin="anonymous" defer></script>
<script>
var markdown = $(".markdown-body");
//smooth all hash trigger scrolling
function smoothHashScroll() {
var hashElements = $("a[href^='#']").toArray();
for (var i = 0; i < hashElements.length; i++) {
var element = hashElements[i];
var $element = $(element);
var hash = element.hash;
if (hash) {
$element.on('click', function (e) {
// store hash
var hash = this.hash;
if ($(hash).length <= 0) return;
// prevent default anchor click behavior
e.preventDefault();
// animate
$('body, html').stop(true, true).animate({
scrollTop: $(hash).offset().top
}, 100, "linear", function () {
// when done, add hash to url
// (default click behaviour)
window.location.hash = hash;
});
});
}
}
}
smoothHashScroll();
var toc = $('.ui-toc');
var tocAffix = $('.ui-affix-toc');
var tocDropdown = $('.ui-toc-dropdown');
//toc
tocDropdown.click(function (e) {
e.stopPropagation();
});
var enoughForAffixToc = true;
function generateScrollspy() {
$(document.body).scrollspy({
target: ''
});
$(document.body).scrollspy('refresh');
if (enoughForAffixToc) {
toc.hide();
tocAffix.show();
} else {
tocAffix.hide();
toc.show();
}
$(document.body).scroll();
}
function windowResize() {
//toc right
var paddingRight = parseFloat(markdown.css('padding-right'));
var right = ($(window).width() - (markdown.offset().left + markdown.outerWidth() - paddingRight));
toc.css('right', right + 'px');
//affix toc left
var newbool;
var rightMargin = (markdown.parent().outerWidth() - markdown.outerWidth()) / 2;
//for ipad or wider device
if (rightMargin >= 133) {
newbool = true;
var affixLeftMargin = (tocAffix.outerWidth() - tocAffix.width()) / 2;
var left = markdown.offset().left + markdown.outerWidth() - affixLeftMargin;
tocAffix.css('left', left + 'px');
} else {
newbool = false;
}
if (newbool != enoughForAffixToc) {
enoughForAffixToc = newbool;
generateScrollspy();
}
}
$(window).resize(function () {
windowResize();
});
$(document).ready(function () {
windowResize();
generateScrollspy();
});
//remove hash
function removeHash() {
window.location.hash = '';
}
var backtotop = $('.back-to-top');
var gotobottom = $('.go-to-bottom');
backtotop.click(function (e) {
e.preventDefault();
e.stopPropagation();
if (scrollToTop)
scrollToTop();
removeHash();
});
gotobottom.click(function (e) {
e.preventDefault();
e.stopPropagation();
if (scrollToBottom)
scrollToBottom();
removeHash();
});
var toggle = $('.expand-toggle');
var tocExpand = false;
checkExpandToggle();
toggle.click(function (e) {
e.preventDefault();
e.stopPropagation();
tocExpand = !tocExpand;
checkExpandToggle();
})
function checkExpandToggle () {
var toc = $('.ui-toc-dropdown .toc');
var toggle = $('.expand-toggle');
if (!tocExpand) {
toc.removeClass('expand');
toggle.text('Expand all');
} else {
toc.addClass('expand');
toggle.text('Collapse all');
}
}
function scrollToTop() {
$('body, html').stop(true, true).animate({
scrollTop: 0
}, 100, "linear");
}
function scrollToBottom() {
$('body, html').stop(true, true).animate({
scrollTop: $(document.body)[0].scrollHeight
}, 100, "linear");
}
</script>
</body>
</html>