-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path《中国历代人名大辞典》
70 lines (59 loc) · 1.67 KB
/
《中国历代人名大辞典》
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from pyvirtualdisplay import Display
from selenium import webdriver
from bs4 import BeautifulSoup as BS
import time, re, os, sys, urllib
from random import randint
display = Display(visible=0, size=(1920, 1080)).start()
browser = webdriver.Firefox()
URL = "http://art.tze.cn/Refbook/book.aspx?bi=m.20080202-m300-w001-008&p="
pages = 2959
for p in range(pages):
N = 0
while (N < 10):
url = URL + str(p+1)
print url
browser.get(url)
time.sleep(2 + N*2)
numb = browser.find_element_by_xpath("//a[@class='p_curpage']")
if numb.text == str(p+1):
print numb.text + " : " + str(p+1)
N = 10
else:
N = N + 1
print "wait " + str(N) + " times!"
filelink = "/home/tian/ldrm/A_" + format(p+1, '04d') + ".html"
if numb.text == str(p+1):
htmllink = browser.page_source.encode('utf8', 'replace')
with open(filelink, "w") as f:
f.write(htmllink)
f.close()
else:
with open("/home/tian/ldrm/log", "a") as f:
f.write(filelink + "\n")
f.close()
URL = "http://art.tze.cn/Refbook/book.aspx?bi=m.20080128-m300-w001-045&p="
pages = 2492
for p in range(pages):
N = 0
while (N < 10):
url = URL + str(p+1)
print url
browser.get(url)
time.sleep(2 + N*2)
numb = browser.find_element_by_xpath("//a[@class='p_curpage']")
if numb.text == str(p+1):
print numb.text + " : " + str(p+1)
N = 10
else:
N = N + 1
print "wait " + str(N) + " times!"
filelink = "/home/tian/ldrm/B_" + format(p+1, '04d') + ".html"
if numb.text == str(p+1):
htmllink = browser.page_source.encode('utf8', 'replace')
with open(filelink, "w") as f:
f.write(htmllink)
f.close()
else:
with open("/home/tian/ldrm/log", "a") as f:
f.write(filelink + "\n")
f.close()