-
Notifications
You must be signed in to change notification settings - Fork 0
/
wiki_info_box.py
89 lines (66 loc) · 2.11 KB
/
wiki_info_box.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# TODO
# list tag to array
# \n to array
from lxml import html
import sys
import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_table_rows(url):
res = requests.get(url)
st = BeautifulSoup(res.content.decode('utf-8','ignore'),features='lxml')
for target in st.find_all(["script","style"]):
target.decompose()
table = st.find('table', attrs={'class':['infobox','vcard']})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
return rows
def format_header(data):
header_buffer = {}
header_buffer['header_value'] = data.text.strip()
if data.find('a'):
header_buffer['header_link'] = data.find('a')['href']
header_buffer['data'] = []
return header_buffer
def format_normal(data,url):
label = data.find(attrs={'class':'infobox-label'})
if label is None:
label = data.find(attrs={'class':'infobox-above'})
if label is None:
label = data.find(attrs={'class':'infobox-image'})
buf = {}
if label:
buf['label'] = ' '.join([r.text.strip() for r in label])
data_element = data.find(attrs={'class':'infobox-data'})
if data_element:
buf['value'] = ' '.join([r.text.strip() for r in data_element])
if data_element.find('a'):
buf['link'] = urljoin(url,data_element.find('a')['href'])
if data.find('img'):
buf['link'] = urljoin(url,data.find('img')['src'])
return buf
def collect(url):
rows = get_table_rows(url)
data = []
buf = {}
collect = False
for row in rows:
header = row.find('th', attrs={'class':'infobox-header'})
if header:
if len(buf) > 0:
data.append(buf)
collect = True
buf = format_header(header)
res = format_normal(row,url)
if res:
if collect:
buf['data'].append(res)
else:
data.append(res)
data.append(buf)
return data
if __name__ == "__main__":
res = collect(sys.argv[1])
out = json.dumps(res,indent=4,ensure_ascii=False)
print(out)