-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
108 lines (87 loc) · 2.76 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from xml.dom import minidom
import os
def findData(dates, record):
list = record.split('=')
for date in dates:
if date in list[0]:
try:
return list[1]
except:
return None
return None
def compare(politics, record):
for politic in politics:
if (u'{{' + politic) in record:
return True
return False
def main():
countRecords = 10
politics = [u'Политик', u'Государственный деятель']
dates = [u'дата рождения', u'Дата рождения']
parts = [u'Партия', u'партия']
works = [u'деятельность', u'Деятельность', u'должность']
cships = [u'гражданство', u'гражданство']
f = open('result.csv', 'w+')
xml = minidom.parse(os.path.join('../', 'ruwiki-20140306-pages-articles1.xml'))
# xml = minidom.parse(os.path.join('../', 'test.xml'))
pages = xml.getElementsByTagName('mediawiki')[0].childNodes
for page in pages:
if page.localName == 'page':
text = page.getElementsByTagName('text')
if text.length > 0 and text[0].childNodes.length > 0:
list = text[0].childNodes[0].nodeValue.split('|')
count = 0
flag = False
title = None
date = None
part = None
work = None
cship = None
for record in list:
if count >= countRecords and not flag:
break
if not flag and compare(politics, record):
title = page.getElementsByTagName('title')[0].childNodes[0].nodeValue.replace(',','')
flag = len(title.split(' '))==3
if flag and date is None:
date = findData(dates, record)
if flag and part is None:
part = findData(parts, record)
if flag and work is None:
work = findData(works, record)
if flag and cship is None:
cship = findData(cships, record)
if not part is None and not title is None and not date is None and not work is None and not cship is None:
break
count = count+1
if flag:
try:
print title
if title is None:
f.write(';')
if not title is None:
f.write('%s;' % title.replace('\n','').encode('utf8'))
if date is None:
f.write(';')
if not date is None:
f.write('%s;' % date.replace('\n','').encode('utf8'))
if work is None:
f.write(';')
if not work is None:
f.write('%s;' % work.replace('\n','').encode('utf8'))
if cship is None:
f.write(';')
if not cship is None:
f.write('%s;' % cship.replace('\n','').encode('utf8'))
if part is None:
f.write('\n')
if not part is None:
f.write('%s\n' % part.replace('\n','').encode('utf8'))
except:
r=0
print 'complete'
f.close()
if __name__ == '__main__':
main()