-
Notifications
You must be signed in to change notification settings - Fork 14
/
split_processed_wikipedia_file.py
58 lines (33 loc) · 1.37 KB
/
split_processed_wikipedia_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# coding: utf-8
# Author: Motaz Saad
import sys, os
from os.path import basename
import logging
logging.basicConfig(format='%(levelname)s : %(asctime)s : %(message)s', level=logging.INFO)
x_seperator = '\nXXXXXXX\n' # define document separator (7 Xs). This separator is used when all the docs are in one file (a corpus file)
##################################################################
def usage():
print 'Usage: ', sys.argv[0], '<corpus file> <number of parts> <output path>'
##################################################################
if len(sys.argv) < 3: usage(); sys.exit(2)
import imp
tp = imp.load_source('textpro', 'textpro.py')
def main(argv):
corpus_file = sys.argv[1]
num_parts = int(sys.argv[2])
output_path = sys.argv[3]
if not output_path.endswith('/'): output_path = output_path + '/'
tp.check_dir(output_path) # if directory does not exist, then create
docs = tp.split_wikipedia_docs_into_array(corpus_file)
logging.info( 'corpus is loaded')
parts = tp.split_list(docs, num_parts)
f_name = basename(corpus_file).split()[0]
for i in range(len(parts)):
out = open(output_path + 'part-' + str(i) + '-' + f_name , 'w')
for d in parts[i]:
print>>out, d.encode('utf-8')
out.close()
logging.info('part %d is done', i )
##################################################################
if __name__ == "__main__":
main(sys.argv)