-
Notifications
You must be signed in to change notification settings - Fork 1
/
UlyssesTextToData.py
131 lines (98 loc) · 3.5 KB
/
UlyssesTextToData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import json
import re
import io
chapter_names = ['telemachus',
'nestor',
'proteus',
'calypso',
'lotuseaters',
'hades',
'aeolus',
'lestrygonians',
'scyllacharybdis',
'wanderingrocks',
'sirens',
'cyclops',
'nausicaa',
'oxenofthesun',
'circe',
'eumaeus',
'ithaca',
'penelope']
def std_chapter(n):
if n>0 and n<19:
txtfile = "txt/%02d%s.txt"%( n, chapter_names[n-1] )
datfile = "data/%02d%s.dat"%( n, chapter_names[n-1] )
# Load lines from file into list
with io.open(txtfile,'r',encoding='utf-8') as f:
oldparagraphs = f.readlines()
paragraphs = []
newline = ""
for oldparagraph in oldparagraphs:
if(oldparagraph <> "\n"):
# accumulate in newline variable
newline = newline + re.sub("\n"," ",oldparagraph)
else:
# dump out newline variable and start it over again
paragraphs.append(newline)
newline = ""
# Parse and output in json format
print(" Parsing for JSON...")
with io.open(datfile,'w',encoding='utf-8') as o:
paragraph_dictionaries = process_paragraphs(paragraphs,n)
print(" Done parsing.")
print(" Dumping to JSON...")
with io.open(datfile,'w',encoding='utf-8') as o:
for d in paragraph_dictionaries:
# Dump dictionary to output file
#json.dump(d, o, encoding='utf-8', ensure_ascii=False)
result = json.dumps(d, encoding='utf-8')
o.write(result.decode('utf-8'))
o.write(u"\n")
print(" Done dumping to JSON.")
print("Done processing "+chapter_names[n-1])
print("Input: "+txtfile)
print("Output: "+datfile)
print("\n")
def process_paragraphs(paragraphs, n):
# One json object/"dictionary" per paragraph
# {
# "parid" : <int>,
# "par" : "paragraph text goes here."
# }
paragraph_dictionaries = []
if(n==17):
# One json object/"dictionary" per paragraph
for pp,paragraph in enumerate(paragraphs):
# Turn the paragraph into a list of sentences
sentences = paragraph.strip().split(". ")
sentences = [s for s in sentences if len(s) > 1]
if(len(sentences) > 0):
# If the last character of the line is just a letter,
# add a period at the end.
for (i,s) in enumerate(sentences):
last_letter = s[len(s)-1]
if( last_letter.isalpha() ):
# Add the period back to the end
sentences[i] = sentences[i] + "."
# Construct dictionary
d = {}
d['parid'] = pp
d['par'] = sentences
paragraph_dictionaries.append(d)
if(n==18):
# One json object/"dictionary" per paragraph
for pp,paragraph in enumerate(paragraphs):
# Turn the paragraph into a list of sentences
fragments = paragraph.strip().split("I ")
fragments = ["I "+j for j in fragments]
# Construct dictionary
d = {}
d['parid'] = pp
d['par'] = fragments
paragraph_dictionaries.append(d)
# All finished processing each paragraph... return text to put into JSON
return paragraph_dictionaries
if __name__=="__main__":
std_chapter(17)
std_chapter(18)