-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdgmDict-labelsort.py
201 lines (176 loc) · 6.06 KB
/
pdgmDict-labelsort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/local/bin/python3
'''
File to be labeled and ordered is placed in dir webbapy/pdgmfiles
labrled as LANG-sort-pdgms.json. Program replaces each termcluster
label with new label made from prop values of props entered with
'pdgmPropOrder' prompt. Pdgm file with new promps is written out
to file LANG-labeled-sort-pdgms.json. After inspection, this file
can be ordered by pdgmSort.py.
[Temporary for dhaasanac trial:
Desired orop order is:
pos,conjClass,lexeme,derivedStem,derivedStemAug,extended,tam,polarity,morphClass,proClass
]
Required text transformations before, in order to preserve structure
of 'terms' as list of parallel lists (mimicking CSV output of query):
",_" => &
Require text transformations after sorting:
,_ => ,^J
terms': => terms':^J
]]} => ]]^J}
& => ', '
I.e.:
',_' => ',^J'
},_ => },^J[
],_[ => ],^J[
'''
import json
from shutil import copy
import pprint
import re
import os
import sys
def value_getter(pdgm):
return pdgm['label']
# For CL argument
lang = sys.argv[1]
# For single lang:
#lang = input('Type language name: ')
#xlang = 'elmolo'
print(str('LANG: ' + lang))
#lfile must have already been copied from aama-data
lfile = str('../aama-data/data/' + lang + '/' + lang + '-pdgms.json')
lfilebck = str('../aama-data/data/' + lang + '/' + lang + '-pdgms-old.json')
copy(lfile, lfilebck)
# 'exploded' termcluster file
outfile0 = str('pvlists/' + lang + '-ordered-pdgms-0.json')
# formatted termclusterfile
outfile1 = str('pvlists/' + lang + '-termclusters.json')
# Make new labels out of the 'common' values
pdgmfile = open(lfile, 'r')
#print(pdgmfile)
#pdgms = pdgmfile.read()
#print(pdgms)
# pdgmdict is the whloe language file
pdgmdict = json.load(pdgmfile)
#pdgmdict = dict(pdgms)
print('pdgmdict in:')
#print(pdgmdict)
# The is the normative order of the pdgm props for this lang
# pdgm labels will be the values for these props in the normative
# order
tcprops = pdgmdict['pdgmPropOrder']
#tcpropstr = ','.join(tcprops)
#tcprops = pdgmdict[10]
lpref = pdgmdict['lgpref']
# This is the current prop order for pdgm names
print('PDGM name label props are:')
print(tcprops)
# this will be a list of pnames
plabels = ''
# get the number of pdgms in the file
tccount = len(pdgmdict['termclusters'])
#print(str('tccount: ' + str(tccount)))
# New label string
# for each termcluster, make new label string out of the
# values of its props in its 'common' section, taken in the
# order of the props in 'refPdgmProops'
for i in range(tccount):
# read-in 'common' section
plabel1 = pdgmdict['termclusters'][i]['label']
#print(str('plabel1 = ' + plabel1))
tccommon = pdgmdict['termclusters'][i]['common']
# 'common' section tupled
tpltcc = list(tccommon.items())
pdgmvals = []
for prop in tcprops:
for tup in tpltcc:
if tup[0] == prop:
if tup[0] == 'morphClass':
#print(tup[0])
#print(tup[1])
# put '-' around morphclass'
pdgmvals.append(str('morph' + '-' + tup[1] + '-'))
elif tup[0] == 'lexeme':
# put '-' around lexeme'
pdgmvals.append(str('-' + tup[1] + '-'))
else:
pdgmvals.append(str(tup[1]))
pvalstr = ''.join(pdgmvals)
plabel2 = str(lpref + '-' + pvalstr)
#print(str('plabel2 = ' + plabel2))
plabels += str(plabel2 + '\n')
#print(plabels)
pdgmdict['termclusters'][i]['label'] = plabel2
plabel3 = pdgmdict['termclusters'][i]['label']
#print(str('plabel3: ' + plabel3))
pdgms = pdgmdict['termclusters']
#print('termclusters in:')
#print(pdgms)
# Now sort the paradigms using label as key
res = sorted(pdgms, key = value_getter)
# and replace the original pdgms by the pdgms sorted by the new label
pdgmdict['termclusters'] = res
print('termclusters out:')
#print(res)
# Write the reordered pdgms to a json file
fp0 = open(outfile0, 'w')
indent = 2
json.dump(res, fp0, ensure_ascii=False, indent=indent)
fp0.close()
# now open the newly made termclusters
f = open(outfile0, 'r')
tctext = f.read()
# outfle0 has served its purpose, and can now be removed
os.remove(outfile0)
print("tctext")
#print(tctext)
# re formulate 'termclusters'section
# json.dump formulates output with one json constituent per line.
# However in our scheme the paradigm structure 'terms' is a
# list of lists in whcih each subllist represents a row of the pdgm.
# Thust in order to approxmate the familiar pdgm 'table' format
# it is necessary to delete CR ('\n') and intervening spaces,
# after each row-initial list bracket, and between each row value.
# It can be seen that the number of spaces that need to be deleted
# after '\n' = 'width of indent' (as used in json.dump) times 'depth of json
# constituent' (= 4 for list members which constitute "paradigm rows"
# and 3 for bracket which closes row). Thus in general
# spdel = indent * depth; and in this case spdel1 = 8, and spdel2 =6
depth1 = 4
depth2 = 3
blank1 = depth1*indent # i.e. '4*2'
blank2 = depth2*indent # i.e. '3*2'
# The string of blank spaces to be eliminated
s1 = " " * blank1
s2 = " " * blank2
tctextx = str(tctext).replace(str('[\n' + s1 +'"'),'["')
print("\nX\n")
# print(tctextx)
tctexty = tctextx.replace(str('",\n' + s1 + '"'), '" ,"')
print("\nY\n")
# print(tctexty)
tctextnew = tctexty.replace(str('"\n' + s2 + ']'),'"]')
print("\nNew Textn")
# print(tctextnew)
# Keep a separate file of sorted and ordered paradibms
with open(outfile1, 'w') as f:
f.write(tctextnew )
# Replace 'termcluster' section in current updated lang file
# Prepare for replacement
ftext1a = str('termclusters": ' + tctextnew + '}')
print("ftext1a:")
#print(ftext1a)
# Open 'matrix' text
with open(lfile) as f:
ftext = f.read()
ftext2 = ftext.replace('\n','\\n')
# print("ftext2")
# print(ftext2)
# Replaace
ftextnew1 = re.sub('termclusters.*]}', ftext1a, ftext2)
ftextnew2 = ftextnew1.replace('\\n','\n')
print('ftextnew2')
#print(ftextnew2)
file = open(lfile, "w")
file.write(ftextnew2)
file.close