-
Notifications
You must be signed in to change notification settings - Fork 0
/
mrp2clf.py
executable file
·233 lines (212 loc) · 9.51 KB
/
mrp2clf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python3
#-*- coding: utf8 -*-
#################################
from os import path as op
import logging
from logging import debug, info, warning, error
import clf_referee as clfref
from collections import OrderedDict, defaultdict, Counter
import re
import os
import argparse
import json
import sys
#################################
def parse_arguments():
parser = argparse.ArgumentParser(description="Convert mrp graphs into clauses")
parser.add_argument(
'--mrp', metavar='FILE PATH',
help='File containing mrp graphs')
parser.add_argument(
'--clf', metavar='FILE PATH',
help='File where clausal form will be written')
parser.add_argument(
'--sig', metavar = 'FILE PATH',
help='If added, this contains a file with all allowed roles\
otherwise a simple signature is used that\
mainly recognizes operators based on their formatting')
parser.add_argument(
'--ids', nargs='*', metavar='LIST OF IDS',
help='List of IDs of mrp which will be processed')
parser.add_argument(
'--validate', action="store_true",
help="Validate with CLF referee")
parser.add_argument(
'--quiet', '-q', action="store_true",
help="Print minimal info")
parser.add_argument(
'--throw-error', action="store_true",
help="Throw an error instead of counting them")
parser.add_argument(
'-v', dest="verbose", default=1, type=int, choices=[0, 1, 2], metavar="LEVEL",
help="Verbosity of logging: warning(0), info(1), debug(2)")
# pre-processing arguments
args = parser.parse_args()
# Set verbosity
verbose = {0:logging.WARNING, 1:logging.INFO, 2:logging.DEBUG}
logging.basicConfig(format='%(levelname)s: %(message)s', level=verbose[args.verbose])
return args
def print_dict(d, pr=True):
message = '\n'.join([ "{}: {}".format(k, d[k]) for k in sorted(d) ])
if pr: print(message)
return message
def error_raise(message):
#error(message)
raise RuntimeError(message)
def find_disjunctions(edges):
for (s1, t1), e1 in sorted(edges.items()):
if e1['lab'] == 'DIS':
for (s2, t2), e2 in sorted(edges.items()):
if e2['lab'] == 'DIS':
if t1 == s2:
yield s1, t1, t2
def find_binary_pred_condition(node, edges, node_types):
args, b, edge_num = [None, None], None, 0
nid, _ = node
for s_t in edges:
if nid in s_t:
edge_num += 1
i = 1 - s_t.index(nid)
if node_types[s_t[i]] == 'b':
if b:
error_raise("Binary pred {} is in two boxes: {}, {}".format(node, b, s_t[i]))
b = s_t[i] # box which is containg the predicate found
else:
if args[i]:
error_raise("Binary pred {} has two {}th args: {}, {}".format(\
node, i, args[i], s_t[i]))
args[i] = s_t[i] # argument found
if any([ j is None for j in args ]):
error_raise("Binary pred {} is missing args ({})".format(node, args))
if not(2 <= edge_num <= 3):
error_raise("Binary pred {} has wrong number of edges ({})".format(node, edge_num))
if b is None:
b = next((s for (s, t) in edges if t == args[0] and node_types.get(s, 0) == 'b'), None)
if b is None:
error_raise("Can't find box of Binary pred {}".format(node))
return b, args
#######################################
def mrp2clf(mrp, fix=[]):
# read nodes and edges and add None labels if they don't have any
clf_info, node_types = {}, {}
nodes, edges = OrderedDict(), OrderedDict()
for n in sorted(mrp['nodes'], key=lambda x: x['id']):
nodes[n['id']] = {'lab':n.get('label', None)}
for e in sorted(mrp['edges'], key=lambda x: (x['source'], x['target'])):
l = e.get('label', None)
if 'edge_lab' in fix and l and not l.isupper() and l != 'in':
l = None
edges[(e['source'], e['target'])] = {'lab':l}
id = mrp['id']
debug(print_dict(nodes, pr=False))
debug(print_dict(edges, pr=False))
# Add typing info to edges and nodes. Don't use signature file at this stage
# as many graphs can be dubbed invalid in the middle of conversion.
# validation will be applied in the end when clf is generated
for (s, t), e in edges.items():
# Process Discourse edges except disjunctions
if e['lab'] and e['lab'] not in ('in', 'DIS'):
if not e['lab'].isupper():
warning('{}: Suspicious Discourse connective: {}'.format(id, e['lab']))
cl = 'b{} {} b{}'.format(s, e['lab'], t)
clf_info[cl] = ('b', 'DRL', 'b')
node_types[s] = node_types[t] = 'b' # using setdefault would be stricter
e['done'] = nodes[s]['done'] = nodes[t]['done'] = True
for b1, b2, b3 in find_disjunctions(edges):
cl = 'b{} DIS b{} b{}'.format(b1, b2, b3)
clf_info[cl] = ('b', 'DIS', 'b', 'b')
node_types[b1] = node_types[b2] = node_types[b3] = 'b'
edges[(b1, b2)]['done'] = edges[(b2, b3)]['done'] = True
for (s, t), e in edges.items():
# Process in-edges
if 'done' not in e and e['lab'] == 'in':
# relax pattern to allow capturing even ill-formatted senses
# some roles with explicit in-edge will leack here but their label will fail matching test
lab = nodes[t]['lab']
if lab is not None:
m = re.match('(.+)\.([avnr]\.\d.+)', lab)
if lab is None or m: # dealing with discourse referent
cl2 = 'b{} REF x{}'.format(s, t)
clf_info[cl2] = ('b', 'REF', 'x')
node_types[s], node_types[t] = 'b', 'x'
e['done'] = nodes[s]['done'] = nodes[t]['done'] = True
if lab is not None and m: # dealing with discourse referent with concept
cl1 = 'b{} {} "{}" x{}'.format(s, *m.groups(), t)
clf_info[cl1] = ('b', 'LEX', 'x')
# process constant entities, which adds nothing to clauses yet
for i, n in nodes.items():
if n['lab'] and n['lab'][0] == n['lab'][-1] == '"':
n['done'] = True
node_types[i] = 'c'
# add binary predicates that are applied to them: roles and operators
# at this point they are the only unprocessed nodes
for i, n in nodes.items():
if 'done' not in n:
b, args = find_binary_pred_condition((i, n), edges, node_types)
# format differently constant and variable args
term_args = [ 'x{}'.format(a) if node_types[a] == 'x' else nodes[a]['lab'] \
for a in args ]
cl = 'b{} {} {} {}'.format(b, n['lab'], *term_args)
clf_info[cl] = ('b', 'BIN', node_types[args[0]], node_types[args[1]])
n['done'] = edges[(args[0], i)]['done'] = edges[(i, args[1])]['done'] = True
if (b, i) in edges: edges[(b, i)]['done'] = True
# check that all nodes and edges were processed
undone_nodes = { i:n for i, n in nodes.items() if 'done' not in n }
undone_edges = { i:n for i, n in edges.items() if 'done' not in n }
if undone_nodes or undone_edges:
error_raise("Undone graph fragment: nodes ({}), edges ({})".format(\
undone_nodes, undone_edges))
return clf_info
#######################################
def write_clfs(clf_infos, meta_list, filename=None):
assert len(clf_infos) == len(meta_list)
# open stream for writing, whether it is file or stdout
if filename:
OUT = open(filename, 'w')
else:
OUT = sys.stdout
# start writing meta data and clfs
for clf_info, meta in zip(clf_infos, meta_list):
OUT.write("%%% {}\n%%% {}\n".format(*meta))
for cl in sorted(clf_info):
OUT.write(cl + '\n')
OUT.write('\n')
if filename:
OUT.close()
#######################################################################
################################ Main #################################
if __name__ == '__main__':
args = parse_arguments()
sig = clfref.get_signature(args.sig)
with open(args.mrp) as F:
mrps = [ json.loads(l) for l in F ]
info("{} mrps read".format(len(mrps)))
# converting mrps into clfs one-by-one
error_counter = Counter()
drg_count = 0
clfs_info_list, meta_list, invalids = [], [], []
for mrp in mrps:
if mrp['framework'] != 'drg' \
or args.ids and mrp['id'] not in args.ids: continue
meta_list.append((mrp['id'], mrp['input']))
drg_count += 1
try:
clf = mrp2clf(mrp, fix=['edge_lab']) # some graphs need this
# if signature is
if args.validate:
clfref.check_clf(clf, sig)
clfs_info_list.append(clf)
except:
if args.throw_error: raise
err_message = repr(sys.exc_info()[1])
if not args.quiet: error("{}: {}".format(mrp['id'], err_message))
error_counter.update([re.sub('\d+', 'NUM', err_message)])
invalids.append(mrp['id'])
clfs_info_list.append({'b REF x':('b', 'REF', 'x'), 'b nevermatching "n.01" x':('b', 'LEX', 'x')})
write_clfs(clfs_info_list, meta_list, filename=args.clf)
if error_counter and not args.quiet:
print("Frequencies of erros")
for err, c in error_counter.most_common():
print("{:>5}: {}".format(c, err))
print("{} ({:.1f}%) mrp conversions out of {} failed: {}".format(\
len(invalids), len(invalids)/drg_count*100, drg_count, '' if args.quiet else invalids ))