-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.py
53 lines (49 loc) · 2.09 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import subprocess
#import externalcommand
import hfst
import pyhfst
import re
def parse(fst_file, fst_format, *strings):
com = ['hfst-lookup', "-q"]
#no foma option...
if fst_format == "xfst": com = ['lookup', '-q', '-flags', 'mbTT']
#this can produce a hang by clogging buffers
with open('__tmp.txt', 'w') as file_out: file_out.write("\n".join(strings))
echo = subprocess.Popen(('cat', '__tmp.txt'), stdout=subprocess.PIPE)
parse = subprocess.check_output(com + [fst_file], stdin=echo.stdout)
echo.wait() #forces echo to wait until parse has completed
#unclear why needed, as value of interest is in parse
return parse
#slimmer
#return subprocess.check_output(com+[fst_file], stdin=echo.stdout)
#alternatively actually write a string with a pipe
#though shell=True poses security risks:
#com = " ".join(["echo", "\n".join(strings), "|"]) + " ".join(com)
#parse = subprocess.Popen(com, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
#return parse.communicate()[0] #0=std out, 1=stderr
def parse_native(transducer, *strings):
parser = hfst.HfstInputStream(transducer).read()
#print("optimizing transducer")
parser.lookup_optimize()
h = {}
#print("looking up strings")
for s in strings:
if s not in h:
h[s] = []
p = parser.lookup(s)
if not p: h[s].append((s+"+?", 0.00))
else:
for q in p: h[s].append((re.sub("@.*?@", "" ,q[0]), q[1])) #filtering out flag diacritics, which the hfst api does not do as of dec 2023
return h
def parse_pyhfst(transducer, *strings):
parser = pyhfst.HfstInputStream(transducer).read()
h = {}
for s in strings:
if s not in h:
h[s] = []
p = parser.lookup(s)
print(p)
if not p: h[s].append((s+"+?", 0.00))
else:
for q in p: h[s].append((re.sub("@.*?@", "" ,q[0]), q[1])) #filtering out flag diacritics, which the hfst api does not do as of dec 2023
return h