-
Notifications
You must be signed in to change notification settings - Fork 0
/
template.py
executable file
·88 lines (77 loc) · 2.48 KB
/
template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
import re
import sys
class FeatureExtractor:
def __init__(self):
self.macro = re.compile(r'%x\[(?P<row>[\d-]+),(?P<col>[\d]+)\]')
self.inst = []
self.t = 0
self.templates = []
def read(self, fi):
self.templates = []
for line in fi:
line = line.strip()
if line.startswith('#'):
continue
if line.startswith('U'):
self.templates.append(line.replace(':', '='))
elif line == 'B':
continue
elif line.startswith('B'):
sys.stderr(
'ERROR: bigram templates not supported: %s\n' % line)
sys.exit(1)
def replace(self, m):
row = self.t + int(m.group('row'))
col = int(m.group('col'))
if row in range(0, len(self.inst)):
return self.inst[row]['x'][col]
else:
return ''
def apply(self, inst, t):
self.inst = inst
self.t = t
for template in self.templates:
f = re.sub(self.macro, self.replace, template)
self.inst[t]['F'].append(f)
def readiter(fi, sep=None):
X = []
for line in fi:
line = line.strip('\n')
if not line:
yield X
X = []
else:
fields = line.split(sep)
item = {
'x': fields[0:-1],
'y': fields[-1],
'F': []
}
X.append(item)
if __name__ == '__main__':
import optparse
fi = sys.stdin
fo = sys.stdout
# Parse the command-line arguments.
parser = optparse.OptionParser(usage="""usage: %prog <template>
This utility reads a data set from STDIN, applies feature templates compatible
with CRF++, and outputs attributes to STDOUT. Each line of a data set must
consist of field values separated by SEPARATOR characters (customizable with
-s option)."""
)
parser.add_option(
'-s', dest='separator', default='\t',
help='specify the separator of columns of input data [default: "\\t"]'
)
(options, args) = parser.parse_args()
F = FeatureExtractor()
F.read(open(args[0]))
for inst in readiter(fi, options.separator):
for t in range(len(inst)):
F.apply(inst, t)
fo.write('%s' % inst[t]['y'])
for attr in inst[t]['F']:
fo.write('\t%s' % attr.replace(':', '__COLON__'))
fo.write('\n')
fo.write('\n')