-
Notifications
You must be signed in to change notification settings - Fork 208
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve example/template.py for speed, functionality, and readability
- python 2-to-3 compatibility - eliminate redundant function calls - especially regex replace calls - use pool.imap for multi-process - supporting scale-decorated template, e.g. `U:%x[0,0]:-1.5` - applying several linting guidelines
- Loading branch information
1 parent
dc5b6c7
commit 8864ac2
Showing
1 changed file
with
137 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,88 +1,164 @@ | ||
#!/usr/bin/env python | ||
"""[summary].""" | ||
|
||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
from __future__ import unicode_literals | ||
|
||
import re | ||
import sys | ||
from builtins import map, object, range | ||
from multiprocessing import Pool | ||
|
||
from future import standard_library | ||
|
||
standard_library.install_aliases() | ||
|
||
CRFPP_PATTERN = re.compile(r'%x\[(?P<row>-?\d+),(?P<col>\d+)\]') | ||
CRFPP_COORDINATE_DELIMITER = '/' | ||
TEMPLATE_QUOTE = ':' | ||
|
||
|
||
class FeatureExtractor(object): | ||
"""FeatureExtractor.""" | ||
|
||
class FeatureExtractor: | ||
def __init__(self): | ||
self.macro = re.compile(r'%x\[(?P<row>[\d-]+),(?P<col>[\d]+)\]') | ||
def __init__(self, template_file): | ||
"""[summary]. | ||
Arguments: | ||
template_file {[type]} -- [description] | ||
Raises: | ||
ValueError -- [description] | ||
""" | ||
self.inst = [] | ||
self.inst_len = 0 | ||
self.t = 0 | ||
self.templates = [] | ||
|
||
def read(self, fi): | ||
self.templates = [] | ||
for line in fi: | ||
for line in template_file: | ||
line = line.strip() | ||
if line.startswith('#'): | ||
if not line or line == 'B': | ||
continue | ||
if line.startswith('U'): | ||
self.templates.append(line.replace(':', '=')) | ||
elif line == 'B': | ||
line_head = line[0] | ||
if line_head == '#': | ||
continue | ||
elif line.startswith('B'): | ||
sys.stderr( | ||
'ERROR: bigram templates not supported: %s\n' % line) | ||
sys.exit(1) | ||
|
||
def replace(self, m): | ||
row = self.t + int(m.group('row')) | ||
col = int(m.group('col')) | ||
if row in range(0, len(self.inst)): | ||
return self.inst[row]['x'][col] | ||
else: | ||
return '' | ||
if line_head != 'U': | ||
raise ValueError('ERROR: unsupported: %s\n' % line) | ||
|
||
def apply(self, inst, t): | ||
self.inst = inst | ||
self.t = t | ||
for template in self.templates: | ||
f = re.sub(self.macro, self.replace, template) | ||
self.inst[t]['F'].append(f) | ||
elements = line.split(TEMPLATE_QUOTE) | ||
name = elements[0] + '=' | ||
pattern = elements[1] | ||
scale = '' | ||
if len(elements) == 3: | ||
scale = elements[2] | ||
try: | ||
float(scale) | ||
except: | ||
raise ValueError( | ||
'ERROR: invalid scaling value: %s\n' % scale) | ||
scale = TEMPLATE_QUOTE + scale | ||
coordinates = [list(map( | ||
int, CRFPP_PATTERN.match(coordinate_pattern).groups())) | ||
for coordinate_pattern in pattern.split( | ||
CRFPP_COORDINATE_DELIMITER)] | ||
self.templates.append({ | ||
'name': name, 'coordinates': coordinates, 'scale': scale}) | ||
|
||
def readiter(fi, sep=None): | ||
def apply(self, inst, inst_len, t): | ||
"""[summary]. | ||
Arguments: | ||
inst {[type]} -- [description] | ||
inst_len {[type]} -- [description] | ||
t {[type]} -- [description] | ||
""" | ||
self.inst = inst | ||
self.inst_len = inst_len | ||
self.t = t | ||
self.inst[t]['F'] = [ | ||
template['name'] + CRFPP_COORDINATE_DELIMITER.join([ | ||
self.inst[row + self.t]['x'][col] | ||
if 0 <= row + self.t < self.inst_len else '' | ||
for row, col in template['coordinates'] | ||
]) + template['scale'] | ||
for template in self.templates | ||
] | ||
|
||
|
||
def readiter(input_feature_tsv, sep): | ||
"""[summary]. | ||
Arguments: | ||
input_feature_tsv {[type]} -- [description] | ||
sep {[type]} -- [description] | ||
Yields: | ||
[type] -- [description] | ||
""" | ||
X = [] | ||
for line in fi: | ||
for line in input_feature_tsv: | ||
line = line.strip('\n') | ||
if not line: | ||
yield X | ||
X = [] | ||
else: | ||
fields = line.split(sep) | ||
item = { | ||
'x': fields[0:-1], | ||
'x': [x.replace('\\', '\\\\').replace(':', r'\:') | ||
for x in fields[0:-1]], | ||
'y': fields[-1], | ||
'F': [] | ||
} | ||
X.append(item) | ||
|
||
|
||
if __name__ == '__main__': | ||
import optparse | ||
|
||
fi = sys.stdin | ||
fo = sys.stdout | ||
|
||
# Parse the command-line arguments. | ||
parser = optparse.OptionParser(usage="""usage: %prog <template> | ||
This utility reads a data set from STDIN, applies feature templates compatible | ||
with CRF++, and outputs attributes to STDOUT. Each line of a data set must | ||
consist of field values separated by SEPARATOR characters (customizable with | ||
-s option).""" | ||
) | ||
parser.add_option( | ||
'-s', dest='separator', default='\t', | ||
help='specify the separator of columns of input data [default: "\\t"]' | ||
) | ||
(options, args) = parser.parse_args() | ||
|
||
F = FeatureExtractor() | ||
F.read(open(args[0])) | ||
|
||
for inst in readiter(fi, options.separator): | ||
for t in range(len(inst)): | ||
F.apply(inst, t) | ||
fo.write('%s' % inst[t]['y']) | ||
for attr in inst[t]['F']: | ||
fo.write('\t%s' % attr.replace(':', '__COLON__')) | ||
fo.write('\n') | ||
fo.write('\n') | ||
import argparse | ||
description = ''' | ||
This utility reads a data set from INPUT_FILE_PATH or STDIN, applies | ||
feature templates compatible with CRF++, and outputs attributes to | ||
OUTPUT_FILE_PATH or STDOUT, repectively. Each line of a data set must | ||
consist of field values separated by SEPARATOR characters (customizable | ||
with -s option).''' | ||
parser = argparse.ArgumentParser(description=description) | ||
parser.add_argument('template_file', | ||
metavar='TEMPLATE_FILE_PATH', | ||
type=argparse.FileType('r')) | ||
parser.add_argument('input_feature_tsv', | ||
nargs='?', | ||
metavar='INPUT_FILE_PATH', | ||
type=argparse.FileType('r'), | ||
default=sys.stdin) | ||
parser.add_argument('output_crfsuite_feature_tsv', | ||
nargs='?', | ||
metavar='OUTPUT_FILE_PATH', | ||
type=argparse.FileType('w'), | ||
default=sys.stdout) | ||
parser.add_argument('-s', '--sep', | ||
default='\t', | ||
help='specify the separator of columns of input data' | ||
' [default: "\\t"]') | ||
args = parser.parse_args() | ||
|
||
F = FeatureExtractor(args.template_file) | ||
|
||
def _apply_F(inst): | ||
inst_len = len(inst) | ||
rows = [] | ||
|
||
for t in range(inst_len): | ||
F.apply(inst, inst_len, t) | ||
columns = [inst[t]['y']] + inst[t]['F'] | ||
rows.append('\t'.join(columns)) | ||
rows.append('') | ||
return rows | ||
|
||
with Pool(processes=4) as pool: | ||
sentence_iter = pool.imap(_apply_F, | ||
readiter(args.input_feature_tsv, args.sep), | ||
50) | ||
for rows in sentence_iter: | ||
args.output_crfsuite_feature_tsv.write('\n'.join(rows) + '\n') |