From 8864ac22602ac00757f353133d0707b95b29e41a Mon Sep 17 00:00:00 2001 From: Mike Tian-Jian Jiang Date: Mon, 26 Feb 2018 21:48:02 +0900 Subject: [PATCH] Improve example/template.py for speed, functionality, and readability - python 2-to-3 compatibility - eliminate redundant function calls - especially regex replace calls - use pool.imap for multi-process - supporting scale-decorated template, e.g. `U:%x[0,0]:-1.5` - applying several linting guidelines --- example/template.py | 198 ++++++++++++++++++++++++++++++-------------- 1 file changed, 137 insertions(+), 61 deletions(-) diff --git a/example/template.py b/example/template.py index 1bb6f8c7..c09138f9 100755 --- a/example/template.py +++ b/example/template.py @@ -1,48 +1,105 @@ #!/usr/bin/env python +"""[summary].""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals import re import sys +from builtins import map, object, range +from multiprocessing import Pool + +from future import standard_library + +standard_library.install_aliases() + +CRFPP_PATTERN = re.compile(r'%x\[(?P-?\d+),(?P\d+)\]') +CRFPP_COORDINATE_DELIMITER = '/' +TEMPLATE_QUOTE = ':' + + +class FeatureExtractor(object): + """FeatureExtractor.""" -class FeatureExtractor: - def __init__(self): - self.macro = re.compile(r'%x\[(?P[\d-]+),(?P[\d]+)\]') + def __init__(self, template_file): + """[summary]. + + Arguments: + template_file {[type]} -- [description] + + Raises: + ValueError -- [description] + + """ self.inst = [] + self.inst_len = 0 self.t = 0 self.templates = [] - - def read(self, fi): - self.templates = [] - for line in fi: + for line in template_file: line = line.strip() - if line.startswith('#'): + if not line or line == 'B': continue - if line.startswith('U'): - self.templates.append(line.replace(':', '=')) - elif line == 'B': + line_head = line[0] + if line_head == '#': continue - elif line.startswith('B'): - sys.stderr( - 'ERROR: bigram templates not supported: %s\n' % line) - sys.exit(1) - - def replace(self, m): - row = self.t + int(m.group('row')) - col = int(m.group('col')) - if row in range(0, len(self.inst)): - return self.inst[row]['x'][col] - else: - return '' + if line_head != 'U': + raise ValueError('ERROR: unsupported: %s\n' % line) - def apply(self, inst, t): - self.inst = inst - self.t = t - for template in self.templates: - f = re.sub(self.macro, self.replace, template) - self.inst[t]['F'].append(f) + elements = line.split(TEMPLATE_QUOTE) + name = elements[0] + '=' + pattern = elements[1] + scale = '' + if len(elements) == 3: + scale = elements[2] + try: + float(scale) + except: + raise ValueError( + 'ERROR: invalid scaling value: %s\n' % scale) + scale = TEMPLATE_QUOTE + scale + coordinates = [list(map( + int, CRFPP_PATTERN.match(coordinate_pattern).groups())) + for coordinate_pattern in pattern.split( + CRFPP_COORDINATE_DELIMITER)] + self.templates.append({ + 'name': name, 'coordinates': coordinates, 'scale': scale}) -def readiter(fi, sep=None): + def apply(self, inst, inst_len, t): + """[summary]. + + Arguments: + inst {[type]} -- [description] + inst_len {[type]} -- [description] + t {[type]} -- [description] + """ + self.inst = inst + self.inst_len = inst_len + self.t = t + self.inst[t]['F'] = [ + template['name'] + CRFPP_COORDINATE_DELIMITER.join([ + self.inst[row + self.t]['x'][col] + if 0 <= row + self.t < self.inst_len else '' + for row, col in template['coordinates'] + ]) + template['scale'] + for template in self.templates + ] + + +def readiter(input_feature_tsv, sep): + """[summary]. + + Arguments: + input_feature_tsv {[type]} -- [description] + sep {[type]} -- [description] + + Yields: + [type] -- [description] + + """ X = [] - for line in fi: + for line in input_feature_tsv: line = line.strip('\n') if not line: yield X @@ -50,39 +107,58 @@ def readiter(fi, sep=None): else: fields = line.split(sep) item = { - 'x': fields[0:-1], + 'x': [x.replace('\\', '\\\\').replace(':', r'\:') + for x in fields[0:-1]], 'y': fields[-1], 'F': [] } X.append(item) + if __name__ == '__main__': - import optparse - - fi = sys.stdin - fo = sys.stdout - - # Parse the command-line arguments. - parser = optparse.OptionParser(usage="""usage: %prog