Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve example/template.py for speed, functionality, and readability #100

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 137 additions & 61 deletions example/template.py
Original file line number Diff line number Diff line change
@@ -1,88 +1,164 @@
#!/usr/bin/env python
"""[summary]."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import re
import sys
from builtins import map, object, range
from multiprocessing import Pool

from future import standard_library

standard_library.install_aliases()

CRFPP_PATTERN = re.compile(r'%x\[(?P<row>-?\d+),(?P<col>\d+)\]')
CRFPP_COORDINATE_DELIMITER = '/'
TEMPLATE_QUOTE = ':'


class FeatureExtractor(object):
"""FeatureExtractor."""

class FeatureExtractor:
def __init__(self):
self.macro = re.compile(r'%x\[(?P<row>[\d-]+),(?P<col>[\d]+)\]')
def __init__(self, template_file):
"""[summary].

Arguments:
template_file {[type]} -- [description]

Raises:
ValueError -- [description]

"""
self.inst = []
self.inst_len = 0
self.t = 0
self.templates = []

def read(self, fi):
self.templates = []
for line in fi:
for line in template_file:
line = line.strip()
if line.startswith('#'):
if not line or line == 'B':
continue
if line.startswith('U'):
self.templates.append(line.replace(':', '='))
elif line == 'B':
line_head = line[0]
if line_head == '#':
continue
elif line.startswith('B'):
sys.stderr(
'ERROR: bigram templates not supported: %s\n' % line)
sys.exit(1)

def replace(self, m):
row = self.t + int(m.group('row'))
col = int(m.group('col'))
if row in range(0, len(self.inst)):
return self.inst[row]['x'][col]
else:
return ''
if line_head != 'U':
raise ValueError('ERROR: unsupported: %s\n' % line)

def apply(self, inst, t):
self.inst = inst
self.t = t
for template in self.templates:
f = re.sub(self.macro, self.replace, template)
self.inst[t]['F'].append(f)
elements = line.split(TEMPLATE_QUOTE)
name = elements[0] + '='
pattern = elements[1]
scale = ''
if len(elements) == 3:
scale = elements[2]
try:
float(scale)
except:
raise ValueError(
'ERROR: invalid scaling value: %s\n' % scale)
scale = TEMPLATE_QUOTE + scale
coordinates = [list(map(
int, CRFPP_PATTERN.match(coordinate_pattern).groups()))
for coordinate_pattern in pattern.split(
CRFPP_COORDINATE_DELIMITER)]
self.templates.append({
'name': name, 'coordinates': coordinates, 'scale': scale})

def readiter(fi, sep=None):
def apply(self, inst, inst_len, t):
"""[summary].

Arguments:
inst {[type]} -- [description]
inst_len {[type]} -- [description]
t {[type]} -- [description]
"""
self.inst = inst
self.inst_len = inst_len
self.t = t
self.inst[t]['F'] = [
template['name'] + CRFPP_COORDINATE_DELIMITER.join([
self.inst[row + self.t]['x'][col]
if 0 <= row + self.t < self.inst_len else ''
for row, col in template['coordinates']
]) + template['scale']
for template in self.templates
]


def readiter(input_feature_tsv, sep):
"""[summary].

Arguments:
input_feature_tsv {[type]} -- [description]
sep {[type]} -- [description]

Yields:
[type] -- [description]

"""
X = []
for line in fi:
for line in input_feature_tsv:
line = line.strip('\n')
if not line:
yield X
X = []
else:
fields = line.split(sep)
item = {
'x': fields[0:-1],
'x': [x.replace('\\', '\\\\').replace(':', r'\:')
for x in fields[0:-1]],
'y': fields[-1],
'F': []
}
X.append(item)


if __name__ == '__main__':
import optparse

fi = sys.stdin
fo = sys.stdout

# Parse the command-line arguments.
parser = optparse.OptionParser(usage="""usage: %prog <template>
This utility reads a data set from STDIN, applies feature templates compatible
with CRF++, and outputs attributes to STDOUT. Each line of a data set must
consist of field values separated by SEPARATOR characters (customizable with
-s option)."""
)
parser.add_option(
'-s', dest='separator', default='\t',
help='specify the separator of columns of input data [default: "\\t"]'
)
(options, args) = parser.parse_args()

F = FeatureExtractor()
F.read(open(args[0]))

for inst in readiter(fi, options.separator):
for t in range(len(inst)):
F.apply(inst, t)
fo.write('%s' % inst[t]['y'])
for attr in inst[t]['F']:
fo.write('\t%s' % attr.replace(':', '__COLON__'))
fo.write('\n')
fo.write('\n')
import argparse
description = '''
This utility reads a data set from INPUT_FILE_PATH or STDIN, applies
feature templates compatible with CRF++, and outputs attributes to
OUTPUT_FILE_PATH or STDOUT, repectively. Each line of a data set must
consist of field values separated by SEPARATOR characters (customizable
with -s option).'''
parser = argparse.ArgumentParser(description=description)
parser.add_argument('template_file',
metavar='TEMPLATE_FILE_PATH',
type=argparse.FileType('r'))
parser.add_argument('input_feature_tsv',
nargs='?',
metavar='INPUT_FILE_PATH',
type=argparse.FileType('r'),
default=sys.stdin)
parser.add_argument('output_crfsuite_feature_tsv',
nargs='?',
metavar='OUTPUT_FILE_PATH',
type=argparse.FileType('w'),
default=sys.stdout)
parser.add_argument('-s', '--sep',
default='\t',
help='specify the separator of columns of input data'
' [default: "\\t"]')
args = parser.parse_args()

F = FeatureExtractor(args.template_file)

def _apply_F(inst):
inst_len = len(inst)
rows = []

for t in range(inst_len):
F.apply(inst, inst_len, t)
columns = [inst[t]['y']] + inst[t]['F']
rows.append('\t'.join(columns))
rows.append('')
return rows

with Pool(processes=4) as pool:
sentence_iter = pool.imap(_apply_F,
readiter(args.input_feature_tsv, args.sep),
50)
for rows in sentence_iter:
args.output_crfsuite_feature_tsv.write('\n'.join(rows) + '\n')