-
Notifications
You must be signed in to change notification settings - Fork 2
/
ngram_extract.py
141 lines (116 loc) · 4.8 KB
/
ngram_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
ngram_extract.py
Input: d3_feats file
Output:
3 nucleic acid immunization nucleic acid immunizations US20040234539A1 JJ NN NNS
"""
# gunzip -c US5579518A.xml.gz | python2.7 ngram_extract.py
import sys
import canon
import os
import gzopen
import dir2filelist
import re
import codecs
can = canon.Canon()
def file_base_name(file_name):
""" Given a file name, return base filename without the extension(s). """
if '.' in file_name:
separator_index = file_name.index('.')
base_name = file_name[:separator_index]
return base_name
else:
return file_name
def path_base_name(path):
""" Given a full path, strip off path and extensions and return base file name. """
file_name = os.path.basename(path)
return file_base_name(file_name)
#def ngram_extract_from_file(infile, can, out_stream)
def file2ngram_info(infile, min_len, max_len):
""" Given a d3_feats file, return a list of tab separated strings of the form:
<ngram_length> <canonicalized term> <surface term> <doc_id> <pos_signature>
e.g., 3 epitaxial silicon process epitaxial silicon processes 000171485800006 JNN
NOTE: All elements are returned as strings, including the <ngram_length>
min_len and max_len constrain the length of ngrams to be included in output.
"""
#print("[file2ngram_info] %s" % infile) ///
s_infile = gzopen.gzopen(infile)
# list of lists of info to be returned for each line of input file
l_term_info = []
for line in s_infile:
line = line.strip("\n")
l_fields = line.split("\t")
filename = l_fields[0]
doc_id = path_base_name(filename)
term = l_fields[2]
ngram_len = len(term.split(" "))
# continue if conditions for the term are met (ngram length and filter check)
if (ngram_len >= min_len) and (ngram_len <= max_len) and not(canon.illegal_phrase_p(term)) :
canon_np = can.get_canon_np(term)
# We assume that the last feature on the line is tag_sig!
pos_sig = l_fields[-1]
if pos_sig[:7] != "tag_sig":
print ("[ngram_extract.py]Error: last feature on input line is not labeled tag_sig")
print ("line: %s" % line)
sys.exit()
else:
# replace pos_sig with a string made of the first char of each pos in the phrase
# e.g. JJ_NN_NNS => JNN
pos_sig = "".join(item[0] for item in pos_sig[8:].split("_"))
prev_Npr = ""
prev_N = ""
# grab the prev_Npr feature, if there is one
try:
# extract the value of the prev_Npr feature, if there is one.
match = re.search(r'prev_Npr=(\S+) ', line)
prev_Npr = match.group(1)
# canonicalize the noun
prev_N = can.get_canon_np(prev_Npr.split("_")[0])
except:
pass
l_term_info.append([str(ngram_len), canon_np, term, doc_id, pos_sig, prev_Npr, prev_N])
s_infile.close()
return(l_term_info)
def flist2_ngram_info(flist_file, outfile):
""" flist_file is a file containing a list of pathnames of d3_feats files.
outfile is pathname of file to which term_info lines are to be written
"""
s_flist_file = open(flist_file, "r")
for filename in s_flist_file:
filename = filename.strip("\n")
l_term_info = file2ngram_info
print("\t".join(term_info))
s_fl_file.close()
# python2.7 ngram_extract.py
"""
if __name__ == "__main__":
dir = "/home/j/corpuswork/techwatch/corpora/SignalProcessing/data/d3_feats/01/files/2002/000171/"
filen = "000171485800006.xml.gz"
infile = dir + filen
l_term_info = file2ngram_info(infile)
for term_info in l_term_info:
print("\t".join(term_info))
#print "%i\t%s\t%s\t%s\t%s" % (ngram_len, canon_np, term, doc_id, pos_sig)
"""
if __name__ == "__main__":
#dir = "/home/j/corpuswork/techwatch/corpora/SignalProcessing/data/d3_feats/01/files/2002/000171/"
#filen = "000171485800006.xml.gz"
#infile = dir + filen
#inDir = "/home/j/corpuswork/techwatch/corpora/SignalProcessing/data/d3_feats/01/files/2002/"
#min_len = 2
#max_len = 6
args = sys.argv
inDir = args[1]
outfile = args[2]
min_len = int(args[3])
max_len = int(args[4])
s_outfile = codecs.open(outfile, "w", encoding='utf-8')
fl = dir2filelist.dir2filelist(inDir)
for infile in fl:
# stdout will be piped into ngrams.log file in output dir
print("file: %s" % infile)
l_term_info = file2ngram_info(infile, min_len, max_len)
for term_info in l_term_info:
outline = "\t".join(term_info)
s_outfile.write("%s\n" % outline)
s_outfile.close()