-
Notifications
You must be signed in to change notification settings - Fork 2
/
act_tf.py
397 lines (317 loc) · 17.5 KB
/
act_tf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
# encoding=utf8
"""
# act_tf.py
# modified from tf.py to take a list of .xml.gz files as input
# create .tf file from term_features directory files
# These functions were originally part of role.py but separated out to separate
# generic feature processing from the ACT/PN task.
# NOTE: We assume we are running on UNIX, where directories are separated by "/"
#----
# from individual term features files, create a summary file per year
# with the freq of the term feature combination (.tf)
# NOTE: alpha filter does not apply to Chinese. Removed for now.
# 2/27/14 PGA added code to count terms and feats and write out their counts
# in separate files (.terms, .feats)
# inroot and outroot should terminate in a directory separator ("/")
# 4/4/15 added canonicalization and prob(term|feature) to .tf
# 4/18/15 added MI to .tf
.tf file sample
program prev_VNP=performs|debugging|on 1 0.000022 0.000142
pre-paid card prev_Npr=amount_of
Need to canonicalize term and feature separately
Remember that features in the seed set have to be consistent (e.g. canonicalized or not) with features here
for probabilities to be consistent. For now, we will only canonicalize the terms (not the features)
The porter stemmer in nltk (called by canon) may generate unicode errors. These can be ignored. When this
occurs, we don't apply stemming to the feature.
6/26/18 fixed a bug causing errors in mallet. Canonicalization of some hyphenated feature
values resulted in a blank replacing a hyphen. This turned up later as an erro in the
mallet svm formatting, which uses blanks as separators. Thus, we no longer try to
canonicalize hyphenated terms.
Also added canon.illegal_phrase_p filter to the doc_terms file output to eliminate noise terms.
Made minimum character length in a term = 2.
"""
import pdb
#import sys
#import collections
from collections import defaultdict
import os
import glob
import codecs
import roles_config
import canon
import math
import gzopen
# Next three lines are a hack from
# https://stackoverflow.com/questions/21129020/how-to-fix-unicodedecodeerror-ascii-codec-cant-decode-byte
# to avoid the error: UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# list of noise terms to omit from doc_terms_file
# The word "end" is a marker inserted in documents by our preprocessing.
# "claim" is a noise word in patents.
DOC_TERMS_NOISE = ["end", "fig", "figure", "claim", "pat", "no.", "u.s. pat"]
# canonicalizer object
can = canon.Canon()
#def dir2features_count(inroot, outroot, year, overwrite_p=False, canonicalize_p=True, filter_noise_p=True):
def dir2features_count(filelist_file, out_root, sections, year, overwrite_p, max_doc_terms_count=1000, canonicalize_p=True, filter_noise_p=True):
#pdb.set_trace()
out_path = "/".join([out_root, sections])
out_path_prefix = "/".join([out_path, year])
# term-feature output file
tf_file = out_path_prefix + ".tf"
# remember the mapping between surface head nouns and their canonicalized forms
canon_file = out_path_prefix + ".canon"
# create the outpath if it doesn't exist yet
print("[act_tf.py]creating path: %s,\n[act_tf.py]writing to %s" % (out_path, tf_file))
try:
# create directory path for corpus, if it does not aleady exist
os.makedirs(out_path)
except:
print("[act_tf.py]NOTE: Path already exists (or cannot be created).")
# Do not continue if the .tf file already exists for this corpus and year
if os.path.isfile(tf_file) and not overwrite_p:
print "[tf.py]file already exists: %s. No need to recompute." % tf_file
else:
terms_file = out_path_prefix + ".terms"
feats_file = out_path_prefix + ".feats"
corpus_size_file = out_path_prefix + ".cs"
doc_terms_file = out_path_prefix + ".doc_terms"
# store each filename with a list of its terms
s_doc_terms_file = codecs.open(doc_terms_file, "w", encoding='utf-8')
# count of number of docs a term pair cooccurs in
# dfreq is document freq, cfreq is corpus freq
#d_pair_freq = defaultdict(int)
d_pair2dfreq = defaultdict(int)
# corpus count for the pair
d_pair2cfreq = defaultdict(int)
# count of number of docs a term occurs in
#d_term_freq = defaultdict(int)
d_term2dfreq = defaultdict(int)
# count of number of instances of a term
#d_term_instance_freq = defaultdict(int)
d_term2cfreq = defaultdict(int)
# count of number of instances of a feature
#d_feat_instance_freq = defaultdict(int)
d_feat2cfreq = defaultdict(int)
# count of number of docs a feature occurs in
#d_feat_freq = defaultdict(int)
d_feat2dfreq = defaultdict(int)
# doc_count needed for computing probs
doc_count = 0
# open list of all the files in the inroot directory
s_filelist = open(filelist_file)
#print "inroot: %s, filelist: %s" % (inroot, filelist)
# iterate through files in filelist
for infile in s_filelist:
infile = infile.strip("\n")
# Create a tab separated string containing the filename and all (legal) canonicalized terms, including
# duplicates. This will be used to populate a doc_term retrieval system in
# elasticsearch.
# First field will be the filename.
# At this point, we'll collect the filename and terms into a list.
# The file without path or extensions should be a unique doc id.
doc_id = os.path.basename(infile).split(".")[0]
doc_terms_list = [doc_id]
# dictionaries to sum up statistics
# number of times a term appears in the doc
d_term2count = defaultdict(int)
d_feat2count = defaultdict(int)
# number of times a term appears with a specific feature in the doc
d_pair2count = defaultdict(int)
# process the dictionaries
# for each file, create a set of all term-feature pairs in the file
#/// dictionaries are functionally redundant with sets here.
# Use sets to capture which terms, features, and pairs occur in the
# document. We'll use this after processing each doc to update the
# doc frequencies of terms, features, and pairs.
pair_set = set()
term_set = set()
feature_set = set()
#pdb.set_trace()
s_infile = gzopen.gzopen(infile)
# count number of lines in file
i = 0
# iterate through lines in d3_feats file
for term_line in s_infile:
i += 1
term_line = term_line.strip("\n")
l_fields = term_line.split("\t")
term = l_fields[2]
# Do not process noise (illegal) terms or features
# for cases where feat = "", need to filter! todo
#pdb.set_trace()
if (filter_noise_p and canon.illegal_phrase_p(term)):
pass
# eliminate lines that come from claims section of patents.
# These are not very useful and skew term frequency counts.
# We do this by eliminating lines containing the feature section_loc=CLAIM*.
if ("=CLAIM" in term_line):
pass
# NOTE: At the moment we don't test which sections of the doc should be included
# as specified by the sections parameter (ta or tas). We include every line. If
# we decide to add this functionality, this would be the place to add the filter.
else:
if canonicalize_p:
# Do canonicalization of term before incrementing counts
#feature = can.get_canon_feature(feature)
term = can.get_canon_np(term)
# increment the within doc count for the term
##d_term2count[term] += 1
term_set.add(term)
# increment the global corpus count for the term
d_term2cfreq[term] += 1
# Add the term to the list of terms for the current doc
# Ideally, we would like to ignore parts of a patent (e.g. the claims) and
# just use the title, abstract and summary. However, there is no feature
# indicating what section we are in beyond the abstract. So instead, we
# will use a simple doc_terms_count cut off (e.g. 1000). Variable i counts
# the number of lines so far.
#pdb.set_trace()
if (i <= max_doc_terms_count) and (term not in DOC_TERMS_NOISE) and not canon.illegal_phrase_p(term):
doc_terms_list.append(term)
# fields 3 and beyond are feature-value pairs
# look for features of interest using their prefixes
for feature in l_fields[3:]:
# Note that we use the prefixes of some feature names for convenience.
# The actual features are prev_V, prev_VNP, prev_J, prev_Jpr, prev_Npr, last_word
# first_word, if an adjective, may capture some indicators of dimensions (high, low), although
# many common adjectives are excluded from the chunk and would be matched by prev_J.
# we also pull out the sent and token locations to allow us to locate the full sentence for this
# term-feature instance.
if (feature[0:6] in ["prev_V", "prev_J", "prev_N", "last_w"]) and not canon.illegal_feature_p(feature):
if canonicalize_p and not "-" in feature:
# Do canonicalization of feature before incrementing counts.
# NOTE: There is a bug in the canonicalization code when the
# term contains hyphens. For example:
# >>> can.get_canon_feature("last_word=compass-on-a-chip")
# Returns a term with a blank in it: 'last_word=compas-on-a chip'
# for this reason, we will not try to canonicalize terms containing
# a hyphen.
feature = can.get_canon_feature(feature)
# increment global corpus count for the feature
d_feat2cfreq[feature] += 1
feature_set.add(feature)
# increment global corpus count for the pair
d_pair2cfreq[(term, feature)] += 1
# increment the within doc count for the term feature pair
##d_pair2count[(term, feature)] += 1
pair_set.add((term, feature))
# construct a tab-separated string containing file_name and all terms
doc_terms_str = "\t".join(doc_terms_list)
s_doc_terms_file.write("%s\n" % doc_terms_str)
s_infile.close()
# Using the sets, increment the doc_freq for term-feature pairs in the doc.
# By making the list a set, we know we are only counting each term-feature combo once
# per document
for pair in pair_set:
d_pair2dfreq[pair] += 1
# also increment doc_freq for features and terms
for term in term_set:
d_term2dfreq[term] +=1
for feature in feature_set:
d_feat2dfreq[feature] += 1
# track total number of docs
doc_count += 1
s_filelist.close()
s_tf_file = codecs.open(tf_file, "w", encoding='utf-8')
s_terms_file = codecs.open(terms_file, "w", encoding='utf-8')
s_feats_file = codecs.open(feats_file, "w", encoding='utf-8')
print "[act_tf.py]Writing to %s" % tf_file
# compute prob
print "[act_tf.py]Processed %i files" % doc_count
for pair in d_pair2dfreq.keys():
freq_pair = d_pair2dfreq[pair]
prob_pair = float(freq_pair)/doc_count
term = pair[0]
feature = pair[1]
freq_term = d_term2dfreq[term]
freq_feat = d_feat2dfreq[feature]
# Occasionally, we come across a term in freq_pair which is not actually in
# the dictionary d_term2dfreq. It returns a freq of 0. We need to ignore these
# cases, since they will create a divide by 0 error.
if freq_term > 0 and freq_feat > 0:
# probability of the feature occurring with the term in a doc, given that
# the term appears in the doc
try:
prob_fgt = freq_pair/float(freq_term)
except:
pdb.set_trace()
# added 4/4/15: prob of the feature occurring with the term in a doc, given that
# the feature appears in the doc
try:
prob_tgf = freq_pair/float(freq_feat)
except:
pdb.set_trace()
# 4/18/15 adding mutual information based on count of pairs, terms, feats (counted once per doc),
# and corpus size (# docs)
# MI = prob(pair) / prob(term) * prob(feature)
#prob_term = float(d_term2dfreq[term])/doc_count
#prob_feature = float(d_feat2dfreq[term])/doc_count
mi_denom = (freq_term) * (freq_feat) / float(doc_count)
mi = math.log(freq_pair / mi_denom)
# normalize to -1 to 1
# Note: if prob_pair == 1, then log is 0 and we risk dividing by 0
# We'll prevent this by subtracting a small amt from prob_pair
if prob_pair == 1:
prob_pair = prob_pair - .000000001
npmi = mi / (-math.log(prob_pair))
s_tf_file.write( "%s\t%s\t%i\t%f\t%f\t%f\t%i\t%i\t%f\t%f\n" % (term, feature, freq_pair, prob_pair, prob_fgt, prob_tgf, freq_term, freq_feat, mi, npmi))
else:
# print out a warning about terms with 0 freq.
print "[act_tf.py]WARNING: term-feature pair: %s has freq = 0. Ignored." % l_pair
for term in d_term2dfreq.keys():
term_prob = float(d_term2dfreq[term])/doc_count
s_terms_file.write( "%s\t%i\t%i\t%f\n" % (term, d_term2dfreq[term], d_term2cfreq[term], term_prob))
for feat in d_feat2dfreq.keys():
feat_prob = float(d_feat2dfreq[feat])/doc_count
s_feats_file.write( "%s\t%i\t%i\t%f\n" % (feat, d_feat2dfreq[feat], d_feat2cfreq[feat], feat_prob))
s_canon_file = codecs.open(canon_file, "w", encoding='utf-8')
for key,value in can.d_n2canon.items():
# Only write out a line if the canonical form differs from the surface form
if key != value:
s_canon_file.write("%s\t%s\n" % (key, value))
s_canon_file.close()
s_tf_file.close()
s_terms_file.close()
s_feats_file.close()
s_doc_terms_file.close()
# Finally, create a file to store the corpus size (# docs in the source directory)
cmd = "ls -1 " + filelist_file + " | wc -l > " + corpus_size_file
s_corpus_size_file = open(corpus_size_file, "w")
s_corpus_size_file.write("%i\n" % doc_count)
s_corpus_size_file.close()
print "[act_tf.py dir2features_count]Storing corpus size in %s " % corpus_size_file
#---
# Create a single file of term feature count for each year (from the .xml extracts of phr_feats data)
# role.run_dir2features_count()
# modified 3/3/14 to take parameters from run_tf_steps
# Set overwrite_p to True to overwrite output files if they already exist.
def run_dir2features_count(filelists_root, out_root, sections, start_year, end_year, overwrite_p=False, max_doc_terms_count=1000):
#pdb.set_trace()
int_start = int(start_year)
int_end = int(end_year) + 1
print "[act_tf.py]Output dir: %s" % out_root
# range should be start_year and end_year + 1
#for int_year in range(1981, 2008):
#for int_year in range(1995, 2008):
#for int_year in range(1997, 1998):
for int_year in range(int_start, int_end):
year = str(int_year)
filelist_year = filelists_root + "/" + year + ".files"
print "[act_tf.py]Processing files in: %s" % filelist_year
dir2features_count(filelist_year, out_root, sections, year, overwrite_p, max_doc_terms_count)
print "[act_tf.py]Completed: %s" % filelist_year
# python2.7 act_tf.py /home/j/anick/tw/roles/data/corpora/sp/filelists /home/j/anick/tw/roles/data/corpora/sp ta 9999 9999 True 1000
# for a range of dates, use start_year and end_year. For a single file, make start_year and end_year the same file.
if __name__ == "__main__":
args = sys.argv
filelists_root = args[1]
out_root = args[2]
sections = args[3] # ta or tas
start_year = args[4]
end_year = args[5]
overwrite_p = args[6]
max_doc_terms_count = int(args[7])
# take the defaults, including canonicalization
run_dir2features_count(filelists_root, out_root, sections, start_year, end_year, overwrite_p=overwrite_p, max_doc_terms_count=max_doc_terms_count)