-
Notifications
You must be signed in to change notification settings - Fork 2
/
act_run_mallet.py
85 lines (66 loc) · 3.84 KB
/
act_run_mallet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# generalized from pa_runs.py
# This file is designed to keep a record of mallet runs
#
import sys
import pdb
import act_pnames
import run_mallet
import vectors_helpers
# 4/25/18
# pa_runs.run_sig("NB", "woc", "9999", False)
def run_sig(classifier_type, context_type, year, feat_info=False):
print "[act_run_mallet.py]starting run_sig"
corpus_tv = "/home/j/anick/tw/roles/data/corpora/SignalProcessing/data/tv/"
num_infogain_features = 50
#tf_path = "/home/j/anick/patent-classifier/ontology/roles/data/patents/health_bio_abs/data/tv/i_bio_abs.cand_bigrams.bigram.mtf.inst_all"
tf_path = corpus_tv + year + ".tf"
#tcs_path = "/home/j/anick/patent-classifier/ontology/roles/data/patents/health_bio_abs/data/tv/inst_all/i_bio_abs.cand_bigrams.bigram.mtf.i.0.8.2.tcs"
#tcs_path = ".".join(["/home/j/anick/patent-classifier/ontology/roles/data/patents/health_bio_abs/data/tv/inst_all/dja", gram_type, context_type] )
tcs_path = corpus_tv + year + ".act.tcs"
feature_file_path = None
#output_dir = "/home/j/anick/patent-classifier/ontology/roles/data/patents/health_bio_abs/data/eval/mallet/"
output_dir = corpus_tv
# output file prefix should contain: ML_type (NB, ME), feature_source (PR, IG)+Number, seedset_identifier (ssi),
# index_source (health_2002), sections (inst_all, inst_abs), pn ratio (0.8), min_freq (5), gram_type (uni, bi)
# eg. NB.IG50.ssi.health_2002.inst_all.bi
output_file_scheme = ".".join(["SignalProcessing", context_type, year] )
# NOTE: the classifier type (NB, ME) has to be the first thing in the file name, since
# that is how the info is passed on to mallet.
output_file_prefix = classifier_type + ".IG50." + output_file_scheme
run_mallet.run_classify(tf_path, tcs_path, output_dir, output_file_prefix, feature_file_path=feature_file_path, num_infogain_features=num_infogain_features, feat_info=feat_info)
def run_mallet_classifier(corpus_root, corpus, sections, classifier_type, context_type, year, num_infogain_features=50, feat_info=False):
"""
sections: ta (text and abstract only), or tas (include summary as well)
classifier_type: NB or ME (mallet classifier to use)
context_type: woc (without context)
year: actual year or fictional number that looks like a year (e.g. 9999)
feat_info: outputs another info file if True.
num_infogain_features: how many features to use (sorted by infogain score)
"""
print "[act_run_mallet.py]Starting run_mallet_classifier"
sections_path = act_pnames.sections_root(corpus_root, corpus, sections)
tf_path = sections_path + "/" + year + ".tf"
tcs_path = sections_path + "/" + year + ".act.tcs"
feature_file_path = None
output_dir = sections_path
output_file_scheme = ".".join([corpus, context_type, year] )
# NOTE: the classifier type (NB, ME) has to be the first thing in the file name, since
# that is how the info is passed on to mallet.
output_file_prefix = classifier_type + ".IG" + str(num_infogain_features) + "." + output_file_scheme
print("[act_run_mallet.py]tf_path:%s, tcs_path:%s, output_dir:%s, output_file_prefix:%s" % (tf_path, tcs_path, output_dir, output_file_prefix))
run_mallet.run_classify(tf_path, tcs_path, output_dir, output_file_prefix, feature_file_path=feature_file_path, num_infogain_features=num_infogain_features, feat_info=feat_info)
# python2.7 act_run_mallet.py /home/j/anick/tw/roles/data/corpora sp ta NB woc 9999 1
if __name__ == "__main__":
args = sys.argv
corpus_root = args[1]
corpus = args[2]
sections = args[3]
classifier_type = args[4]
context_type = args[5]
year = args[6]
feat_info = args[4]
if feat_info == "0":
feat_info_p = False
else:
feat_info_p = True
run_mallet_classifier(corpus_root, corpus, sections, classifier_type, context_type, year, feat_info=feat_info_p)