forked from techknowledgist/act
-
Notifications
You must be signed in to change notification settings - Fork 0
/
act_pnames.py
80 lines (69 loc) · 3.39 KB
/
act_pnames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# act_pnames.py version adapted from pnames.py to handle new directory structure
# pnames.py
# functions to construct path names
import os
import roles_config
# create path to sections
def sections_root(corpus_root, corpus, sections):
return("/".join([corpus_root, corpus, sections]))
def tv_filepath(corpus_root, corpus, sections, year, file_type, subset, cat_type=""):
# check for illegal parameter values
# note: for file_type, we allow values of the form "cat.<cutoff>"
if file_type not in ["diff", "tf", "cs", "cat", "cat_prob", "fc", "fc_kl", "fc_prob", "fc_uc", "tc", "tcs", "tfc", "feats", "terms", "ds", "filt.gold", "feats.1000", "unlab", "train", "tstart"] and not file_type[0:5] == "cat.w":
print "[tv_filepath]WARNING: unknown file type: %s" % file_type
if subset not in ["", "a", "t", "c"]:
# note: subset can be empty string
print "[tv_filepath]ERROR: unknown subset: %s" % subset
quit
if cat_type not in ["", "pn", "act"]:
print "[tv_filepath]ERROR: unknown cat_type: %s" % cat_type
quit
#tv_subpath = "/data/tv/"
tv_subpath = "/" + sections + "/"
# make sure we don't create double slashes in the name
if corpus_root[-1] != "/":
corpus_root += "/"
if cat_type != "":
cat_type = "." + cat_type
if subset != "":
subset = "." + subset
full_filename = corpus_root + corpus + tv_subpath + str(year) + subset + cat_type + "." + file_type
print "[tv_filepath]file: %s" % full_filename
return(full_filename)
# sections is ta or tas, depending on the sections included (title, abstract, summary)
def tf_dir(corpus_root, corpus, sections="tas"):
tf_subpath = "/data/term_features/"
if sections == "ta":
tf_subpath = "/data/term_features_ta/"
# make sure we don't create double slashes in the name
if corpus_root[-1] != "/":
corpus_root += "/"
full_filename = corpus_root + corpus + tv_subpath
print "[tf_filepath]file: %s" % full_filename
return(full_filename)
def tv_dir(corpus_root, corpus, sections):
#tv_subpath = "/data/tv/"
tv_subpath = "/" + sections + "/"
# make sure we don't create double slashes in the name
if corpus_root[-1] != "/":
corpus_root += "/"
full_filename = corpus_root + corpus + tv_subpath
print "[tv_filepath]file: %s" % full_filename
return(full_filename)
# creates full tv filepath, including year prefix and qualifier
def tv_dir_year_file(corpus_root, corpus, sections, year, qualifier):
year = str(year)
path = tv_dir(corpus_root, corpus, sections)
full_filename = path + year + "." + qualifier
return(full_filename)
# full name of file containing the list of files for a given patent corpus and application year
# e.g. /home/j/corpuswork/fuse/FUSEData/corpora/ln-us-A21-computers/subcorpora/1997/config/files.txt
def fuse_filelist(fuse_corpus_root, corpus, year):
year = str(year)
full_filename = os.path.join(fuse_corpus_root, corpus, "subcorpora", year, "config", "files.txt")
return(full_filename)
# e.g., /home/j/corpuswork/fuse/FUSEData/corpora/ln-us-A21-computers/subcorpora/1997/data/d3_phr_feats/01/files/1998
def fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, filename):
year = str(year)
full_filename = os.path.join(fuse_corpus_root, corpus, "subcorpora", year, "data", pipeline_step, "01", "files", filename)
return(full_filename)