This repository has been archived by the owner on Oct 26, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakefile
159 lines (127 loc) · 8.31 KB
/
makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# THIS MAKEFILE IS MEANT FOR LINUX ONLY. HOWEVWER, IT CAN BE MODIFIED AS PER CONVENVIENCE TO BE USED WITH OTHER OS AS WELL.
# LIST OF VARIABLES
# Change here, to the location of folder containing the UDPipe Models
UDPIPE_Models := $(HOME)/udpipe-ud-2.0-170801
# download the corpus, storing it as parallel.tgz. After the download, need to unzip the corpus into the current directory, with the original name preserved.
download:
wget http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz --user-agent "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.22 (KHTML, like Gecko) Ubuntu Chromium/25.0.1364.160 Chrome/25.0.1364.160 Safari/537.22"
tar -xf parallel.tgz
mv parallel/IITB.en-hi.hi ./
rm -r parallel
rm parallel.tgz
# The current folder now contains the downloaded corpus. Edit the variable `$UDPIPE_Models` above to point to udpipe's saved model folders.
# This step will most likely take a few hours. Please be patient.
# Output file: hi.conllu
udpipe_process:
udpipe --tokenizer=presegmented --tag --parse --immediate $(UDPIPE_Models)/hindi-ud-* IITB.en-hi.hi > hi.conllu
# After the step above, we have CONLLU format analyzed hi.conllu files.
# This will extract the list of nouns in a file called en.NOUN and the entries in the CONLLU file with the tag as NOUN, seperated by the line ID.
# After getting the output, we sort the unique values of the words found, and store them back in the same file.
get_nouns:
# The switch at the end of the next two commands is optional, to view the case-insensitive list of the most common nouns, followed by their frequencies, stored in langcode.most_common file.
# Feel free to remove `--count` switch if you don't want to see those statistics. However, it is HIGHLY recommended that you use that switch.
# The two files generated here, .most_common (from --count) and .NOUN (from -pos) differ in their length, since the latter also includes some garbage values.
# By garbage values, it is meant that the values are not of the form in devanagri alphabet or are useless otherwise. So, we eliminate the files with .NOUN
# and also the file with .output since we won't be needing that file for this output
# Output file: hi.most_common; hi.output
python3 extract_dict.py -i hi.conllu -pos NOUN --count
rm hi.NOUN
# Since, we won't be needing a few files anymore, let's clean up the directory a bit
cleanup:
mkdir corpus
mv IITB.en-hi.hi corpus/
mv hi.conllu corpus/
# Having cleaned up the directory, we remove noun tokens from consideration which might be
# 1. having non-clear usage of gender, i.e. gender can't be defined in particular for the usage at certain times.
# 2. can be used with either gender
# The resulting file checked is called remove_list
sample_data:
# Download UD treebanks
curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2837/ud-treebanks-v2.2.tgz -o $(HOME)/
tar -xf $(HOME)/ud-treebanks-v2.2.tgz
rm $(HOME)ud-treebanks-v2.2.tgz
# Transfer treebanks to another folder for ease of usage, and proecess the CONLLU files contained within.
mkdir $(HOME)/ud-treebanks-v2x2
mv $(HOME)/ud-treebanks-v2.2/* $(HOME)/ud-treebanks-v2x2/
python3 extract_dict.py -i $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu -pos NOUN --count
python3 extract_dict.py -i $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-test.conllu -pos NOUN --count
python3 extract_dict.py -i $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu -pos NOUN --count
python3 extract_dict.py -i $(HOME)/ud-treebanks-v2x2/UD_Hindi-PUD/hi_pud-ud-test.conllu -pos NOUN --count
# Create a directory and then move the results to this directory
mkdir sample
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-dev.output sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-dev.NOUN sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-dev.most_common sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-test.output sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-test.NOUN sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-test.most_common sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-train.output sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-train.NOUN sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-HDTB/hi_hdtb-ud-train.most_common sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-PUD/hi_pud-ud-test.output sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-PUD/hi_pud-ud-test.NOUN sample/
mv $(HOME)/ud-treebanks-v2x2/UD_Hindi-PUD/hi_pud-ud-test.most_common sample/
mv $(HOME)/ud-treebanks-v2x2/* $(HOME)/ud-treebanks-v2.2/
rm -r $(HOME)/ud-treebanks-v2x2
rm sample/hi*.NOUN
mv process_sample.py sample/
# Combine the multiple .most_common files, and delete the redundant data.
python3 filter_data.py --combine sample/hi_*.most_common
rm sample/hi*.most_common
# Check the translations and get Table 1 as in the source paper.
# Test to know how many values in the true_gender values are used in different context, and output result in an intermediate file.
# Output file: sample/test
python3 sample/process_sample.py translation true_gender_hin.list sample/*.output > sample/test
# Process this intermediate file, to generate the symbols as explained in process_sample.py file as comments to get a similar output to table 1 in the reference paper
# Output file: sample/truth_data_analysis
python3 sample/process_sample.py translation_check sample/test true_gender_eng_to_hin.tsv
# Sort the output file, and get rid of the intermediate file
sort -u sample/truth_data_analysis > sample/default_seeds_table.tsv
rm sample/test
rm truth_data_analysis
# With all the data ready, we start with context bootstrapping on the sample data, to update the seeds
# and then go for morphological analysis using trie model
bootstrap:
# get the list of seeds.
# Output file: seeds.LIST
python3 filter_data.py -i true_gender_eng_to_hin.tsv -ds
# remove seeds, ambiguous content, low freq items from sample/sample.most_common
# output file generated: sample/sample.most_common.list_filtered
python3 filter_data.py -i sample/sample.most_common -fl -l remove_list
mv sample/sample.most_common.list_filtered sample/sample.removed
# output file generated: sample/sample.removed.list_filtered > context_bootstrap
python3 filter_data.py -i sample/sample.removed -fl -l seeds.LIST
mv sample/sample.removed.list_filtered sample/context_bootstrap
rm sample/sample.removed
# continue with the bootstraping, and updating seeds.LIST
python3 filter_data.py --bootstrap -i sample/context_bootstrap --conllu sample/hi_*.output
# remove all seeds values from non_ambig file, and use it for morphological tagging
python3 filter_data.py -i sample/context_bootstrap -fl -l seeds.LIST
mv sample/context_bootstrap.list_filtered sample/morpho
# With the contextual bootstrapping completed, we move on to morphological analysis using trie model
# And finally, we generate the coverage and the final accuracy of the trie based approach.
morphology:
# morphologically tag all the values in the sample data
# output files: morpho.output, morpho.not_analysed
python3 filter_data.py --morphology -i sample/morpho --conllu sample/*.output > sample/morpho.not_analysed
cat sample/morpho.output >> seeds.LIST
# let's start processing the corpus that we initially extracted nouns from
# remove all the tokens that are present in remove_list
# Output file: hi.most_common.list_filtered > hi.non-ambig1
python3 filter_data.py -fl -l remove_list -i hi.most_common
mv hi.most_common.list_filtered hi.non-ambig1
# remove all the tokens that are present in sample/morpho.not_analysed
# Output file: hi.non-ambig1.list_filtered > hi.non-ambig
python3 filter_data.py -fl -l sample/morpho.not_analysed -i hi.non-ambig1
mv hi.non-ambig1.list_filtered hi.non-ambig
rm hi.non-ambig1
# remove all the tokens present in seeds.LIST
# Output file: hi.non-ambig.list_filtered > hi.morph
python3 filter_data.py -fl -l seeds.LIST -i hi.non-ambig
mv hi.non-ambig.list_filtered hi.morph
rm hi.non-ambig
# now, let us use the trie model for hi.morph file to get the final output
# Output file: hi.morph.output
python3 filter_data.py --morphology -i hi.morph > hi.morph.not_analysed
# Finally, get stats to check how the system performed on the corpus
python3 filter_data.py --get_accuracy -i hi.morph.output --conllu hi.output