-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_dict.sh.old
executable file
·32 lines (31 loc) · 1.45 KB
/
build_dict.sh.old
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/bin/bash
# sort
export LC_ALL=C && sort diccionari.txt >diccionari_sorted.txt
#cp diccionari_sorted.txt ./backups/diccionari_$(date +%Y%m%d-%H%M%S).txt
rm diccionari.txt
mv diccionari_sorted.txt diccionari.txt
# replace whitespaces with tabs
perl sptotabs.pl <diccionari.txt >diccionari_tabs.txt
# create list of used tags
gawk -f tags.awk diccionari_tabs.txt | sort -u > catalan_tags.txt
# create tagger dictionary with morfologik tools
java -jar morfologik.jar tab2morph -i diccionari_tabs.txt -o diccionari_morph.txt -annotation _
export LC_ALL=C && sort diccionari_morph.txt | java -jar morfologik.jar fsa_build --sorted -f cfsa2 -o catalan.dict
# dump the tagger dictionary
java -jar morfologik.jar fsa_dump -d catalan.dict -x >catalan_lt.txt
# reorder columns for syntesis dictionary
gawk -f synthesis.awk diccionari_tabs.txt >output2.txt
# create synthesis dictionary with morfologik tools
java -jar morfologik.jar tab2morph -nw -i output2.txt -o encoded.txt
#perl morph_data_ca.pl <output2.txt >encoded.txt
export LC_ALL=C && sort encoded.txt | java -jar morfologik.jar fsa_build --sorted -f cfsa2 -o catalan_synth.dict
# dump synthesis dictionary
java -jar morfologik.jar fsa_dump -d catalan_synth.dict -x >catalan_synth_lt.txt
rm diccionari_tabs.txt
rm diccionari_morph.txt
rm output2.txt
rm encoded.txt
#convert catalan_tags.txt to DOS file
sed 's/$'"/`echo \\\r`/" catalan_tags.txt > catalan_tags_dos.txt
rm catalan_tags.txt
mv catalan_tags_dos.txt catalan_tags.txt