-
Notifications
You must be signed in to change notification settings - Fork 3
/
plain2snt.cpp
115 lines (106 loc) · 3.47 KB
/
plain2snt.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#include <iostream>
#include <string>
#include <strstream>
#include <fstream>
#include <map>
#include <vector>
#include <stdlib.h>
using namespace std;
int main(int argc,char**argv)
{
vector<double>weights;
vector<string>filenames;
for(int i=1;i<argc;++i)
if(string(argv[i])=="-weight")
weights.push_back(atof(argv[++i]));
else
filenames.push_back(argv[i]);
if((filenames.size()%2)==1||filenames.size()==0 )
{
cerr << argv[0] << " txt1 txt2 [txt3 txt4 -weight w]\n";
cerr << " Converts plain text into GIZA++ snt-format.\n";
exit(1);
}
string line1,line2,word;
map<string,int> v1,v2;
map<string,int> id1,id2;
vector<string> iid1(2),iid2(2);
string w1(filenames[0]);
string w2(filenames[1]);
if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )||
(w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) ))
{
w1=w1.substr(0,w1.length()-4);
w2=w2.substr(0,w2.length()-4);
cerr << "w1:"<< w1 << " w2:" << w2 << endl;
}
string vocab1(w1),vocab2(w2),snt1,snt2;
unsigned int slashpos=vocab1.rfind('/')+1;
if( slashpos>=vocab1.length() ) slashpos=0;
string vocab1x(vocab1.substr(slashpos,vocab1.length()));
cout << vocab1 << " -> " << vocab1x << endl;
slashpos=vocab2.rfind('/')+1;
if( slashpos>=vocab2.length() ) slashpos=0;
string vocab2x(vocab2.substr(slashpos,vocab2.length()));
cout << vocab2 << " -> " << vocab2x << endl;
snt1=vocab1+"_"+vocab2x+string(".snt");
snt2=vocab2+"_"+vocab1x+string(".snt");
vocab1+=string(".vcb");
vocab2+=string(".vcb");
ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str());
for(unsigned int i=0;i<filenames.size();i+=2)
{
ifstream i1(filenames[i].c_str()),i2(filenames[i+1].c_str());
if(!i1)cerr << "WARNING: " << filenames[i] << " cannot be read.\n";
if(!i2)cerr << "WARNING: " << filenames[i+1] << " cannot be read.\n";
while(getline(i1,line1) && getline(i2,line2) )
{
vector<string> t1,t2;
istrstream ii1(line1.c_str());
while(ii1>>word)
{
t1.push_back(word);
v1[word]++;
if( id1.find(word)==id1.end() )
{
iid1.push_back(word);
id1[word]=iid1.size()-1;
}
}
istrstream ii2(line2.c_str());
while(ii2>>word)
{
t2.push_back(word);
v2[word]++;
if( id2.find(word)==id2.end() )
{
iid2.push_back(word);
id2[word]=iid2.size()-1;
}
}
double w=1.0;
if( i/2<weights.size() )
w=weights[i/2];
if( t1.size()&&t2.size() )
{
osnt1 << w << "\n";
for(unsigned int j=0;j<t1.size();++j)osnt1 << id1[t1[j]] << ' ';
osnt1 << '\n';
for(unsigned int j=0;j<t2.size();++j)osnt1 << id2[t2[j]] << ' ';
osnt1 << '\n';
osnt2 << w << "\n";
for(unsigned int j=0;j<t2.size();++j)osnt2 << id2[t2[j]] << ' ';
osnt2 << '\n';
for(unsigned int j=0;j<t1.size();++j)osnt2 << id1[t1[j]] << ' ';
osnt2 << '\n';
}
else
cerr << "WARNING: filtered out empty sentence (source: " << filenames[i] << " " << t1.size() <<
" target: " << filenames[i+1] << " " << t2.size() << ").\n";
}
}
for(unsigned int i=2;i<iid1.size();++i)
ovocab1 << i << ' ' << iid1[i] << ' ' << v1[iid1[i]] << '\n';
for(unsigned int i=2;i<iid2.size();++i)
ovocab2 << i << ' ' << iid2[i] << ' ' << v2[iid2[i]] << '\n';
}