This repository has been archived by the owner on Jan 21, 2024. It is now read-only.
forked from OpenNMT/OpenNMT-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
corenlp_tokenize.py
66 lines (52 loc) · 2.14 KB
/
corenlp_tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os, sys
import time
sys.path.append(os.getcwd())
from stanza.nlp.corenlp import CoreNLPClient
import json
from tqdm import tqdm
from collections import defaultdict
import re
import gzip
import argparse
parser = argparse.ArgumentParser(description='preprocess.py')
##
## **Preprocess Options**
##
parser.add_argument('-config', help="Read options from this file")
parser.add_argument('-input-fn', required=True,
help="Path to the input english data")
parser.add_argument('-output-fn', required=True,
help="Path to the output english data")
parser.add_argument('-src_vocab_size', type=int, default=50000,
help="Size of the source vocabulary")
parser.add_argument('-tgt_vocab_size', type=int, default=50000,
help="Size of the target vocabulary")
parser.add_argument('-src_vocab',
help="Path to an existing source vocabulary")
parser.add_argument('-tgt_vocab',
help="Path to an existing target vocabulary")
parser.add_argument('-seq_length', type=int, default=50,
help="Maximum sequence length")
parser.add_argument('-shuffle', type=int, default=1,
help="Shuffle data")
parser.add_argument('-seed', type=int, default=3435,
help="Random seed")
parser.add_argument('-lower', action='store_true', help='lowercase data')
parser.add_argument('-report_every', type=int, default=100000,
help="Report status every this many sentences")
opt = parser.parse_args()
corenlp = CoreNLPClient(default_annotators=['tokenize', 'ssplit'])
def annotate_sentence(corenlp, gloss):
try:
parse = corenlp.annotate(gloss)
except:
time.sleep(10)
parse = corenlp.annotate(gloss)
token_str = ' '.join([token['word'] for sentence in parse.json['sentence'] for token in sentence['token'] ])
#return parse.json['sentence'][0]['token']
return token_str
with open(opt.input_fn, 'r') as f:
with open(opt.output_fn, 'w') as f2:
for sent in f.readlines():
f2.write(annotate_sentence(corenlp, sent))
f2.write('\n')