-
-
Notifications
You must be signed in to change notification settings - Fork 50
/
vocabulary.py
34 lines (28 loc) · 1016 Bytes
/
vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# script to create vocabulary of given model
#
# @author: Andreas Mueller
# @see: Bachelor Thesis 'Analyse von Wort-Vektoren deutscher Textkorpora'
#
# Contributors:
# Michael Egger <michael.egger@tsn.at>
#
# @example: python vocabulary.py test.model test.model.vocab
import gensim
import argparse
# configuration
parser = argparse.ArgumentParser(description='Script for computing vocabulary of given corpus')
parser.add_argument('model', type=str, help='source file with trained model')
parser.add_argument('target', type=str, help='target file name to store vocabulary in')
args = parser.parse_args()
# load model
model = gensim.models.KeyedVectors.load_word2vec_format(args.model, binary=True)
# build vocab
vocab = []
for word, obj in model.vocab.items():
vocab.append([word, obj.count])
# save vocab
with open(args.target, 'w') as f:
for word, count in sorted(vocab, key=lambda x: x[1], reverse=True):
f.write('{} {}\n'.format(count, word))