-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgenerate_model.py
111 lines (83 loc) · 2.87 KB
/
generate_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
"""
Topic Modeling for 18th German Bundestag debates. Model generation with best model parameters found by model selection
in tm_eval.py.
As parameter, pass which data should be loaded, i.e. with which preprocessing pipeline it was generated:
1 -> use merged speeches, default pipeline
2 -> use merged speeches, remove salutatory addresses, default pipeline
Markus Konrad <markus.konrad@wzb.eu>
"""
from __future__ import division
import sys
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
from lda import LDA
from tmtoolkit.topicmod.model_io import print_ldamodel_doc_topics, print_ldamodel_topic_words, \
save_ldamodel_summary_to_excel
from tmtoolkit.utils import unpickle_file, pickle_data
#%% input args
if len(sys.argv) != 2:
print('run script as: %s <tokens preprocessing pipeline>' % sys.argv[0])
print('<tokens preprocessing pipeline> must be 1 or 2')
exit(1)
toks = int(sys.argv[1])
assert toks in (1, 2)
#%% model hyperparameters
if toks == 1:
K = 130
alpha_mod = 10.0
beta = 0.1
elif toks == 2:
K = 130
alpha_mod = 10.0
beta = 0.1
else:
print('<tokens preprocessing pipeline> must be 1 or 2')
exit(2)
LDA_PARAMS = dict(
n_topics=K,
alpha=alpha_mod/K,
eta=beta,
n_iter=2000
)
# other parameters
BURNIN = 5 # with a default of refresh=10 this means 50 burnin iterations
# paths to data files
DATA_PICKLE_DTM = 'data/speeches_tokens_%d.pickle' % toks
LDA_MODEL_PICKLE = 'data/model%d.pickle' % toks
LDA_MODEL_LL_PLOT = 'data/model%d_logliks.png' % toks
LDA_MODEL_EXCEL_OUTPUT = 'data/model%d_results.xlsx' % toks
#%% load
print('input tokens from preprocessing pipeline %d' % toks)
print('loading DTM from `%s`...' % DATA_PICKLE_DTM)
doc_labels, vocab, dtm, tokens = unpickle_file(DATA_PICKLE_DTM)
assert len(doc_labels) == dtm.shape[0]
assert len(vocab) == dtm.shape[1]
print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum()))
#%% compute model
print('generating model with parameters:')
pprint(LDA_PARAMS)
model = LDA(**LDA_PARAMS)
model.fit(dtm)
#%% output
print('saving model to `%s`' % LDA_MODEL_PICKLE)
pickle_data((doc_labels, vocab, dtm, model), LDA_MODEL_PICKLE)
print('saving results to `%s`' % LDA_MODEL_EXCEL_OUTPUT)
save_ldamodel_summary_to_excel(LDA_MODEL_EXCEL_OUTPUT, model.topic_word_, model.doc_topic_, doc_labels, vocab, dtm=dtm)
#%%
print('displaying loglikelihoods...')
plt.plot(np.arange(BURNIN, len(model.loglikelihoods_)) * 10, model.loglikelihoods_[BURNIN:])
plt.xlabel('iterations')
plt.ylabel('log likelihood')
plt.savefig(LDA_MODEL_LL_PLOT)
plt.show()
#%%
print('topic-word distribution:')
print('-----')
print_ldamodel_topic_words(model.topic_word_, vocab)
print('-----')
print('document-topic distribution:')
print('-----')
print_ldamodel_doc_topics(model.doc_topic_, doc_labels)
print('-----')