-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLDA_text_model.py
70 lines (55 loc) · 2.12 KB
/
LDA_text_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# most of code from https://github.com/priya-dwivedi/Deep-Learning
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import pandas as pd
import numpy as np
import nltk
class LDAtext:
def __init__(self):
self.stemmer = SnowballStemmer("english")
def text_preprocess_(self, documents):
'''
documents: list of strings
'''
processed_docs = []
for doc in documents:
processed_docs.append(self.preprocess(doc))
return processed_docs
def lemmatize_stemming(self, text):
return self.stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(self, text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(
token) > 3:
result.append(token)
return result
def train(self,
documents,
number_of_topics=8,
number_of_passes=10,
number_of_workers=2):
processed_docs = self.text_preprocess_(documents)
self.dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [self.dictionary.doc2bow(doc) for doc in processed_docs]
self.lda_model = gensim.models.LdaMulticore(
bow_corpus,
num_topics=number_of_topics,
id2word=self.dictionary,
passes=number_of_passes,
workers=number_of_workers)
def predict(self, unseen_doc):
# Data preprocessing step for the unseen document
bow_vector = self.dictionary.doc2bow(self.preprocess(unseen_doc))
return self.lda_model[bow_vector]
def topics(self):
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in self.lda_model.print_topics(-1):
print("Topic: {} \nWords: {}".format(idx, topic))
print("\n")