-
Notifications
You must be signed in to change notification settings - Fork 0
/
1-data-cleaning.py
executable file
·74 lines (56 loc) · 2.19 KB
/
1-data-cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 16 23:08:25 2020
@author: nesko
"""
from inc import functions as f
from inc import stop_words as sw
from inc import extra_stopwords as esw
import pandas as pd
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
pd.set_option('max_colwidth', 150)
table_name = 'Goals'
engine = create_engine('sqlite:///db/goals.sqlite', echo=False)
data_df = pd.read_sql_table(table_name, engine)
data_df.set_index('Name', inplace=True, drop=True)
# print(data_df)
# print(data_df.columns)
data_clean = pd.DataFrame(data_df.Description.apply(f.lemmatizeText))
data_clean = pd.DataFrame(data_clean.Description.apply(f.clean_text_round1))
data_clean = pd.DataFrame(data_clean.Description.apply(f.clean_text_round2))
sw.STOP_WORDS = sw.STOP_WORDS.union(esw.common_stopwords)
sw.STOP_WORDS = sw.STOP_WORDS.union(esw.extra_stopwords)
# cv = CountVectorizer(stop_words=sw.STOP_WORDS, ngram_range=(1, 2), max_features=1000)
cv = CountVectorizer(stop_words=sw.STOP_WORDS, max_features=1000)
data_cv = cv.fit_transform(data_clean.Description)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm = data_dtm.transpose()
# print(data_dtm)
# Find the top 30 words within each goal
top_dict = {}
for c in data_dtm.columns:
top = data_dtm[c].sort_values(ascending=False).head(30)
top_dict[c] = list(zip(top.index, top.values))
# print(top_dict)
# Print the top 15 words within each goal
for name, top_words in top_dict.items():
print()
print(', '.join([word for word, count in top_words[0:14]]))
print('---')
wc = WordCloud(stopwords=sw.STOP_WORDS, background_color="white",
colormap="Dark2", max_font_size=150, random_state=42)
plt.rcParams['figure.figsize'] = [30, 12]
names = data_clean.index.values.tolist()
# Create subplots for each Global Goal
for index, name in enumerate(data_dtm.columns):
wc.generate(data_clean.Description[name])
plt.subplot(5, 4, index+1)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(names[index])
plt.show()