-
Notifications
You must be signed in to change notification settings - Fork 1
/
Wordcloud.py
37 lines (28 loc) · 1.1 KB
/
Wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Load and preprocess the data
df = pd.read_csv('train_preprocess.tsv.txt', sep='\t')
df.columns = ['Text', 'Sentiment']
# Download NLTK stopwords data
import nltk
nltk.download('punkt')
# Combine all the cleaned text into a single string
combined_text = ' '.join(df['Text'])
# Tokenize the combined text
words = word_tokenize(combined_text)
# Create a custom stopwords list for Indonesian language
custom_stop_words = set(["dan", "di", "sini", "..."]) # Add more stopwords as needed
# Remove custom stopwords from the list of words
filtered_words = [word for word in words if word.lower() not in custom_stop_words]
# Join the filtered words back into a single string
filtered_text = ' '.join(filtered_words)
# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(filtered_text)
# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()