-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment.py
187 lines (172 loc) · 10.5 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import streamlit as st
import pandas as pd
import plotly.express as px
import random
@st.cache(persist = True, suppress_st_warning = True)
def load_data(my_story):
if my_story == 'Crown of Glass' or not my_story:
my_data = pd.read_csv("Data/cog.csv")
my_names = ['Edwina', 'Amphitrite', 'Eric', 'Tristan', 'Drusilla', 'Aidon', 'Celestina', 'Deimos', 'Cosmo', 'Emerick', 'Thanatos', 'Ambrosine', 'Apollo', 'Titania', 'Favian', 'Vivian', 'Lucius', 'Eros', 'Elodie', 'Miriel', 'Helios']
my_defaults = ['Edwina', 'Vivian', 'Emerick', 'Amphitrite', 'Cosmo', 'Tristan']
else:
my_data = pd.read_csv("Data/qod.csv")
my_names = ['Persephone', 'Hades', 'Demeter', 'Zeus', 'Hecate', 'Athena', 'Artemis', 'Thanatos', 'Poseidon', 'Charon', 'Cerberus']
my_defaults = ['Persephone', 'Hades', 'Demeter', 'Zeus', 'Hecate', 'Artemis']
my_data.pop('Unnamed: 0')
return my_data, my_names, my_defaults
@st.cache(suppress_st_warning = True)
def person_to_sentiments(person):
df = data[data['Text'].str.contains(person.lower())]
pos = df[(df.Compound > 0)].groupby('Chapter Name').count().reindex(data['Chapter Name'].unique()).reset_index().fillna(0).Positive
neu = df[(df.Compound == 0)].groupby('Chapter Name').count().reindex(data['Chapter Name'].unique()).reset_index().fillna(0).Neutral
neg = df[(df.Compound < 0)].groupby('Chapter Name').count().reindex(data['Chapter Name'].unique()).reset_index().fillna(0).Negative
df = pd.DataFrame(columns = ['Positive', 'Neutral', 'Negative', 'Chapter Name'])
df['Chapter Name'] = data['Chapter Name'].unique()
df['Positive'] = pos
df['Negative'] = neg
df['Neutral'] = neu
df = df.sum(axis = 0).reset_index()[:3]
df.rename(columns ={'index': 'Sentiment', 0: 'Comments'}, inplace = True)
n = list(df.Sentiment.values)
v = list(df.Comments.values)
c = ['#636EFA', '#00CC96','#EF553B']
v, n, c = [list(i) for i in zip(*sorted(zip(v, n, c), reverse=True))]
return n, v, c
@st.cache(suppress_st_warning = True)
def people_to_sentiments(people):
temp = pd.DataFrame(columns = ['Character', 'Positive', 'Neutral', 'Negative'])
for person in people:
df = data[data['Text'].str.contains(person.lower())]
df = sentiments_to_chapter(df)
df = df.sum(axis = 0)
row = pd.Series(list([person, df.Positive, df.Neutral, df.Negative]), index=temp.columns)
temp = temp.append(row, ignore_index=True)
temp[['Positive', 'Neutral', 'Negative']] = temp[['Positive', 'Neutral', 'Negative']].div(temp[['Positive', 'Neutral', 'Negative']].sum(axis=1), axis=0) * 100
temp[['Positive', 'Neutral', 'Negative']] = round(temp[['Positive', 'Neutral', 'Negative']], 2).astype(str) + '%'
temp = pd.melt(temp, id_vars=['Character'], value_vars=['Positive', 'Negative', 'Neutral'])
temp.rename(columns ={'variable': 'Sentiment', 'value': '% Comments'}, inplace = True)
return temp
@st.cache(suppress_st_warning = True)
def sentiments_to_chapter(df):
pos = df[(df.Compound > 0)].groupby('Chapter Name').count().reindex(data['Chapter Name'].unique()).reset_index().fillna(0).Positive
neu = df[(df.Compound == 0)].groupby('Chapter Name').count().reindex(data['Chapter Name'].unique()).reset_index().fillna(0).Neutral
neg = df[(df.Compound < 0)].groupby('Chapter Name').count().reindex(data['Chapter Name'].unique()).reset_index().fillna(0).Negative
df = pd.DataFrame(columns = ['Positive', 'Neutral', 'Negative', 'Chapter Name'])
df['Chapter Name'] = data['Chapter Name'].unique()
df['Positive'] = pos
df['Negative'] = neg
df['Neutral'] = neu
return df
@st.cache(suppress_st_warning = True)
def sentiments_to_time(df, choice):
mydict = dict()
df = df.reset_index()
if choice == 'Positive':
for i in range(len(df)):
if df.loc[i, 'Compound'] > 0:
mydict[df.loc[i, 'Date']] = mydict.get(df.loc[i, 'Date'], 0) + 1
mydict = pd.DataFrame(list(mydict.items()),columns = ['Date','Positive Comments']).sort_values(by = 'Date')
if choice == 'Neutral':
for i in range(len(df)):
if df.loc[i, 'Compound'] == 0:
mydict[df.loc[i, 'Date']] = mydict.get(df.loc[i, 'Date'], 0) + 1
mydict = pd.DataFrame(list(mydict.items()),columns = ['Date','Neutral Comments']).sort_values(by = 'Date')
if choice == 'Negative':
for i in range(len(df)):
if df.loc[i, 'Compound'] < 0:
mydict[df.loc[i, 'Date']] = mydict.get(df.loc[i, 'Date'], 0) + 1
mydict = pd.DataFrame(list(mydict.items()),columns = ['Date','Negative Comments']).sort_values(by = 'Date')
return mydict
def load_random():
while True:
i = random.randint(0, len(data) - 1)
if 'This comment may be offensive.' in data.loc[i]['Original'] or len(data.loc[i]['Original']) < 100:
# st.write(data.loc[i]['Original'])
continue
else:
st.sidebar.markdown(f"**@{data.loc[i]['Username']}** commented on {data.loc[i]['Chapter Name']}:")
st.sidebar.markdown(f"{data.loc[i]['Original']}")
break
st.title('sentifluent.')
st.markdown('A dashboard for viewing story stats and character targeted sentiment analysis.')
st.sidebar.markdown('[![Github Badge](https://img.shields.io/badge/-github-grey?style=for-the-badge&logo=github&logoColor=white&link=https://github.com/rubyruins/)](https://www.github.com/rubyruins/) [![Wattpad Badge](https://img.shields.io/badge/-wattpad-orange?style=for-the-badge&logo=wattpad&logoColor=white&link=https://www.wattpad.com/user/rubyruins)](https://www.wattpad.com/user/rubyruins)')
st.sidebar.title('Pick a story.')
story = st.sidebar.selectbox('Story', ['Crown of Glass', 'Queen of Death'])
data, names, defaults = load_data(story)
load_random()
st.header("Let's see how the readers feel about particular characters.")
st.subheader("Get started by picking a character from the story. ")
st.markdown("💡 Pro tip: Switch between different stories in the sidebar!")
character = st.selectbox('Select', names)
labels, values, colors = person_to_sentiments(character)
st.plotly_chart(px.pie(values = values, names = labels, color_discrete_sequence = colors, hole=0.5, title=f"Most reader sentiment towards {character} is {labels[values.index(max(values))].lower()}.").update_layout(width=600, height=400))
st.subheader("Compare a few of these characters together... ")
characters = st.multiselect('Select', names, default = defaults)
temp = people_to_sentiments(characters)
st.plotly_chart(px.bar(temp, x = 'Character', y = '% Comments', color='Sentiment', barmode='group'))
neg_ind = temp[temp.Sentiment == 'Negative']['% Comments'].str[:-1].astype('float').idxmax(axis = 0)
pos_ind = temp[temp.Sentiment == 'Positive']['% Comments'].str[:-1].astype('float').idxmax(axis = 0)
st.write(f"Phew! Most readers really dislike {temp.iloc[neg_ind]['Character']}, who has nearly {temp.iloc[neg_ind]['% Comments']} negative comments. {temp.iloc[pos_ind]['Character']} seems to be the most popular character, with {temp.iloc[pos_ind]['% Comments']} positive comments.")
st.subheader("General sentiment over all chapters.")
option = st.radio("Type", ['All', 'Positive', 'Neutral', 'Negative'], key = 3)
temp = data[['Chapter Name', 'Positive', 'Neutral', 'Negative', 'Compound']]
temp = sentiments_to_chapter(temp)
if option == 'All':
st.plotly_chart(px.line(temp, x = 'Chapter Name', y = ['Positive', 'Negative', 'Neutral']).update_xaxes(showticklabels=False).update_layout(yaxis_title=f"Total comments"))
else:
c = ['', '#636EFA', '#00CC96', '#EF553B'][['All', 'Positive', 'Neutral', 'Negative'].index(option)]
st.plotly_chart(px.line(temp, x = 'Chapter Name', y = option).update_traces(line_color=c).update_xaxes(showticklabels=False).update_layout(yaxis_title=f"Total {option.lower()} comments"))
st.subheader("General sentiment over all time.")
option = st.radio("Type", ['Positive', 'Neutral', 'Negative'], key = 4)
temp = sentiments_to_time(data, option)
c = ['', '#636EFA', '#00CC96', '#EF553B'][['All', 'Positive', 'Neutral', 'Negative'].index(option)]
st.plotly_chart(px.histogram(temp, x = 'Date', y = f'{option} Comments', histfunc='sum').update_traces(xbins_size="M1", marker_color=c).update_layout(bargap=0.35, yaxis_title=f"Total {option.lower()} comments"))
st.subheader("Stats over all chapters.")
option = st.radio("Type", ['Comments', 'Reads', 'Votes'], key = 1)
temp = data[['Comments', 'Reads', 'Votes', 'Chapter Name']].drop_duplicates()
st.plotly_chart(px.line(temp, x = 'Chapter Name', y = option).update_xaxes(showticklabels=False))
a, b = list(temp[option]), list(temp['Chapter Name'])
option = option.lower()
st.write(f"Each of the chapters has an average of around {round(sum(a) / len(a))} {option}! {b[a.index(max(a))]} performed the best with around {max(a)} {option}, while {b[a.index(min(a))]} achieved only {min(a)} {option}.")
st.subheader("Stats over all parts.")
option = st.radio("Type", ['Comments', 'Reads', 'Votes'], key = 2)
temp = data[['Comments', 'Reads', 'Votes', 'Part']].drop_duplicates().groupby(by=["Part"])[option].sum().reset_index()
st.plotly_chart(px.pie(temp, names = 'Part', values = option, hole=0.5).update_layout(width=400, height=400))
st.subheader("Comments over all time.")
temp = dict()
for i in data.Date:
temp[i] = temp.get(i, 0) + 1
temp = pd.DataFrame(list(temp.items()),columns = ['Date','Comments']).sort_values(by = 'Date')
st.plotly_chart(px.histogram(temp, x = 'Date', y = 'Comments', histfunc='sum').update_traces(xbins_size="M1").update_layout(bargap=0.35, yaxis_title=f"Total comments",
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1,
label="1m",
step="month",
stepmode="backward"),
dict(count=6,
label="6m",
step="month",
stepmode="backward"),
dict(count=1,
label="YTD",
step="year",
stepmode="todate"),
dict(count=1,
label="1y",
step="year",
stepmode="backward"),
dict(step="all")
])
),
rangeslider=dict(
visible=True
),
type="date"
)))
temp = temp[temp['Comments']==temp['Comments'].max()]
st.write(f"{story} reached max popularity on {temp.Date.values[0]} with {temp.Comments.values[0]} comments on a single day!")
st.subheader("That's all for now!")
st.markdown("**Thanks for making it all the way down here. 🎉**")
st.markdown("As a featured writer on Wattpad, I've always wanted to do some sentiment analysis on what users thought of my stories. This pet project gave me the perfect opportunity to work with real data and visualise clear insights about characters from these tales! I've used Selenium and BeautifulSoup for scraping comments + Pandas and NLTK for data preprocessing and sentiment analysis + Streamlit and Plotly Express for the dashboard and visulization. You can find the full code on Github [here](https://www.github.com/rubyruins/sentifluent), or give my work on Wattpad a quick read [here!](https://www.wattpad.com/user/rubyruins) 💖 ")