-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_f.py
164 lines (128 loc) · 4.92 KB
/
main_f.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter
import numpy as np
#sns.set()
from urlextract import URLExtract
extra=URLExtract()
def fetch_stat(user,data_f):
if user== 'Overall':
#no of meesages
num_message=data_f.shape[0]
#no of words
words = []
for message in data_f['message']:
words.extend(message.split())
#fetch number of media send
no_of_media=data_f[data_f['message'] == '<Media omitted>\n'].shape[0]
#fetch no of links
links=[]
for message in data_f['message']:
links.extend(extra.find_urls(message))
return num_message,len(words),no_of_media,len(links)
else:
#no of message
new_data_f=data_f[data_f['user']== user]
num_message=data_f[data_f['user'] == user].shape[0]
#no of words
words = []
for message in new_data_f['message']:
words.extend(message.split())
#no_of_media
no_of_media = new_data_f[new_data_f['message'] == '<Media omitted>\n'].shape[0]
#no_of links
links=[]
for message in new_data_f['message']:
links.extend(extra.find_urls(message))
return num_message, len(words),no_of_media,len(links)
def most_busy_users(data_f):
x = data_f['user'].value_counts().head()
df=round((data_f['user'].value_counts() / data_f.shape[0]) * 100, 2).reset_index().rename(columns={'index': 'name', 'user': 'percentage'})
return x,df
#wordcloud
def create_wordcloud(user,data_f):
stop_word=[]
with open("hinglish.txt", "r+") as f:
lines = f.readlines()
for line in lines:
stop_word.append(line.replace("\n", ""))
if user != 'Overall':
data_f = data_f[data_f['user'] == user]
temp = data_f[data_f['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
def remove_stop(message):
q=[]
for word in message.lower().split():
if word not in stop_word:
q.append(word)
return " ".join(q)
wcloud=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
temp['message']=temp['message'].apply(remove_stop)
df_wcloud=wcloud.generate((data_f['message'].str.cat(sep=" ")))
return df_wcloud
#most used words
def most_used_words(user,data_f):
stop_word = []
with open("hinglish.txt", "r+") as f:
lines = f.readlines()
for line in lines:
stop_word.append(line.replace("\n", ""))
if user != 'Overall':
data_f = data_f[data_f['user'] == user]
temp=data_f[data_f['user']!= 'group_notification']
temp=temp[temp['message']!='<Media omitted>\n']
words=[]
for message in temp['message']:
for word in message.lower().split():
if word not in stop_word:
words.append(word)
return pd.DataFrame(Counter(words).most_common(50))
#timeline
def monthly_timeline(user,data_f):
if user != 'Overall':
data_f = data_f[data_f['user'] == user]
timeline = data_f.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
time = []
for i in range(timeline.shape[0]):
time.append(timeline['month'][i] + "-" + str(timeline['year'][i]))
timeline['time'] = time
return timeline
#dauily time line
def daily_timeline(user,data_f):
if user != 'Overall':
data_f = data_f[data_f['user'] == user]
date_timeline = data_f.groupby('timeline_date').count()['message'].reset_index()
return date_timeline
#activity map
def weekly_act(user,data_f):
if user != 'Overall':
data_f = data_f[data_f['user'] == user]
return data_f['day_name'].value_counts()
def month_activity(user,data_f):
if user != 'Overall':
data_f = data_f[data_f['user'] == user]
return data_f['month'].value_counts()
#busy and quiet day
def busy_quiet(user,data_f):
if user != 'Overall':
data_f = data_f[data_f['user'] == user]
timeline = data_f.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
time = []
for i in range(timeline.shape[0]):
time.append(timeline['month'][i] + "-" + str(timeline['year'][i]))
timeline['time'] = time
date_timeline = data_f.groupby('timeline_date').count()['message'].reset_index()
busiest_day = date_timeline.sort_values('message', ascending=0).reset_index()
busiest_day.index = np.arange(1, len(busiest_day) + 1)
quiet_day = date_timeline.sort_values('message', ascending=1).reset_index()
quiet_day.index = np.arange(1, len(quiet_day) + 1)
return busiest_day,quiet_day
#heat amp
def heat_map(user,data_f):
if user != 'Overall':
data_f = data_f[data_f['user'] == user]
heat_map_u=data_f.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
return heat_map_u