-
Notifications
You must be signed in to change notification settings - Fork 1
/
tweet_analyzer.py
executable file
·125 lines (101 loc) · 3.51 KB
/
tweet_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
Find the most common words in a user's Twitter feed
Usage:
tweet_analyzer [--user USER] [--num NUMBER OF OCCURRENCES] [-f OUTPUTFILE]
Options:
-h --help show this
-u --user username of account to analyze
-n --num minimum number of occurrences the word appears [default: 20]
-f --file name of output file (.csv) [default: results.csv]
'''
import string
import nltk
import tweepy
import csv
import argparse
from credentials import consumer_key
from credentials import consumer_secret
from credentials import access_token
from credentials import access_token_secret
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import defaultdict
def main():
# add arguments
parser = argparse.ArgumentParser()
parser.add_argument(
'-u', '--user', type=str, help='username of account to analyze')
parser.add_argument(
'-n',
'--num',
type=int,
help='minimum number of occurrences the word appers [default: 20]',
default=20)
parser.add_argument(
'-f',
'--file',
type=str,
help='name of output file (.csv)',
default='results.csv')
args = parser.parse_args()
# parse arguments
if args.user:
user = '@' + args.user
else:
user = input('Username to analyze: ')
user = '@' + user
if args.num:
num_occurrences = int(args.num)
if args.file:
if '.csv' not in args.file:
output_file = args.file + '.csv'
else:
output_file = args.file
tweets = [] # list of tweets obtained from user timeline
count = defaultdict(int)
total_tweets_analyzed = 0
total_results = 0
# make connection
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
print('\nAnalyzing tweets. This could take a few seconds...')
# get tweets from user's timeline
for tweet in tweepy.Cursor(api.user_timeline, screen_name=user).items():
tweets.append(tweet._json['text'].lower())
# get rid of stop words
# list of stop words https://gist.github.com/sebleier/554280
stop_words = stopwords.words('english') + list(
string.punctuation) + ['“', '”']
# words to ignore in results
BACKLIST_WORDS = [
'https',
'``',
'...',
'http',
'rt',
]
# analyze tweets
for tweet in tweets:
total_tweets_analyzed += 1
for word in tweet.split(' '):
words = nltk.word_tokenize(word.replace("'", ""))
filtered_words = [word for word in words if word not in stop_words]
if len(filtered_words) > 0:
if filtered_words[0] not in BACKLIST_WORDS:
count[filtered_words[0]] += 1
# create a csv file with the results
with open(output_file, 'w') as file:
output = csv.writer(file)
output.writerow(['Word occurrences in tweets by ' + user])
output.writerow(['Word', '# of Occurrences'])
for result in count:
if count.get(result) > num_occurrences and len(result) > 1:
output.writerow([result, str(count.get(result))])
total_results += 1
print(f'\nCompleted! Total tweets analyzed: {total_tweets_analyzed}')
print(f'Total results found: {total_results}')
if __name__ == '__main__':
main()