-
Notifications
You must be signed in to change notification settings - Fork 0
/
noapiaccesstweetextraction.py
70 lines (55 loc) · 3.13 KB
/
noapiaccesstweetextraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# -*- coding: utf-8 -*-
"""NoAPIAccessTweetExtraction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1YTTZ1zO71oYV8RpLJsgD0wbZU4yxCpoR
# Tweet Extraction with the Twint Library
Allows for the extraction of tweets from Twitter without the need to access [Twitter's API](https://developer.twitter.com/en/docs/twitter-api/getting-started/getting-access-to-the-twitter-api) which now costs money to access.
Relevant Links:<br>
[Official PyPi](https://pypi.org/project/twint/) <br>
[Medium Blog / Article](https://basilkjose.medium.com/twint-twitter-scraping-without-twitters-api-aca8ba1b210e#:~:text=What%20is%20Twint%20%3F,scraping%20Tweets%20from%20Twitter%20profiles%20.) <br>
[Kaggle Discussion Page / Post](https://www.kaggle.com/general/207512) <br>
[Twint Status](https://www.reddit.com/r/Python/comments/vb8cmu/twint_python_twitter_crawler_no_longer_being/) <br>
[Twint Usage](https://www.reddit.com/r/OSINT/comments/1101aud/anyone_familiar_with_the_opensource_twitter_api/)
"""
#!pip3 install twint
#!pip3 uninstall twint
#!pip3 install --user --upgrade "git+https://github.com/twintproject/twint.git@origin/master#egg=twint"
import twint
# Configure and set up the details of what is to be scrapped
user = twint.Config()
tweetExtractCount = input("Enter the number of tweets you would like to scrape:") # Get user input for number of tweets
user.Limit = tweetExtractCount # Limit : Number of tweets to pull.
twitterUsername = input("Enter the username of the Twitter user which you would like to extract tweets from:")
user.Username = twitterUsername
# Optional Settings
user.Lang = "en"
user.Hide_output = True
user.Since = '2022-10-12' # Since : Filter tweets from this date.
user.Until= '2023-01-20' # Until : Filter tweets up to this date.
#user.Images= True # Images : Display only tweets with images .
#user.Videos = True # Videos : Display only tweets with videos.
user.Media = True # Media : Display tweets with only images or videos.
user.Popular_tweets = True # Popular_tweets : Scrape popular tweets ,most recent(default=False)
user.Min_likes = 50 # Min_likes : Filter tweets by minimum number of likes.
user.Min_replies = 10 # Min_retweets : Filter tweets by minimum number of retweets.
user.Min_retweets = 10 # Min_replies : Filter tweets by minimum number of replies.
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()
import pandas as pd
twint.run.Search(user) # run and search twint
Tweets_df = twint.storage.panda.Tweets_df # store scrapped data in pandas dataframe
print(Tweets_df) # print out pandas dataframe
# Optional Display - Displays Tweets using HTML and requests library
from IPython.display import HTML
import requests
def show_tweet(link):
'''Display the contents of a tweet. '''
url = 'https://publish.twitter.com/oembed?url=%s' % link
response = requests.get(url)
html = response.json()["html"]
display(HTML(html))
sample_tweet_link = Tweets_df.sample(1)['link'].values[0]
display(sample_tweet_link)
show_tweet(sample_tweet_link)