-
Notifications
You must be signed in to change notification settings - Fork 0
/
profiles.py
109 lines (89 loc) · 3.86 KB
/
profiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import streamlit as st
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urlparse, parse_qs
import time
# Function to scrape Instagram profiles
def get_insta_accounts(location, num_pages=10):
profiles = []
for page in range(num_pages):
# Create search query for Google search
keyword_search_location = f"instagram smoke shop {location}"
start = page * 10
url = f"https://www.google.com/search?q={keyword_search_location}&start={start}"
# Set user agent header to avoid being blocked by Google
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
# Send request to Google and extract profiles from search results
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
div_elements = soup.select("div.egMi0.kCrYT")
for div in div_elements:
# Extract the profile link from the href attribute
href = div.find("a")["href"]
parsed_url = urlparse(href)
query_params = parse_qs(parsed_url.query)
profile_link = query_params.get("url", [""])[0]
# Check if the profile link belongs to Instagram
if (
profile_link.startswith("https://www.instagram.com/")
and "/p/" not in profile_link
and "/explore/" not in profile_link
and "?utm_medium=copy_link" not in profile_link
and "?hl=ne" not in profile_link
):
# Extract the profile ID from the link
profile_id = profile_link.strip("/").split("/")[-1]
# Extract the title from the h3 element
title = div.select_one(
"div.DnJfK div.j039Wc h3 div.BNeawe.vvjwJb.AP7Wnd"
).text
# Append the profile information to the list
profiles.append(
{
"Title": title,
"Profile Link": profile_link,
"Profile ID": profile_id,
}
)
return profiles
# Main function to scrape profiles for selected city
def scrape_profiles_for_city(city):
profiles = get_insta_accounts(city, num_pages=5)
# Generate timestamp for the filename
timestamp = time.strftime("%Y%m%d%H%M%S")
# Save the profiles to a CSV file with the timestamp in the filename
output_file = f"./profiles/{city}_profiles_{timestamp}.csv"
with open(output_file, mode="w", newline="") as csvfile:
writer = csv.DictWriter(
csvfile, fieldnames=["Title", "Profile Link", "Profile ID"]
)
writer.writeheader()
writer.writerows(profiles)
return profiles
if __name__ == "__main__":
# Load the list of cities from the CSV file
cities = []
with open("./data/cities.csv", newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
city = row["City"]
cities.append(city)
# Create a Streamlit app to select the city
st.title("Instagram Profiles Scraper")
selected_city = st.selectbox("Select a city:", cities)
if st.button("Scrape Profiles"):
profiles = scrape_profiles_for_city(selected_city)
st.subheader(f"Found {len(profiles)} profiles!")
if len(profiles) > 0:
st.write("Scraped profiles:")
for profile in profiles:
st.write(f"Title: {profile['Title']}")
st.write(f"Profile Link: {profile['Profile Link']}")
st.write(f"Profile ID: {profile['Profile ID']}")
st.write("---")
else:
st.write("No profiles found.")
st.success(f"Scraped profiles for {selected_city}!")