-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
87 lines (61 loc) · 2.49 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
headers = {"Accept-Language": "en-US, en;q=0.5"}
url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
# initialize empty lists to store data
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []
# extract div containers containing information
movie_div = soup.find_all('div', class_='lister-item mode-advanced')
pages = np.arange(1, 1001, 50)
for page in pages:
page = requests.get("https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page) + "&ref_=adv_nxt",
headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
movie_div = soup.find_all('div', class_='lister-item mode-advanced')
sleep(randint(2, 10))
for container in movie_div:
name = container.h3.a.text
titles.append(name)
year = container.h3.find('span', class_='lister-item-year').text
years.append(year)
runtime = container.p.find('span', class_='runtime') if container.p.find('span', class_='runtime') else ''
time.append(runtime)
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else ''
metascores.append(m_score)
nv = container.find_all('span', attrs={'name': 'nv'})
vote = nv[0].text
votes.append(vote)
grosses = nv[1].text if len(nv) > 1 else ''
us_gross.append(grosses)
movies = pd.DataFrame({
'movie': titles,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
'timeMin': time
})
movies['votes'] = movies['votes'].str.replace(',', '').astype(int)
movies.loc[:, 'year'] = movies['year'].str[-5:-1].astype(int)
movies['timeMin'] = movies['timeMin'].astype(str)
movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int)
movies['metascore'] = movies['metascore'].str.extract('(\d+)')
movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')
movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M'))
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'], errors='coerce')