-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
123 lines (103 loc) · 4.17 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import pandas as pd
import shutil
from flask import Flask, render_template, send_from_directory, request
from filmweb_scraper import FilmWebScraper
from imdb_top250_scraper import ImdbTop250Scraper
from imdb_top100_popular_scraper import ImdbPopularMovies
from netflix_top10_PL_scraper import NetflixTop10PL
from waitress import serve
from loguru import logger
app = Flask(__name__)
def remove_http_cache(directory_path):
http_cache_path = os.path.join(directory_path, "http_cache")
try:
shutil.rmtree(http_cache_path)
print(f"Removed http_cache folder in {directory_path}")
except Exception as e:
# Handle any errors that may occur during folder removal
print(f"Error removing http_cache folder in {directory_path}: {e}")
def get_country_headers():
try:
user_country = request.headers.get("Cf-Ipcountry")
logger.info(f"Country code: {user_country}")
if not user_country:
logger.warning("Cf-Ipcountry header not found.")
user_country = "US" # Default to 'US' if header is not present
except Exception as e:
logger.error(f"Error retrieving country information: {e}")
user_country = "US" # Default to 'US' in case of an error
headers = {
"Accept-Language": f"{user_country.lower()}-{user_country};q=0.9,en-US;q=0.8,en;q=0.7",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}
return headers
# Define the root route
@app.route("/", methods=["GET", "POST"])
def root():
return render_template("home.html")
@app.route("/imdb_top250", methods=["GET", "POST"])
def imdb_top250():
url = "https://www.imdb.com/chart/top/"
headers = get_country_headers()
df = ImdbTop250Scraper.scrape_imdb_top250(url, headers)
# Remove duplicate rows based on the 'rank' column
df = df.drop_duplicates(subset=["rank"])
# Render the imdb_top250.html template, passing the DataFrame as an HTML table and its columns and rows as lists
return render_template(
"imdb_top250.html",
tables=[df.to_html(classes="data")],
titles=df.columns.values,
row_data=list(df.to_numpy().tolist()),
)
@app.route("/imdb_popular_movies", methods=["GET", "POST"])
def imdb_popular_movies():
url = "https://www.imdb.com/chart/moviemeter/"
headers = get_country_headers()
df = ImdbPopularMovies.scrape_imdb_popular_movies(url, headers)
# Remove duplicate rows based on the 'rank' column
df = df.drop_duplicates(subset=["rank"])
# Replace null values in the 'year' column with 0
df["year"] = df["year"].fillna(0).astype(int)
return render_template(
"imdb_popular_movies.html",
tables=[df.to_html(classes="data")],
titles=df.columns.values,
row_data=list(df.to_numpy().tolist()),
)
@app.route("/filmweb_top100", methods=["GET", "POST"])
def filmweb_top100():
url = "https://www.filmweb.pl"
headers = get_country_headers()
df = FilmWebScraper.scrape_filmweb_top100(url, headers)
df = df.drop_duplicates(subset=["rank"])
return render_template(
"filmweb_top100.html",
tables=[df.to_html(classes="data")],
titles=df.columns.values,
row_data=list(df.to_numpy().tolist()),
)
@app.route("/netflix_pl_top10", methods=["GET", "POST"])
def results_netflix_top10_pl():
scraper = NetflixTop10PL()
headers = get_country_headers()
scraper.parse(headers)
# Get the dates of the top 10 rankings from the scraper
dates = scraper.dates
# Create a DataFrame from the scraper's results
df = pd.DataFrame(scraper.results)
# Remove duplicate rows based on the 'rank' column
df = df.drop_duplicates(subset=["rank"])
# scraper.to_db()
return render_template(
"netflix_pl_top10.html",
tables=[df.to_html(classes="data")],
titles=df.columns.values,
row_data=list(df.to_numpy().tolist()),
dates=dates,
)
# Run the app using the waitress WSGI server
if __name__ == "__main__":
# app.run(debug=True, host="0.0.0.0", port=5000, threaded=True)
serve(app, port="5000", threads=8)