-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
153 lines (111 loc) · 4.4 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import csv
import logging
import os
import random
import re
import sys
from typing import List
import requests
from bs4 import BeautifulSoup
from extract import VALID_SITES
ses = requests.session()
logger = logging.getLogger()
logger.setLevel("INFO")
# Select links to news articles from in.gr
def get_in_gr(soup: BeautifulSoup) -> List[str]:
links = soup.find_all("a", {"class": "tile relative-title"})
return [i["href"] for i in links]
# Select links to news articles from zougla.gr
def get_zougla(soup: BeautifulSoup) -> List[str]:
links = soup.find_all("div", {"class": "secondary_story_content"})
return [
"https://www.zougla.gr/politiki/" + i.find("a", href=True)["href"]
for i in links
]
# Select links to news articles from naftemporiki.gr
def get_naftemporiki(soup: BeautifulSoup) -> List[str]:
links = soup.find_all("h4")
prepend = "https://www.naftemporiki.gr"
return [
prepend + i.find("a", href=True)["href"]
for i in links
if i.find("a", href=True)["href"].startswith("/story")
]
# Select links to news articles from news247.gr
def get_news247(soup: BeautifulSoup) -> List[str]:
links = soup.find_all("h3", {"class": "article__title bold"})
return [i.find("a", href=True)["href"] for i in links]
# Perform GET request to every website and gather links
def get_latest_from_url(url):
ses.cookies.clear()
res = ses.get(url, timeout=2, headers={"Content-Type": "text/html; charset=UTF-8"})
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
base_url = re.split(r"\b(?:(/)(?!\1))+\b", url)[0]
if base_url in VALID_SITES:
if base_url == VALID_SITES[0]:
res = get_in_gr(soup)
logger.info(f"Found {len(res)} articles from {base_url}")
elif base_url == VALID_SITES[1]:
res = get_zougla(soup)
logger.info(f"Found {len(res)} articles from {base_url}")
elif base_url == VALID_SITES[2]:
res = get_naftemporiki(soup)
logger.info(f"Found {len(res)} articles from {base_url}")
elif base_url == VALID_SITES[3]:
res = get_news247(soup)
logger.info(f"Found {len(res)} articles from {base_url}")
return res
raise ValueError("Provided URL is invalid")
def flatten(t) -> List:
return [item for sublist in t for item in sublist]
def file_exists(filepath: str):
if os.path.isfile(filepath) and os.stat(filepath, follow_symlinks=False).st_size:
logger.info("Found non-empty file %s", os.path.abspath(filepath))
return True
logger.warning("File %s not found", filepath)
return False
def get_num_lines(filepath: str) -> int:
with open(filepath) as f:
count = sum(1 for _ in f) - 1 # header
logger.info("Setting the starting index to %d", count)
return count
# Write gathered links to a CSV file
def links_to_file(outfile: str, links: List[str], override: bool = False):
fmode, start_idx = "w", 0
if file_exists(outfile) and not override:
logger.info("Appending to already existing file %s", outfile)
fmode = "a"
start_idx = get_num_lines(outfile)
with open(outfile, mode=fmode, encoding="utf-8") as out:
writer = csv.writer(out)
if fmode == "w":
logger.info("Writing to new file %s", outfile)
writer.writerow(("id", "url"))
writer.writerows(list(zip(range(len(links)), links)))
else:
writer.writerows(list(zip(range(start_idx, start_idx + len(links)), links)))
def main():
logging.basicConfig(
format="[%(levelname)s] %(asctime)s : %(message)s",
datefmt="%d/%m/%Y %I:%M:%S %p",
)
assert len(sys.argv) == 3, "Not enough arguments: <outfile.csv> <0/1>"
base_urls = (
"https://www.in.gr/politics/",
"https://www.zougla.gr/politiki/main",
"https://www.naftemporiki.gr/politics",
"https://www.news247.gr/politiki/",
)
links = flatten([get_latest_from_url(url) for url in base_urls])
# random.shuffle(links)
if int(sys.argv[2]) == 1:
logger.warning(
"File already exists but user requested to discard %s", sys.argv[1]
)
links_to_file(sys.argv[1], links, override=True)
else:
links_to_file(sys.argv[1], links)
logger.info("Done")
if __name__ == "__main__":
main()