-
Notifications
You must be signed in to change notification settings - Fork 0
/
util_html.py
94 lines (67 loc) · 2.99 KB
/
util_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
import re
from bs4 import BeautifulSoup
def url_to_string(url):
"""
Extracts the raw text from a web page.
It takes a URL string as input and returns the text.
"""
parser_content = url_to_html(url)
return html_to_string(parser_content)
def html_to_string(parser_content):
"""Extracts the textual content from an html object."""
# Remove scripts
for script in parser_content(["script", "style", "aside"]):
script.extract()
# This is a shorter way to write the code for removing the newlines.
# It does it in one step without intermediate variables
return " ".join(re.split(r'[\n\t]+', parser_content.get_text()))
def url_to_html(url):
"""Scrapes the html content from a web page. Takes a URL string as input and returns an html object. """
# Get the html content
res = requests.get(url)
res = requests.get(url, headers={"User-Agent": "XY"})#Van Etienne om error in 1.4 te fixen
html = res.text
parser_content = BeautifulSoup(html, 'html5lib')
return parser_content
# We are looking for the author information at places where it can often be found.
# If we do not find it, it does not mean that it is not there.
def parse_author(html_content):
# Initialize variables
search_query = re.compile('author', re.IGNORECASE)
name = ""
# The author information might be encoded as a value of the attribute name
attribute = html_content.find('meta', attrs={'name': search_query})
# Or as a property
property = html_content.find('meta', property=search_query)
found_author = attribute or property
if found_author:
name = found_author['content']
# If the author name cannot be found in the metadata, we might find it as an attribute of the text.
else:
itemprop = html_content.find(attrs={'itemprop': 'author'})
byline = html_content.find(attrs={'class': 'byline'})
found_author = itemprop or byline
if found_author:
name = found_author.text
name = name.replace("by ", "")
name = name.replace("\n", "")
return name.strip()
#This function requires the HTML content of the result as an input parameter
#It returns the actual text content
def parse_news_text(html_content):
# Try to find Article Body by Semantic Tag
article = html_content.find('article')
# Otherwise, try to find Article Body by Class Name (with the largest number of paragraphs)
if not article:
articles = html_content.find_all(class_=re.compile('(body|article|main)', re.IGNORECASE))
if articles:
article = sorted(articles, key=lambda x: len(x.find_all('p')), reverse=True)[0]
# Parse text from all Paragraphs
text = []
if article:
for paragraph in [tag.text for tag in article.find_all('p')]:
if re.findall("[.,!?]", paragraph):
text.append(paragraph)
text = re.sub(r"\s+", " ", " ".join(text))
return text