-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
41 lines (36 loc) · 1.25 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from urllib import urlopen
from bs4 import BeautifulSoup
from textblob import TextBlob
from re import findall
def check_spaces(sentence, tolerance=2):
"""
Heuristic for checking number of spaces to filter out scraped menu items and other site structure contents
Original code https://stackoverflow.com/a/34460986/863923
sentence (str) - Sentence to inspect
tolerance (int) - (Optional) Number of spaces within normal sentences
return (bool) - True if sentence does not exceed tolerant number of spaces else False
"""
tokens = findall('\s+', sentence)
for i in range(0, len(tokens)):
if len(tokens[i]) > tolerance: return False
return True
""" Original source https://stackoverflow.com/a/1983219/863923 """
def get_body(url):
"""
Returns an article's full text
url (str) - URL of article's full text
return (str) - Full text of article
"""
readable = ''
html = urlopen(url)
try:
bs = BeautifulSoup(html.read(), 'lxml')
texts = bs.html.body.text
except:
return readable
sections = TextBlob(texts).sentences
for section in sections:
section = str(section).decode('utf-8').strip()
if len(section) == 0: continue
if check_spaces(section) and section[-1] == '.': readable += section + ' '
return readable