-
Notifications
You must be signed in to change notification settings - Fork 1
/
article_sents.py
58 lines (54 loc) · 2.02 KB
/
article_sents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import click
import datetime
from scrapers import webmd_article_scraper
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
write_time = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
@click.command(context_settings=CONTEXT_SETTINGS)
@click.option(
'-c', '--chromedriver-path', 'chromedriver_path',
default = None,
required = True,
help = "Used to specify the path to your Chromedriver binary."
)
@click.option(
'-i', '--input-articles-file', 'input_articles_file',
default = None,
required = True,
help = (
f"Used to specify the path to the newline-delimited .txt file containing the "
f"links to the articles that this scraper should evaluated. The simplest "
f"approach is to specify a file that was created by the Search Results Scraper."
)
)
@click.option(
'-o', '--output-file', 'output_file',
default = 'data/test/webmd_sentences/article_sents_' + write_time + '.csv',
show_default = True,
help = (
f"Used to specify the path to the CSV file where the results will be stored. "
f"If the directory doesn't exist, it will be created. If an output file "
f"containing some results is specified, this program will skip links that "
f"have already been evaluated. This is a good technique to use if you need "
f"to resume scraping after quitting the scraper before it finishes."
)
)
@click.option(
'-f', '--filter-results', 'should_filter_results',
default = False,
show_default = True,
is_flag=True,
help = (
f"Used to specify that the sentences collected from WebMD articles should be "
f"filtered to remove potentially irrelevant sentences based on the logic in "
f"utils/filter_results.py."
)
)
def main(chromedriver_path, input_articles_file, output_file, should_filter_results):
webmd_article_scraper.scrape_sents(
input_articles_file,
output_file,
chromedriver_path,
should_filter_results
)
if __name__ == '__main__':
main()