Skip to content

Commit

Permalink
Merge pull request #3 from akb89/develop
Browse files Browse the repository at this point in the history
Adding note in README on how to avoid request rejection from Wikimedi…
  • Loading branch information
akb89 authored Nov 16, 2018
2 parents a5c6573 + 0aa1e9a commit a4fada1
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 6 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ The `--date` parameter expects a string corresponding to one of the dates
found under the Wikimedia dump site corresponding to a given Wikipedia dump
(e.g. https://dumps.wikimedia.org/enwiki/ for the English Wikipedia).

**Important** Keep num-threads <= 3 to avoid rejection from Wikimedia servers

### Extract
To extract the content of the downloaded .bz2 archives, do:

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
author_email='akb@3azouz.net',
long_description=long_description,
long_description_content_type='text/markdown',
version='0.1.2',
version='0.1.4',
url='https://github.com/akb89/witokit',
download_url='https://pypi.org/project/witokit/#files',
license='MIT',
Expand Down
9 changes: 4 additions & 5 deletions witokit/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""

import os
import sys
import argparse
import multiprocessing
import urllib
Expand Down Expand Up @@ -38,9 +37,9 @@ def _download_href(output_dirpath, wiki_dump_url, href):
href)
try:
urllib.request.urlretrieve(url, output_filepath)
except urllib.error.HTTPError:
except urllib.error.HTTPError as error:
logger.error('Could not download archive from {}'.format(url))
sys.exit(1)
raise error


def _parallel_download(wiki_arxiv_hrefs, wiki_dump_url, num_threads,
Expand Down Expand Up @@ -69,11 +68,11 @@ def _collect_wiki_arxiv_hrefs(wiki_dump_url, lang, date):
href = link.get('href')
if re.match(pattern, href):
wiki_arxiv_hrefs.append(href)
except urllib.error.HTTPError:
except urllib.error.HTTPError as error:
logger.error('HTTPError using lang = \'{}\' and date = \'{}\'. '
'Could not retrieve any Wikipedia data at URL = {}'
.format(lang, date, wiki_dump_url))
sys.exit(1)
raise error
return wiki_arxiv_hrefs


Expand Down

0 comments on commit a4fada1

Please sign in to comment.