Skip to content

Commit

Permalink
Merge pull request #10 from akb89/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
akb89 authored Sep 12, 2019
2 parents 74986e8 + 4f9ad0d commit d6af1eb
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 70 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ data
*.7z
RELEASE.md
dist/
XP.md
scripts/
build/
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
[![Build][travis-image]][travis-url]
[![MIT License][license-image]][license-url]

Welcome to `WiToKit`, a Python toolkit to download and generate
preprocessed Wikipedia dumps for NLP in a single .txt file, one
sentence per line.
Welcome to `WiToKit`, a Python toolkit to download and generate preprocessed Wikipedia dumps for all languages.

*Note: WiToKit currently only supports `xx-pages-articles.xml.xx.bz2` Wikipedia archives corresponding to articles, templates, media/file descriptions, and primary meta-pages. Also, the preprocessing is currently only supported for English. If you'd like support in other languages, please create an issue on Github.*
WiToKit can be used to converts a Wikipedia archive into a single .txt file, one (tokenized) sentence per line.

*Note: WiToKit currently only supports `xx-pages-articles.xml.xx.bz2` Wikipedia archives corresponding to articles, templates, media/file descriptions, and primary meta-pages.*

## Install

Expand Down Expand Up @@ -67,6 +67,17 @@ witokit process \
--num-threads num_cpu_threads
```

Preprocessing for all languages is performed with [Polyglot](https://github.com/aboSamoor/polyglot).

### Sample
You can also use WiToKit to sample the content of a preprocess .txt file, using:
```bash
witokit sample \
--input /abs/path/to/witokit/preprocessed/txt/file \
--percent \ # percentage of total lines to keep
--balance # if set, will balance sampling, otherwise, will take top n sentences only
```

[release-image]:https://img.shields.io/github/release/akb89/witokit.svg?style=flat-square
[release-url]:https://github.com/akb89/witokit/releases/latest
[pypi-image]:https://img.shields.io/pypi/v/witokit.svg?style=flat-square
Expand Down
12 changes: 6 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
author_email='akb@3azouz.net',
long_description=long_description,
long_description_content_type='text/markdown',
version='0.2.1',
version='1.0.0',
url='https://github.com/akb89/witokit',
download_url='https://pypi.org/project/witokit/#files',
license='MIT',
Expand All @@ -31,13 +31,13 @@
'witokit = witokit.main:main'
],
},
install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.3',
install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.4',
'natsort==5.4.1', 'beautifulsoup4==4.6.3',
'polyglot==16.7.4', 'numpy==1.16.0', 'pyicu==2.2',
'pycld2==0.31', 'morfessor==2.0.4'],
'polyglot==16.7.4', 'pyicu==2.3.1',
'pycld2==0.31', 'morfessor==2.0.4', 'tqdm==4.35.0'],
dependency_links=[
'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.3'],
classifiers=['Development Status :: 4 - Beta',
'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.4'],
classifiers=['Development Status :: 5 - Production/Stable',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Education',
Expand Down
4 changes: 4 additions & 0 deletions witokit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""To export for toolkit use."""
from .main import tokenize

__all__ = ('tokenize')
152 changes: 100 additions & 52 deletions witokit/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import logging.config
import pycld2

from tqdm import tqdm
from polyglot.text import Text
from bs4 import BeautifulSoup

Expand All @@ -30,10 +31,12 @@

logger = logging.getLogger(__name__)

__all__ = ('tokenize')


def _download_href(output_dirpath, wiki_dump_url, href):
url = uutils.get_wiki_arxiv_url(wiki_dump_url, href)
logger.info('Downloading {}'.format(url))
logger.debug('Downloading {}'.format(url))
output_filepath = futils.get_download_output_filepath(output_dirpath,
href)
try:
Expand All @@ -49,13 +52,10 @@ def _parallel_download(wiki_arxiv_hrefs, wiki_dump_url, num_threads,
_download_href_to_output_dir = functools.partial(_download_href,
output_dirpath,
wiki_dump_url)
total_arxivs = len(wiki_arxiv_hrefs)
arxiv_num = 0
for _ in pool.imap_unordered(_download_href_to_output_dir,
wiki_arxiv_hrefs):
arxiv_num += 1
logger.info('Downloaded {}/{} archives'.format(arxiv_num,
total_arxivs))
for _ in tqdm(pool.imap_unordered(_download_href_to_output_dir,
wiki_arxiv_hrefs),
total=len(wiki_arxiv_hrefs)):
continue


def _collect_wiki_arxiv_hrefs(wiki_dump_url, lang, date):
Expand All @@ -66,10 +66,20 @@ def _collect_wiki_arxiv_hrefs(wiki_dump_url, lang, date):
html_doc = response.read()
soup = BeautifulSoup(html_doc, 'html.parser')
for link in soup.find_all('a'):
pattern = uutils.get_wikipedia_pattern(lang, date)
pattern = uutils.get_wikipedia_multi_pattern(lang, date)
href = link.get('href')
if re.match(pattern, href):
wiki_arxiv_hrefs.append(href)
if not wiki_arxiv_hrefs:
logger.info('No multi arxivs found. Trying for single arxiv')
# If wikipedia arxiv is too small, check for single arxiv
for link in soup.find_all('a'):
pattern = uutils.get_wikipedia_single_pattern(lang, date)
href = link.get('href')
if re.match(pattern, href):
wiki_arxiv_hrefs.append(href)
if not wiki_arxiv_hrefs:
logger.warning('No wikipedia arxiv found')
except urllib.error.HTTPError as error:
logger.error('HTTPError using lang = \'{}\' and date = \'{}\'. '
'Could not retrieve any Wikipedia data at URL = {}'
Expand Down Expand Up @@ -102,12 +112,10 @@ def _extract(args):
logger.info('Extracting .bz2 files from {}'.format(args.bz2_input_dirpath))
bz2_arxivs = futils.get_bz2_arxivs(args.bz2_input_dirpath)
total_arxivs = len(bz2_arxivs)
arxiv_num = 0
with multiprocessing.Pool(args.num_threads) as pool:
for _ in pool.imap_unordered(_decompress_arxiv, bz2_arxivs):
arxiv_num += 1
logger.info('Extracted {}/{} archives'.format(arxiv_num,
total_arxivs))
for _ in tqdm(pool.imap_unordered(_decompress_arxiv, bz2_arxivs),
total=total_arxivs):
continue


def _preprocess(output_txt_filepath, lowercase, input_xml_filepath):
Expand All @@ -123,61 +131,96 @@ def _preprocess(output_txt_filepath, lowercase, input_xml_filepath):
output_txt_filepath)
with open(output_filepath, 'w', encoding='utf-8') as output_stream:
logger.info('Writing output to file {}'.format(output_filepath))
for json_object in wikiextractor.extract(input_xml_filepath):
for json_object in tqdm(wikiextractor.extract(input_xml_filepath)):
try:
text = Text(json_object['text']) # lang will be guessed
for sent in text.sentences:
if lowercase:
tokens = [token.lower().strip() for token in sent.words]
else:
tokens = [token.strip() for token in sent.words]
output_sent = ' '.join(tokens)
print(output_sent, file=output_stream)
print(tokenize(json_object['text'], lowercase),
file=output_stream)
except UnicodeEncodeError as err:
logger.error('UnicodeEncodeError processing '
'json_object[\'text\'] with spacy: {}'
'json_object[\'text\'] with polyglot: {}'
.format(str(err)))
except ValueError as err:
logger.warning('Skipping empty text sequence')
except pycld2.error as err:
logger.warning('{}. Skipping sequence'.format(str(err)))
return input_xml_filepath


def tokenize(raw_text, lowercase):
"""Tokenize raw_text with polyglot."""
output = []
try:
text = Text(raw_text)
for sent in text.sentences:
if lowercase:
tokens = [token.lower().strip() for token in sent.words]
else:
tokens = [token.strip() for token in sent.words]
output.append(' '.join(tokens))
except ValueError as err:
logger.debug('Skipping empty text sequence')
except pycld2.error as err:
logger.debug('{}. Skipping sequence'.format(str(err)))
return '\n'.join(output)


def _process(args):
logger.info('Processing content of wikipedia archives under {}'
.format(args.wiki_input_dirpath))
if args.lower:
logger.info('Lowercasing archives')
input_filepaths = futils.get_input_filepaths(args.wiki_input_dirpath)
total_arxivs = len(input_filepaths)
arxiv_num = 0
left = input_filepaths
with open(args.wiki_output_filepath, 'w', encoding='utf-8') as output_strm:
# with multiprocessing.Pool(processes=args.num_threads,
# maxtasksperchild=args.max_tasks) as pool:
# for wiki_input_filepath in input_filepaths:
# _preprocess(args.wiki_output_filepath, args.lower, wiki_input_filepath)
with multiprocessing.Pool(processes=args.num_threads) as pool:
preprocess = functools.partial(
_preprocess, args.wiki_output_filepath, args.lower)
for process in pool.imap_unordered(preprocess, input_filepaths):
arxiv_num += 1
logger.info('Done processing content of {}'.format(process))
logger.info('Completed processing of {}/{} archives'
.format(arxiv_num, total_arxivs))
left = [item for item in left if item != process]
logger.info('Left to process: {}'.format(left))
for _ in tqdm(pool.imap_unordered(preprocess, input_filepaths),
total=total_arxivs):
continue
# concatenate all .txt files into single output .txt file
logger.info('Concatenating tmp files...')
tmp_filepaths = futils.get_tmp_filepaths(args.wiki_output_filepath)
tmp_filepaths = futils.get_tmp_filepaths(args.wiki_input_dirpath)
for tmp_filepath in tmp_filepaths:
with open(tmp_filepath, 'r') as tmp_stream:
with open(tmp_filepath, 'r', encoding='utf-8') as tmp_stream:
for line in tmp_stream:
line = line.strip()
print(line, file=output_strm)
logger.info('Done processing content of Wikipedia archives')
shutil.rmtree(futils.get_tmp_dirpath(args.wiki_output_filepath))
shutil.rmtree(futils.get_tmp_dirpath(args.wiki_input_dirpath))


def _sample(args):
if not 0 < args.percent < 100:
raise Exception('Specified percent param should be in ]0, 100[')
logger.info('Sampling input file {}'.format(args.input_filepath))

logger.info('Counting number of lines in file...')
if args.input_filepath.endswith('.txt'):
input_basename = args.input_filepath.split('.txt')[0]
else:
input_basename = args.input_filepath
with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
count = sum(1 for x in input_stream)
logger.info('Total lines = {}'.format(count))
final_count = count * args.percent / 100
sampling = count / final_count
logger.info('Sampling file to {} lines with balance = {}'
.format(int(final_count), args.balance))
if args.balance:
output_filepath = '{}.sample{}.balanced.txt'.format(input_basename,
args.percent)
with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
with open(output_filepath, 'w', encoding='utf-8') as output_stream:
for idx, line in enumerate(input_stream):
if idx % round(sampling) == 0:
print(line.strip(), file=output_stream)
else:
output_filepath = '{}.sample{}.txt'.format(input_basename,
args.percent)
with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
with open(output_filepath, 'w', encoding='utf-8') as output_stream:
for idx, line in enumerate(input_stream):
if idx >= final_count:
break
print(line.strip(), file=output_stream)
logger.info('Done sampling file to {}'.format(output_filepath))


def main():
Expand Down Expand Up @@ -223,15 +266,20 @@ def main():
help='absolute path to output .txt file')
parser_process.add_argument('-l', '--lower', action='store_true',
help='whether or not to lowercase splits')
parser_process.add_argument('-m', '--max-len', type=int, default=10000000,
dest='max_length',
help='spacy .max_length option for string '
'processing')
parser_process.add_argument('-t', '--max-tasks', type=int, default=0,
help='max task per child for fine-grained '
'control over python multiprocessing '
'pool memory management')
parser_process.add_argument('-n', '--num-threads', type=int, default=1,
help='number of CPU threads to be used')
parser_sample = subparsers.add_parser(
'sample', formatter_class=argparse.ArgumentDefaultsHelpFormatter,
help='sample a given .txt file deterministically')
parser_sample.set_defaults(func=_sample)
parser_sample.add_argument('-i', '--input', required=True,
dest='input_filepath',
help='absolute path to .txt file to sample')
parser_sample.add_argument('-p', '--percent', required=True, type=float,
help='percentage of input file to keep')
parser_sample.add_argument('-b', '--balance', action='store_true',
help='whether or not to balance the sampling'
'within the corpus or to take the top'
'p% sentences')
args = parser.parse_args()
args.func(args)
10 changes: 5 additions & 5 deletions witokit/utils/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ def get_download_output_filepath(output_dirpath, href):
return os.path.join(output_dirpath, href)


def get_tmp_dirpath(output_txt_filepath):
def get_tmp_dirpath(input_xml_filepath):
"""Return absolute path to output_txt_dirpath/tmp/."""
return os.path.join(os.path.dirname(output_txt_filepath), 'tmp')
return os.path.join(os.path.dirname(input_xml_filepath), 'tmp')


def get_tmp_filepaths(output_txt_filepath):
def get_tmp_filepaths(xml_input_dirpath):
"""Return all .txt files under the output_txt_dirpath/tmp/ dir."""
tmp_dirpath = get_tmp_dirpath(output_txt_filepath)
tmp_dirpath = get_tmp_dirpath(xml_input_dirpath)
return natsort.natsorted([os.path.join(tmp_dirpath, filename) for filename
in os.listdir(tmp_dirpath)],
alg=natsort.ns.IGNORECASE)
Expand All @@ -38,7 +38,7 @@ def get_output_filepath(input_xml_filepath, output_txt_filepath):
Create tmp dir if not exists.
"""
tmp_dirpath = get_tmp_dirpath(output_txt_filepath)
tmp_dirpath = get_tmp_dirpath(input_xml_filepath)
os.makedirs(tmp_dirpath, exist_ok=True)
output_filename = os.path.basename(input_xml_filepath)
output_txt_filepath = os.path.join(
Expand Down
11 changes: 8 additions & 3 deletions witokit/utils/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,25 @@

import witokit.utils.constants as const

__all__ = ('get_wikipedia_dump_url', 'get_wikipedia_pattern',
'get_wiki_arxiv_url')
__all__ = ('get_wikipedia_dump_url', 'get_wikipedia_multi_pattern',
'get_wiki_arxiv_url', 'get_wikipedia_single_pattern')


def get_wikipedia_dump_url(lang, date):
"""Return the Wikipedia download URL corresponding to the lang and data."""
return '{}/{}wiki/{}'.format(const.WIKI_DL_URL, lang, date)


def get_wikipedia_pattern(lang, date):
def get_wikipedia_multi_pattern(lang, date):
"""Return a regex pattern matching for wiki .bz2 files to be extracted."""
return r'({}wiki-{}-pages-articles[0-9]+.xml.*bz2$)'.format(lang, date)


def get_wikipedia_single_pattern(lang, date):
"""Return a regex pattern matching for wiki .bz2 files to be extracted."""
return r'({}wiki-{}-pages-articles+.xml.*bz2$)'.format(lang, date)


def get_wiki_arxiv_url(wiki_dump_url, href):
"""Return a full URL from the href of a .bz2 archive."""
return '{}/{}'.format(wiki_dump_url, href)

0 comments on commit d6af1eb

Please sign in to comment.