From d52901c73243dc1f42e48c14b1de6b51bccd99c6 Mon Sep 17 00:00:00 2001 From: AKB Date: Mon, 28 Jan 2019 11:39:36 +0100 Subject: [PATCH 01/14] Added sampling --- setup.py | 2 +- witokit/main.py | 36 ++++++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 4fe5d24..eceba7c 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='0.2.1', + version='0.3.0', url='https://github.com/akb89/witokit', download_url='https://pypi.org/project/witokit/#files', license='MIT', diff --git a/witokit/main.py b/witokit/main.py index 27433db..9757a30 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -154,10 +154,6 @@ def _process(args): arxiv_num = 0 left = input_filepaths with open(args.wiki_output_filepath, 'w', encoding='utf-8') as output_strm: - # with multiprocessing.Pool(processes=args.num_threads, - # maxtasksperchild=args.max_tasks) as pool: - # for wiki_input_filepath in input_filepaths: - # _preprocess(args.wiki_output_filepath, args.lower, wiki_input_filepath) with multiprocessing.Pool(processes=args.num_threads) as pool: preprocess = functools.partial( _preprocess, args.wiki_output_filepath, args.lower) @@ -180,6 +176,29 @@ def _process(args): shutil.rmtree(futils.get_tmp_dirpath(args.wiki_output_filepath)) +def _sample(args): + if args.percent > 100: + raise Exception('Specified percent arg should be a percentage < 100') + logger.info('Sampling input file {}'.format(args.input_filepath)) + output_filepath = '{}.sample{}'.format(args.input_filepath, args.percent) + logger.info('Counting number of lines in file...') + count = 0 + with open(args.input_filepath, 'r') as input_stream: + for line in input_stream: + count += 1 + logger.info('Total lines = {}'.format(count)) + final_count = count * args.percent / 100 + sampling = count / final_count + logger.info('Sampling file to {} lines with sampling rate = {}' + .format(final_count, sampling)) + with open(args.input_filepath, 'r') as input_stream: + with open(output_filepath, 'w', encoding='utf-8') as output_stream: + for idx, line in enumerate(input_stream): + if idx % sampling == 0: + print(line.strip(), file=output_stream) + logger.info('Done sampling file') + + def main(): """Launch WiToKit.""" parser = argparse.ArgumentParser(prog='witokit') @@ -233,5 +252,14 @@ def main(): 'pool memory management') parser_process.add_argument('-n', '--num-threads', type=int, default=1, help='number of CPU threads to be used') + parser_sample = subparsers.add_parser( + 'sample', formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help='sample a given .txt file deterministically') + parser_sample.set_defaults(func=_sample) + parser_sample.add_argument('-i', '--input', required=True, + dest='input_filepath', + help='absolute path to .txt file to sample') + parser_sample.add_argument('-p', '--percent', required=True, type=float, + help='percentage of input file to keep') args = parser.parse_args() args.func(args) From 9432da677748d4307f3b18081cb1daae94f79d49 Mon Sep 17 00:00:00 2001 From: AKB Date: Mon, 28 Jan 2019 13:05:36 +0100 Subject: [PATCH 02/14] Fixed wikiextractor encoding bug --- setup.py | 6 +++--- witokit/main.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index eceba7c..1303d90 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='0.3.0', + version='0.3.1', url='https://github.com/akb89/witokit', download_url='https://pypi.org/project/witokit/#files', license='MIT', @@ -31,12 +31,12 @@ 'witokit = witokit.main:main' ], }, - install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.3', + install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.4', 'natsort==5.4.1', 'beautifulsoup4==4.6.3', 'polyglot==16.7.4', 'numpy==1.16.0', 'pyicu==2.2', 'pycld2==0.31', 'morfessor==2.0.4'], dependency_links=[ - 'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.3'], + 'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.4'], classifiers=['Development Status :: 4 - Beta', 'Environment :: Console', 'Intended Audience :: Developers', diff --git a/witokit/main.py b/witokit/main.py index 9757a30..8a9d223 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -168,7 +168,7 @@ def _process(args): logger.info('Concatenating tmp files...') tmp_filepaths = futils.get_tmp_filepaths(args.wiki_output_filepath) for tmp_filepath in tmp_filepaths: - with open(tmp_filepath, 'r') as tmp_stream: + with open(tmp_filepath, 'r', encoding='utf-8') as tmp_stream: for line in tmp_stream: line = line.strip() print(line, file=output_strm) @@ -183,7 +183,7 @@ def _sample(args): output_filepath = '{}.sample{}'.format(args.input_filepath, args.percent) logger.info('Counting number of lines in file...') count = 0 - with open(args.input_filepath, 'r') as input_stream: + with open(args.input_filepath, 'r', encoding='utf-8') as input_stream: for line in input_stream: count += 1 logger.info('Total lines = {}'.format(count)) @@ -191,7 +191,7 @@ def _sample(args): sampling = count / final_count logger.info('Sampling file to {} lines with sampling rate = {}' .format(final_count, sampling)) - with open(args.input_filepath, 'r') as input_stream: + with open(args.input_filepath, 'r', encoding='utf-8') as input_stream: with open(output_filepath, 'w', encoding='utf-8') as output_stream: for idx, line in enumerate(input_stream): if idx % sampling == 0: From bf42ce5743fb0fafa42a322e0dc5269e1aeaeb2e Mon Sep 17 00:00:00 2001 From: AKB Date: Mon, 28 Jan 2019 13:16:54 +0100 Subject: [PATCH 03/14] updated logging --- setup.py | 2 +- witokit/main.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1303d90..34ed655 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='0.3.1', + version='0.3.2', url='https://github.com/akb89/witokit', download_url='https://pypi.org/project/witokit/#files', license='MIT', diff --git a/witokit/main.py b/witokit/main.py index 8a9d223..0e8f615 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -138,9 +138,9 @@ def _preprocess(output_txt_filepath, lowercase, input_xml_filepath): 'json_object[\'text\'] with spacy: {}' .format(str(err))) except ValueError as err: - logger.warning('Skipping empty text sequence') + logger.debug('Skipping empty text sequence') except pycld2.error as err: - logger.warning('{}. Skipping sequence'.format(str(err))) + logger.debug('{}. Skipping sequence'.format(str(err))) return input_xml_filepath From 895494f0976e8cd91ae2462af17c7b43531dd711 Mon Sep 17 00:00:00 2001 From: AKB Date: Mon, 28 Jan 2019 13:26:49 +0100 Subject: [PATCH 04/14] updated tmp filepath management --- setup.py | 2 +- witokit/main.py | 2 +- witokit/utils/files.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 34ed655..2e33a24 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='0.3.2', + version='0.3.3', url='https://github.com/akb89/witokit', download_url='https://pypi.org/project/witokit/#files', license='MIT', diff --git a/witokit/main.py b/witokit/main.py index 0e8f615..18e0116 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -166,7 +166,7 @@ def _process(args): logger.info('Left to process: {}'.format(left)) # concatenate all .txt files into single output .txt file logger.info('Concatenating tmp files...') - tmp_filepaths = futils.get_tmp_filepaths(args.wiki_output_filepath) + tmp_filepaths = futils.get_tmp_filepaths(args.wiki_input_dirpath) for tmp_filepath in tmp_filepaths: with open(tmp_filepath, 'r', encoding='utf-8') as tmp_stream: for line in tmp_stream: diff --git a/witokit/utils/files.py b/witokit/utils/files.py index 04200c5..ccffc94 100644 --- a/witokit/utils/files.py +++ b/witokit/utils/files.py @@ -20,14 +20,14 @@ def get_download_output_filepath(output_dirpath, href): return os.path.join(output_dirpath, href) -def get_tmp_dirpath(output_txt_filepath): +def get_tmp_dirpath(input_xml_filepath): """Return absolute path to output_txt_dirpath/tmp/.""" - return os.path.join(os.path.dirname(output_txt_filepath), 'tmp') + return os.path.join(os.path.dirname(input_xml_filepath), 'tmp') -def get_tmp_filepaths(output_txt_filepath): +def get_tmp_filepaths(xml_input_dirpath): """Return all .txt files under the output_txt_dirpath/tmp/ dir.""" - tmp_dirpath = get_tmp_dirpath(output_txt_filepath) + tmp_dirpath = os.path.join(xml_input_dirpath, 'tmp') return natsort.natsorted([os.path.join(tmp_dirpath, filename) for filename in os.listdir(tmp_dirpath)], alg=natsort.ns.IGNORECASE) @@ -38,7 +38,7 @@ def get_output_filepath(input_xml_filepath, output_txt_filepath): Create tmp dir if not exists. """ - tmp_dirpath = get_tmp_dirpath(output_txt_filepath) + tmp_dirpath = get_tmp_dirpath(input_xml_filepath) os.makedirs(tmp_dirpath, exist_ok=True) output_filename = os.path.basename(input_xml_filepath) output_txt_filepath = os.path.join( From d696b39d0100775a8f04f53882573b2c24318577 Mon Sep 17 00:00:00 2001 From: AKB Date: Mon, 28 Jan 2019 13:30:32 +0100 Subject: [PATCH 05/14] Fixed wrong filepath --- setup.py | 2 +- witokit/main.py | 2 +- witokit/utils/files.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 2e33a24..c2f1843 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='0.3.3', + version='0.3.4', url='https://github.com/akb89/witokit', download_url='https://pypi.org/project/witokit/#files', license='MIT', diff --git a/witokit/main.py b/witokit/main.py index 18e0116..6421ffc 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -173,7 +173,7 @@ def _process(args): line = line.strip() print(line, file=output_strm) logger.info('Done processing content of Wikipedia archives') - shutil.rmtree(futils.get_tmp_dirpath(args.wiki_output_filepath)) + shutil.rmtree(futils.get_tmp_dirpath(args.wiki_input_dirpath)) def _sample(args): diff --git a/witokit/utils/files.py b/witokit/utils/files.py index ccffc94..bfb9e6e 100644 --- a/witokit/utils/files.py +++ b/witokit/utils/files.py @@ -27,7 +27,7 @@ def get_tmp_dirpath(input_xml_filepath): def get_tmp_filepaths(xml_input_dirpath): """Return all .txt files under the output_txt_dirpath/tmp/ dir.""" - tmp_dirpath = os.path.join(xml_input_dirpath, 'tmp') + tmp_dirpath = get_tmp_dirpath(xml_input_dirpath) return natsort.natsorted([os.path.join(tmp_dirpath, filename) for filename in os.listdir(tmp_dirpath)], alg=natsort.ns.IGNORECASE) From 4bf764c88ebb274c114c7508609bce02c83b2362 Mon Sep 17 00:00:00 2001 From: AKB Date: Mon, 28 Jan 2019 14:35:39 +0100 Subject: [PATCH 06/14] Added support for downloading small wikipedia arxivs --- setup.py | 2 +- witokit/main.py | 12 +++++++++++- witokit/utils/urls.py | 11 ++++++++--- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index c2f1843..7dbff70 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='0.3.4', + version='0.4.0', url='https://github.com/akb89/witokit', download_url='https://pypi.org/project/witokit/#files', license='MIT', diff --git a/witokit/main.py b/witokit/main.py index 6421ffc..d7d6f86 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -66,10 +66,20 @@ def _collect_wiki_arxiv_hrefs(wiki_dump_url, lang, date): html_doc = response.read() soup = BeautifulSoup(html_doc, 'html.parser') for link in soup.find_all('a'): - pattern = uutils.get_wikipedia_pattern(lang, date) + pattern = uutils.get_wikipedia_multi_pattern(lang, date) href = link.get('href') if re.match(pattern, href): wiki_arxiv_hrefs.append(href) + if not wiki_arxiv_hrefs: + logger.info('No multi arxivs found. Trying for single arxiv') + # If wikipedia arxiv is too small, check for single arxiv + for link in soup.find_all('a'): + pattern = uutils.get_wikipedia_single_pattern(lang, date) + href = link.get('href') + if re.match(pattern, href): + wiki_arxiv_hrefs.append(href) + if not wiki_arxiv_hrefs: + logger.warning('No wikipedia arxiv found') except urllib.error.HTTPError as error: logger.error('HTTPError using lang = \'{}\' and date = \'{}\'. ' 'Could not retrieve any Wikipedia data at URL = {}' diff --git a/witokit/utils/urls.py b/witokit/utils/urls.py index ed3a3b2..62d9a14 100644 --- a/witokit/utils/urls.py +++ b/witokit/utils/urls.py @@ -2,8 +2,8 @@ import witokit.utils.constants as const -__all__ = ('get_wikipedia_dump_url', 'get_wikipedia_pattern', - 'get_wiki_arxiv_url') +__all__ = ('get_wikipedia_dump_url', 'get_wikipedia_multi_pattern', + 'get_wiki_arxiv_url', 'get_wikipedia_single_pattern') def get_wikipedia_dump_url(lang, date): @@ -11,11 +11,16 @@ def get_wikipedia_dump_url(lang, date): return '{}/{}wiki/{}'.format(const.WIKI_DL_URL, lang, date) -def get_wikipedia_pattern(lang, date): +def get_wikipedia_multi_pattern(lang, date): """Return a regex pattern matching for wiki .bz2 files to be extracted.""" return r'({}wiki-{}-pages-articles[0-9]+.xml.*bz2$)'.format(lang, date) +def get_wikipedia_single_pattern(lang, date): + """Return a regex pattern matching for wiki .bz2 files to be extracted.""" + return r'({}wiki-{}-pages-articles+.xml.*bz2$)'.format(lang, date) + + def get_wiki_arxiv_url(wiki_dump_url, href): """Return a full URL from the href of a .bz2 archive.""" return '{}/{}'.format(wiki_dump_url, href) From 96dfea204205137e826ee8d863c0a5a9ba578371 Mon Sep 17 00:00:00 2001 From: AKB Date: Wed, 30 Jan 2019 11:26:15 +0100 Subject: [PATCH 07/14] added tokenize function --- setup.py | 2 +- witokit/__init__.py | 3 +++ witokit/main.py | 34 ++++++++++++++++++++++------------ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/setup.py b/setup.py index 7dbff70..db0a5de 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='0.4.0', + version='0.5.0', url='https://github.com/akb89/witokit', download_url='https://pypi.org/project/witokit/#files', license='MIT', diff --git a/witokit/__init__.py b/witokit/__init__.py index e69de29..d211f46 100644 --- a/witokit/__init__.py +++ b/witokit/__init__.py @@ -0,0 +1,3 @@ +from .main import tokenize + +__all__ = ('tokenize') diff --git a/witokit/main.py b/witokit/main.py index d7d6f86..fbdacab 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -30,6 +30,8 @@ logger = logging.getLogger(__name__) +__all__ = ('tokenize') + def _download_href(output_dirpath, wiki_dump_url, href): url = uutils.get_wiki_arxiv_url(wiki_dump_url, href) @@ -135,25 +137,33 @@ def _preprocess(output_txt_filepath, lowercase, input_xml_filepath): logger.info('Writing output to file {}'.format(output_filepath)) for json_object in wikiextractor.extract(input_xml_filepath): try: - text = Text(json_object['text']) # lang will be guessed - for sent in text.sentences: - if lowercase: - tokens = [token.lower().strip() for token in sent.words] - else: - tokens = [token.strip() for token in sent.words] - output_sent = ' '.join(tokens) - print(output_sent, file=output_stream) + print(tokenize(json_object['text'], lowercase), + file=output_stream) except UnicodeEncodeError as err: logger.error('UnicodeEncodeError processing ' 'json_object[\'text\'] with spacy: {}' .format(str(err))) - except ValueError as err: - logger.debug('Skipping empty text sequence') - except pycld2.error as err: - logger.debug('{}. Skipping sequence'.format(str(err))) return input_xml_filepath +def tokenize(raw_text, lowercase): + """Tokenize raw_text with polyglot.""" + output = [] + try: + text = Text(raw_text) + for sent in text.sentences: + if lowercase: + tokens = [token.lower().strip() for token in sent.words] + else: + tokens = [token.strip() for token in sent.words] + output.append(' '.join(tokens)) + except ValueError as err: + logger.debug('Skipping empty text sequence') + except pycld2.error as err: + logger.debug('{}. Skipping sequence'.format(str(err))) + return '\n'.join(output) + + def _process(args): logger.info('Processing content of wikipedia archives under {}' .format(args.wiki_input_dirpath)) From 1b0e0fd08e9ca5f49ea549f5f63fa3db9511db23 Mon Sep 17 00:00:00 2001 From: AKB Date: Wed, 30 Jan 2019 11:26:26 +0100 Subject: [PATCH 08/14] added batch-tokenize script --- scripts/batch-tokenize.py | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 scripts/batch-tokenize.py diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py new file mode 100644 index 0000000..aea1494 --- /dev/null +++ b/scripts/batch-tokenize.py @@ -0,0 +1,44 @@ +"""Tokenize a batch of .txt files with polyglot.""" + +import os +import functools +import multiprocessing + +import witokit + + +def _process(lowercase, output_dirpath, input_filepath): + output_filepath = os.path.join( + output_dirpath, + '{}.tkz.txt'.format(os.path.basename(input_filepath).split('.txt')[0])) + input = [] + with open(input_filepath, 'r', encoding='utf-8') as input_stream: + for line in input_stream: + line = line.strip() + input.append(line) + with open(output_filepath, 'w', encoding='utf-8') as output_stream: + tokenized_txt = witokit.tokenize(' '.join(input), lowercase) + print(tokenized_txt, file=output_stream) + return input_filepath + + +if __name__ == '__main__': + BATCH_DIRPATH = '/Users/akb/Github/witokit/data/' + OUTPUT_DIRPATH = '/Users/akb/Github/witokit/data/' + LOWERCASE = True + NUM_THREADS = 1 + + assert os.path.exists(BATCH_DIRPATH) + assert os.path.exists(OUTPUT_DIRPATH) + + txt_filepaths = [os.path.join(BATCH_DIRPATH, filename) for filename in + os.listdir(BATCH_DIRPATH) if not filename.startswith('.')] + + file_num = 0 + with multiprocessing.Pool(NUM_THREADS) as pool: + file_num += 1 + process = functools.partial(_process, LOWERCASE, OUTPUT_DIRPATH) + for filepath in pool.imap_unordered(process, txt_filepaths): + print('Done processing file {}'.format(filepath)) + print('Completed processing of {}/{} files' + .format(file_num, len(txt_filepaths))) From 219d67d18ab3c5085996b4d09cb41e6e01686eaf Mon Sep 17 00:00:00 2001 From: AKB Date: Wed, 30 Jan 2019 11:28:34 +0100 Subject: [PATCH 09/14] updated script --- scripts/batch-tokenize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py index aea1494..6f9176e 100644 --- a/scripts/batch-tokenize.py +++ b/scripts/batch-tokenize.py @@ -23,10 +23,10 @@ def _process(lowercase, output_dirpath, input_filepath): if __name__ == '__main__': - BATCH_DIRPATH = '/Users/akb/Github/witokit/data/' - OUTPUT_DIRPATH = '/Users/akb/Github/witokit/data/' + BATCH_DIRPATH = '/home/kabbach/witokit/data/ud23/raw/' + OUTPUT_DIRPATH = '/home/kabbach/witokit/data/ud23/tokenized/' LOWERCASE = True - NUM_THREADS = 1 + NUM_THREADS = 38 assert os.path.exists(BATCH_DIRPATH) assert os.path.exists(OUTPUT_DIRPATH) From 41ca02ec29dad95ee6deb2504ec5e61e7ec7ba02 Mon Sep 17 00:00:00 2001 From: AKB Date: Wed, 30 Jan 2019 11:30:11 +0100 Subject: [PATCH 10/14] bugfix on counter --- scripts/batch-tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py index 6f9176e..72bfe57 100644 --- a/scripts/batch-tokenize.py +++ b/scripts/batch-tokenize.py @@ -36,9 +36,9 @@ def _process(lowercase, output_dirpath, input_filepath): file_num = 0 with multiprocessing.Pool(NUM_THREADS) as pool: - file_num += 1 process = functools.partial(_process, LOWERCASE, OUTPUT_DIRPATH) for filepath in pool.imap_unordered(process, txt_filepaths): + file_num += 1 print('Done processing file {}'.format(filepath)) print('Completed processing of {}/{} files' .format(file_num, len(txt_filepaths))) From 5a2045cdbd75fb7455b88e459301789c0a7009c0 Mon Sep 17 00:00:00 2001 From: AKB Date: Wed, 13 Feb 2019 14:13:17 +0100 Subject: [PATCH 11/14] Updated sampling script to add balance param --- witokit/main.py | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/witokit/main.py b/witokit/main.py index fbdacab..c70b6bf 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -197,24 +197,40 @@ def _process(args): def _sample(args): - if args.percent > 100: - raise Exception('Specified percent arg should be a percentage < 100') + if not (0 < args.percent < 100): + raise Exception('Specified percent param should be in ]0, 100[') logger.info('Sampling input file {}'.format(args.input_filepath)) - output_filepath = '{}.sample{}'.format(args.input_filepath, args.percent) + logger.info('Counting number of lines in file...') count = 0 + if args.input_filepath.endswith('.txt'): + input_basename = args.input_filepath.split('.txt')[0] + else: + input_basename = args.input_filepath with open(args.input_filepath, 'r', encoding='utf-8') as input_stream: for line in input_stream: count += 1 logger.info('Total lines = {}'.format(count)) final_count = count * args.percent / 100 sampling = count / final_count - logger.info('Sampling file to {} lines with sampling rate = {}' - .format(final_count, sampling)) - with open(args.input_filepath, 'r', encoding='utf-8') as input_stream: - with open(output_filepath, 'w', encoding='utf-8') as output_stream: - for idx, line in enumerate(input_stream): - if idx % sampling == 0: + logger.info('Sampling file to {} lines'.format(final_count)) + print(round(sampling)) + if args.balance: + output_filepath = '{}.sample{}.balanced.txt'.format(input_basename, + args.percent) + with open(args.input_filepath, 'r', encoding='utf-8') as input_stream: + with open(output_filepath, 'w', encoding='utf-8') as output_stream: + for idx, line in enumerate(input_stream): + if idx % round(sampling) == 0: + print(line.strip(), file=output_stream) + else: + output_filepath = '{}.sample{}.txt'.format(input_basename, + args.percent) + with open(args.input_filepath, 'r', encoding='utf-8') as input_stream: + with open(output_filepath, 'w', encoding='utf-8') as output_stream: + for idx, line in enumerate(input_stream): + if idx >= final_count: + break print(line.strip(), file=output_stream) logger.info('Done sampling file') @@ -281,5 +297,9 @@ def main(): help='absolute path to .txt file to sample') parser_sample.add_argument('-p', '--percent', required=True, type=float, help='percentage of input file to keep') + parser_sample.add_argument('-b', '--balance', action='store_true', + help='whether or not to balance the sampling' + 'within the corpus or to take the top' + 'p% sentences') args = parser.parse_args() args.func(args) From c9122d4d6ada99ab25eecab85d4dad8402efa408 Mon Sep 17 00:00:00 2001 From: AKB Date: Thu, 12 Sep 2019 14:03:58 +0200 Subject: [PATCH 12/14] v1.0 --- .gitignore | 3 ++ README.md | 19 ++++++++++--- scripts/batch-tokenize.py | 13 ++++----- setup.py | 8 +++--- witokit/__init__.py | 1 + witokit/main.py | 60 +++++++++++++-------------------------- 6 files changed, 49 insertions(+), 55 deletions(-) diff --git a/.gitignore b/.gitignore index 3f3a061..430e6ec 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ data *.7z RELEASE.md dist/ +XP.md +scripts/ +build/ diff --git a/README.md b/README.md index 6572871..6903d61 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,11 @@ [![Build][travis-image]][travis-url] [![MIT License][license-image]][license-url] -Welcome to `WiToKit`, a Python toolkit to download and generate -preprocessed Wikipedia dumps for NLP in a single .txt file, one -sentence per line. +Welcome to `WiToKit`, a Python toolkit to download and generate preprocessed Wikipedia dumps for all languages. -*Note: WiToKit currently only supports `xx-pages-articles.xml.xx.bz2` Wikipedia archives corresponding to articles, templates, media/file descriptions, and primary meta-pages. Also, the preprocessing is currently only supported for English. If you'd like support in other languages, please create an issue on Github.* +WiToKit can be used to converts a Wikipedia archive into a single .txt file, one (tokenized) sentence per line. + +*Note: WiToKit currently only supports `xx-pages-articles.xml.xx.bz2` Wikipedia archives corresponding to articles, templates, media/file descriptions, and primary meta-pages.* ## Install @@ -67,6 +67,17 @@ witokit process \ --num-threads num_cpu_threads ``` +Preprocessing for all languages is performed with [Polyglot](https://github.com/aboSamoor/polyglot) + +### Sample +You can also use WiToKit to sample the content of a preprocess .txt file, using: +```bash +witokit sample \ + --input /abs/path/to/witokit/preprocessed/txt/file \ + --percent \ # percentage of total lines to keep + --balance # if set, will balance sampling, otherwise, will take top n sentences only +``` + [release-image]:https://img.shields.io/github/release/akb89/witokit.svg?style=flat-square [release-url]:https://github.com/akb89/witokit/releases/latest [pypi-image]:https://img.shields.io/pypi/v/witokit.svg?style=flat-square diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py index 72bfe57..ceb1125 100644 --- a/scripts/batch-tokenize.py +++ b/scripts/batch-tokenize.py @@ -11,22 +11,21 @@ def _process(lowercase, output_dirpath, input_filepath): output_filepath = os.path.join( output_dirpath, '{}.tkz.txt'.format(os.path.basename(input_filepath).split('.txt')[0])) - input = [] + processing_input = [] with open(input_filepath, 'r', encoding='utf-8') as input_stream: for line in input_stream: - line = line.strip() - input.append(line) + processing_input.append(line.strip()) with open(output_filepath, 'w', encoding='utf-8') as output_stream: - tokenized_txt = witokit.tokenize(' '.join(input), lowercase) + tokenized_txt = witokit.tokenize(' '.join(processing_input), lowercase) print(tokenized_txt, file=output_stream) return input_filepath if __name__ == '__main__': - BATCH_DIRPATH = '/home/kabbach/witokit/data/ud23/raw/' - OUTPUT_DIRPATH = '/home/kabbach/witokit/data/ud23/tokenized/' + BATCH_DIRPATH = '/Users/akb/Github/witokit/data/' + OUTPUT_DIRPATH = '/Users/akb/Github/witokit/data/' LOWERCASE = True - NUM_THREADS = 38 + NUM_THREADS = 1 assert os.path.exists(BATCH_DIRPATH) assert os.path.exists(OUTPUT_DIRPATH) diff --git a/setup.py b/setup.py index db0a5de..c6c279a 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ author_email='akb@3azouz.net', long_description=long_description, long_description_content_type='text/markdown', - version='0.5.0', + version='1.0.0', url='https://github.com/akb89/witokit', download_url='https://pypi.org/project/witokit/#files', license='MIT', @@ -33,11 +33,11 @@ }, install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.4', 'natsort==5.4.1', 'beautifulsoup4==4.6.3', - 'polyglot==16.7.4', 'numpy==1.16.0', 'pyicu==2.2', - 'pycld2==0.31', 'morfessor==2.0.4'], + 'polyglot==16.7.4', 'pyicu==2.3.1', + 'pycld2==0.31', 'morfessor==2.0.4', 'tqdm==4.35.0'], dependency_links=[ 'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.4'], - classifiers=['Development Status :: 4 - Beta', + classifiers=['Development Status :: 5 - Production/Stable', 'Environment :: Console', 'Intended Audience :: Developers', 'Intended Audience :: Education', diff --git a/witokit/__init__.py b/witokit/__init__.py index d211f46..da76f89 100644 --- a/witokit/__init__.py +++ b/witokit/__init__.py @@ -1,3 +1,4 @@ +"""To export for toolkit use.""" from .main import tokenize __all__ = ('tokenize') diff --git a/witokit/main.py b/witokit/main.py index c70b6bf..64c3b72 100644 --- a/witokit/main.py +++ b/witokit/main.py @@ -15,6 +15,7 @@ import logging.config import pycld2 +from tqdm import tqdm from polyglot.text import Text from bs4 import BeautifulSoup @@ -35,7 +36,7 @@ def _download_href(output_dirpath, wiki_dump_url, href): url = uutils.get_wiki_arxiv_url(wiki_dump_url, href) - logger.info('Downloading {}'.format(url)) + logger.debug('Downloading {}'.format(url)) output_filepath = futils.get_download_output_filepath(output_dirpath, href) try: @@ -51,13 +52,10 @@ def _parallel_download(wiki_arxiv_hrefs, wiki_dump_url, num_threads, _download_href_to_output_dir = functools.partial(_download_href, output_dirpath, wiki_dump_url) - total_arxivs = len(wiki_arxiv_hrefs) - arxiv_num = 0 - for _ in pool.imap_unordered(_download_href_to_output_dir, - wiki_arxiv_hrefs): - arxiv_num += 1 - logger.info('Downloaded {}/{} archives'.format(arxiv_num, - total_arxivs)) + for _ in tqdm(pool.imap_unordered(_download_href_to_output_dir, + wiki_arxiv_hrefs), + total=len(wiki_arxiv_hrefs)): + continue def _collect_wiki_arxiv_hrefs(wiki_dump_url, lang, date): @@ -114,12 +112,10 @@ def _extract(args): logger.info('Extracting .bz2 files from {}'.format(args.bz2_input_dirpath)) bz2_arxivs = futils.get_bz2_arxivs(args.bz2_input_dirpath) total_arxivs = len(bz2_arxivs) - arxiv_num = 0 with multiprocessing.Pool(args.num_threads) as pool: - for _ in pool.imap_unordered(_decompress_arxiv, bz2_arxivs): - arxiv_num += 1 - logger.info('Extracted {}/{} archives'.format(arxiv_num, - total_arxivs)) + for _ in tqdm(pool.imap_unordered(_decompress_arxiv, bz2_arxivs), + total=total_arxivs): + continue def _preprocess(output_txt_filepath, lowercase, input_xml_filepath): @@ -135,13 +131,13 @@ def _preprocess(output_txt_filepath, lowercase, input_xml_filepath): output_txt_filepath) with open(output_filepath, 'w', encoding='utf-8') as output_stream: logger.info('Writing output to file {}'.format(output_filepath)) - for json_object in wikiextractor.extract(input_xml_filepath): + for json_object in tqdm(wikiextractor.extract(input_xml_filepath)): try: print(tokenize(json_object['text'], lowercase), file=output_stream) except UnicodeEncodeError as err: logger.error('UnicodeEncodeError processing ' - 'json_object[\'text\'] with spacy: {}' + 'json_object[\'text\'] with polyglot: {}' .format(str(err))) return input_xml_filepath @@ -171,19 +167,13 @@ def _process(args): logger.info('Lowercasing archives') input_filepaths = futils.get_input_filepaths(args.wiki_input_dirpath) total_arxivs = len(input_filepaths) - arxiv_num = 0 - left = input_filepaths with open(args.wiki_output_filepath, 'w', encoding='utf-8') as output_strm: with multiprocessing.Pool(processes=args.num_threads) as pool: preprocess = functools.partial( _preprocess, args.wiki_output_filepath, args.lower) - for process in pool.imap_unordered(preprocess, input_filepaths): - arxiv_num += 1 - logger.info('Done processing content of {}'.format(process)) - logger.info('Completed processing of {}/{} archives' - .format(arxiv_num, total_arxivs)) - left = [item for item in left if item != process] - logger.info('Left to process: {}'.format(left)) + for _ in tqdm(pool.imap_unordered(preprocess, input_filepaths), + total=total_arxivs): + continue # concatenate all .txt files into single output .txt file logger.info('Concatenating tmp files...') tmp_filepaths = futils.get_tmp_filepaths(args.wiki_input_dirpath) @@ -197,24 +187,22 @@ def _process(args): def _sample(args): - if not (0 < args.percent < 100): + if not 0 < args.percent < 100: raise Exception('Specified percent param should be in ]0, 100[') logger.info('Sampling input file {}'.format(args.input_filepath)) logger.info('Counting number of lines in file...') - count = 0 if args.input_filepath.endswith('.txt'): input_basename = args.input_filepath.split('.txt')[0] else: input_basename = args.input_filepath with open(args.input_filepath, 'r', encoding='utf-8') as input_stream: - for line in input_stream: - count += 1 - logger.info('Total lines = {}'.format(count)) + count = sum(1 for x in input_stream) + logger.info('Total lines = {}'.format(count)) final_count = count * args.percent / 100 sampling = count / final_count - logger.info('Sampling file to {} lines'.format(final_count)) - print(round(sampling)) + logger.info('Sampling file to {} lines with balance = {}' + .format(int(final_count), args.balance)) if args.balance: output_filepath = '{}.sample{}.balanced.txt'.format(input_basename, args.percent) @@ -232,7 +220,7 @@ def _sample(args): if idx >= final_count: break print(line.strip(), file=output_stream) - logger.info('Done sampling file') + logger.info('Done sampling file to {}'.format(output_filepath)) def main(): @@ -278,14 +266,6 @@ def main(): help='absolute path to output .txt file') parser_process.add_argument('-l', '--lower', action='store_true', help='whether or not to lowercase splits') - parser_process.add_argument('-m', '--max-len', type=int, default=10000000, - dest='max_length', - help='spacy .max_length option for string ' - 'processing') - parser_process.add_argument('-t', '--max-tasks', type=int, default=0, - help='max task per child for fine-grained ' - 'control over python multiprocessing ' - 'pool memory management') parser_process.add_argument('-n', '--num-threads', type=int, default=1, help='number of CPU threads to be used') parser_sample = subparsers.add_parser( From 0a1fcf93f6851e378658417e4cfd5949c4f49ec7 Mon Sep 17 00:00:00 2001 From: AKB Date: Thu, 12 Sep 2019 14:05:26 +0200 Subject: [PATCH 13/14] Removed scripts --- scripts/batch-tokenize.py | 43 --------------------------------------- 1 file changed, 43 deletions(-) delete mode 100644 scripts/batch-tokenize.py diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py deleted file mode 100644 index ceb1125..0000000 --- a/scripts/batch-tokenize.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Tokenize a batch of .txt files with polyglot.""" - -import os -import functools -import multiprocessing - -import witokit - - -def _process(lowercase, output_dirpath, input_filepath): - output_filepath = os.path.join( - output_dirpath, - '{}.tkz.txt'.format(os.path.basename(input_filepath).split('.txt')[0])) - processing_input = [] - with open(input_filepath, 'r', encoding='utf-8') as input_stream: - for line in input_stream: - processing_input.append(line.strip()) - with open(output_filepath, 'w', encoding='utf-8') as output_stream: - tokenized_txt = witokit.tokenize(' '.join(processing_input), lowercase) - print(tokenized_txt, file=output_stream) - return input_filepath - - -if __name__ == '__main__': - BATCH_DIRPATH = '/Users/akb/Github/witokit/data/' - OUTPUT_DIRPATH = '/Users/akb/Github/witokit/data/' - LOWERCASE = True - NUM_THREADS = 1 - - assert os.path.exists(BATCH_DIRPATH) - assert os.path.exists(OUTPUT_DIRPATH) - - txt_filepaths = [os.path.join(BATCH_DIRPATH, filename) for filename in - os.listdir(BATCH_DIRPATH) if not filename.startswith('.')] - - file_num = 0 - with multiprocessing.Pool(NUM_THREADS) as pool: - process = functools.partial(_process, LOWERCASE, OUTPUT_DIRPATH) - for filepath in pool.imap_unordered(process, txt_filepaths): - file_num += 1 - print('Done processing file {}'.format(filepath)) - print('Completed processing of {}/{} files' - .format(file_num, len(txt_filepaths))) From 4f9ad0de4ee3cfa7a82a98df54546bb290f76ab4 Mon Sep 17 00:00:00 2001 From: AKB Date: Thu, 12 Sep 2019 14:07:00 +0200 Subject: [PATCH 14/14] Fixed typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6903d61..284b713 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ witokit process \ --num-threads num_cpu_threads ``` -Preprocessing for all languages is performed with [Polyglot](https://github.com/aboSamoor/polyglot) +Preprocessing for all languages is performed with [Polyglot](https://github.com/aboSamoor/polyglot). ### Sample You can also use WiToKit to sample the content of a preprocess .txt file, using: