From d52901c73243dc1f42e48c14b1de6b51bccd99c6 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Mon, 28 Jan 2019 11:39:36 +0100
Subject: [PATCH 01/14] Added sampling

---
 setup.py        |  2 +-
 witokit/main.py | 36 ++++++++++++++++++++++++++++++++----
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 4fe5d24..eceba7c 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='0.2.1',
+    version='0.3.0',
     url='https://github.com/akb89/witokit',
     download_url='https://pypi.org/project/witokit/#files',
     license='MIT',
diff --git a/witokit/main.py b/witokit/main.py
index 27433db..9757a30 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -154,10 +154,6 @@ def _process(args):
     arxiv_num = 0
     left = input_filepaths
     with open(args.wiki_output_filepath, 'w', encoding='utf-8') as output_strm:
-        # with multiprocessing.Pool(processes=args.num_threads,
-        #                           maxtasksperchild=args.max_tasks) as pool:
-        # for wiki_input_filepath in input_filepaths:
-        #     _preprocess(args.wiki_output_filepath, args.lower, wiki_input_filepath)
         with multiprocessing.Pool(processes=args.num_threads) as pool:
             preprocess = functools.partial(
                 _preprocess, args.wiki_output_filepath, args.lower)
@@ -180,6 +176,29 @@ def _process(args):
         shutil.rmtree(futils.get_tmp_dirpath(args.wiki_output_filepath))
 
 
+def _sample(args):
+    if args.percent > 100:
+        raise Exception('Specified percent arg should be a percentage < 100')
+    logger.info('Sampling input file {}'.format(args.input_filepath))
+    output_filepath = '{}.sample{}'.format(args.input_filepath, args.percent)
+    logger.info('Counting number of lines in file...')
+    count = 0
+    with open(args.input_filepath, 'r') as input_stream:
+        for line in input_stream:
+            count += 1
+    logger.info('Total lines = {}'.format(count))
+    final_count = count * args.percent / 100
+    sampling = count / final_count
+    logger.info('Sampling file to {} lines with sampling rate = {}'
+                .format(final_count, sampling))
+    with open(args.input_filepath, 'r') as input_stream:
+        with open(output_filepath, 'w', encoding='utf-8') as output_stream:
+            for idx, line in enumerate(input_stream):
+                if idx % sampling == 0:
+                    print(line.strip(), file=output_stream)
+    logger.info('Done sampling file')
+
+
 def main():
     """Launch WiToKit."""
     parser = argparse.ArgumentParser(prog='witokit')
@@ -233,5 +252,14 @@ def main():
                                      'pool memory management')
     parser_process.add_argument('-n', '--num-threads', type=int, default=1,
                                 help='number of CPU threads to be used')
+    parser_sample = subparsers.add_parser(
+        'sample', formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        help='sample a given .txt file deterministically')
+    parser_sample.set_defaults(func=_sample)
+    parser_sample.add_argument('-i', '--input', required=True,
+                               dest='input_filepath',
+                               help='absolute path to .txt file to sample')
+    parser_sample.add_argument('-p', '--percent', required=True, type=float,
+                               help='percentage of input file to keep')
     args = parser.parse_args()
     args.func(args)

From 9432da677748d4307f3b18081cb1daae94f79d49 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Mon, 28 Jan 2019 13:05:36 +0100
Subject: [PATCH 02/14] Fixed wikiextractor encoding bug

---
 setup.py        | 6 +++---
 witokit/main.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index eceba7c..1303d90 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='0.3.0',
+    version='0.3.1',
     url='https://github.com/akb89/witokit',
     download_url='https://pypi.org/project/witokit/#files',
     license='MIT',
@@ -31,12 +31,12 @@
             'witokit = witokit.main:main'
         ],
     },
-    install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.3',
+    install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.4',
                       'natsort==5.4.1', 'beautifulsoup4==4.6.3',
                       'polyglot==16.7.4', 'numpy==1.16.0', 'pyicu==2.2',
                       'pycld2==0.31', 'morfessor==2.0.4'],
     dependency_links=[
-        'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.3'],
+        'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.4'],
     classifiers=['Development Status :: 4 - Beta',
                  'Environment :: Console',
                  'Intended Audience :: Developers',
diff --git a/witokit/main.py b/witokit/main.py
index 9757a30..8a9d223 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -168,7 +168,7 @@ def _process(args):
         logger.info('Concatenating tmp files...')
         tmp_filepaths = futils.get_tmp_filepaths(args.wiki_output_filepath)
         for tmp_filepath in tmp_filepaths:
-            with open(tmp_filepath, 'r') as tmp_stream:
+            with open(tmp_filepath, 'r', encoding='utf-8') as tmp_stream:
                 for line in tmp_stream:
                     line = line.strip()
                     print(line, file=output_strm)
@@ -183,7 +183,7 @@ def _sample(args):
     output_filepath = '{}.sample{}'.format(args.input_filepath, args.percent)
     logger.info('Counting number of lines in file...')
     count = 0
-    with open(args.input_filepath, 'r') as input_stream:
+    with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
         for line in input_stream:
             count += 1
     logger.info('Total lines = {}'.format(count))
@@ -191,7 +191,7 @@ def _sample(args):
     sampling = count / final_count
     logger.info('Sampling file to {} lines with sampling rate = {}'
                 .format(final_count, sampling))
-    with open(args.input_filepath, 'r') as input_stream:
+    with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
         with open(output_filepath, 'w', encoding='utf-8') as output_stream:
             for idx, line in enumerate(input_stream):
                 if idx % sampling == 0:

From bf42ce5743fb0fafa42a322e0dc5269e1aeaeb2e Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Mon, 28 Jan 2019 13:16:54 +0100
Subject: [PATCH 03/14] updated logging

---
 setup.py        | 2 +-
 witokit/main.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 1303d90..34ed655 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='0.3.1',
+    version='0.3.2',
     url='https://github.com/akb89/witokit',
     download_url='https://pypi.org/project/witokit/#files',
     license='MIT',
diff --git a/witokit/main.py b/witokit/main.py
index 8a9d223..0e8f615 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -138,9 +138,9 @@ def _preprocess(output_txt_filepath, lowercase, input_xml_filepath):
                              'json_object[\'text\'] with spacy: {}'
                              .format(str(err)))
             except ValueError as err:
-                logger.warning('Skipping empty text sequence')
+                logger.debug('Skipping empty text sequence')
             except pycld2.error as err:
-                logger.warning('{}. Skipping sequence'.format(str(err)))
+                logger.debug('{}. Skipping sequence'.format(str(err)))
     return input_xml_filepath
 
 

From 895494f0976e8cd91ae2462af17c7b43531dd711 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Mon, 28 Jan 2019 13:26:49 +0100
Subject: [PATCH 04/14] updated tmp filepath management

---
 setup.py               |  2 +-
 witokit/main.py        |  2 +-
 witokit/utils/files.py | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 34ed655..2e33a24 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='0.3.2',
+    version='0.3.3',
     url='https://github.com/akb89/witokit',
     download_url='https://pypi.org/project/witokit/#files',
     license='MIT',
diff --git a/witokit/main.py b/witokit/main.py
index 0e8f615..18e0116 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -166,7 +166,7 @@ def _process(args):
                 logger.info('Left to process: {}'.format(left))
         # concatenate all .txt files into single output .txt file
         logger.info('Concatenating tmp files...')
-        tmp_filepaths = futils.get_tmp_filepaths(args.wiki_output_filepath)
+        tmp_filepaths = futils.get_tmp_filepaths(args.wiki_input_dirpath)
         for tmp_filepath in tmp_filepaths:
             with open(tmp_filepath, 'r', encoding='utf-8') as tmp_stream:
                 for line in tmp_stream:
diff --git a/witokit/utils/files.py b/witokit/utils/files.py
index 04200c5..ccffc94 100644
--- a/witokit/utils/files.py
+++ b/witokit/utils/files.py
@@ -20,14 +20,14 @@ def get_download_output_filepath(output_dirpath, href):
     return os.path.join(output_dirpath, href)
 
 
-def get_tmp_dirpath(output_txt_filepath):
+def get_tmp_dirpath(input_xml_filepath):
     """Return absolute path to output_txt_dirpath/tmp/."""
-    return os.path.join(os.path.dirname(output_txt_filepath), 'tmp')
+    return os.path.join(os.path.dirname(input_xml_filepath), 'tmp')
 
 
-def get_tmp_filepaths(output_txt_filepath):
+def get_tmp_filepaths(xml_input_dirpath):
     """Return all .txt files under the output_txt_dirpath/tmp/ dir."""
-    tmp_dirpath = get_tmp_dirpath(output_txt_filepath)
+    tmp_dirpath = os.path.join(xml_input_dirpath, 'tmp')
     return natsort.natsorted([os.path.join(tmp_dirpath, filename) for filename
                               in os.listdir(tmp_dirpath)],
                              alg=natsort.ns.IGNORECASE)
@@ -38,7 +38,7 @@ def get_output_filepath(input_xml_filepath, output_txt_filepath):
 
     Create tmp dir if not exists.
     """
-    tmp_dirpath = get_tmp_dirpath(output_txt_filepath)
+    tmp_dirpath = get_tmp_dirpath(input_xml_filepath)
     os.makedirs(tmp_dirpath, exist_ok=True)
     output_filename = os.path.basename(input_xml_filepath)
     output_txt_filepath = os.path.join(

From d696b39d0100775a8f04f53882573b2c24318577 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Mon, 28 Jan 2019 13:30:32 +0100
Subject: [PATCH 05/14] Fixed wrong filepath

---
 setup.py               | 2 +-
 witokit/main.py        | 2 +-
 witokit/utils/files.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 2e33a24..c2f1843 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='0.3.3',
+    version='0.3.4',
     url='https://github.com/akb89/witokit',
     download_url='https://pypi.org/project/witokit/#files',
     license='MIT',
diff --git a/witokit/main.py b/witokit/main.py
index 18e0116..6421ffc 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -173,7 +173,7 @@ def _process(args):
                     line = line.strip()
                     print(line, file=output_strm)
         logger.info('Done processing content of Wikipedia archives')
-        shutil.rmtree(futils.get_tmp_dirpath(args.wiki_output_filepath))
+        shutil.rmtree(futils.get_tmp_dirpath(args.wiki_input_dirpath))
 
 
 def _sample(args):
diff --git a/witokit/utils/files.py b/witokit/utils/files.py
index ccffc94..bfb9e6e 100644
--- a/witokit/utils/files.py
+++ b/witokit/utils/files.py
@@ -27,7 +27,7 @@ def get_tmp_dirpath(input_xml_filepath):
 
 def get_tmp_filepaths(xml_input_dirpath):
     """Return all .txt files under the output_txt_dirpath/tmp/ dir."""
-    tmp_dirpath = os.path.join(xml_input_dirpath, 'tmp')
+    tmp_dirpath = get_tmp_dirpath(xml_input_dirpath)
     return natsort.natsorted([os.path.join(tmp_dirpath, filename) for filename
                               in os.listdir(tmp_dirpath)],
                              alg=natsort.ns.IGNORECASE)

From 4bf764c88ebb274c114c7508609bce02c83b2362 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Mon, 28 Jan 2019 14:35:39 +0100
Subject: [PATCH 06/14] Added support for downloading small wikipedia arxivs

---
 setup.py              |  2 +-
 witokit/main.py       | 12 +++++++++++-
 witokit/utils/urls.py | 11 ++++++++---
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index c2f1843..7dbff70 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='0.3.4',
+    version='0.4.0',
     url='https://github.com/akb89/witokit',
     download_url='https://pypi.org/project/witokit/#files',
     license='MIT',
diff --git a/witokit/main.py b/witokit/main.py
index 6421ffc..d7d6f86 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -66,10 +66,20 @@ def _collect_wiki_arxiv_hrefs(wiki_dump_url, lang, date):
         html_doc = response.read()
         soup = BeautifulSoup(html_doc, 'html.parser')
         for link in soup.find_all('a'):
-            pattern = uutils.get_wikipedia_pattern(lang, date)
+            pattern = uutils.get_wikipedia_multi_pattern(lang, date)
             href = link.get('href')
             if re.match(pattern, href):
                 wiki_arxiv_hrefs.append(href)
+        if not wiki_arxiv_hrefs:
+            logger.info('No multi arxivs found. Trying for single arxiv')
+            # If wikipedia arxiv is too small, check for single arxiv
+            for link in soup.find_all('a'):
+                pattern = uutils.get_wikipedia_single_pattern(lang, date)
+                href = link.get('href')
+                if re.match(pattern, href):
+                    wiki_arxiv_hrefs.append(href)
+        if not wiki_arxiv_hrefs:
+            logger.warning('No wikipedia arxiv found')
     except urllib.error.HTTPError as error:
         logger.error('HTTPError using lang = \'{}\' and date = \'{}\'. '
                      'Could not retrieve any Wikipedia data at URL = {}'
diff --git a/witokit/utils/urls.py b/witokit/utils/urls.py
index ed3a3b2..62d9a14 100644
--- a/witokit/utils/urls.py
+++ b/witokit/utils/urls.py
@@ -2,8 +2,8 @@
 
 import witokit.utils.constants as const
 
-__all__ = ('get_wikipedia_dump_url', 'get_wikipedia_pattern',
-           'get_wiki_arxiv_url')
+__all__ = ('get_wikipedia_dump_url', 'get_wikipedia_multi_pattern',
+           'get_wiki_arxiv_url', 'get_wikipedia_single_pattern')
 
 
 def get_wikipedia_dump_url(lang, date):
@@ -11,11 +11,16 @@ def get_wikipedia_dump_url(lang, date):
     return '{}/{}wiki/{}'.format(const.WIKI_DL_URL, lang, date)
 
 
-def get_wikipedia_pattern(lang, date):
+def get_wikipedia_multi_pattern(lang, date):
     """Return a regex pattern matching for wiki .bz2 files to be extracted."""
     return r'({}wiki-{}-pages-articles[0-9]+.xml.*bz2$)'.format(lang, date)
 
 
+def get_wikipedia_single_pattern(lang, date):
+    """Return a regex pattern matching for wiki .bz2 files to be extracted."""
+    return r'({}wiki-{}-pages-articles+.xml.*bz2$)'.format(lang, date)
+
+
 def get_wiki_arxiv_url(wiki_dump_url, href):
     """Return a full URL from the href of a .bz2 archive."""
     return '{}/{}'.format(wiki_dump_url, href)

From 96dfea204205137e826ee8d863c0a5a9ba578371 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Wed, 30 Jan 2019 11:26:15 +0100
Subject: [PATCH 07/14] added tokenize function

---
 setup.py            |  2 +-
 witokit/__init__.py |  3 +++
 witokit/main.py     | 34 ++++++++++++++++++++++------------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/setup.py b/setup.py
index 7dbff70..db0a5de 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='0.4.0',
+    version='0.5.0',
     url='https://github.com/akb89/witokit',
     download_url='https://pypi.org/project/witokit/#files',
     license='MIT',
diff --git a/witokit/__init__.py b/witokit/__init__.py
index e69de29..d211f46 100644
--- a/witokit/__init__.py
+++ b/witokit/__init__.py
@@ -0,0 +1,3 @@
+from .main import tokenize
+
+__all__ = ('tokenize')
diff --git a/witokit/main.py b/witokit/main.py
index d7d6f86..fbdacab 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -30,6 +30,8 @@
 
 logger = logging.getLogger(__name__)
 
+__all__ = ('tokenize')
+
 
 def _download_href(output_dirpath, wiki_dump_url, href):
     url = uutils.get_wiki_arxiv_url(wiki_dump_url, href)
@@ -135,25 +137,33 @@ def _preprocess(output_txt_filepath, lowercase, input_xml_filepath):
         logger.info('Writing output to file {}'.format(output_filepath))
         for json_object in wikiextractor.extract(input_xml_filepath):
             try:
-                text = Text(json_object['text'])  # lang will be guessed
-                for sent in text.sentences:
-                    if lowercase:
-                        tokens = [token.lower().strip() for token in sent.words]
-                    else:
-                        tokens = [token.strip() for token in sent.words]
-                    output_sent = ' '.join(tokens)
-                    print(output_sent, file=output_stream)
+                print(tokenize(json_object['text'], lowercase),
+                      file=output_stream)
             except UnicodeEncodeError as err:
                 logger.error('UnicodeEncodeError processing '
                              'json_object[\'text\'] with spacy: {}'
                              .format(str(err)))
-            except ValueError as err:
-                logger.debug('Skipping empty text sequence')
-            except pycld2.error as err:
-                logger.debug('{}. Skipping sequence'.format(str(err)))
     return input_xml_filepath
 
 
+def tokenize(raw_text, lowercase):
+    """Tokenize raw_text with polyglot."""
+    output = []
+    try:
+        text = Text(raw_text)
+        for sent in text.sentences:
+            if lowercase:
+                tokens = [token.lower().strip() for token in sent.words]
+            else:
+                tokens = [token.strip() for token in sent.words]
+            output.append(' '.join(tokens))
+    except ValueError as err:
+        logger.debug('Skipping empty text sequence')
+    except pycld2.error as err:
+        logger.debug('{}. Skipping sequence'.format(str(err)))
+    return '\n'.join(output)
+
+
 def _process(args):
     logger.info('Processing content of wikipedia archives under {}'
                 .format(args.wiki_input_dirpath))

From 1b0e0fd08e9ca5f49ea549f5f63fa3db9511db23 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Wed, 30 Jan 2019 11:26:26 +0100
Subject: [PATCH 08/14] added batch-tokenize script

---
 scripts/batch-tokenize.py | 44 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 scripts/batch-tokenize.py

diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py
new file mode 100644
index 0000000..aea1494
--- /dev/null
+++ b/scripts/batch-tokenize.py
@@ -0,0 +1,44 @@
+"""Tokenize a batch of .txt files with polyglot."""
+
+import os
+import functools
+import multiprocessing
+
+import witokit
+
+
+def _process(lowercase, output_dirpath, input_filepath):
+    output_filepath = os.path.join(
+        output_dirpath,
+        '{}.tkz.txt'.format(os.path.basename(input_filepath).split('.txt')[0]))
+    input = []
+    with open(input_filepath, 'r', encoding='utf-8') as input_stream:
+        for line in input_stream:
+            line = line.strip()
+            input.append(line)
+    with open(output_filepath, 'w', encoding='utf-8') as output_stream:
+        tokenized_txt = witokit.tokenize(' '.join(input), lowercase)
+        print(tokenized_txt, file=output_stream)
+    return input_filepath
+
+
+if __name__ == '__main__':
+    BATCH_DIRPATH = '/Users/akb/Github/witokit/data/'
+    OUTPUT_DIRPATH = '/Users/akb/Github/witokit/data/'
+    LOWERCASE = True
+    NUM_THREADS = 1
+
+    assert os.path.exists(BATCH_DIRPATH)
+    assert os.path.exists(OUTPUT_DIRPATH)
+
+    txt_filepaths = [os.path.join(BATCH_DIRPATH, filename) for filename in
+                     os.listdir(BATCH_DIRPATH) if not filename.startswith('.')]
+
+    file_num = 0
+    with multiprocessing.Pool(NUM_THREADS) as pool:
+        file_num += 1
+        process = functools.partial(_process, LOWERCASE, OUTPUT_DIRPATH)
+        for filepath in pool.imap_unordered(process, txt_filepaths):
+            print('Done processing file {}'.format(filepath))
+            print('Completed processing of {}/{} files'
+                  .format(file_num, len(txt_filepaths)))

From 219d67d18ab3c5085996b4d09cb41e6e01686eaf Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Wed, 30 Jan 2019 11:28:34 +0100
Subject: [PATCH 09/14] updated script

---
 scripts/batch-tokenize.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py
index aea1494..6f9176e 100644
--- a/scripts/batch-tokenize.py
+++ b/scripts/batch-tokenize.py
@@ -23,10 +23,10 @@ def _process(lowercase, output_dirpath, input_filepath):
 
 
 if __name__ == '__main__':
-    BATCH_DIRPATH = '/Users/akb/Github/witokit/data/'
-    OUTPUT_DIRPATH = '/Users/akb/Github/witokit/data/'
+    BATCH_DIRPATH = '/home/kabbach/witokit/data/ud23/raw/'
+    OUTPUT_DIRPATH = '/home/kabbach/witokit/data/ud23/tokenized/'
     LOWERCASE = True
-    NUM_THREADS = 1
+    NUM_THREADS = 38
 
     assert os.path.exists(BATCH_DIRPATH)
     assert os.path.exists(OUTPUT_DIRPATH)

From 41ca02ec29dad95ee6deb2504ec5e61e7ec7ba02 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Wed, 30 Jan 2019 11:30:11 +0100
Subject: [PATCH 10/14] bugfix on counter

---
 scripts/batch-tokenize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py
index 6f9176e..72bfe57 100644
--- a/scripts/batch-tokenize.py
+++ b/scripts/batch-tokenize.py
@@ -36,9 +36,9 @@ def _process(lowercase, output_dirpath, input_filepath):
 
     file_num = 0
     with multiprocessing.Pool(NUM_THREADS) as pool:
-        file_num += 1
         process = functools.partial(_process, LOWERCASE, OUTPUT_DIRPATH)
         for filepath in pool.imap_unordered(process, txt_filepaths):
+            file_num += 1
             print('Done processing file {}'.format(filepath))
             print('Completed processing of {}/{} files'
                   .format(file_num, len(txt_filepaths)))

From 5a2045cdbd75fb7455b88e459301789c0a7009c0 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Wed, 13 Feb 2019 14:13:17 +0100
Subject: [PATCH 11/14] Updated sampling script to add balance param

---
 witokit/main.py | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/witokit/main.py b/witokit/main.py
index fbdacab..c70b6bf 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -197,24 +197,40 @@ def _process(args):
 
 
 def _sample(args):
-    if args.percent > 100:
-        raise Exception('Specified percent arg should be a percentage < 100')
+    if not (0 < args.percent < 100):
+        raise Exception('Specified percent param should be in ]0, 100[')
     logger.info('Sampling input file {}'.format(args.input_filepath))
-    output_filepath = '{}.sample{}'.format(args.input_filepath, args.percent)
+
     logger.info('Counting number of lines in file...')
     count = 0
+    if args.input_filepath.endswith('.txt'):
+        input_basename = args.input_filepath.split('.txt')[0]
+    else:
+        input_basename = args.input_filepath
     with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
         for line in input_stream:
             count += 1
     logger.info('Total lines = {}'.format(count))
     final_count = count * args.percent / 100
     sampling = count / final_count
-    logger.info('Sampling file to {} lines with sampling rate = {}'
-                .format(final_count, sampling))
-    with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
-        with open(output_filepath, 'w', encoding='utf-8') as output_stream:
-            for idx, line in enumerate(input_stream):
-                if idx % sampling == 0:
+    logger.info('Sampling file to {} lines'.format(final_count))
+    print(round(sampling))
+    if args.balance:
+        output_filepath = '{}.sample{}.balanced.txt'.format(input_basename,
+                                                            args.percent)
+        with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
+            with open(output_filepath, 'w', encoding='utf-8') as output_stream:
+                for idx, line in enumerate(input_stream):
+                    if idx % round(sampling) == 0:
+                        print(line.strip(), file=output_stream)
+    else:
+        output_filepath = '{}.sample{}.txt'.format(input_basename,
+                                                   args.percent)
+        with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
+            with open(output_filepath, 'w', encoding='utf-8') as output_stream:
+                for idx, line in enumerate(input_stream):
+                    if idx >= final_count:
+                        break
                     print(line.strip(), file=output_stream)
     logger.info('Done sampling file')
 
@@ -281,5 +297,9 @@ def main():
                                help='absolute path to .txt file to sample')
     parser_sample.add_argument('-p', '--percent', required=True, type=float,
                                help='percentage of input file to keep')
+    parser_sample.add_argument('-b', '--balance', action='store_true',
+                               help='whether or not to balance the sampling'
+                                    'within the corpus or to take the top'
+                                    'p% sentences')
     args = parser.parse_args()
     args.func(args)

From c9122d4d6ada99ab25eecab85d4dad8402efa408 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Thu, 12 Sep 2019 14:03:58 +0200
Subject: [PATCH 12/14] v1.0

---
 .gitignore                |  3 ++
 README.md                 | 19 ++++++++++---
 scripts/batch-tokenize.py | 13 ++++-----
 setup.py                  |  8 +++---
 witokit/__init__.py       |  1 +
 witokit/main.py           | 60 +++++++++++++--------------------------
 6 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3f3a061..430e6ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,6 @@ data
 *.7z
 RELEASE.md
 dist/
+XP.md
+scripts/
+build/
diff --git a/README.md b/README.md
index 6572871..6903d61 100644
--- a/README.md
+++ b/README.md
@@ -4,11 +4,11 @@
 [![Build][travis-image]][travis-url]
 [![MIT License][license-image]][license-url]
 
-Welcome to `WiToKit`, a Python toolkit to download and generate
-preprocessed Wikipedia dumps for NLP in a single .txt file, one
-sentence per line.
+Welcome to `WiToKit`, a Python toolkit to download and generate preprocessed Wikipedia dumps for all languages.
 
-*Note: WiToKit currently only supports `xx-pages-articles.xml.xx.bz2` Wikipedia archives corresponding to articles, templates, media/file descriptions, and primary meta-pages. Also, the preprocessing is currently only supported for English. If you'd like support in other languages, please create an issue on Github.*
+WiToKit can be used to converts a Wikipedia archive into a single .txt file, one (tokenized) sentence per line.
+
+*Note: WiToKit currently only supports `xx-pages-articles.xml.xx.bz2` Wikipedia archives corresponding to articles, templates, media/file descriptions, and primary meta-pages.*
 
 ## Install
 
@@ -67,6 +67,17 @@ witokit process \
   --num-threads num_cpu_threads
 ```
 
+Preprocessing for all languages is performed with [Polyglot](https://github.com/aboSamoor/polyglot)
+
+### Sample
+You can also use WiToKit to sample the content of a preprocess .txt file, using:
+```bash
+witokit sample \
+  --input /abs/path/to/witokit/preprocessed/txt/file \
+  --percent \  # percentage of total lines to keep
+  --balance  # if set, will balance sampling, otherwise, will take top n sentences only
+```
+
 [release-image]:https://img.shields.io/github/release/akb89/witokit.svg?style=flat-square
 [release-url]:https://github.com/akb89/witokit/releases/latest
 [pypi-image]:https://img.shields.io/pypi/v/witokit.svg?style=flat-square
diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py
index 72bfe57..ceb1125 100644
--- a/scripts/batch-tokenize.py
+++ b/scripts/batch-tokenize.py
@@ -11,22 +11,21 @@ def _process(lowercase, output_dirpath, input_filepath):
     output_filepath = os.path.join(
         output_dirpath,
         '{}.tkz.txt'.format(os.path.basename(input_filepath).split('.txt')[0]))
-    input = []
+    processing_input = []
     with open(input_filepath, 'r', encoding='utf-8') as input_stream:
         for line in input_stream:
-            line = line.strip()
-            input.append(line)
+            processing_input.append(line.strip())
     with open(output_filepath, 'w', encoding='utf-8') as output_stream:
-        tokenized_txt = witokit.tokenize(' '.join(input), lowercase)
+        tokenized_txt = witokit.tokenize(' '.join(processing_input), lowercase)
         print(tokenized_txt, file=output_stream)
     return input_filepath
 
 
 if __name__ == '__main__':
-    BATCH_DIRPATH = '/home/kabbach/witokit/data/ud23/raw/'
-    OUTPUT_DIRPATH = '/home/kabbach/witokit/data/ud23/tokenized/'
+    BATCH_DIRPATH = '/Users/akb/Github/witokit/data/'
+    OUTPUT_DIRPATH = '/Users/akb/Github/witokit/data/'
     LOWERCASE = True
-    NUM_THREADS = 38
+    NUM_THREADS = 1
 
     assert os.path.exists(BATCH_DIRPATH)
     assert os.path.exists(OUTPUT_DIRPATH)
diff --git a/setup.py b/setup.py
index db0a5de..c6c279a 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
     author_email='akb@3azouz.net',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    version='0.5.0',
+    version='1.0.0',
     url='https://github.com/akb89/witokit',
     download_url='https://pypi.org/project/witokit/#files',
     license='MIT',
@@ -33,11 +33,11 @@
     },
     install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.4',
                       'natsort==5.4.1', 'beautifulsoup4==4.6.3',
-                      'polyglot==16.7.4', 'numpy==1.16.0', 'pyicu==2.2',
-                      'pycld2==0.31', 'morfessor==2.0.4'],
+                      'polyglot==16.7.4', 'pyicu==2.3.1',
+                      'pycld2==0.31', 'morfessor==2.0.4', 'tqdm==4.35.0'],
     dependency_links=[
         'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.4'],
-    classifiers=['Development Status :: 4 - Beta',
+    classifiers=['Development Status :: 5 - Production/Stable',
                  'Environment :: Console',
                  'Intended Audience :: Developers',
                  'Intended Audience :: Education',
diff --git a/witokit/__init__.py b/witokit/__init__.py
index d211f46..da76f89 100644
--- a/witokit/__init__.py
+++ b/witokit/__init__.py
@@ -1,3 +1,4 @@
+"""To export for toolkit use."""
 from .main import tokenize
 
 __all__ = ('tokenize')
diff --git a/witokit/main.py b/witokit/main.py
index c70b6bf..64c3b72 100644
--- a/witokit/main.py
+++ b/witokit/main.py
@@ -15,6 +15,7 @@
 import logging.config
 import pycld2
 
+from tqdm import tqdm
 from polyglot.text import Text
 from bs4 import BeautifulSoup
 
@@ -35,7 +36,7 @@
 
 def _download_href(output_dirpath, wiki_dump_url, href):
     url = uutils.get_wiki_arxiv_url(wiki_dump_url, href)
-    logger.info('Downloading {}'.format(url))
+    logger.debug('Downloading {}'.format(url))
     output_filepath = futils.get_download_output_filepath(output_dirpath,
                                                           href)
     try:
@@ -51,13 +52,10 @@ def _parallel_download(wiki_arxiv_hrefs, wiki_dump_url, num_threads,
         _download_href_to_output_dir = functools.partial(_download_href,
                                                          output_dirpath,
                                                          wiki_dump_url)
-        total_arxivs = len(wiki_arxiv_hrefs)
-        arxiv_num = 0
-        for _ in pool.imap_unordered(_download_href_to_output_dir,
-                                     wiki_arxiv_hrefs):
-            arxiv_num += 1
-            logger.info('Downloaded {}/{} archives'.format(arxiv_num,
-                                                           total_arxivs))
+        for _ in tqdm(pool.imap_unordered(_download_href_to_output_dir,
+                                          wiki_arxiv_hrefs),
+                      total=len(wiki_arxiv_hrefs)):
+            continue
 
 
 def _collect_wiki_arxiv_hrefs(wiki_dump_url, lang, date):
@@ -114,12 +112,10 @@ def _extract(args):
     logger.info('Extracting .bz2 files from {}'.format(args.bz2_input_dirpath))
     bz2_arxivs = futils.get_bz2_arxivs(args.bz2_input_dirpath)
     total_arxivs = len(bz2_arxivs)
-    arxiv_num = 0
     with multiprocessing.Pool(args.num_threads) as pool:
-        for _ in pool.imap_unordered(_decompress_arxiv, bz2_arxivs):
-            arxiv_num += 1
-            logger.info('Extracted {}/{} archives'.format(arxiv_num,
-                                                          total_arxivs))
+        for _ in tqdm(pool.imap_unordered(_decompress_arxiv, bz2_arxivs),
+                      total=total_arxivs):
+            continue
 
 
 def _preprocess(output_txt_filepath, lowercase, input_xml_filepath):
@@ -135,13 +131,13 @@ def _preprocess(output_txt_filepath, lowercase, input_xml_filepath):
                                                  output_txt_filepath)
     with open(output_filepath, 'w', encoding='utf-8') as output_stream:
         logger.info('Writing output to file {}'.format(output_filepath))
-        for json_object in wikiextractor.extract(input_xml_filepath):
+        for json_object in tqdm(wikiextractor.extract(input_xml_filepath)):
             try:
                 print(tokenize(json_object['text'], lowercase),
                       file=output_stream)
             except UnicodeEncodeError as err:
                 logger.error('UnicodeEncodeError processing '
-                             'json_object[\'text\'] with spacy: {}'
+                             'json_object[\'text\'] with polyglot: {}'
                              .format(str(err)))
     return input_xml_filepath
 
@@ -171,19 +167,13 @@ def _process(args):
         logger.info('Lowercasing archives')
     input_filepaths = futils.get_input_filepaths(args.wiki_input_dirpath)
     total_arxivs = len(input_filepaths)
-    arxiv_num = 0
-    left = input_filepaths
     with open(args.wiki_output_filepath, 'w', encoding='utf-8') as output_strm:
         with multiprocessing.Pool(processes=args.num_threads) as pool:
             preprocess = functools.partial(
                 _preprocess, args.wiki_output_filepath, args.lower)
-            for process in pool.imap_unordered(preprocess, input_filepaths):
-                arxiv_num += 1
-                logger.info('Done processing content of {}'.format(process))
-                logger.info('Completed processing of {}/{} archives'
-                            .format(arxiv_num, total_arxivs))
-                left = [item for item in left if item != process]
-                logger.info('Left to process: {}'.format(left))
+            for _ in tqdm(pool.imap_unordered(preprocess, input_filepaths),
+                          total=total_arxivs):
+                continue
         # concatenate all .txt files into single output .txt file
         logger.info('Concatenating tmp files...')
         tmp_filepaths = futils.get_tmp_filepaths(args.wiki_input_dirpath)
@@ -197,24 +187,22 @@ def _process(args):
 
 
 def _sample(args):
-    if not (0 < args.percent < 100):
+    if not 0 < args.percent < 100:
         raise Exception('Specified percent param should be in ]0, 100[')
     logger.info('Sampling input file {}'.format(args.input_filepath))
 
     logger.info('Counting number of lines in file...')
-    count = 0
     if args.input_filepath.endswith('.txt'):
         input_basename = args.input_filepath.split('.txt')[0]
     else:
         input_basename = args.input_filepath
     with open(args.input_filepath, 'r', encoding='utf-8') as input_stream:
-        for line in input_stream:
-            count += 1
-    logger.info('Total lines = {}'.format(count))
+        count = sum(1 for x in input_stream)
+        logger.info('Total lines = {}'.format(count))
     final_count = count * args.percent / 100
     sampling = count / final_count
-    logger.info('Sampling file to {} lines'.format(final_count))
-    print(round(sampling))
+    logger.info('Sampling file to {} lines with balance = {}'
+                .format(int(final_count), args.balance))
     if args.balance:
         output_filepath = '{}.sample{}.balanced.txt'.format(input_basename,
                                                             args.percent)
@@ -232,7 +220,7 @@ def _sample(args):
                     if idx >= final_count:
                         break
                     print(line.strip(), file=output_stream)
-    logger.info('Done sampling file')
+    logger.info('Done sampling file to {}'.format(output_filepath))
 
 
 def main():
@@ -278,14 +266,6 @@ def main():
                                 help='absolute path to output .txt file')
     parser_process.add_argument('-l', '--lower', action='store_true',
                                 help='whether or not to lowercase splits')
-    parser_process.add_argument('-m', '--max-len', type=int, default=10000000,
-                                dest='max_length',
-                                help='spacy .max_length option for string '
-                                     'processing')
-    parser_process.add_argument('-t', '--max-tasks', type=int, default=0,
-                                help='max task per child for fine-grained '
-                                     'control over python multiprocessing '
-                                     'pool memory management')
     parser_process.add_argument('-n', '--num-threads', type=int, default=1,
                                 help='number of CPU threads to be used')
     parser_sample = subparsers.add_parser(

From 0a1fcf93f6851e378658417e4cfd5949c4f49ec7 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Thu, 12 Sep 2019 14:05:26 +0200
Subject: [PATCH 13/14] Removed scripts

---
 scripts/batch-tokenize.py | 43 ---------------------------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 scripts/batch-tokenize.py

diff --git a/scripts/batch-tokenize.py b/scripts/batch-tokenize.py
deleted file mode 100644
index ceb1125..0000000
--- a/scripts/batch-tokenize.py
+++ /dev/null
@@ -1,43 +0,0 @@
-"""Tokenize a batch of .txt files with polyglot."""
-
-import os
-import functools
-import multiprocessing
-
-import witokit
-
-
-def _process(lowercase, output_dirpath, input_filepath):
-    output_filepath = os.path.join(
-        output_dirpath,
-        '{}.tkz.txt'.format(os.path.basename(input_filepath).split('.txt')[0]))
-    processing_input = []
-    with open(input_filepath, 'r', encoding='utf-8') as input_stream:
-        for line in input_stream:
-            processing_input.append(line.strip())
-    with open(output_filepath, 'w', encoding='utf-8') as output_stream:
-        tokenized_txt = witokit.tokenize(' '.join(processing_input), lowercase)
-        print(tokenized_txt, file=output_stream)
-    return input_filepath
-
-
-if __name__ == '__main__':
-    BATCH_DIRPATH = '/Users/akb/Github/witokit/data/'
-    OUTPUT_DIRPATH = '/Users/akb/Github/witokit/data/'
-    LOWERCASE = True
-    NUM_THREADS = 1
-
-    assert os.path.exists(BATCH_DIRPATH)
-    assert os.path.exists(OUTPUT_DIRPATH)
-
-    txt_filepaths = [os.path.join(BATCH_DIRPATH, filename) for filename in
-                     os.listdir(BATCH_DIRPATH) if not filename.startswith('.')]
-
-    file_num = 0
-    with multiprocessing.Pool(NUM_THREADS) as pool:
-        process = functools.partial(_process, LOWERCASE, OUTPUT_DIRPATH)
-        for filepath in pool.imap_unordered(process, txt_filepaths):
-            file_num += 1
-            print('Done processing file {}'.format(filepath))
-            print('Completed processing of {}/{} files'
-                  .format(file_num, len(txt_filepaths)))

From 4f9ad0de4ee3cfa7a82a98df54546bb290f76ab4 Mon Sep 17 00:00:00 2001
From: AKB <akb@3azouz.net>
Date: Thu, 12 Sep 2019 14:07:00 +0200
Subject: [PATCH 14/14] Fixed typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6903d61..284b713 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ witokit process \
   --num-threads num_cpu_threads
 ```
 
-Preprocessing for all languages is performed with [Polyglot](https://github.com/aboSamoor/polyglot)
+Preprocessing for all languages is performed with [Polyglot](https://github.com/aboSamoor/polyglot).
 
 ### Sample
 You can also use WiToKit to sample the content of a preprocess .txt file, using: