Skip to content

Commit

Permalink
Merge pull request #9 from akb89/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
akb89 authored Jan 28, 2019
2 parents 0b38b56 + 43aa1c6 commit 74986e8
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 33 deletions.
7 changes: 7 additions & 0 deletions INSTALL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Install Polyglot

```
ln -fs /usr/local/Cellar/icu4c/63.1/bin/icu-config /usr/local/bin/icu-config
```
https://stackoverflow.com/questions/52309891/import-or-symbol-not-found-error-with-polyglot-when-pyicu-and-icu4c-installed-co
https://stackoverflow.com/questions/50217214/import-error-for-icu-in-mac-and-ubuntu-although-pyicu-is-installed-correctly
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from setuptools import setup

with open('README.md', 'r') as fh:
with open('README.md', 'r', encoding='utf-8') as fh:
long_description = fh.read()

setup(
Expand All @@ -16,7 +16,7 @@
author_email='akb@3azouz.net',
long_description=long_description,
long_description_content_type='text/markdown',
version='0.1.13',
version='0.2.1',
url='https://github.com/akb89/witokit',
download_url='https://pypi.org/project/witokit/#files',
license='MIT',
Expand All @@ -31,12 +31,12 @@
'witokit = witokit.main:main'
],
},
install_requires=['PyYAML==3.13', 'wikiextractor==3.0.3', 'spacy-nightly==2.1.0a1',
'en_core_web_sm==2.1.0a0', 'natsort==5.4.1',
'beautifulsoup4==4.6.3', 'polyglot==16.7.4'],
install_requires=['pyyaml>=4.2b1', 'wikiextractor==3.0.3',
'natsort==5.4.1', 'beautifulsoup4==4.6.3',
'polyglot==16.7.4', 'numpy==1.16.0', 'pyicu==2.2',
'pycld2==0.31', 'morfessor==2.0.4'],
dependency_links=[
'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.3',
'https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0a0/en_core_web_sm-2.1.0a0.tar.gz'],
'https://github.com/akb89/wikiextractor/tarball/master#egg=wikiextractor-3.0.3'],
classifiers=['Development Status :: 4 - Beta',
'Environment :: Console',
'Intended Audience :: Developers',
Expand Down
61 changes: 35 additions & 26 deletions witokit/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@
import os
import argparse
import multiprocessing
import urllib
import urllib.request
import functools
import shutil
import re
import bz2
import logging
import logging.config
import pycld2

from polyglot.text import Text
from bs4 import BeautifulSoup

import spacy
import wikiextractor

import witokit.utils.config as cutils
Expand Down Expand Up @@ -60,6 +61,7 @@ def _parallel_download(wiki_arxiv_hrefs, wiki_dump_url, num_threads,
def _collect_wiki_arxiv_hrefs(wiki_dump_url, lang, date):
wiki_arxiv_hrefs = []
try:
logger.info('Collecting arxiv from {}'.format(wiki_dump_url))
response = urllib.request.urlopen(wiki_dump_url)
html_doc = response.read()
soup = BeautifulSoup(html_doc, 'html.parser')
Expand Down Expand Up @@ -108,36 +110,37 @@ def _extract(args):
total_arxivs))


def _preprocess(output_txt_filepath, lowercase, max_length,
input_xml_filepath):
def _preprocess(output_txt_filepath, lowercase, input_xml_filepath):
"""Extract content of wikipedia XML file.
Extract content of json.text as given by wikiextractor and tokenize
content with spacy. Output one-sentence-per-line, lowercase, tokenized
content with polyglot. Output one-sentence-per-line, lowercase, tokenized
text.
"""
logger.info('Processing content of wikipedia file {}'
.format(input_xml_filepath))
output_filepath = futils.get_output_filepath(input_xml_filepath,
output_txt_filepath)
spacy_nlp = spacy.load('en_core_web_sm')
spacy_nlp.max_length = max_length # avoid bug with very long input
with open(output_filepath, 'w', encoding='utf-8') as output_stream:
logger.info('Writing output to file {}'.format(output_filepath))
for json_object in wikiextractor.extract(input_xml_filepath):
try:
doc = spacy_nlp(json_object['text'])
for sent in doc.sents:
text = Text(json_object['text']) # lang will be guessed
for sent in text.sentences:
if lowercase:
tokens = [token.text.lower().strip() for token in sent]
tokens = [token.lower().strip() for token in sent.words]
else:
tokens = [token.text.strip() for token in sent]
tokens = [token.strip() for token in sent.words]
output_sent = ' '.join(tokens)
print(output_sent, file=output_stream)
except UnicodeEncodeError as err:
logger.error('UnicodeEncodeError processing '
'json_object[\'text\'] with spacy: {}'
.format(str(err)))
except ValueError as err:
logger.warning('Skipping empty text sequence')
except pycld2.error as err:
logger.warning('{}. Skipping sequence'.format(str(err)))
return input_xml_filepath


Expand All @@ -149,26 +152,32 @@ def _process(args):
input_filepaths = futils.get_input_filepaths(args.wiki_input_dirpath)
total_arxivs = len(input_filepaths)
arxiv_num = 0
with multiprocessing.Pool(processes=args.num_threads,
maxtasksperchild=args.max_tasks) as pool:
preprocess = functools.partial(_preprocess, args.wiki_output_filepath,
args.lower, args.max_length)
for process in pool.imap_unordered(preprocess, input_filepaths):
arxiv_num += 1
logger.info('Done processing content of {}'.format(process))
logger.info('Completed processing of {}/{} archives'
.format(arxiv_num, total_arxivs))
# concatenate all .txt files into single output .txt file
logger.info('Concatenating tmp files...')
tmp_filepaths = futils.get_tmp_filepaths(args.wiki_output_filepath)
left = input_filepaths
with open(args.wiki_output_filepath, 'w', encoding='utf-8') as output_strm:
# with multiprocessing.Pool(processes=args.num_threads,
# maxtasksperchild=args.max_tasks) as pool:
# for wiki_input_filepath in input_filepaths:
# _preprocess(args.wiki_output_filepath, args.lower, wiki_input_filepath)
with multiprocessing.Pool(processes=args.num_threads) as pool:
preprocess = functools.partial(
_preprocess, args.wiki_output_filepath, args.lower)
for process in pool.imap_unordered(preprocess, input_filepaths):
arxiv_num += 1
logger.info('Done processing content of {}'.format(process))
logger.info('Completed processing of {}/{} archives'
.format(arxiv_num, total_arxivs))
left = [item for item in left if item != process]
logger.info('Left to process: {}'.format(left))
# concatenate all .txt files into single output .txt file
logger.info('Concatenating tmp files...')
tmp_filepaths = futils.get_tmp_filepaths(args.wiki_output_filepath)
for tmp_filepath in tmp_filepaths:
with open(tmp_filepath, 'r') as tmp_stream:
for line in tmp_stream:
line = line.strip()
print(line, file=output_strm)
logger.info('Done processing content of Wikipedia archives')
shutil.rmtree(futils.get_tmp_dirpath(args.wiki_output_filepath))
logger.info('Done processing content of Wikipedia archives')
shutil.rmtree(futils.get_tmp_dirpath(args.wiki_output_filepath))


def main():
Expand Down Expand Up @@ -218,7 +227,7 @@ def main():
dest='max_length',
help='spacy .max_length option for string '
'processing')
parser_process.add_argument('-t', '--max-tasks', type=int, default=10,
parser_process.add_argument('-t', '--max-tasks', type=int, default=0,
help='max task per child for fine-grained '
'control over python multiprocessing '
'pool memory management')
Expand Down

0 comments on commit 74986e8

Please sign in to comment.