Skip to content
This repository has been archived by the owner on Mar 8, 2023. It is now read-only.

Commit

Permalink
fix #85
Browse files Browse the repository at this point in the history
  • Loading branch information
Mte90 committed Jul 31, 2020
1 parent e258f98 commit e66cae3
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 5 deletions.
2 changes: 1 addition & 1 deletion MITADS/merge_txt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
i=0
parsing=''
loop=0
for f in ./corpus_api.py ./eulogos_chat_importer.py ./ananas_exporter.py ./tg_ita_exporter.py ./ted_importer.py ./gutenberg_exporter.py ./wikiquote_exporter.py ./wikisource_importer.py ./opensubtitles_exporter.py
for f in ./wikisource_importer.py ./opensubtitles_exporter.py ./corpus_api.py ./eulogos_chat_importer.py ./ananas_exporter.py ./tg_ita_exporter.py ./ted_importer.py ./gutenberg_exporter.py ./wikiquote_exporter.py
do
echo "========="
echo $f
Expand Down
24 changes: 20 additions & 4 deletions MITADS/wikisource_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@
from utils import sanitize, line_rules, download
from urllib import parse
import time
import os

OUTFILE = "output/wikisource.txt"
PARSING = './parsing/wikisource/'
if not os.path.isdir(PARSING):
os.mkdir(PARSING)
DISCARD_FILE = 'output/discarded/wikisource.json'
DOWNLOAD_LINK = 'https://wsexport.wmflabs.org/tool/book.php?lang=it&format=txt&page='

Expand Down Expand Up @@ -33,8 +37,17 @@ def process_line(line, out_file):


def process_book(book, out_file):
book_file = PARSING + book.replace('/','-') + '.txt'
book = parse.quote(book) # need to html encode book title to avoid non ascii chars
raw_text = download_me.download_page(DOWNLOAD_LINK + book)
if not os.path.isfile(book_file):
print(" Downloading in progress")
time.sleep(5) # to avoid being banned from wikipedia servers for excess in requests
raw_text = download_me.download_page(DOWNLOAD_LINK + book)
result = open(book_file, "w", encoding='utf-8')
result.write(raw_text)
else:
print(" Already downloaded in " + book_file)
raw_text = open(book_file, 'r').read()
raw_text = clean_me.maybe_normalize(raw_text)
raw_text = clean_me.prepare_splitlines(raw_text).splitlines()
tot_lines = 0
Expand All @@ -52,9 +65,12 @@ def main():

tot_lines = 0
for count, book in enumerate(books):
time.sleep(5) # to avoid being banned from wikipedia servers for excess in requests
print(" Processing book : {}\n {} of {}".format(book, count, len(books)))
tot_lines += process_book(book, result)
print(" Processing book: {}\n {} of {}".format(book, count, len(books)))
try:
tot_lines += process_book(book, result)
except:
# if fails try again
tot_lines += process_book(book, result)

result.close()

Expand Down

0 comments on commit e66cae3

Please sign in to comment.