Skip to content

Commit

Permalink
use argparse to get arguments, including optional number of threads a…
Browse files Browse the repository at this point in the history
…nd connections
  • Loading branch information
gcerretani committed Nov 20, 2021
1 parent 029b440 commit 2a3d500
Showing 1 changed file with 26 additions and 14 deletions.
40 changes: 26 additions & 14 deletions antenati.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
antenati.py: a tool to download data from the Portale Antenati
"""

__author__ = "Giovanni Cerretani"
__copyright__ = "Copyright (c) 2021, MIT License"
__author__ = 'Giovanni Cerretani'
__copyright__ = 'Copyright (c) 2021, Giovanni Cerretani'
__license__ = 'MIT License'
__version__ = '2.1'

import argparse
import json
import sys
import os
import re
import concurrent.futures
Expand All @@ -16,7 +18,7 @@
import slugify
import humanize

def get_manifest_from_url(url):
def get_mirador_manifest(url):
pool_manager = urllib3.PoolManager()
http_reply = pool_manager.request('GET', url)
manifest_url = None
Expand All @@ -29,7 +31,7 @@ def get_manifest_from_url(url):
http_reply = pool_manager.request('GET', manifest_url)
return json.loads(http_reply.data.decode('utf-8'))

def get_foldername_from_manifest(manifest):
def generate_foldername(manifest):
archive_label = manifest['label']
archive_content_type = 'unknown'
for metadata in manifest['metadata']:
Expand Down Expand Up @@ -59,11 +61,11 @@ def run(img, pool):
http_reply_size = len(http_reply.data)
return http_reply_size

def get_images(manifest):
def get_images(manifest, n_workers, n_connections):
total_size = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
host = 'iiif-antenati.san.beniculturali.it'
pool_http = urllib3.HTTPSConnectionPool(host, maxsize=10, block=True)
pool_http = urllib3.HTTPSConnectionPool(host, maxsize=n_connections, block=True)
canvases = manifest['sequences'][0]['canvases']
img_list = [ get_img_data(i) for i in canvases ]
future_img = { executor.submit(run, i, pool_http): i for i in img_list }
Expand All @@ -78,23 +80,33 @@ def get_images(manifest):
total_size = total_size + size
print(f'{filename} done ({humanize.naturalsize(size)})')
return total_size

def print_result(total_size):
print(f'Done. Total size: {humanize.naturalsize(total_size)}')

def main():

# Parse arguments
parser = argparse.ArgumentParser(
description='a tool to download data from the Portale Antenati',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('url', metavar='URL', type=str, help='url of the gallery page')
parser.add_argument('-n', '--nthreads', type=int, help='max n. of threads', default=8)
parser.add_argument('-c', '--nconn', type=int, help='max n. of connections', default=4)
args = parser.parse_args()

# Get Mirador manifest from HTTP
manifest = get_manifest_from_url(sys.argv[1])
manifest = get_mirador_manifest(args.url)

# Get folder name from metadata
foldername = get_foldername_from_manifest(manifest)
foldername = generate_foldername(manifest)

# Check if folder already exists and chdir to it
check_folder(foldername)

# Run
total_size = get_images(manifest)
total_size = get_images(manifest, args.nthreads, args.nconn)

# Done
print_result(total_size)
Expand Down

0 comments on commit 2a3d500

Please sign in to comment.