Skip to content

Commit

Permalink
Merge pull request #31 from unt-libraries/add-token-support
Browse files Browse the repository at this point in the history
Add token support
  • Loading branch information
ldko authored Jul 23, 2019
2 parents 1fe354c + 4d6ca9e commit 20b2991
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 20 deletions.
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ That gives you usage instructions:

```
usage: wasapi-client [-h] [-b BASE_URI] [-d DESTINATION] [-l LOG] [-n] [-v]
[--profile PROFILE | -u USER]
[--profile PROFILE | -u USER | -t TOKEN]
[-c | -m | -p PROCESSES | -s | -r]
[--collection COLLECTION [COLLECTION ...]]
[--filename FILENAME] [--crawl CRAWL]
Expand Down Expand Up @@ -57,6 +57,8 @@ optional arguments:
-v, --verbose log verbosely; -v is INFO, -vv is DEBUG
--profile PROFILE profile to use for API authentication
-u USER, --user USER username for API authentication
-t TOKEN, --token TOKEN
token for API authentication
-c, --count print number of files for download and exit
-m, --manifest generate checksum files only and exit
-p PROCESSES, --processes PROCESSES
Expand Down Expand Up @@ -109,12 +111,26 @@ Order of precedence is command line, environment, config file.

The following command downloads the WARC files available from a crawl
with `crawl id` 256119 and logs program output to a file named
`out.log`. Downloads are carried out by one process.
`out.log`. The program will prompt the user to enter the password for
user `myusername`. Downloads are carried out by one process.

```
$ wasapi-client -u myusername --crawl 256119 --log /tmp/out.log -p 1
```

The following command downloads similarly, but user credentials are
supplied by a configuration file.

```
$ wasapi-client --profile unt --crawl 256119 --log out.log -p 1
```

You may supply an API token instead of user credentials.

```
$ wasapi-client --token thisistheAPItokenIwasgiven --crawl 256119 --log out.log -p 1
```

The following command downloads the WARC files available from crawls
that occurred in the specified time range. Verbose logging is being
written to a file named out.log. Downloads are happening via four
Expand Down
4 changes: 3 additions & 1 deletion tests/test_wasapi_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,10 @@ def __init__(self):
class Test_make_session:
def test_make_session_auth(self):
auth = ('user', 'pass')
session = wc.make_session(auth)
headers = {'Authorization': 'Token lalala'}
session = wc.make_session(auth, headers)
assert session.auth == auth
assert 'Authorization' in session.headers

def test_make_session_no_auth(self):
session = wc.make_session(None)
Expand Down
49 changes: 32 additions & 17 deletions wasapi_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,14 @@ class WASAPIManifestError(Exception):
pass


def make_session(auth=None):
def make_session(auth=None, headers={}):
"""Make a session that will store our auth.
`auth` is a tuple of the form (user, password)
"""
session = requests.Session()
session.auth = auth
session.headers.update(headers)
return session


Expand All @@ -100,17 +101,17 @@ def get_webdata(webdata_uri, session):
sys.exit('Non-JSON response from {}:\n{}'.format(webdata_uri, err))


def get_files_count(webdata_uri, auth=None):
def get_files_count(webdata_uri, auth=None, headers={}):
"""Return total number of downloadable files."""
session = make_session(auth)
session = make_session(auth, headers)
webdata = get_webdata(webdata_uri, session)
session.close()
return webdata.get('count', None)


def get_files_size(page_uri, auth=None):
def get_files_size(page_uri, auth=None, headers={}):
"""Return total size (bytes) of downloadable files."""
session = make_session(auth)
session = make_session(auth, headers)
total = 0
count = 0
webdata = None
Expand Down Expand Up @@ -145,7 +146,8 @@ class Downloads:
each available hash algorithm.
"""

def __init__(self, page_uri, auth=None, download=True, destination=''):
def __init__(self, page_uri, auth=None, download=True, destination='',
headers={}):
self.page_uri = page_uri
self.auth = auth
self.download = download
Expand All @@ -154,11 +156,12 @@ def __init__(self, page_uri, auth=None, download=True, destination=''):
self.checksums = defaultdict(list)
self.urls = []
self.destination = '' if destination == '.' else destination
self.headers = headers
self.populate_downloads()

def populate_downloads(self):
"""Repeat webdata requests to gather downloadable file info."""
session = make_session(self.auth)
session = make_session(self.auth, self.headers)
current_uri = self.page_uri
while current_uri:
webdata = get_webdata(current_uri, session)
Expand Down Expand Up @@ -333,11 +336,11 @@ class Downloader(multiprocessing.Process):
"""Worker for downloading web files with a persistent session."""

def __init__(self, get_q, result_q, log_q, log_level=logging.ERROR,
auth=None, destination='.', *args, **kwargs):
auth=None, destination='.', headers={}, *args, **kwargs):
super(Downloader, self).__init__(*args, **kwargs)
self.get_q = get_q
self.result_q = result_q
self.session = make_session(auth)
self.session = make_session(auth, headers)
self.destination = destination
configure_worker_logging(log_q, log_level)

Expand Down Expand Up @@ -430,6 +433,10 @@ def _parse_args(args=sys.argv[1:]):
'--user',
dest='user',
help='username for API authentication')
auth_group.add_argument('-t',
'--token',
dest='token',
help='token for API authentication')

out_group = parser.add_mutually_exclusive_group()
out_group.add_argument('-c',
Expand Down Expand Up @@ -569,39 +576,46 @@ def stop_listener_logging():
query = ''
webdata_uri = '{}{}'.format(args.base_uri, query)

# Generate authentication tuple for the API calls.
auth = get_credentials(args.user, args.profile)
# Set up authentication.
auth = None
headers = {}
if args.token:
# Set the HTTP Authentication header.
headers['Authorization'] = 'Token {}'.format(args.token)
else:
# Generate authentication tuple for the API calls.
auth = get_credentials(args.user, args.profile)

# If user wants the size, don't download files.
if args.size:
count, size = get_files_size(webdata_uri, auth)
count, size = get_files_size(webdata_uri, auth, headers)
print('Number of Files: ', count)
print('Size of Files: ', convert_bytes(size))
sys.exit()

# If user wants a count, don't download files.
if args.count:
print('Number of Files: ', get_files_count(webdata_uri, auth))
print('Number of Files: ', get_files_count(webdata_uri, auth, headers))
sys.exit()

# Process webdata requests to generate checksum files.
if args.manifest:
downloads = Downloads(webdata_uri, auth, download=False,
destination=args.destination)
destination=args.destination, headers=headers)
downloads.generate_manifests()
sys.exit()

# Print the URLs for files that can be downloaded; don't download them.
if args.urls:
downloads = Downloads(webdata_uri, auth, download=False,
destination=args.destination)
destination=args.destination, headers=headers)
for url in downloads.urls:
print(url)
sys.exit()

# Process webdata requests to fill webdata file queue.
downloads = Downloads(webdata_uri, auth, download=True,
destination=args.destination)
destination=args.destination, headers=headers)

# Write manifest file(s).
if not args.skip_manifest:
Expand All @@ -617,7 +631,8 @@ def stop_listener_logging():
except NotImplementedError:
num_processes = args.processes
for _ in range(num_processes):
dp = Downloader(get_q, result_q, log_q, log_level, auth, args.destination)
dp = Downloader(get_q, result_q, log_q, log_level, auth,
args.destination, headers=headers)
dp.start()
download_processes.append(dp)
for dp in download_processes:
Expand Down

0 comments on commit 20b2991

Please sign in to comment.