From 5584566773ac2e48e86893c79ca08605c6510a74 Mon Sep 17 00:00:00 2001
From: Trapsilo Bumi <tbumi@thpd.io>
Date: Fri, 20 Jan 2023 22:04:57 +0700
Subject: [PATCH] Miscellaneous updates and fixes (#39)

* Update readme & TODO

* Update some logic and fix typos
---
 README.md                         | 29 +++++++++++++++++++----------
 TODO                              |  1 +
 src/glacier_upload/archives.py    | 26 +++++++++++---------------
 src/glacier_upload/inventories.py |  5 +++++
 src/glacier_upload/upload.py      |  6 +++---
 5 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index f6dd74a..fd5eaf5 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,8 @@
 
 A helper tool to upload and manage archives in
 [Amazon S3 Glacier](https://docs.aws.amazon.com/amazonglacier/latest/dev/introduction.html)
-Vaults.
+Vaults. Amazon S3 Glacier is a cloud storage service that is optimized for long
+term storage for a relatively cheap price.
 
 ## Installation
 
@@ -20,9 +21,7 @@ $ pip install glacier_upload
 
 ### Prerequisites
 
-Amazon S3 Glacier is a cloud storage service that is optimized for long term
-storage for a relatively cheap price. To upload an archive to Amazon S3 Glacier
-vault, ensure you have:
+To upload an archive to Amazon S3 Glacier vault, ensure you have:
 
 -   Created an AWS account
 -   Created an Amazon S3 Glacier vault from the AWS CLI tool or the Management
@@ -58,6 +57,15 @@ There are additional options to customize your upload, such as adding a
 description to the archive or configuring the number of threads or the size of
 parts. Run `glacier upload --help` for more information.
 
+If a multipart upload is interrupted in the middle (because of an exception,
+interrupted manually, or other reason), the script will show you the upload ID.
+That upload ID can be used to resume the upload, using the same command but
+adding the `--upload-id` option, like so:
+
+```
+glacier upload --upload-id UPLOAD_ID VAULT_NAME FILE_NAME [FILE_NAME ...]
+```
+
 ### Retrieving an archive
 
 Retrieving an archive in glacier requires two steps. First, initiate a
@@ -84,7 +92,7 @@ a vault contains, you need to request an inventory of the archive, in a similar
 manner to retrieving an archive. To initiate an inventory, run:
 
 ```
-glacier archive init-retrieval VAULT_NAME ARCHIVE_ID
+glacier inventory init-retrieval VAULT_NAME
 ```
 
 Then, the inventory job will take some time to complete. Run the next step to
@@ -92,14 +100,15 @@ both check whether the job is complete and retrieve the inventory if it has been
 completed.
 
 ```
-glacier archive get VAULT_NAME JOB_ID FILE_NAME
+glacier inventory get VAULT_NAME JOB_ID
 ```
 
-### Deleting an archive, deleting an upload, creating/deleting a vault etc.
+### Deleting an archive, deleting an upload job, creating/deleting a vault, etc.
 
-Anything that is not listed above can be done using the AWS CLI. Those
-functionalities are not implemented here to avoid duplication of work, and
-minimize maintenance efforts of this package.
+All jobs other than uploading an archive and requesting/downloading an inventory
+or archive can be done using the AWS CLI. Those functionalities are not
+implemented here to avoid duplication of work, and minimize maintenance efforts
+of this package.
 
 ## Contributing
 
diff --git a/TODO b/TODO
index 377722c..2361afe 100644
--- a/TODO
+++ b/TODO
@@ -1,6 +1,7 @@
 -   Unit tests
 -   https://help.github.com/en/articles/setting-guidelines-for-repository-contributors
 
+-   Support multpart download of archives like in #25
 -   Add progress indication for archiving
 -   implement auto-retry on multipart upload failure
     -   it should ask the user for confirmation to continue
diff --git a/src/glacier_upload/archives.py b/src/glacier_upload/archives.py
index 6aaad9c..60c2332 100644
--- a/src/glacier_upload/archives.py
+++ b/src/glacier_upload/archives.py
@@ -14,7 +14,6 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-import json
 import os
 
 import boto3
@@ -46,6 +45,11 @@ def get(vault_name, job_id, file_name):
     except glacier.exceptions.ResourceNotFoundException as e:
         raise click.ClickException(e.response["Error"]["Message"])
 
+    if response["Action"] != "ArchiveRetrieval":
+        raise click.ClickException(
+            "Job is not an archive retrieval. Check the Job ID again."
+        )
+
     click.echo(f"Job status: {response['StatusCode']}")
 
     if not response["Completed"]:
@@ -60,20 +64,12 @@ def get(vault_name, job_id, file_name):
     click.echo("Retrieving job data...")
     response = glacier.get_job_output(vaultName=vault_name, jobId=job_id)
 
-    if response["contentType"] == "application/json":
-        inventory_json = json.load(response["body"])
-        click.echo(json.dumps(inventory_json, indent=2))
-    elif response["contentType"] == "text/csv":
-        click.echo(response["body"].read())
-    else:
-        content_length = int(
-            response["ResponseMetadata"]["HTTPHeaders"]["content-length"]
-        )
-        response_stream = response["body"]
-        try:
-            download_archive(content_length, response_stream, file_name)
-        finally:
-            response_stream.close()
+    content_length = int(response["ResponseMetadata"]["HTTPHeaders"]["content-length"])
+    response_stream = response["body"]
+    try:
+        download_archive(content_length, response_stream, file_name)
+    finally:
+        response_stream.close()
 
 
 def download_archive(content_length, response_stream, file_name):
diff --git a/src/glacier_upload/inventories.py b/src/glacier_upload/inventories.py
index 51a6f6d..c1eacf6 100644
--- a/src/glacier_upload/inventories.py
+++ b/src/glacier_upload/inventories.py
@@ -45,6 +45,11 @@ def get(vault_name, job_id):
     except glacier.exceptions.ResourceNotFoundException as e:
         raise click.ClickException(e.response["Error"]["Message"])
 
+    if response["Action"] != "InventoryRetrieval":
+        raise click.ClickException(
+            "Job is not an inventory retrieval. Check the Job ID again."
+        )
+
     click.echo(f"Inventory status: {response['StatusCode']}")
 
     if not response["Completed"]:
diff --git a/src/glacier_upload/upload.py b/src/glacier_upload/upload.py
index 464f09b..428fac7 100644
--- a/src/glacier_upload/upload.py
+++ b/src/glacier_upload/upload.py
@@ -189,14 +189,14 @@ def multipart_upload(
             futures_list, return_when=concurrent.futures.FIRST_EXCEPTION
         )
         if len(not_done) > 0:
-            # an exception occured
+            # an exception occurred
             for future in not_done:
                 future.cancel()
             for future in done:
                 exc = future.exception()
                 if exc is not None:
                     exc_string = "".join(traceback.format_exception(exc))
-                    click.secho(f"Exception occured: {exc_string}", err=True, fg="red")
+                    click.secho(f"Exception occurred: {exc_string}", err=True, fg="red")
             click.echo(f"Upload can still be resumed. Upload ID: {upload_id}")
             raise click.Abort
         else:
@@ -240,6 +240,7 @@ def upload_part(
     range_header = f"bytes {start_pos}-{end_pos}/{file_size_bytes}"
     part_num = start_pos // part_size_bytes
     percentage = part_num / num_parts
+    checksum = calculate_tree_hash(part, part_size_bytes)
 
     click.echo(f"Uploading part {part_num + 1} of {num_parts}... ({percentage:.2%})")
 
@@ -249,7 +250,6 @@ def upload_part(
             response = glacier.upload_multipart_part(
                 vaultName=vault_name, uploadId=upload_id, range=range_header, body=part
             )
-            checksum = calculate_tree_hash(part, part_size_bytes)
             if checksum != response["checksum"]:
                 raise Exception("Local checksum does not match Glacier checksum")