Skip to content

Commit

Permalink
Merge pull request #11 from utkdigitalinitiatives/checksum_and_hashing
Browse files Browse the repository at this point in the history
Checksum and hashing
  • Loading branch information
Weston49 authored May 16, 2024
2 parents 85c7ed2 + 1b26a0a commit 62cfa69
Show file tree
Hide file tree
Showing 14 changed files with 520 additions and 116 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,18 @@ Before you start, you need to have a few things in place:

There are several interfaces for the application.

You can always find out what interfaces exist with:

```shell
exodus --help
```

Similarly, you can get help for a specific interface with:

```shell
exodus <interface> --help
````

If you want to get works and files, and you have metadata files, use:

```shell
Expand Down Expand Up @@ -74,6 +86,12 @@ If you want to generate a full template for a metadata import, use:
exodus generate_template --model book -o /path/to/sheet.csv
```
If you want to generate a sheet of checksums for files that failed to import, you can:
```shell
exodus hash_errors --path /path/to/directory --output /path/to/sheet.csv
```
## What's Missing Here Right Now

* The ability to create pcdm:Collection objects.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "utk-exodus"
version = "0.1.7"
version = "0.1.8"
description = "A tool for building import sheets from UTK legacy systems"
authors = ["Mark Baggett <mbagget1@utk.edu>"]
readme = "README.md"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
source_identifier,model,remote_files,parents,has_work_type,primary_identifier,local_identifier,ark,acquisition_identifier,oclc,issn,isbn,title,alternative_title,abstract,table_of_contents,date_created,date_issued,date_other,date_created_d,date_issued_d,date_other_d,publisher,utk_publisher,publication_place,utk_place_of_publication,note,extent,instrumentation,first_line,intended_audience,rights_statement,spatial,spatial_local,coordinates,temporal,call_number,bibliographic_citation,provider,intermediate_provider,repository,archival_collection,subject,keyword,form,resource_type,form_local,language,sheetmusic_hostitem,is_part_of,rdf_type,file_language,visibility
mpaekefauver:248_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:248_MODS.xml,mpaekefauver:248_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:248,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:309_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:309_MODS.xml,mpaekefauver:309_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:309,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:118_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:118_OBJ.jp2,mpaekefauver:118_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:118,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:99_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:99_MODS.xml,mpaekefauver:99_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:99,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:374_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:374_MODS.xml,mpaekefauver:374_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:465_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:465_OBJ.jp2,mpaekefauver:465_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:356_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:356_MODS.xml,mpaekefauver:356_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:356,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:254_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:254_OBJ.jp2,mpaekefauver:254_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:254,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:451_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:451_OBJ.jp2,mpaekefauver:451_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:451,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:104_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:104_MODS.xml,mpaekefauver:104_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:104,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:419_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:419_MODS.xml,mpaekefauver:419_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:419,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:318_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:318_OBJ.jp2,mpaekefauver:318_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:318,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:463_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:463_MODS.xml,mpaekefauver:463_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:463,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:350_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:350_MODS.xml,mpaekefauver:350_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:108_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:108_OBJ.jp2,mpaekefauver:108_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:108,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:420_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:420_MODS.xml,mpaekefauver:420_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
66 changes: 66 additions & 0 deletions tests/fixtures/cdf_13238.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@

<mods xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-5.xsd">
<identifier type="local">0116_000050_000351</identifier>
<titleInfo>
<title>Head Start: Helping Families Move from Welfare to Work</title>
</titleInfo>
<location>
<physicalLocation valueURI="http://id.loc.gov/authorities/names/no2017113530">Langston Hughes Library (Children&apos;s Defense Fund Haley Farm)</physicalLocation>
</location>
<abstract>This report describes some of the ways in which Head Start agencies are helping to set parents on the path to self-sufficiency. It illustrates the variety of initiatives that are underway relative to the hundreds of Head Start agencies. The report highlights the important work all Head Start agencies are doing to support parents as they move from welfare to work.</abstract>
<relatedItem displayLabel="Digital Collection" type="host">
<titleInfo>
<title>Children&apos;s Defense Fund</title>
</titleInfo>
</relatedItem>
<originInfo>
<dateIssued>1998</dateIssued>
<publisher>Children&apos;s Defense Fund (U.S.)</publisher>
</originInfo>
<physicalDescription>
<form authority="aat" valueURI="http://vocab.getty.edu/aat/300026816">reports</form>
<extent>24 pages</extent>
<digitalOrigin>reformatted digital</digitalOrigin>
</physicalDescription>
<language>
<languageTerm authority="iso639-2b" type="code">eng</languageTerm>
</language>
<typeOfResource>text</typeOfResource>
<name authority="naf" valueURI="http://id.loc.gov/authorities/names/n92112591">
<namePart>Finlay, Belva</namePart>
<role>
<roleTerm authority="marcrelator">Author</roleTerm>
</role>
</name>
<name authority="naf" valueURI="http://id.loc.gov/authorities/names/n86833543">
<namePart>Blank, Helen</namePart>
<role>
<roleTerm authority="marcrelator">Author</roleTerm>
</role>
</name>
<name>
<namePart> Poersch, Oxendine</namePart>
<role>
<roleTerm authority="marcrelator">Author</roleTerm>
</role>
</name>
<subject>
<geographic valueURI="http://id.loc.gov/authorities/names/n78095330">United States</geographic>
</subject>
<subject>
<topic valueURI="http://id.loc.gov/authorities/subjects/sh85023396 ">Child welfare</topic>
</subject>
<subject>
<topic valueURI="http://id.loc.gov/authorities/subjects/sh87000100">Head Start programs</topic>
</subject>
<subject>
<name valueURI=" http://id.loc.gov/authorities/names/n79059917">
<namePart>Project Head Start (U.S.)</namePart>
</name>
</subject>
<recordInfo>
<recordContentSource valueURI="http://id.loc.gov/authorities/names/no2017113530">Langston Hughes Library (Children&apos;s Defense Fund Haley Farm)</recordContentSource>
</recordInfo>
<note displayLabel="Intermediate Provider">University of Tennessee, Knoxville. Libraries</note>
<accessCondition type="use and reproduction" xlink:href="http://rightsstatements.org/vocab/InC-NC/1.0/">In Copyright - Non-Commercial Use Permitted</accessCondition>
</mods>
28 changes: 28 additions & 0 deletions tests/test_checksum_checksum_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pytest
from utk_exodus.checksum import HashSheet
from pathlib import Path

# Set path to fixtures
fixtures_path = Path(__file__).parent / "fixtures"

@pytest.fixture(
params=[
{
"filename": "bad_imports",
"expected_results": {
'url': 'https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml',
'checksum': '081a51fae0200f266d2933756d48441c4ea77b1e'
}
},
]
)
def fixture(request):
request.param["fixtures_path"] = fixtures_path / request.param.get("filename")
return request.param

def test_checksum_file(fixture):
hs = HashSheet(fixture.get("fixtures_path"), "example.csv")
results = hs.checksum_file(
"https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml"
)
assert results == fixture["expected_results"]
40 changes: 40 additions & 0 deletions tests/test_metadata_keword.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pytest
from utk_exodus.metadata import KeywordProperty
from pathlib import Path

# Set path to fixtures
fixtures_path = Path(__file__).parent / "fixtures"

# Set namespaces
NAMESPACES = {
"mods": "http://www.loc.gov/mods/v3",
"xlink": "http://www.w3.org/1999/xlink",
}


@pytest.fixture(
params=[
{
"filename": "civilwar_1438.xml",
"expected_results": {
'keyword': [
'Jurisdiction -- Tennessee, East -- History -- Civil War, 1861-1865',
'Actions and defenses -- Tennessee, East -- History -- Civil War, 1861-1865',
'Tennessee, East -- Politics and government -- 19th century',
'Wallace, Jesse G. -- Correspondence',
'Temple, Oliver Perry, 1820-1907 -- Correspondence'
]
}
},
]
)
def fixture(request):
param = request.param
param["fixture_path"] = fixtures_path / param.get("filename")
return param


def test_find_method_on_keyword(fixture):
subjects = KeywordProperty(fixture.get("fixture_path"), NAMESPACES)
results = subjects.find_topic()
assert results == fixture["expected_results"]
12 changes: 12 additions & 0 deletions tests/test_metadata_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@
'copyright_holder': ['http://id.loc.gov/authorities/names/n79144615'],
'utk_creator': ['Kefauver, Estes 1903-1963']
}
},
{
"filename": "cdf_13238.xml",
"expected_results": {
'author': [
'http://id.loc.gov/authorities/names/n92112591',
'http://id.loc.gov/authorities/names/n86833543'
],
'utk_author': [
'Poersch, Oxendine'
]
}
}
]
)
Expand Down
55 changes: 55 additions & 0 deletions tests/test_metadata_subject.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest
from utk_exodus.metadata import SubjectProperty
from pathlib import Path

# Set path to fixtures
fixtures_path = Path(__file__).parent / "fixtures"

# Set namespaces
NAMESPACES = {
"mods": "http://www.loc.gov/mods/v3",
"xlink": "http://www.w3.org/1999/xlink",
}


@pytest.fixture(
params=[
{
"filename": "cdf_13238.xml",
"expected_results": {
"subject": [
"http://id.loc.gov/authorities/subjects/sh85023396",
"http://id.loc.gov/authorities/subjects/sh87000100",
"http://id.loc.gov/authorities/names/n79059917",
]
},
},
{
"filename": "egypt_224.xml",
"expected_results": {
"subject": ["http://id.loc.gov/authorities/subjects/sh85016233"]
},
},
{
"filename": "knoxgardens_125.xml",
"expected_results": {
"subject": [
"http://id.loc.gov/authorities/subjects/sh85101348",
"http://id.loc.gov/authorities/subjects/sh85053123",
"http://id.loc.gov/authorities/subjects/sh85103022",
"http://id.loc.gov/authorities/subjects/sh2008120720",
]
},
},
]
)
def fixture(request):
param = request.param
param["fixture_path"] = fixtures_path / param.get("filename")
return param


def test_find_method_on_subject(fixture):
subjects = SubjectProperty(fixture.get("fixture_path"), NAMESPACES)
results = subjects.find_topic()
assert results == fixture["expected_results"]
2 changes: 2 additions & 0 deletions utk_exodus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .curate import FileCurator
from .validate import ValidateMigration
from .fedora import FedoraObject
from.checksum import HashSheet
from .controller import InterfaceController
from .combine import ImportRefactor
from .template import ImportTemplate
Expand All @@ -13,6 +14,7 @@
"FedoraObject",
"FileCurator",
"FileOrganizer",
"HashSheet",
"ImportRefactor",
"ImportTemplate",
"InterfaceController",
Expand Down
3 changes: 3 additions & 0 deletions utk_exodus/checksum/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .checksum import HashSheet

__all__ = ["HashSheet"]
91 changes: 91 additions & 0 deletions utk_exodus/checksum/checksum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import hashlib
from csv import DictWriter, DictReader
import os
import requests
from tqdm import tqdm


class HashSheet:
def __init__(self, path, output):
self.path = path
self.output = output
self.all_files = self.walk_sheets(path)

@staticmethod
def walk_sheets(path):
"""Walk through a directory and return a list of all files.
Args:
path (str): The path to the directory to walk through.
Returns:
list: A list of all files in the directory.
Examples:
>>> hs = HashSheet("tests/fixtures/bad_imports", "example.csv")
>>> hs.walk_sheets("tests/fixtures/bad_imports")
['https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:248_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:309_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:118_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:99_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:374_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:465_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:356_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:254_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:451_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:104_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:419_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:318_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:463_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:350_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:108_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:420_MODS.xml']
"""
all_files = []
for path, directories, files in os.walk(path):
for filename in files:
with open(f"{path}/{filename}", "r") as f:
reader = DictReader(f)
for row in reader:
all_files.append(row["remote_files"])
return all_files

def checksum(self):
"""Calculate the sha1 checksum of all files listed in csvs in a directory.
Returns:
list: A list of dictionaries with the url and checksum of each file.
Examples:
No example to keep tests running quickly.
"""
files_with_checksums = []
for file in tqdm(self.all_files):
hash = self.checksum_file(file)
files_with_checksums.append(hash)
return files_with_checksums

@staticmethod
def checksum_file(file):
"""Calculate the sha1 checksum of a file.
Args:
file (str): The path to the file to checksum.
Returns:
dict: A dictionary with the url and checksum of the file.
Examples:
>>> hs = HashSheet("tests/fixtures/bad_imports", "example.csv")
>>> hs.checksum_file("https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml")
{'url': 'https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml', 'checksum': '081a51fae0200f266d2933756d48441c4ea77b1e'}
"""
response = requests.get(file, stream=True)
response.raise_for_status()
sha1 = hashlib.sha1()
for chunk in response.iter_content(chunk_size=8192):
if chunk:
sha1.update(chunk)
return {"url": file, "checksum": sha1.hexdigest()}

def write(self):
with open(self.output, "w") as csvfile:
writer = DictWriter(csvfile, fieldnames=["url", "checksum"])
writer.writeheader()
writer.writerows(self.checksum())
return


if __name__ == "__main__":
path = "tests/fixtures/bad_imports"
output = "delete/sample_checksums.csv"
checksum = HashSheet(path, output)
checksum.write()
Loading

0 comments on commit 62cfa69

Please sign in to comment.