-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #11 from utkdigitalinitiatives/checksum_and_hashing
Checksum and hashing
- Loading branch information
Showing
14 changed files
with
520 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
17 changes: 17 additions & 0 deletions
17
tests/fixtures/bad_imports/import_411_20240426180251_885_errored_entries.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
source_identifier,model,remote_files,parents,has_work_type,primary_identifier,local_identifier,ark,acquisition_identifier,oclc,issn,isbn,title,alternative_title,abstract,table_of_contents,date_created,date_issued,date_other,date_created_d,date_issued_d,date_other_d,publisher,utk_publisher,publication_place,utk_place_of_publication,note,extent,instrumentation,first_line,intended_audience,rights_statement,spatial,spatial_local,coordinates,temporal,call_number,bibliographic_citation,provider,intermediate_provider,repository,archival_collection,subject,keyword,form,resource_type,form_local,language,sheetmusic_hostitem,is_part_of,rdf_type,file_language,visibility | ||
mpaekefauver:248_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:248_MODS.xml,mpaekefauver:248_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:248,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:309_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:309_MODS.xml,mpaekefauver:309_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:309,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:118_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:118_OBJ.jp2,mpaekefauver:118_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:118,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted | ||
mpaekefauver:99_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:99_MODS.xml,mpaekefauver:99_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:99,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:374_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:374_MODS.xml,mpaekefauver:374_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:465_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:465_OBJ.jp2,mpaekefauver:465_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted | ||
mpaekefauver:356_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:356_MODS.xml,mpaekefauver:356_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:356,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:254_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:254_OBJ.jp2,mpaekefauver:254_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:254,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted | ||
mpaekefauver:451_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:451_OBJ.jp2,mpaekefauver:451_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:451,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted | ||
mpaekefauver:104_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:104_MODS.xml,mpaekefauver:104_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:104,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:419_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:419_MODS.xml,mpaekefauver:419_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:419,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:318_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:318_OBJ.jp2,mpaekefauver:318_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:318,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted | ||
mpaekefauver:463_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:463_MODS.xml,mpaekefauver:463_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:463,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:350_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:350_MODS.xml,mpaekefauver:350_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted | ||
mpaekefauver:108_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:108_OBJ.jp2,mpaekefauver:108_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:108,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted | ||
mpaekefauver:420_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:420_MODS.xml,mpaekefauver:420_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
|
||
<mods xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-5.xsd"> | ||
<identifier type="local">0116_000050_000351</identifier> | ||
<titleInfo> | ||
<title>Head Start: Helping Families Move from Welfare to Work</title> | ||
</titleInfo> | ||
<location> | ||
<physicalLocation valueURI="http://id.loc.gov/authorities/names/no2017113530">Langston Hughes Library (Children's Defense Fund Haley Farm)</physicalLocation> | ||
</location> | ||
<abstract>This report describes some of the ways in which Head Start agencies are helping to set parents on the path to self-sufficiency. It illustrates the variety of initiatives that are underway relative to the hundreds of Head Start agencies. The report highlights the important work all Head Start agencies are doing to support parents as they move from welfare to work.</abstract> | ||
<relatedItem displayLabel="Digital Collection" type="host"> | ||
<titleInfo> | ||
<title>Children's Defense Fund</title> | ||
</titleInfo> | ||
</relatedItem> | ||
<originInfo> | ||
<dateIssued>1998</dateIssued> | ||
<publisher>Children's Defense Fund (U.S.)</publisher> | ||
</originInfo> | ||
<physicalDescription> | ||
<form authority="aat" valueURI="http://vocab.getty.edu/aat/300026816">reports</form> | ||
<extent>24 pages</extent> | ||
<digitalOrigin>reformatted digital</digitalOrigin> | ||
</physicalDescription> | ||
<language> | ||
<languageTerm authority="iso639-2b" type="code">eng</languageTerm> | ||
</language> | ||
<typeOfResource>text</typeOfResource> | ||
<name authority="naf" valueURI="http://id.loc.gov/authorities/names/n92112591"> | ||
<namePart>Finlay, Belva</namePart> | ||
<role> | ||
<roleTerm authority="marcrelator">Author</roleTerm> | ||
</role> | ||
</name> | ||
<name authority="naf" valueURI="http://id.loc.gov/authorities/names/n86833543"> | ||
<namePart>Blank, Helen</namePart> | ||
<role> | ||
<roleTerm authority="marcrelator">Author</roleTerm> | ||
</role> | ||
</name> | ||
<name> | ||
<namePart> Poersch, Oxendine</namePart> | ||
<role> | ||
<roleTerm authority="marcrelator">Author</roleTerm> | ||
</role> | ||
</name> | ||
<subject> | ||
<geographic valueURI="http://id.loc.gov/authorities/names/n78095330">United States</geographic> | ||
</subject> | ||
<subject> | ||
<topic valueURI="http://id.loc.gov/authorities/subjects/sh85023396 ">Child welfare</topic> | ||
</subject> | ||
<subject> | ||
<topic valueURI="http://id.loc.gov/authorities/subjects/sh87000100">Head Start programs</topic> | ||
</subject> | ||
<subject> | ||
<name valueURI=" http://id.loc.gov/authorities/names/n79059917"> | ||
<namePart>Project Head Start (U.S.)</namePart> | ||
</name> | ||
</subject> | ||
<recordInfo> | ||
<recordContentSource valueURI="http://id.loc.gov/authorities/names/no2017113530">Langston Hughes Library (Children's Defense Fund Haley Farm)</recordContentSource> | ||
</recordInfo> | ||
<note displayLabel="Intermediate Provider">University of Tennessee, Knoxville. Libraries</note> | ||
<accessCondition type="use and reproduction" xlink:href="http://rightsstatements.org/vocab/InC-NC/1.0/">In Copyright - Non-Commercial Use Permitted</accessCondition> | ||
</mods> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import pytest | ||
from utk_exodus.checksum import HashSheet | ||
from pathlib import Path | ||
|
||
# Set path to fixtures | ||
fixtures_path = Path(__file__).parent / "fixtures" | ||
|
||
@pytest.fixture( | ||
params=[ | ||
{ | ||
"filename": "bad_imports", | ||
"expected_results": { | ||
'url': 'https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml', | ||
'checksum': '081a51fae0200f266d2933756d48441c4ea77b1e' | ||
} | ||
}, | ||
] | ||
) | ||
def fixture(request): | ||
request.param["fixtures_path"] = fixtures_path / request.param.get("filename") | ||
return request.param | ||
|
||
def test_checksum_file(fixture): | ||
hs = HashSheet(fixture.get("fixtures_path"), "example.csv") | ||
results = hs.checksum_file( | ||
"https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml" | ||
) | ||
assert results == fixture["expected_results"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import pytest | ||
from utk_exodus.metadata import KeywordProperty | ||
from pathlib import Path | ||
|
||
# Set path to fixtures | ||
fixtures_path = Path(__file__).parent / "fixtures" | ||
|
||
# Set namespaces | ||
NAMESPACES = { | ||
"mods": "http://www.loc.gov/mods/v3", | ||
"xlink": "http://www.w3.org/1999/xlink", | ||
} | ||
|
||
|
||
@pytest.fixture( | ||
params=[ | ||
{ | ||
"filename": "civilwar_1438.xml", | ||
"expected_results": { | ||
'keyword': [ | ||
'Jurisdiction -- Tennessee, East -- History -- Civil War, 1861-1865', | ||
'Actions and defenses -- Tennessee, East -- History -- Civil War, 1861-1865', | ||
'Tennessee, East -- Politics and government -- 19th century', | ||
'Wallace, Jesse G. -- Correspondence', | ||
'Temple, Oliver Perry, 1820-1907 -- Correspondence' | ||
] | ||
} | ||
}, | ||
] | ||
) | ||
def fixture(request): | ||
param = request.param | ||
param["fixture_path"] = fixtures_path / param.get("filename") | ||
return param | ||
|
||
|
||
def test_find_method_on_keyword(fixture): | ||
subjects = KeywordProperty(fixture.get("fixture_path"), NAMESPACES) | ||
results = subjects.find_topic() | ||
assert results == fixture["expected_results"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import pytest | ||
from utk_exodus.metadata import SubjectProperty | ||
from pathlib import Path | ||
|
||
# Set path to fixtures | ||
fixtures_path = Path(__file__).parent / "fixtures" | ||
|
||
# Set namespaces | ||
NAMESPACES = { | ||
"mods": "http://www.loc.gov/mods/v3", | ||
"xlink": "http://www.w3.org/1999/xlink", | ||
} | ||
|
||
|
||
@pytest.fixture( | ||
params=[ | ||
{ | ||
"filename": "cdf_13238.xml", | ||
"expected_results": { | ||
"subject": [ | ||
"http://id.loc.gov/authorities/subjects/sh85023396", | ||
"http://id.loc.gov/authorities/subjects/sh87000100", | ||
"http://id.loc.gov/authorities/names/n79059917", | ||
] | ||
}, | ||
}, | ||
{ | ||
"filename": "egypt_224.xml", | ||
"expected_results": { | ||
"subject": ["http://id.loc.gov/authorities/subjects/sh85016233"] | ||
}, | ||
}, | ||
{ | ||
"filename": "knoxgardens_125.xml", | ||
"expected_results": { | ||
"subject": [ | ||
"http://id.loc.gov/authorities/subjects/sh85101348", | ||
"http://id.loc.gov/authorities/subjects/sh85053123", | ||
"http://id.loc.gov/authorities/subjects/sh85103022", | ||
"http://id.loc.gov/authorities/subjects/sh2008120720", | ||
] | ||
}, | ||
}, | ||
] | ||
) | ||
def fixture(request): | ||
param = request.param | ||
param["fixture_path"] = fixtures_path / param.get("filename") | ||
return param | ||
|
||
|
||
def test_find_method_on_subject(fixture): | ||
subjects = SubjectProperty(fixture.get("fixture_path"), NAMESPACES) | ||
results = subjects.find_topic() | ||
assert results == fixture["expected_results"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .checksum import HashSheet | ||
|
||
__all__ = ["HashSheet"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import hashlib | ||
from csv import DictWriter, DictReader | ||
import os | ||
import requests | ||
from tqdm import tqdm | ||
|
||
|
||
class HashSheet: | ||
def __init__(self, path, output): | ||
self.path = path | ||
self.output = output | ||
self.all_files = self.walk_sheets(path) | ||
|
||
@staticmethod | ||
def walk_sheets(path): | ||
"""Walk through a directory and return a list of all files. | ||
Args: | ||
path (str): The path to the directory to walk through. | ||
Returns: | ||
list: A list of all files in the directory. | ||
Examples: | ||
>>> hs = HashSheet("tests/fixtures/bad_imports", "example.csv") | ||
>>> hs.walk_sheets("tests/fixtures/bad_imports") | ||
['https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:248_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:309_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:118_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:99_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:374_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:465_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:356_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:254_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:451_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:104_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:419_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:318_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:463_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:350_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:108_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:420_MODS.xml'] | ||
""" | ||
all_files = [] | ||
for path, directories, files in os.walk(path): | ||
for filename in files: | ||
with open(f"{path}/{filename}", "r") as f: | ||
reader = DictReader(f) | ||
for row in reader: | ||
all_files.append(row["remote_files"]) | ||
return all_files | ||
|
||
def checksum(self): | ||
"""Calculate the sha1 checksum of all files listed in csvs in a directory. | ||
Returns: | ||
list: A list of dictionaries with the url and checksum of each file. | ||
Examples: | ||
No example to keep tests running quickly. | ||
""" | ||
files_with_checksums = [] | ||
for file in tqdm(self.all_files): | ||
hash = self.checksum_file(file) | ||
files_with_checksums.append(hash) | ||
return files_with_checksums | ||
|
||
@staticmethod | ||
def checksum_file(file): | ||
"""Calculate the sha1 checksum of a file. | ||
Args: | ||
file (str): The path to the file to checksum. | ||
Returns: | ||
dict: A dictionary with the url and checksum of the file. | ||
Examples: | ||
>>> hs = HashSheet("tests/fixtures/bad_imports", "example.csv") | ||
>>> hs.checksum_file("https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml") | ||
{'url': 'https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml', 'checksum': '081a51fae0200f266d2933756d48441c4ea77b1e'} | ||
""" | ||
response = requests.get(file, stream=True) | ||
response.raise_for_status() | ||
sha1 = hashlib.sha1() | ||
for chunk in response.iter_content(chunk_size=8192): | ||
if chunk: | ||
sha1.update(chunk) | ||
return {"url": file, "checksum": sha1.hexdigest()} | ||
|
||
def write(self): | ||
with open(self.output, "w") as csvfile: | ||
writer = DictWriter(csvfile, fieldnames=["url", "checksum"]) | ||
writer.writeheader() | ||
writer.writerows(self.checksum()) | ||
return | ||
|
||
|
||
if __name__ == "__main__": | ||
path = "tests/fixtures/bad_imports" | ||
output = "delete/sample_checksums.csv" | ||
checksum = HashSheet(path, output) | ||
checksum.write() |
Oops, something went wrong.