From f283bd9184e540da476f9f49a03344852b6d9cbb Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Wed, 15 May 2024 11:29:00 -0400 Subject: [PATCH 01/11] Add Hash Sheet. --- utk_exodus/__init__.py | 2 ++ utk_exodus/checksum/__init__.py | 3 ++ utk_exodus/checksum/checksum.py | 49 +++++++++++++++++++++++++++++++++ utk_exodus/exodus.py | 27 ++++++++++++++++++ 4 files changed, 81 insertions(+) create mode 100644 utk_exodus/checksum/__init__.py create mode 100644 utk_exodus/checksum/checksum.py diff --git a/utk_exodus/__init__.py b/utk_exodus/__init__.py index 29bf09d..180ba38 100644 --- a/utk_exodus/__init__.py +++ b/utk_exodus/__init__.py @@ -4,6 +4,7 @@ from .curate import FileCurator from .validate import ValidateMigration from .fedora import FedoraObject +from.checksum import HashSheet from .controller import InterfaceController from .combine import ImportRefactor from .template import ImportTemplate @@ -13,6 +14,7 @@ "FedoraObject", "FileCurator", "FileOrganizer", + "HashSheet", "ImportRefactor", "ImportTemplate", "InterfaceController", diff --git a/utk_exodus/checksum/__init__.py b/utk_exodus/checksum/__init__.py new file mode 100644 index 0000000..cad9434 --- /dev/null +++ b/utk_exodus/checksum/__init__.py @@ -0,0 +1,3 @@ +from .checksum import HashSheet + +__all__ = ["HashSheet"] \ No newline at end of file diff --git a/utk_exodus/checksum/checksum.py b/utk_exodus/checksum/checksum.py new file mode 100644 index 0000000..b9e189a --- /dev/null +++ b/utk_exodus/checksum/checksum.py @@ -0,0 +1,49 @@ +import hashlib +from csv import DictWriter, DictReader +import os +import requests +from tqdm import tqdm + + +class HashSheet: + def __init__(self, path, output): + self.path = path + self.output = output + self.all_files = self.walk_sheets(path) + + @staticmethod + def walk_sheets(path): + all_files = [] + for path, directories, files in os.walk(path): + for filename in files: + with open(f"{path}/{filename}", "r") as f: + reader = DictReader(f) + for row in reader: + all_files.append(row["remote_files"]) + return all_files + + def checksum(self): + files_with_checksums = [] + for file in tqdm(self.all_files): + response = requests.get(file, stream=True) + response.raise_for_status() + sha1 = hashlib.sha1() + for chunk in response.iter_content(chunk_size=8192): + if chunk: + sha1.update(chunk) + files_with_checksums.append({"url": file, "checksum": sha1.hexdigest()}) + return files_with_checksums + + def write(self): + with open(self.output, "w") as csvfile: + writer = DictWriter(csvfile, fieldnames=["url", "checksum"]) + writer.writeheader() + writer.writerows(self.checksum()) + return + + +if __name__ == "__main__": + path = "delete/bad_imports" + output = "delete/sample_checksums.csv" + checksum = HashSheet(path, output) + checksum.write() diff --git a/utk_exodus/exodus.py b/utk_exodus/exodus.py index 1bbf9fa..f453caa 100644 --- a/utk_exodus/exodus.py +++ b/utk_exodus/exodus.py @@ -6,6 +6,7 @@ from utk_exodus.controller import InterfaceController from utk_exodus.template import ImportTemplate from utk_exodus.combine import ImportRefactor +from utk_exodus.checksum import HashSheet import click import requests @@ -213,3 +214,29 @@ def remove_old_values( ir = ImportRefactor(sheet, old_sheet) ir.create_csv_with_fields_to_nuke(sheet, new_sheet) print(f"Refactored sheet written to {new_sheet}.") + + +@cli.command( + "hash_errors", + help="Create sheet from a directory of errored import sheets.", +) +@click.option( + "--path", + "-p", + required=True, + help="Specify the path to the directory of sheets.", +) +@click.option( + "--output", + "-o", + required=True, + help="Specify where you want to write your sheets.", +) +def hash_errors( + path: str, + output: str, +) -> None: + print(f"Generating checksums for bad files in csvs in {path}.") + hs = HashSheet(path, output) + hs.write() + print(f"Hash sheet written to {output}.") From 2e7608fe464f4c293868c56ca398b852d37caced Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Wed, 15 May 2024 11:32:41 -0400 Subject: [PATCH 02/11] Update README and tag. --- README.md | 6 ++++++ pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9723ca6..4f855c1 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,12 @@ If you want to generate a full template for a metadata import, use: exodus generate_template --model book -o /path/to/sheet.csv ``` +If you want to generate a sheet of checksums for files that failed to import, you can: + +```shell +exodus hash_errors --path /path/to/directory --output /path/to/sheet.csv +``` + ## What's Missing Here Right Now * The ability to create pcdm:Collection objects. diff --git a/pyproject.toml b/pyproject.toml index 964670e..586122b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "utk-exodus" -version = "0.1.7" +version = "0.1.8" description = "A tool for building import sheets from UTK legacy systems" authors = ["Mark Baggett "] readme = "README.md" From cdca8754c63522b424879ff3b5c001cfd5a96fa7 Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Wed, 15 May 2024 12:31:01 -0400 Subject: [PATCH 03/11] Add unit test for checksum checking. --- ...411_20240426180251_885_errored_entries.csv | 17 +++++++++ tests/test_checksum_checksum_file.py | 28 +++++++++++++++ utk_exodus/checksum/checksum.py | 35 ++++++++++++++----- 3 files changed, 72 insertions(+), 8 deletions(-) create mode 100644 tests/fixtures/bad_imports/import_411_20240426180251_885_errored_entries.csv create mode 100644 tests/test_checksum_checksum_file.py diff --git a/tests/fixtures/bad_imports/import_411_20240426180251_885_errored_entries.csv b/tests/fixtures/bad_imports/import_411_20240426180251_885_errored_entries.csv new file mode 100644 index 0000000..0c38270 --- /dev/null +++ b/tests/fixtures/bad_imports/import_411_20240426180251_885_errored_entries.csv @@ -0,0 +1,17 @@ +source_identifier,model,remote_files,parents,has_work_type,primary_identifier,local_identifier,ark,acquisition_identifier,oclc,issn,isbn,title,alternative_title,abstract,table_of_contents,date_created,date_issued,date_other,date_created_d,date_issued_d,date_other_d,publisher,utk_publisher,publication_place,utk_place_of_publication,note,extent,instrumentation,first_line,intended_audience,rights_statement,spatial,spatial_local,coordinates,temporal,call_number,bibliographic_citation,provider,intermediate_provider,repository,archival_collection,subject,keyword,form,resource_type,form_local,language,sheetmusic_hostitem,is_part_of,rdf_type,file_language,visibility +mpaekefauver:248_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:248_MODS.xml,mpaekefauver:248_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:248,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:309_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:309_MODS.xml,mpaekefauver:309_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:309,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:118_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:118_OBJ.jp2,mpaekefauver:118_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:118,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted +mpaekefauver:99_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:99_MODS.xml,mpaekefauver:99_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:99,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:374_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:374_MODS.xml,mpaekefauver:374_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:465_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:465_OBJ.jp2,mpaekefauver:465_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted +mpaekefauver:356_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:356_MODS.xml,mpaekefauver:356_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:356,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:254_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:254_OBJ.jp2,mpaekefauver:254_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:254,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted +mpaekefauver:451_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:451_OBJ.jp2,mpaekefauver:451_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:451,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted +mpaekefauver:104_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:104_MODS.xml,mpaekefauver:104_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:104,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:419_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:419_MODS.xml,mpaekefauver:419_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:419,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:318_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:318_OBJ.jp2,mpaekefauver:318_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:318,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted +mpaekefauver:463_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:463_MODS.xml,mpaekefauver:463_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:463,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:350_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:350_MODS.xml,mpaekefauver:350_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted +mpaekefauver:108_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:108_OBJ.jp2,mpaekefauver:108_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:108,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted +mpaekefauver:420_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:420_MODS.xml,mpaekefauver:420_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted diff --git a/tests/test_checksum_checksum_file.py b/tests/test_checksum_checksum_file.py new file mode 100644 index 0000000..0cab440 --- /dev/null +++ b/tests/test_checksum_checksum_file.py @@ -0,0 +1,28 @@ +import pytest +from utk_exodus.checksum import HashSheet +from pathlib import Path + +# Set path to fixtures +fixtures_path = Path(__file__).parent / "fixtures" + +@pytest.fixture( + params=[ + { + "filename": "bad_imports", + "expected_results": { + 'url': 'https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml', + 'checksum': '081a51fae0200f266d2933756d48441c4ea77b1e' + } + }, + ] +) +def fixture(request): + request.param["fixtures_path"] = fixtures_path / request.param.get("filename") + return request.param + +def test_checksum_file(fixture): + hs = HashSheet(fixture.get("fixtures_path"), "example.csv") + results = hs.checksum_file( + "https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml" + ) + assert results == fixture["expected_results"] diff --git a/utk_exodus/checksum/checksum.py b/utk_exodus/checksum/checksum.py index b9e189a..781e8d1 100644 --- a/utk_exodus/checksum/checksum.py +++ b/utk_exodus/checksum/checksum.py @@ -25,15 +25,34 @@ def walk_sheets(path): def checksum(self): files_with_checksums = [] for file in tqdm(self.all_files): - response = requests.get(file, stream=True) - response.raise_for_status() - sha1 = hashlib.sha1() - for chunk in response.iter_content(chunk_size=8192): - if chunk: - sha1.update(chunk) - files_with_checksums.append({"url": file, "checksum": sha1.hexdigest()}) + hash = self.checksum_file(file) + files_with_checksums.append(hash) return files_with_checksums + @staticmethod + def checksum_file(file): + """Calculate the sha1 checksum of a file. + + Args: + file (str): The path to the file to checksum. + + Returns: + dict: A dictionary with the url and checksum of the file. + + Examples: + >>> hs = HashSheet("tests/fixtures/bad_imports", "example.csv") + >>> hs.checksum_file("https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml") + {'url': 'https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml', 'checksum': '081a51fae0200f266d2933756d48441c4ea77b1e'} + + """ + response = requests.get(file, stream=True) + response.raise_for_status() + sha1 = hashlib.sha1() + for chunk in response.iter_content(chunk_size=8192): + if chunk: + sha1.update(chunk) + return {"url": file, "checksum": sha1.hexdigest()} + def write(self): with open(self.output, "w") as csvfile: writer = DictWriter(csvfile, fieldnames=["url", "checksum"]) @@ -43,7 +62,7 @@ def write(self): if __name__ == "__main__": - path = "delete/bad_imports" + path = "tests/fixtures/bad_imports" output = "delete/sample_checksums.csv" checksum = HashSheet(path, output) checksum.write() From d20b5fc10f02344f3cacb987aafec7870abd6c9b Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Wed, 15 May 2024 12:38:51 -0400 Subject: [PATCH 04/11] Point out that help is always available. --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 4f855c1..4ce76e7 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,18 @@ Before you start, you need to have a few things in place: There are several interfaces for the application. +You can always find out what interfaces exist with: + +```shell +exodus --help +``` + +Similarly, you can get help for a specific interface with: + +```shell +exodus --help +```` + If you want to get works and files, and you have metadata files, use: ```shell From 57cadd71adc8db6bdf94078b395e0693c0fe1d1e Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Wed, 15 May 2024 12:51:29 -0400 Subject: [PATCH 05/11] Add doc test for walk_sheets. --- utk_exodus/checksum/checksum.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/utk_exodus/checksum/checksum.py b/utk_exodus/checksum/checksum.py index 781e8d1..eb8ce8f 100644 --- a/utk_exodus/checksum/checksum.py +++ b/utk_exodus/checksum/checksum.py @@ -13,6 +13,20 @@ def __init__(self, path, output): @staticmethod def walk_sheets(path): + """Walk through a directory and return a list of all files. + + Args: + path (str): The path to the directory to walk through. + + Returns: + list: A list of all files in the directory. + + Examples: + >>> hs = HashSheet("tests/fixtures/bad_imports", "example.csv") + >>> hs.walk_sheets("tests/fixtures/bad_imports") + ['https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:248_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:309_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:118_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:99_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:374_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:465_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:356_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:254_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:451_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:104_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:419_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:318_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:463_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:350_MODS.xml', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:108_OBJ.jp2', 'https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:420_MODS.xml'] + + """ all_files = [] for path, directories, files in os.walk(path): for filename in files: @@ -23,6 +37,15 @@ def walk_sheets(path): return all_files def checksum(self): + """Calculate the sha1 checksum of all files listed in csvs in a directory. + + Returns: + list: A list of dictionaries with the url and checksum of each file. + + Examples: + No example to keep tests running quickly. + + """ files_with_checksums = [] for file in tqdm(self.all_files): hash = self.checksum_file(file) From 850efe97a3e3692271bf16523fe252dd809a3e83 Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Thu, 16 May 2024 08:48:31 -0400 Subject: [PATCH 06/11] Add problematic fixture for tests. --- tests/fixtures/cdf_13238.xml | 66 ++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/fixtures/cdf_13238.xml diff --git a/tests/fixtures/cdf_13238.xml b/tests/fixtures/cdf_13238.xml new file mode 100644 index 0000000..1c08a13 --- /dev/null +++ b/tests/fixtures/cdf_13238.xml @@ -0,0 +1,66 @@ + + + 0116_000050_000351 + + Head Start: Helping Families Move from Welfare to Work + + + Langston Hughes Library (Children's Defense Fund Haley Farm) + + This report describes some of the ways in which Head Start agencies are helping to set parents on the path to self-sufficiency. It illustrates the variety of initiatives that are underway relative to the hundreds of Head Start agencies. The report highlights the important work all Head Start agencies are doing to support parents as they move from welfare to work. + + + Children's Defense Fund + + + + 1998 + Children's Defense Fund (U.S.) + + +
reports
+ 24 pages + reformatted digital +
+ + eng + + text + + Finlay, Belva + + Author + + + + Blank, Helen + + Author + + + + Poersch, Oxendine + + Author + + + + United States + + + Child welfare + + + Head Start programs + + + + Project Head Start (U.S.) + + + + Langston Hughes Library (Children's Defense Fund Haley Farm) + + University of Tennessee, Knoxville. Libraries + In Copyright - Non-Commercial Use Permitted +
From dec77b030a2b16f54ba3d60f8d291503fb691707 Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Thu, 16 May 2024 08:55:39 -0400 Subject: [PATCH 07/11] Add failing test. --- tests/test_metadata_names.py | 12 +++++++++ tests/test_metadata_subject.py | 44 +++++++++++++++++++++++++++++++++ utk_exodus/metadata/__init__.py | 4 +-- 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 tests/test_metadata_subject.py diff --git a/tests/test_metadata_names.py b/tests/test_metadata_names.py index 3f88df3..9b5a103 100644 --- a/tests/test_metadata_names.py +++ b/tests/test_metadata_names.py @@ -35,6 +35,18 @@ 'copyright_holder': ['http://id.loc.gov/authorities/names/n79144615'], 'utk_creator': ['Kefauver, Estes 1903-1963'] } + }, + { + "filename": "cdf_13238.xml", + "expected_results": { + 'author': [ + 'http://id.loc.gov/authorities/names/n92112591', + 'http://id.loc.gov/authorities/names/n86833543' + ], + 'utk_author': [ + 'Poersch, Oxendine' + ] + } } ] ) diff --git a/tests/test_metadata_subject.py b/tests/test_metadata_subject.py new file mode 100644 index 0000000..7517c11 --- /dev/null +++ b/tests/test_metadata_subject.py @@ -0,0 +1,44 @@ +import pytest +from utk_exodus.metadata import SubjectProperty +from pathlib import Path + +# Set path to fixtures +fixtures_path = Path(__file__).parent / "fixtures" + +# Set namespaces +NAMESPACES = { + "mods": "http://www.loc.gov/mods/v3", + "xlink": "http://www.w3.org/1999/xlink", +} + + +@pytest.fixture( + params=[ + { + "filename": "cdf_13238.xml", + "expected_results": { + "subject": [ + "http://id.loc.gov/authorities/subjects/sh85023396 ", + "http://id.loc.gov/authorities/subjects/sh87000100", + "http://id.loc.gov/authorities/names/n79059917", + ] + }, + }, + { + "filename": "egypt_224.xml", + "expected_results": { + "subject": ["http://id.loc.gov/authorities/subjects/sh85016233"] + }, + }, + ] +) +def fixture(request): + param = request.param + param["fixture_path"] = fixtures_path / param.get("filename") + return param + + +def test_find_method_on_subject(fixture): + subjects = SubjectProperty(fixture.get("fixture_path"), NAMESPACES) + results = subjects.find_topic() + assert results == fixture["expected_results"] diff --git a/utk_exodus/metadata/__init__.py b/utk_exodus/metadata/__init__.py index 7649212..8f615b6 100644 --- a/utk_exodus/metadata/__init__.py +++ b/utk_exodus/metadata/__init__.py @@ -1,2 +1,2 @@ -from .metadata import DataProvider, ExtentProperty, GeoNamesProperty, LanguageURIProperty, LocalTypesProperties, MachineDate, MetadataMapping, NameProperty, PhysicalLocationsProperties, PublicationPlaceProperty, PublisherProperty, RightsOrLicenseProperties, RoleAndNameProperty, TitleProperty, TypesProperties -__all__ = ['DataProvider', 'ExtentProperty', 'GeoNamesProperty', 'LanguageURIProperty', 'LocalTypesProperties', 'MachineDate', 'MetadataMapping', 'NameProperty', 'PhysicalLocationsProperties', 'PublicationPlaceProperty', 'PublisherProperty', 'RightsOrLicenseProperties', 'RoleAndNameProperty', 'TitleProperty', 'TypesProperties'] +from .metadata import DataProvider, ExtentProperty, GeoNamesProperty, LanguageURIProperty, LocalTypesProperties, MachineDate, MetadataMapping, NameProperty, PhysicalLocationsProperties, PublicationPlaceProperty, PublisherProperty, RightsOrLicenseProperties, RoleAndNameProperty, SubjectProperty, TitleProperty, TypesProperties +__all__ = ['DataProvider', 'ExtentProperty', 'GeoNamesProperty', 'LanguageURIProperty', 'LocalTypesProperties', 'MachineDate', 'MetadataMapping', 'NameProperty', 'PhysicalLocationsProperties', 'PublicationPlaceProperty', 'PublisherProperty', 'RightsOrLicenseProperties', 'RoleAndNameProperty', 'SubjectProperty', 'TitleProperty', 'TypesProperties'] From f696d2645d32b97885f8df017243bacaa161706c Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Thu, 16 May 2024 09:02:23 -0400 Subject: [PATCH 08/11] Fix bug and make tests pass. --- tests/test_metadata_subject.py | 13 ++++++++++++- utk_exodus/metadata/metadata.py | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/test_metadata_subject.py b/tests/test_metadata_subject.py index 7517c11..c1974c8 100644 --- a/tests/test_metadata_subject.py +++ b/tests/test_metadata_subject.py @@ -18,7 +18,7 @@ "filename": "cdf_13238.xml", "expected_results": { "subject": [ - "http://id.loc.gov/authorities/subjects/sh85023396 ", + "http://id.loc.gov/authorities/subjects/sh85023396", "http://id.loc.gov/authorities/subjects/sh87000100", "http://id.loc.gov/authorities/names/n79059917", ] @@ -30,6 +30,17 @@ "subject": ["http://id.loc.gov/authorities/subjects/sh85016233"] }, }, + { + "filename": "knoxgardens_125.xml", + "expected_results": { + "subject": [ + "http://id.loc.gov/authorities/subjects/sh85101348", + "http://id.loc.gov/authorities/subjects/sh85053123", + "http://id.loc.gov/authorities/subjects/sh85103022", + "http://id.loc.gov/authorities/subjects/sh2008120720", + ] + }, + }, ] ) def fixture(request): diff --git a/utk_exodus/metadata/metadata.py b/utk_exodus/metadata/metadata.py index 3c7df14..7961b0d 100644 --- a/utk_exodus/metadata/metadata.py +++ b/utk_exodus/metadata/metadata.py @@ -540,7 +540,7 @@ def find_topic(self): return_values = [] for iterable in all_initial_values: for value in iterable: - return_values.append(value) + return_values.append(value.strip()) return {"subject": return_values} From 23e7add80bd6fd607dbf6f13182aeda5b18a9951 Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Thu, 16 May 2024 09:04:01 -0400 Subject: [PATCH 09/11] Refactor for efficiency. --- utk_exodus/metadata/metadata.py | 201 ++++++++++++++------------------ 1 file changed, 89 insertions(+), 112 deletions(-) diff --git a/utk_exodus/metadata/metadata.py b/utk_exodus/metadata/metadata.py index 7961b0d..0b4bdac 100644 --- a/utk_exodus/metadata/metadata.py +++ b/utk_exodus/metadata/metadata.py @@ -34,18 +34,14 @@ def __extract_titles(self): "supplied": self.__get_titles_from_xpath( "mods:titleInfo[@supplied]/mods:title" ), - "part_names": self.__get_titles_from_xpath( - "mods:titleInfo/mods:partName" - ), + "part_names": self.__get_titles_from_xpath("mods:titleInfo/mods:partName"), "part_numbers": self.__get_titles_from_xpath( "mods:titleInfo/mods:partNumber" ), - "non_sorts": self.__get_titles_from_xpath( - "mods:titleInfo/mods:nonSort" - ), + "non_sorts": self.__get_titles_from_xpath("mods:titleInfo/mods:nonSort"), "alternatives": self.__get_titles_from_xpath( 'mods:titleInfo[@type="alternative"]/mods:title' - ) + ), } return titles_data @@ -83,10 +79,7 @@ def find(self): # Handle alternative titles mapping to dcterms:alternative. alternatives.extend(titles_data["alternatives"]) - return { - "title": titles, - "alternative_title": alternatives - } + return {"title": titles, "alternative_title": alternatives} class RoleAndNameProperty(XMLtoDictProperty): @@ -145,7 +138,7 @@ def find(self): roles_and_names[role] = [part["mods:namePart"]] else: roles_and_names[role].append([part["mods:namePart"]]) - if isinstance(part, str) and not part.startswith("http"): + if isinstance(part, str) and not part.startswith("http"): if role not in roles_and_names: roles_and_names[role] = [part] else: @@ -161,21 +154,22 @@ class NameProperty(XMLtoDictProperty): """ Used for names. """ + def __init__(self, file): super().__init__(file) self.all_names = self.__find_all_names() def __find_all_names(self): - if 'mods:name' in self.doc['mods:mods']: - all_names = self.doc['mods:mods']['mods:name'] - if isinstance(all_names, list): + if "mods:name" in self.doc["mods:mods"]: + all_names = self.doc["mods:mods"]["mods:name"] + if isinstance(all_names, list): return all_names elif isinstance(all_names, dict): return [all_names] elif isinstance(all_names, str): return [all_names] else: - return ['Problem'] + return ["Problem"] else: return [] @@ -196,50 +190,68 @@ def find(self): roles = [] local_roles = [] try: - roles.append(name['mods:role']['mods:roleTerm']['#text'].lower().replace(' ', '_')) - local_roles.append(f"utk_{name['mods:role']['mods:roleTerm']['#text'].lower().replace(' ', '_')}") + roles.append( + name["mods:role"]["mods:roleTerm"]["#text"] + .lower() + .replace(" ", "_") + ) + local_roles.append( + f"utk_{name['mods:role']['mods:roleTerm']['#text'].lower().replace(' ', '_')}" + ) except KeyError: print(name) # TODO: A name can have multiple roles except TypeError: - if isinstance(name['mods:role'], list): - for role in name['mods:role']: - if '#text' in role['mods:roleTerm']: - roles.append(role['mods:roleTerm']['#text'].lower().replace(' ', '_')) - local_roles.append(f"utk_{role['mods:roleTerm']['#text'].lower().replace(' ', '_')}") + if isinstance(name["mods:role"], list): + for role in name["mods:role"]: + if "#text" in role["mods:roleTerm"]: + roles.append( + role["mods:roleTerm"]["#text"].lower().replace(" ", "_") + ) + local_roles.append( + f"utk_{role['mods:roleTerm']['#text'].lower().replace(' ', '_')}" + ) else: - roles.append(role['mods:roleTerm'].lower().replace(' ', '_')) - local_roles.append(f"utk_{role['mods:roleTerm'].lower().replace(' ', '_')}") + roles.append( + role["mods:roleTerm"].lower().replace(" ", "_") + ) + local_roles.append( + f"utk_{role['mods:roleTerm'].lower().replace(' ', '_')}" + ) else: - roles.append(name['mods:role']['mods:roleTerm'].lower().replace(' ', '_')) - local_roles.append(f"utk_{name['mods:role']['mods:roleTerm'].lower().replace(' ', '_')}") + roles.append( + name["mods:role"]["mods:roleTerm"].lower().replace(" ", "_") + ) + local_roles.append( + f"utk_{name['mods:role']['mods:roleTerm'].lower().replace(' ', '_')}" + ) # TODO: Rework this. It's not pretty but it works. - name_value = name['mods:namePart'] - if '@valueURI' in name: - name_value = name['@valueURI'] + name_value = name["mods:namePart"] + if "@valueURI" in name: + name_value = name["@valueURI"] for role in roles: if isinstance(name_value, list): for part in name_value: - if isinstance(part, dict) and '@valueURI' in part: + if isinstance(part, dict) and "@valueURI" in part: if role not in roles_and_names: - roles_and_names[role] = [part['@valueURI']] + roles_and_names[role] = [part["@valueURI"]] else: - roles_and_names[role].append([part['@valueURI']]) - elif role not in roles_and_names and name_value.startswith('http'): + roles_and_names[role].append([part["@valueURI"]]) + elif role not in roles_and_names and name_value.startswith("http"): roles_and_names[role] = [name_value] - elif name_value.startswith('http'): + elif name_value.startswith("http"): roles_and_names[role].append(name_value) for role in local_roles: if isinstance(name_value, list): for part in name_value: - if isinstance(part, str) and not part.startswith('http'): + if isinstance(part, str) and not part.startswith("http"): if role not in roles_and_names: roles_and_names[role] = [part] else: roles_and_names[role].append(part) - elif role not in roles_and_names and not name_value.startswith('http'): + elif role not in roles_and_names and not name_value.startswith("http"): roles_and_names[role] = [name_value] - elif not name_value.startswith('http'): + elif not name_value.startswith("http"): roles_and_names[role].append(name_value) return roles_and_names @@ -252,6 +264,7 @@ class GeoNamesProperty(BaseProperty): path (str): The path to the XML file. namespaces (dict): A dictionary containing the namespaces used in the XML file. """ + def __init__(self, path, namespaces): super().__init__(path, namespaces) @@ -267,7 +280,7 @@ def find(self, name): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> geonames = GeoNamesProperty("tests/fixtures/webster_1127.xml", NAMESPACES) >>> geonames.find("spatial") @@ -333,7 +346,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> physical_location = PhysicalLocationsProperties("tests/fixtures/civilwar_1438.xml", NAMESPACES) >>> physical_location.find() @@ -398,7 +411,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> dataProvider = DataProvider("tests/fixtures/egypt_224.xml", NAMESPACES) >>> dataProvider.find() @@ -433,7 +446,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> machineDate = MachineDate("tests/fixtures/volvoices_2993.xml", NAMESPACES) >>> machineDate.find() @@ -485,62 +498,22 @@ def __init__(self, path, namespaces): super().__init__(path, namespaces) def find_topic(self): - subject_topic_value_uris = [ - uri - for uri in self.root.xpath( - "mods:subject[mods:topic]/@valueURI", namespaces=self.namespaces - ) - ] - topic_value_uris = [ - uri - for uri in self.root.xpath( - "mods:subject/mods:topic/@valueURI", namespaces=self.namespaces - ) - ] - subject_name_value_uris = [ - uri - for uri in self.root.xpath( - "mods:subject[mods:name/mods:namePart]/@valueURI", - namespaces=self.namespaces, - ) - ] - name_value_uris = [ - uri - for uri in self.root.xpath( - "mods:subject/mods:name/@valueURI", namespaces=self.namespaces - ) - ] - aat_genres = [ - uri - for uri in self.root.xpath( - 'mods:genre[@authority="aat"]/@valueURI', namespaces=self.namespaces - ) - ] - lcmpt_genres = [ - uri - for uri in self.root.xpath( - 'mods:genre[@authority="lcmpt"]/@valueURI', namespaces=self.namespaces - ) - ] - lcsh_genres = [ - uri - for uri in self.root.xpath( - 'mods:genre[@authority="lcsh"]/@valueURI', namespaces=self.namespaces - ) - ] - all_initial_values = [ - subject_topic_value_uris, - topic_value_uris, - subject_name_value_uris, - name_value_uris, - aat_genres, - lcmpt_genres, - lcsh_genres, + xpaths = [ + "mods:subject[mods:topic]/@valueURI", + "mods:subject/mods:topic/@valueURI", + "mods:subject[mods:name/mods:namePart]/@valueURI", + "mods:subject/mods:name/@valueURI", + 'mods:genre[@authority="aat"]/@valueURI', + 'mods:genre[@authority="lcmpt"]/@valueURI', + 'mods:genre[@authority="lcsh"]/@valueURI', ] + + # Execute each XPath query and collect results return_values = [] - for iterable in all_initial_values: - for value in iterable: - return_values.append(value.strip()) + for xpath in xpaths: + uris = self.root.xpath(xpath, namespaces=self.namespaces) + return_values.extend(uri.strip() for uri in uris) + return {"subject": return_values} @@ -584,7 +557,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> types = TypesProperties("tests/fixtures/utsmc_17870.xml", NAMESPACES) >>> types.find() @@ -739,7 +712,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> local_types = LocalTypesProperties("tests/fixtures/egypt_224.xml", NAMESPACES) >>> local_types.find() @@ -872,7 +845,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> publisher = PublisherProperty("tests/fixtures/playbills:1052.xml", NAMESPACES) >>> publisher.find() @@ -902,7 +875,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> rights = RightsOrLicenseProperties("tests/fixtures/heilman:1010.xml", NAMESPACES) >>> rights.find() @@ -954,7 +927,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> publication_place = PublicationPlaceProperty("tests/fixtures/volvoices_2495.xml", NAMESPACES) >>> publication_place.find() @@ -984,7 +957,7 @@ def find_term(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> language_uri = LanguageURIProperty("tests/fixtures/utsmc:725.xml", NAMESPACES) >>> language_uri.find_term() @@ -1023,6 +996,7 @@ class ExtentProperty(BaseProperty): path (str): The path to the file. namespaces (dict): Namespaces to be used in the XPath queries. """ + def __init__(self, path: str, namespaces: dict): super().__init__(path, namespaces) @@ -1035,7 +1009,7 @@ def find(self): Examples: >>> NAMESPACES = { - ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'mods': 'http://www.loc.gov/mods/v3', ... 'xlink': 'http://www.w3.org/1999/xlink' } >>> extent_property = ExtentProperty("tests/fixtures/knoxgardens_125.xml", NAMESPACES) >>> extent_property.find() @@ -1056,7 +1030,7 @@ def find(self): "mods:physicalDescription/mods:extent[@unit]", namespaces=self.namespaces, ) - if node.text is not None and 'unit' in node.attrib + if node.text is not None and "unit" in node.attrib ] # Combine extents with and without units into a single list @@ -1097,7 +1071,7 @@ def __execute(self, namespaces): .replace("_MODS.xml", "") .replace(".xml", ""), "model": model, - 'sequence': '', + "sequence": "", "remote_files": "", "parents": " | ".join( ResourceIndexSearch().get_parent_collections( @@ -1137,18 +1111,18 @@ def __execute(self, namespaces): pages = self.look_for_pages(item) for page in pages: new_page = item.copy() - new_page['source_identifier'] = page['pid'].replace('info:fedora/', '') - new_page['parents'] = item['source_identifier'] - new_page['model'] = 'Page' - new_page['sequence'] = page['page'] + new_page["source_identifier"] = page["pid"].replace("info:fedora/", "") + new_page["parents"] = item["source_identifier"] + new_page["model"] = "Page" + new_page["sequence"] = page["page"] all_pages.append(new_page) for page in all_pages: all_file_data.append(page) return all_file_data def look_for_pages(self, data): - if data['model'] == 'Book': - return ResourceIndexSearch().find_pages_in_book(data['source_identifier']) + if data["model"] == "Book": + return ResourceIndexSearch().find_pages_in_book(data["source_identifier"]) return [] def __find_unique_fieldnames(self, data): @@ -1168,7 +1142,10 @@ def __dereference_islandora_type(self, file): "info:fedora/islandora:sp_videoCModel": "Video", } x = ResourceIndexSearch().get_islandora_work_type( - file.split("/")[-1].replace("_MODS.xml", "").replace(".xml", "").replace("_", ":") + file.split("/")[-1] + .replace("_MODS.xml", "") + .replace(".xml", "") + .replace("_", ":") ) return islandora_types[x] From 2095f348e638a8d45e9358fd9c1d9e3e262b1492 Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Thu, 16 May 2024 09:10:59 -0400 Subject: [PATCH 10/11] Add doctest. --- utk_exodus/metadata/metadata.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/utk_exodus/metadata/metadata.py b/utk_exodus/metadata/metadata.py index 0b4bdac..67386ad 100644 --- a/utk_exodus/metadata/metadata.py +++ b/utk_exodus/metadata/metadata.py @@ -493,11 +493,25 @@ def __sort_if_range(values): class SubjectProperty(BaseProperty): - # TODO: Should this even exist? Can't this just be BaseProperty? + """Get subjects from the MODS XML file.""" def __init__(self, path, namespaces): super().__init__(path, namespaces) def find_topic(self): + """Find all topics in the XML file. + + Returns: + dict: A dictionary containing topics information. + + Examples: + >>> NAMESPACES = { + ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'xlink': 'http://www.w3.org/1999/xlink' } + >>> subjects = SubjectProperty("tests/fixtures/knoxgardens_125.xml", NAMESPACES) + >>> subjects.find_topic() + {'subject': ['http://id.loc.gov/authorities/subjects/sh85101348', 'http://id.loc.gov/authorities/subjects/sh85053123', 'http://id.loc.gov/authorities/subjects/sh85103022', 'http://id.loc.gov/authorities/subjects/sh2008120720']} + + """ xpaths = [ "mods:subject[mods:topic]/@valueURI", "mods:subject/mods:topic/@valueURI", From 1b26a0a7e8e7370f1a88862f64f43665a4a0d149 Mon Sep 17 00:00:00 2001 From: Mark Baggett Date: Thu, 16 May 2024 10:07:47 -0400 Subject: [PATCH 11/11] Test Keyword Property. --- tests/test_metadata_keword.py | 40 ++++++++++++++++++++++++++++++++ utk_exodus/metadata/__init__.py | 41 +++++++++++++++++++++++++++++++-- utk_exodus/metadata/metadata.py | 17 ++++++++++++++ 3 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 tests/test_metadata_keword.py diff --git a/tests/test_metadata_keword.py b/tests/test_metadata_keword.py new file mode 100644 index 0000000..c33ad68 --- /dev/null +++ b/tests/test_metadata_keword.py @@ -0,0 +1,40 @@ +import pytest +from utk_exodus.metadata import KeywordProperty +from pathlib import Path + +# Set path to fixtures +fixtures_path = Path(__file__).parent / "fixtures" + +# Set namespaces +NAMESPACES = { + "mods": "http://www.loc.gov/mods/v3", + "xlink": "http://www.w3.org/1999/xlink", +} + + +@pytest.fixture( + params=[ + { + "filename": "civilwar_1438.xml", + "expected_results": { + 'keyword': [ + 'Jurisdiction -- Tennessee, East -- History -- Civil War, 1861-1865', + 'Actions and defenses -- Tennessee, East -- History -- Civil War, 1861-1865', + 'Tennessee, East -- Politics and government -- 19th century', + 'Wallace, Jesse G. -- Correspondence', + 'Temple, Oliver Perry, 1820-1907 -- Correspondence' + ] + } + }, + ] +) +def fixture(request): + param = request.param + param["fixture_path"] = fixtures_path / param.get("filename") + return param + + +def test_find_method_on_keyword(fixture): + subjects = KeywordProperty(fixture.get("fixture_path"), NAMESPACES) + results = subjects.find_topic() + assert results == fixture["expected_results"] diff --git a/utk_exodus/metadata/__init__.py b/utk_exodus/metadata/__init__.py index 8f615b6..393d581 100644 --- a/utk_exodus/metadata/__init__.py +++ b/utk_exodus/metadata/__init__.py @@ -1,2 +1,39 @@ -from .metadata import DataProvider, ExtentProperty, GeoNamesProperty, LanguageURIProperty, LocalTypesProperties, MachineDate, MetadataMapping, NameProperty, PhysicalLocationsProperties, PublicationPlaceProperty, PublisherProperty, RightsOrLicenseProperties, RoleAndNameProperty, SubjectProperty, TitleProperty, TypesProperties -__all__ = ['DataProvider', 'ExtentProperty', 'GeoNamesProperty', 'LanguageURIProperty', 'LocalTypesProperties', 'MachineDate', 'MetadataMapping', 'NameProperty', 'PhysicalLocationsProperties', 'PublicationPlaceProperty', 'PublisherProperty', 'RightsOrLicenseProperties', 'RoleAndNameProperty', 'SubjectProperty', 'TitleProperty', 'TypesProperties'] +from .metadata import ( + DataProvider, + ExtentProperty, + GeoNamesProperty, + KeywordProperty, + LanguageURIProperty, + LocalTypesProperties, + MachineDate, + MetadataMapping, + NameProperty, + PhysicalLocationsProperties, + PublicationPlaceProperty, + PublisherProperty, + RightsOrLicenseProperties, + RoleAndNameProperty, + SubjectProperty, + TitleProperty, + TypesProperties, +) + +__all__ = [ + "DataProvider", + "ExtentProperty", + "GeoNamesProperty", + "KeywordProperty", + "LanguageURIProperty", + "LocalTypesProperties", + "MachineDate", + "MetadataMapping", + "NameProperty", + "PhysicalLocationsProperties", + "PublicationPlaceProperty", + "PublisherProperty", + "RightsOrLicenseProperties", + "RoleAndNameProperty", + "SubjectProperty", + "TitleProperty", + "TypesProperties", +] diff --git a/utk_exodus/metadata/metadata.py b/utk_exodus/metadata/metadata.py index 67386ad..ac78b88 100644 --- a/utk_exodus/metadata/metadata.py +++ b/utk_exodus/metadata/metadata.py @@ -494,6 +494,7 @@ def __sort_if_range(values): class SubjectProperty(BaseProperty): """Get subjects from the MODS XML file.""" + def __init__(self, path, namespaces): super().__init__(path, namespaces) @@ -532,10 +533,26 @@ def find_topic(self): class KeywordProperty(BaseProperty): + """Get keywords from the MODS XML file.""" + def __init__(self, path, namespaces): super().__init__(path, namespaces) def find_topic(self): + """Find all topics in the XML file. + + Returns: + dict: A dictionary containing topics information. + + Examples: + >>> NAMESPACES = { + ... 'mods': 'http://www.loc.gov/mods/v3', + ... 'xlink': 'http://www.w3.org/1999/xlink' } + >>> keywords = KeywordProperty("tests/fixtures/civilwar_1438.xml", NAMESPACES) + >>> keywords.find_topic() + {'keyword': ['Jurisdiction -- Tennessee, East -- History -- Civil War, 1861-1865', 'Actions and defenses -- Tennessee, East -- History -- Civil War, 1861-1865', 'Tennessee, East -- Politics and government -- 19th century', 'Wallace, Jesse G. -- Correspondence', 'Temple, Oliver Perry, 1820-1907 -- Correspondence']} + + """ non_uris_topics = [ value.text for value in self.root.xpath(