Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Banish MODS and Policy files. #13

Merged
merged 9 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
365 changes: 360 additions & 5 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "utk-exodus"
version = "0.1.9"
version = "0.2.0"
description = "A tool for building import sheets from UTK legacy systems"
authors = ["Mark Baggett <mbagget1@utk.edu>"]
readme = "README.md"
Expand All @@ -19,6 +19,9 @@ black = "^24.4.2"
ruff = "^0.4.2"
pytest = "^8.2.0"
click = "^8.1.7"
requestium = "^0.4.0"
selenium = "^4.21.0"
selenium-requests = "^2.0.4"

[tool.poetry.scripts]
exodus = "utk_exodus.exodus:cli"
Expand Down
4 changes: 4 additions & 0 deletions utk_exodus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@
from .template import ImportTemplate
from .restrict import Restrictions, RestrictionsSheet
from .collection import CollectionMetadata, CollectionImporter
from .banish import BanishFiles
from .review import ExistingImport

__all__ = [
"BanishFiles",
"CollectionMetadata",
"CollectionImporter",
"ExistingImport",
"FedoraObject",
"FileCurator",
"FileOrganizer",
Expand Down
3 changes: 3 additions & 0 deletions utk_exodus/banish/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .banish import BanishFiles

__all__ = ["BanishFiles"]
20 changes: 20 additions & 0 deletions utk_exodus/banish/banish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from csv import DictReader, DictWriter

class BanishFiles:
def __init__(self, csv_file):
self.csv_file = csv_file
self.csv_contents = self.__read_csv()

def __read_csv(self):
with open(self.csv_file, "r") as file:
reader = DictReader(file)
return [
row for row in reader if "MODS" not in row.get("source_identifier") and "POLICY" not in row.get("source_identifier")
]

def write(self, output_file):
with open(output_file, "w") as file:
writer = DictWriter(file, fieldnames=self.csv_contents[0].keys())
writer.writeheader()
writer.writerows(self.csv_contents)

99 changes: 98 additions & 1 deletion utk_exodus/exodus.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,14 @@
from utk_exodus.checksum import HashSheet
from utk_exodus.collection import CollectionImporter
from utk_exodus.risearch import ResourceIndexSearch
from utk_exodus.banish import BanishFiles
from utk_exodus.fedora import FedoraObject
from utk_exodus.review import ExistingImport
import click
import requests
import os
from tqdm import tqdm
from csv import DictReader


@click.group()
Expand Down Expand Up @@ -99,7 +105,7 @@ def add_files(sheet: str, files_sheet: str, what_to_add: str, remote: str) -> No
"--model",
"-m",
type=click.Choice(
["book", "image", "large_image", "pdf", "audio", "video"], case_sensitive=False
["book", "image", "large_image", "pdf", "audio", "video", "compound"], case_sensitive=False
),
help="The model you want to download metadata for.",
)
Expand Down Expand Up @@ -274,3 +280,94 @@ def generate_collection_metadata(
x = CollectionImporter(collections)
x.write_csv(output)
print("Done. Metadata written to tmp/all_collections.csv.")

@cli.command(
"banish",
help="Remove Polices and MODS from import sheets",
)
@click.option(
"--directory",
"-d",
required=True,
help="The directory of CSVs you want to remove POLICY and MODS from",
)
def banish(
directory: str,
) -> None:
print(f"MODS and POLICIES From {directory}.")
for path, directories, files in os.walk(directory):
for file in tqdm(files):
if file.endswith(".csv"):
bf = BanishFiles(os.path.join(path, file))
bf.write(os.path.join(path, file))

@cli.command(
"get_all_versions",
help="Download all versions of a datastream.",
)
@click.option(
"--directory",
"-d",
required=True,
help="The directory to write the versions to.",
)
@click.option(
"--type",
"-t",
required=True,
type=click.Choice(
[
"book", "image", "large_image", "pdf", "audio", "video", "compound", "page", "binary", "oral_history"
],
case_sensitive=False
),
help="The content model you want",
)
@click.option(
"--dsid",
"-ds",
required=True,
help="The datastream you want to download versions of.",
)
def get_all_versions(
directory: str,
type: str,
dsid: str,
) -> None:
print(f"Downloading all versions of {dsid} to {directory}.")
for pid in tqdm(ResourceIndexSearch().get_works_of_a_type_with_dsid(type, dsid)):
fedora = FedoraObject(
auth=(os.getenv("FEDORA_USERNAME"), os.getenv("FEDORA_PASSWORD")),
fedora_uri=os.getenv("FEDORA_URI"),
pid=pid,
)
fedora.write_all_versions(dsid, directory)
print("Done.")

@cli.command(
"export_errors",
help="Using a CSV, export all the errors from failed imports",
)
@click.option(
"--csv",
"-c",
required=True,
help="The CSV you want to read in",
)
@click.option(
"--directory",
"-d",
required=True,
help="Where to export the errors to",
)
def export_errors(
csv: str,
directory: str,
) -> None:
print(f"Exporting errors from {csv} to {directory}.")
with open(csv, "r") as file:
reader = DictReader(file)
import_ids = [row["Link to Errors"].split('/')[-2] for row in reader if row["Ongoing Issues"] == "Y"]
ei = ExistingImport(import_ids, directory, initial_auth=(os.getenv('HYKU_BASIC_AUTH_USER'), os.getenv('HYKU_BASIC_AUTH_PASS')))
ei.sign_in_to_hyku(os.getenv('HYKU_USER'), os.getenv('HYKU_PASS'))
ei.export_errors()
56 changes: 49 additions & 7 deletions utk_exodus/fedora/fedora.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import requests
import xmltodict


class FedoraObject:
Expand All @@ -24,13 +25,25 @@ def __guess_extension(content_type):
}
return mimetypes.get(content_type, "bin")

def getDatastream(self, dsid, output):
r = requests.get(
f"{self.fedora_uri}/objects/{self.pid}/datastreams/{dsid}/content",
auth=self.auth,
allow_redirects=True,
)
if r.status_code == 200:
def getDatastream(self, dsid, output, as_of_date=None):
if as_of_date:
r = requests.get(
f"{self.fedora_uri}/objects/{self.pid}/datastreams/{dsid}/content?asOfDateTime={as_of_date}",
auth=self.auth,
allow_redirects=True,
)
else:
r = requests.get(
f"{self.fedora_uri}/objects/{self.pid}/datastreams/{dsid}/content",
auth=self.auth,
allow_redirects=True,
)
if r.status_code == 200 and as_of_date:
open(
f'{output}/{self.pid}_{dsid}_{as_of_date}.{self.__guess_extension(r.headers.get("Content-Type", "application/binary"))}',
"wb",
).write(r.content)
elif r.status_code == 200:
open(
f'{output}/{self.pid}_{dsid}.{self.__guess_extension(r.headers.get("Content-Type", "application/binary"))}',
"wb",
Expand All @@ -47,3 +60,32 @@ def streamDatastream(self, dsid):
stream=True,
)
return r

def getDatastreamHistory(self, dsid):
r = requests.get(
f"{self.fedora_uri}/objects/{self.pid}/datastreams/{dsid}/history?format=xml",
auth=self.auth,
allow_redirects=True,
)
return xmltodict.parse(r.content.decode("utf-8"))

def write_all_versions(self, dsid, output):
history = self.getDatastreamHistory(dsid)
if isinstance(history["datastreamHistory"]["datastreamProfile"], dict):
self.getDatastream(
dsid, output, history["datastreamHistory"]["datastreamProfile"]["dsCreateDate"]
)
else:
for version in history["datastreamHistory"]["datastreamProfile"]:
self.getDatastream(dsid, output, version["dsCreateDate"])
return


if __name__ == "__main__":
import os
x = FedoraObject(
auth=(os.getenv("FEDORA_USERNAME"), os.getenv("FEDORA_PASSWORD")),
fedora_uri=os.getenv("FEDORA_URI"),
pid="roth:10"
)
x.getDatastream("OBJ", "tmp/roth2")
4 changes: 3 additions & 1 deletion utk_exodus/finder/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,9 @@ def write_csv(self, filename):

class FileSetFinder:
def __init__(self, pid):
self.universal_ignores = ('DC', 'RELS-EXT', 'TECHMD', 'PREVIEW', 'JPG', 'JP2', 'MEDIUM_SIZE', 'POLICY', 'TN')
self.universal_ignores = (
'DC', 'RELS-EXT', 'TECHMD', 'PREVIEW', 'JPG', 'JP2', 'MEDIUM_SIZE', 'POLICY', 'TN', 'MODS', 'POLICY'
)
self.pid = pid.replace('.xml', '')
self.files = self.__get_all_files()

Expand Down
3 changes: 3 additions & 0 deletions utk_exodus/review/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .review import ExistingImport

__all__ = ["ExistingImport"]
54 changes: 54 additions & 0 deletions utk_exodus/review/review.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from requestium import Session, Keys
import os
import requests
from requests.auth import HTTPBasicAuth
from tqdm import tqdm


class ExistingImport:
def __init__(
self,
import_ids,
output_dir,
initial_auth=None,
hyku_instance='https://dc.utk-hyku-production.notch8.cloud',
webdriver_path='/usr/local/bin/chromedriver',
):
self.import_ids = import_ids
self.output = output_dir
self.s = Session(
webdriver_path=webdriver_path,
browser='chrome',
default_timeout=15,
webdriver_options={'arguments': ['headless']}
)
self.hyku_instance = hyku_instance
self.initial_auth = initial_auth
if initial_auth is not None:
self.s.driver.get(
f'https://{initial_auth[0]}:{initial_auth[1]}@{hyku_instance.replace("https://", "")}/users/sign_in?locale=en'
)

def sign_in_to_hyku(self, username, password):
print('\nSigning in to Hyku\n')
self.s.driver.ensure_element_by_xpath("//input[@id='user_email']").send_keys(username, Keys.ENTER)
self.s.driver.ensure_element_by_xpath("//input[@id='user_password']").send_keys(password, Keys.ENTER)
return

def transfer_cookies_to_requests(self):
self.requests_session = requests.Session()
for cookie in self.s.driver.get_cookies():
cookie_dict = {cookie['name']: cookie['value']}
self.requests_session.cookies.update(cookie_dict)
self.requests_session.auth = HTTPBasicAuth(self.initial_auth[0], self.initial_auth[1])

def export_errors(self):
self.transfer_cookies_to_requests()
for import_id in tqdm(self.import_ids):
url = f'{self.hyku_instance}/importers/{import_id}/export_errors'
response = self.requests_session.get(url)
if response.status_code == 200:
file_path = os.path.join(os.getcwd(), self.output, f'export_errors_{import_id}.csv')
with open(f"{file_path}", 'wb') as file:
file.write(response.content)
return
24 changes: 19 additions & 5 deletions utk_exodus/risearch/risearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(
self.format = self.validate_format(riformat)
self.base_url = (
f"{self.risearch_endpoint}?type=tuples"
f"&lang={self.language}&format={self.format}"
f"&lang={self.language}&format={self.format}&limit=1000000"
)

def validate_language(self, language):
Expand Down Expand Up @@ -176,6 +176,8 @@ def __lookup_work_type(work_type):
"video": "info:fedora/islandora:sp_videoCModel",
"pdf": "info:fedora/islandora:sp_pdf",
"page": "info:fedora/islandora:pageCModel",
"binary": "info:fedora/islandora:binaryObjectCModel",
"oral_history": "info:fedora/islandora:oralhistoriesCModel",
}
return work_types.get(work_type, "unknown")

Expand Down Expand Up @@ -291,10 +293,22 @@ def find_all_collections(self):
if result != "" and result not in ignore and result != '"collection"'
]

def get_works_of_a_type_with_dsid(self, work_type, dsid):
query = quote(
f"""PREFIX system: <info:fedora/fedora-system:def/view#>
SELECT ?pid WHERE {{
?pid <info:fedora/fedora-system:def/model#hasModel> <{self.__lookup_work_type(work_type)}> ;
system:disseminates ?dsid .
FILTER(REGEX(STR(?dsid), "{dsid}"))
}}"""
)
results = requests.get(f"{self.base_url}&query={query}").content.decode("utf-8")
return [
result.replace("info:fedora/", "")
for result in results.split("\n")
if result != "" and result != '"pid"'
]

if __name__ == "__main__":
risearch = ResourceIndexSearch()
x = risearch.get_policies_based_on_type_and_collection(
"book", "collections:galston"
)
print(x)
x = risearch.get_works_of_a_type_with_dsid("book", "MODS")
Loading