-
Notifications
You must be signed in to change notification settings - Fork 0
/
export.py
151 lines (119 loc) · 4.69 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import configparser
from get_extract import *
from pprint import pprint as pp
from asnake.aspace import ASpace
import json
import argparse
import re
import logging
import os.path
import jinja2
from transform import MAPPING, Transforms
argparser = argparse.ArgumentParser(description="Export MODS metadata from ArchivesSpace digital objects")
argparser.add_argument('--cachefile', default=None, help="Name of a json file containing cached data. For development and testing purposes.")
argparser.add_argument("OUTPUTPATH", help="File path for record output.")
cliargs = argparser.parse_args()
CACHEFILE = cliargs.cachefile
logging.basicConfig(level=logging.INFO)
def get_export_list(EXTRACTED_DATA):
to_export = []
for do_id, do_data in EXTRACTED_DATA['digital_objects'].items():
compass_pid = get_compass_pid(do_data)
if compass_pid is not None:
filename = compass_pid.replace(':', '_') + '_MODS.xml'
to_export.append((do_id, filename))
else:
logging.error("Could not find valid pid for %s" % do_id)
return to_export
def get_compass_pid(do_data):
namespace = 'smith'
for file_version in do_data['file_versions']:
if 'compass' in file_version['file_uri']: # Make sure that this is a compass URL
matches = re.findall(r'%s:[0-9]+' % namespace, file_version['file_uri'])
if matches is not None:
return matches[0]
def render_record(mapping):
templateLoader = jinja2.FileSystemLoader(searchpath=".")
templateEnv = jinja2.Environment(loader=templateLoader)
# Merge the template and data
template = templateEnv.get_template('compass-mods-template.xml')
return template.render(mapping)
if __name__ == '__main__':
aspace = ASpace()
with open('config.json') as config_file:
try:
configs = json.load(config_file)
except:
logging.error('No config file found')
exit(1)
## Uncomment when testing:
# repo_data = aspace.client.get('/repositories?all_ids=true').json()
# repos = []
# for repo in repo_data:
# repo_id = repo['uri'].split('/')[-1]
# repos.append(repo_id)
## Comment out this line when testing:
repos = configs['config']['repos']
if CACHEFILE is None:
EXTRACTED_DATA = get_extract(repos)
with open('extract.json', 'w') as outfile:
json.dump(EXTRACTED_DATA, outfile)
else:
with open('extract.json', 'r') as infile:
EXTRACTED_DATA = json.load(infile)
# Add repository names and finding aid url to extracted data
EXTRACTED_DATA['repositories'] = configs['config']['repositories']
EXTRACTED_DATA['url_stem'] = configs['config']['findingaid_url']
EXTRACTED_DATA['relator_file'] = configs['config']['relator_file']
# Adding Library of Congress relator data for agent output
with open(EXTRACTED_DATA['relator_file']) as json_file:
try:
relators = json.load(json_file)
except:
relators = None
EXTRACTED_DATA['relators'] = relators
to_export = get_export_list(EXTRACTED_DATA)
transforms = Transforms()
save_path = cliargs.OUTPUTPATH
if os.path.isdir(save_path) != False:
for current_record in to_export:
do_id = current_record[0]
# pp(do_id)
template_context = {}
record_valid = True
for field_name, field_recipe in MAPPING.items():
try:
transform_method = getattr(transforms, field_recipe['transform_function'])
except AttributeError as e:
print("No transform named '%s'. Please add a transform method to the Transforms class in transform.py." % field_recipe['transform_function'])
exit(1)
try:
transform_return_value = transform_method(EXTRACTED_DATA, do_id)
except Exception as e:
logging.warning("%s %s %s" % (do_id, field_name, str(e)))
transform_return_value = None
if (transform_return_value is None):
if ('required' in field_recipe) & (field_recipe['required'] is True):
logging.error("Required field '%s' missing in %s. Skipping record." % (field_name, do_id))
record_valid = False
else:
logging.warning("Information not found in %s field for digital object %s" % (field_name, do_id))
template_context[field_name] = transform_return_value
else:
template_context[field_name] = transform_return_value
if record_valid == True:
logging.info('Rendering MODS record for %s' % current_record[0])
xml = render_record(template_context)
handle = current_record[1]
filename = os.path.join(save_path, handle)
try:
with open(filename, "w") as fh:
logging.info('Writing %s' % filename)
fh.write(xml)
except Exception as e:
logging.error('File could not be written for %s' % (handle))
logging.info('All files written.')
else:
logging.error("Directory not found. Please create if not created. Files cannot be written without an existing directory to store them.")
exit(1)
# import pdb; pdb.set_trace()