-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpopulate_publication_hansard.py
337 lines (313 loc) · 14.2 KB
/
populate_publication_hansard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# This script populates table publication,
# and also other tables connected to publication:
# translation, translation_text, event,
# event_connection, event_occurrence. It also creates all
# the needed XML files for each publication and
# updates the db with the file paths.
# The starting point is a csv file containing info about hansards,
# which will be made into publications. This script adds more info
# to the file: the publication id and title. They are needed later
# when populating table facsimile_collection.
# Sample input and output (CSV) at end of file.
import psycopg2
import re
import os
from bs4 import BeautifulSoup
conn_db = psycopg2.connect(
host="",
database="",
user="",
port="",
password=""
)
cursor = conn_db.cursor()
# input the parameters for the hansards
COLLECTION_ID = 1
XML_OUTPUT_FOLDER = "documents/Delutgava_1/Lantdagen"
GENRE = "lantdagsprotokoll"
CSV_IN = "csv/lantdagen_1.csv"
CSV_OUT = "csv/lantdagen_1_id.csv"
# create a list from the original csv file
# replace empty values with None
def create_list_from_csv(filename):
with open(filename, "r", encoding="utf-8-sig") as source_file:
list = []
for line in source_file:
row = line.rstrip()
elements = row.split(";")
for i in range(0,len(elements)):
if elements[i] == "":
elements[i] = None
list.append(elements)
return list
# create a csv file
def write_list_to_csv(list, filename):
with open(filename, "w", encoding="utf-8-sig") as output_file:
for row in list:
for item in row:
if item is None:
item = ""
output_file.write(str(item) + ";")
output_file.write("\n")
print("List written to file", filename)
# populate table publication with hansards and create needed content
# in other tables, i.e. translation, translation_text, event,
# event_connection, event_occurrence
def create_hansard_publication(hansards):
directory = XML_OUTPUT_FOLDER
directory_path = create_directory(directory)
insert_query = """INSERT INTO publication(publication_collection_id, published, genre, original_publication_date, original_language) VALUES(%s, %s, %s, %s, %s) RETURNING id"""
for hansard in hansards:
published = 1
original_date = hansard[0]
original_publication_date, date_uncertain, year = replace_date(original_date)
# all of the hansards are in Swedish
original_language = "sv"
original_title = hansard[1]
values_to_insert = (COLLECTION_ID, published, GENRE, original_publication_date, original_language)
cursor.execute(insert_query, values_to_insert)
publication_id = cursor.fetchone()[0]
# the title of the publication is in swe and fin
# the titles are kept in a different table
# title contains the date as it has been recorded originally
# translated titles are not yet available so we'll use the Swedish title so far
title_swe, title_fin, translation_id = add_title(publication_id, original_date, date_uncertain, original_title)
# these hansards contain texts and speeches by Mechelin
# but also naturally by other members of the Diet
# this is registered as a co-authorship: LM & the Diet
register_author(publication_id)
# each publication has two XML-files, a Swedish and a Finnish one
# these files contain a template and the editors will fill them with content
# update table translation_text with the file paths
create_file(directory_path, original_publication_date, year, original_language, publication_id, translation_id, original_title, title_swe)
hansard.extend((publication_id, title_swe))
print(hansard)
print("Table publication updated with the new publications.")
conn_db.commit()
return hansards
# date has mainly been recorded as 1.1.1800
# make date format uniform and get rid of other characters
# new format: YYYY-MM-DD, use XX if some part of the date is missing
# if no date can be easily extracted from original_date, make date XXXX-XX-XX
def replace_date(original_date):
date = "XXXX-XX-XX"
original_date = original_date.replace("/", ".")
original_date = original_date.replace("[", "")
original_date = original_date.replace("]", "")
match_string = re.search(r"\?", original_date)
if match_string:
original_date = original_date.replace("?", "")
date_uncertain = True
else:
date_uncertain = False
search_string = re.compile(r"(\d{1,2})\.(\d{1,2})\.(\d{4})")
match_string = re.search(search_string, original_date)
if match_string:
year = match_string.group(3)
month = match_string.group(2).zfill(2)
day = match_string.group(1).zfill(2)
date = year + "-" + month + "-" + day
search_string = re.compile(r"^(\d{1,2})\.(\d{4})")
match_string = re.search(search_string, original_date)
if match_string:
year = match_string.group(2)
month = match_string.group(1).zfill(2)
date = year + "-" + month + "-XX"
search_string = re.compile(r"(^\d{4})")
match_string = re.search(search_string, original_date)
if match_string:
date = match_string.group(0)
date = date + "-XX-XX"
return date, date_uncertain, year
# create the titles for the publication
# there's a swe and a fin title
# since the titles haven't been translated yet, we'll just use
# the Swedish title as the Finnish one too, and replace it later
def add_title(publication_id, original_date, date_uncertain, original_title):
# make some slight changes to original_date, if needed, since it'll be part of a title
# if there's some uncertainty about the date, add a standard phrase
original_date = original_date.replace("/", ".")
if date_uncertain is True:
original_date = original_date.replace("?", "")
title_swe = "ca " + original_date + " " + original_title
title_fin = "n. " + original_date + " " + original_title
else:
title_swe = original_date + " " + original_title
title_fin = original_date + " " + original_title
translation_id = create_translation()
field_name = "name"
table_name = "publication"
create_translation_text(translation_id, title_swe, title_fin, field_name, table_name)
update_query = """UPDATE publication SET translation_id = %s WHERE id = %s"""
values_to_insert = (translation_id, publication_id)
cursor.execute(update_query, values_to_insert)
return title_swe, title_fin, translation_id
# populate table translation
def create_translation():
neutral_text = "No translation found"
insert_query = """INSERT INTO translation(neutral_text) VALUES(%s) RETURNING id"""
value_to_insert = (neutral_text,)
cursor.execute(insert_query, value_to_insert)
translation_id = cursor.fetchone()[0]
return translation_id
# populate table translation_text with swe and fin titles or file paths
# for the publication
def create_translation_text(translation_id, text_swe, text_fin, field_name, table_name):
insert_query = """INSERT INTO translation_text(translation_id, language, text, field_name, table_name) VALUES(%s, %s, %s, %s, %s)"""
values_to_insert_swe = (translation_id, "sv", text_swe, field_name, table_name)
values_to_insert_fin = (translation_id, "fi", text_fin, field_name, table_name)
cursor.execute(insert_query, values_to_insert_swe)
cursor.execute(insert_query, values_to_insert_fin)
def register_author(publication_id):
# Mechelin is registered as the co-author of these hansards
# together with the subject id for the Diet (as a collective)
LM_id = 1
diet_subject_id = 7902
event_connection_type = "contributed to hansard"
event_id = create_event_and_connection(LM_id, diet_subject_id, event_connection_type)
event_occurrence_type = "hansard"
create_event_occurrence(publication_id, event_id, event_occurrence_type)
# create connection between publication and subject
# (the author of the text)
def create_event_and_connection(LM_id, diet_subject_id, event_connection_type):
insert_query = """INSERT INTO event(type) VALUES(%s) RETURNING id"""
event_type = "published"
value_to_insert = (event_type,)
cursor.execute(insert_query, value_to_insert)
event_id = cursor.fetchone()[0]
insert_query = """INSERT INTO event_connection(subject_id, event_id, type) VALUES(%s, %s, %s)"""
values_to_insert_LM = (LM_id, event_id, event_connection_type)
values_to_insert_diet = (diet_subject_id, event_id, event_connection_type)
cursor.execute(insert_query, values_to_insert_LM)
cursor.execute(insert_query, values_to_insert_diet)
return event_id
# create connection between publication and event
def create_event_occurrence(publication_id, event_id, event_occurrence_type):
insert_query = """INSERT INTO event_occurrence(type, event_id, publication_id) VALUES(%s, %s, %s)"""
values_to_insert = (event_occurrence_type, event_id, publication_id)
cursor.execute(insert_query, values_to_insert)
# create a directory
def create_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
return directory
# each publication has two XML files, a Swedish and a Finnish one
# create files and directories
# and update table translation_text with the file paths
def create_file(directory_path, original_publication_date, year, original_language, publication_id, translation_id, original_title, title_swe):
# files and directories contain the publication's date
original_publication_date = original_publication_date.replace("-", "_")
directory = directory_path + "/" + year
year_directory_path = create_directory(directory)
title_part = create_title_part_for_file(original_title)
final_directory = year_directory_path + "/" + original_publication_date + "_" + title_part
final_directory_path = create_directory(final_directory)
# since the original language is Swedish there will be two files/file paths
# for the publication
file_name = original_publication_date + "_" + title_part + "_" + original_language + "_" + str(publication_id) + ".xml"
file_path_swe = final_directory_path + "/" + file_name
write_to_file(file_path_swe, title_swe)
file_name = original_publication_date + "_" + title_part + "_fi_" + str(publication_id) + ".xml"
file_path_fin = final_directory_path + "/" + file_name
write_to_file(file_path_fin, title_swe)
add_file_path(translation_id, file_path_swe, file_path_fin)
# file and directory names contain the hansard's title
# with certain replacements
def create_title_part_for_file(title_part):
title_part = title_part.replace(". ", "_")
title_part = title_part.replace(".", "")
title_part = title_part.replace(" ", "_")
title_part = title_part.replace("-", "_")
title_part = title_part.replace("–", "_")
title_part = re.sub(r",|\?|!|’|»|”|:|;|\(|\)|\[|\]|\'|\"", "", title_part)
title_part = title_part.replace("ç", "c")
title_part = title_part.replace("Ç", "C")
title_part = title_part.replace("é", "e")
title_part = title_part.replace("è", "e")
title_part = title_part.replace("ê", "e")
title_part = title_part.replace("Ê", "E")
title_part = title_part.replace("É", "E")
title_part = title_part.replace("á", "a")
title_part = title_part.replace("à", "a")
title_part = title_part.replace("À", "A")
title_part = title_part.replace("ü", "u")
title_part = title_part.replace("ú", "u")
title_part = title_part.replace("Ü", "U")
title_part = title_part.replace("ï", "i")
title_part = title_part.replace("í", "i")
title_part = title_part.replace("ô", "o")
title_part = title_part.replace("ó", "o")
title_part = title_part.replace("æ", "ae")
title_part = title_part.replace("œ", "oe")
title_part = title_part.replace("ß", "ss")
title_part = title_part.replace("&", "et")
title_part = title_part.replace("ø", "o")
title_part = title_part.replace("Ö", "O")
title_part = title_part.replace("ö", "o")
title_part = title_part.replace("Å", "A")
title_part = title_part.replace("å", "a")
title_part = title_part.replace("Ä", "A")
title_part = title_part.replace("ä", "a")
# shorten long names of files and directories
# otherwise the file path may become too long
if len(title_part) >= 40:
title_part = title_part[0:39]
return title_part
# the XML files contain a template
def content_template():
xml_template = '''
<TEI xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.tei-c.org/ns/1.0" xsi:schemaLocation="">
<teiHeader>
<fileDesc>
<titleStmt>
<title></title>
<respStmt>
<resp/>
<name/>
</respStmt>
</titleStmt>
<publicationStmt>
<publisher>Utgåvan Leo Mechelin / Leo Mechelin -editio</publisher>
</publicationStmt>
<sourceDesc>
<p/>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<body xml:space="preserve">
<div type="hansard">
</div>
</body>
</text>
</TEI>
'''
return BeautifulSoup(xml_template, "xml")
# create the file and its content
def write_to_file(file_path, title):
with open(file_path, "w", encoding="utf-8-sig") as output_file:
template_soup = content_template()
# insert publication name as title
template_title = template_soup.find("title")
template_title.append(title)
# write to file as string
output_file.write(str(template_soup))
# add file paths to table translation_text
def add_file_path(translation_id, file_path_swe, file_path_fin):
field_name = "original_filename"
table_name = "publication"
create_translation_text(translation_id, file_path_swe, file_path_fin, field_name, table_name)
def main():
# info about publications of type "hansard"
hansards = create_list_from_csv(CSV_IN)
hansards_with_id = create_hansard_publication(hansards)
write_list_to_csv(hansards_with_id, CSV_OUT)
conn_db.close()
cursor.close()
main()
'''
sample input:
28.3.1877;Lantdagen. Allmän värnplikt
sample output:
28.3.1877;Lantdagen. Allmän värnplikt;1748;28.3.1877 Lantdagen. Allmän värnplikt;
'''