Skip to content

Commit

Permalink
#77
Browse files Browse the repository at this point in the history
  • Loading branch information
funderburkjim committed Sep 20, 2024
1 parent 83d115b commit 230fe31
Show file tree
Hide file tree
Showing 9 changed files with 1,060 additions and 0 deletions.
178 changes: 178 additions & 0 deletions pwgissues/issue77/analyze_between.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#-*- coding:utf-8 -*-
""" analyze_between.py
analyze lines between <LEND> and <L>
"""
import sys,re,codecs
## https:##stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters required by git bash to avoid error
## UnicodeEncodeError: 'charmap' codec cannot encode characters
## when run in a git bash script.

sys.stdout.reconfigure(encoding='utf-8')

def read_lines(filein):
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [x.rstrip('\r\n') for x in f]
print("%s lines read from %s" % (len(lines),filein))
return lines

def remove_between(lines):
# remove lines between <LEND> and <L>
newlines = [] # returned
metaline = None
nchg = 0
for iline,line in enumerate(lines):
if line.startswith('<L>'):
metaline = line
newlines.append(line)
continue
if line.startswith('<LEND>'):
metaline = None
newlines.append('<LEND>')
continue
if metaline == None:
# drop this 'between' line
nchg = nchg + 1
continue
# keep line in body.
newlines.append(line)
print('adjust: %s dropped between <LEND> and <L>' % nchg)
return newlines

def insert_between(lines):
# insert single blank line after <LEND>
newlines = [] # returned
metaline = None
nchg = 0
for iline,line in enumerate(lines):
if line.startswith('<L>'):
metaline = line
newlines.append(line)
continue
if line.startswith('<LEND>'):
metaline = None
newlines.append('<LEND>')
newlines.append('') # extra blank line
nchg = nchg + 1
continue
if metaline == None:
# drop this 'between' line
# unexpected
print('Unexpected line at line #',iline+1)
exit(1)
# keep line in body.
newlines.append(line)
print('adjust: %s blank lines inserted between <LEND> and <L>' % nchg)
return newlines

def analyze_after_lend(iline,metaline,lines,nlines):
ans = [] # lines after lend and before next <L>
assert lines[iline] == '<LEND>'
m = re.search(r'<L>(.*?)<',metaline)
L = m.group(1)
ans.append(L)
if (iline + 1) == nlines:
nextmeta = None
ans.append(nextmeta)
return ans

b = []
while True:
iline = iline + 1
nextline = lines[iline]
if nextline.startswith('<L>'):
ans.append(nextline) # next meta line
for c in b:
ans.append(c)
return ans
else:
b.append(nextline)

def check_afters_page(afters):
nprob = 0
for after in afters:
L = after[0]
nextmeta = after[1]
b = after[2:]
for c in b:
m = re.search(r'^\[Page([0-9]-[0-9][0-9][0-9][0-9])\]$',c)
if m == None:
continue
pc = m.group(1)
m1 = re.search(r'<pc>(.*?)<',nextmeta)
pc1 = m.group(1)
if pc != pc1:
out = '%s' % c
print(out)
nprob = nprob + 1
print('check_afters_page finds %s problems' % nprob)

def summary_after(afters,fileout):
n = 0
for after in afters:
L = after[0]
nextmeta = after[1]
lines = after[2:]
if lines == ['']:
n = n + 1
print('%s entries with single blank line between <LEND> and <L>' % n)
d = {}
check_afters_page(afters)

for after in afters:
L = after[0]
nextmeta = after[1]
b = after[2:]
n = len(b)
if n not in d:
d[n] = 0
d[n] = d[n] + 1
print(d)
# fileout = 'temp.txt'
outarr = []
for after in afters:
L = after[0]
nextmeta = after[1]
if nextmeta == None:
continue # last entry
b = after[2:]
if b != ['']:
x = [L] + b
out = '%s' % x
outarr.append(out)
write(fileout,outarr)

def analyze_between(lines,fileout):
# remove lines between <LEND> and <L>
newlines = [] # returned
metaline = None
nchg = 0
d = {}
afters = []
nmeta = 0
nlines = len(lines)
for iline,line in enumerate(lines):
if line.startswith('<L>'):
metaline = line
nmeta = nmeta + 1
continue
if line.startswith('<LEND>'):
#metaline = None
newlines.append('<LEND>')
after = analyze_after_lend(iline,metaline,lines,nlines)
afters.append(after)
print('# of metalines = %s' % nmeta)
print('# afters = %s' % len(afters))
summary_after(afters,fileout)

def write(fileout,lines):
with codecs.open(fileout,"w","utf-8") as f:
for line in lines:
f.write(line + '\n')
print(len(lines),"written to",fileout)

if __name__=="__main__":
filein = sys.argv[1] # xxx.txt (path to digitization of xxx)
fileout = sys.argv[2] # revised xxx.txt
lines = read_lines(filein)
analyze_between(lines,fileout)

74 changes: 74 additions & 0 deletions pwgissues/issue77/analyze_between.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
['7764', '<H>{#A#}', '']
['9937', '<H>{#i#}', '']
['10465', '<H>{#I#}', '']
['10575', '<H>{#u#}', '']
['12698', '<H>{#U#}', '']
['12892', '<H>{#f#}', '']
['13188', '<H>{#F, x, X#}', '']
['13192', '<H>{#e#}', '']
['13603', '<H>{#E#}', '']
['13750', '<H>{#o#}', '']
['13854', '<H>{#O#}', '']
['14148', '<H>{#ka#}', '']
['20747', '<H>{#Ka#}', '']
['21317', '<H>{#ga#}', '']
['23877', '<H>{#Ga#}', '']
['24256', '<H>{#Na#}', '']
['24258', '<H>{#ca#}', '']
['26044', '<H>{#Ca#}', '']
['26305', '<H>{#ja#}', '']
['28080', '<H>{#Ja#}', '']
['28164', '<H>{#Ya#}', '']
['28165', '<H>{#wa#}', '']
['28233', '<H>{#Wa#}', '']
['28240', '<H>{#qa#}', '']
['28313', '<H>{#Qa#}', '']
['28327', '<H>{#Ra#}', '']
['28329', '<H>{#ta#}', '']
['31542', '<H>{#Ta#}', '']
['31558', '<H>{#da#}', '']
['35991', '<H>{#Da#}', '']
['37259', '<H>{#na#}', '']
['41153', '<H>{#pa#}', '']
['51386', '<H>{#Pa#}', '']
['51684', '<H>{#ba#}', '']
['53687', '<H>{#Ba#}', '']
['56101', '<H>{#ma#}', '']
['62403', '<H>VERBESSERUNGEN UND NACHTRÄGE ZU THEIL I-V.', '', '<H>{#a#}', '']
['66037', '<H>{#A#}', '']
['67139', '<H>{#i#}', '']
['67368', '<H>{#I#}', '']
['67454', '<H>{#u#}', '']
['68663', '<H>{#U#}', '']
['68750', '<H>{#f#}', '']
['68840', '<H>{#e#}', '']
['69021', '<H>{#E#}', '']
['69108', '<H>{#o#}', '']
['69147', '<H>{#O#}', '']
['69260', '<H>{#ka#}', '']
['72206', '<H>{#Ka#}', '']
['72390', '<H>{#ga#}', '']
['73247', '<H>{#Ga#}', '']
['73344', '<H>{#ca#}', '']
['73978', '<H>{#Ca#}', '']
['74072', '<H>{#ja#}', '']
['74625', '<H>{#Ja#}', '']
['74641', '<H>{#wa Wa qa Qa#}', '']
['74708', '<H>{#ta#}', '']
['75426', '<H>{#da#}', '']
['76427', '<H>{#Da#}', '']
['76722', '<H>{#na#}', '']
['77774', '<H>{#pa#}', '']
['79738', '<H>{#Pa#}', '']
['79777', '<H>{#ba#}', '']
['80043', '<H>{#Ba#}', '']
['80294', '<H>{#ma#}', '']
['80800', '<H>{#ya#}', '']
['82643', '<H>{#ra#}', '']
['85605', '<H>{#la#}', '']
['87082', '<H>{#va#}', '']
['96987', '<H>{#Sa#}', '']
['102408', '<H>{#za#}', '']
['102721', '<H>{#sa#}', '']
['115844', '<H>{#ha#}', '']
['117928', '<H>Verbesserungen und Nachträge zum ganzen Werke.', '']
103 changes: 103 additions & 0 deletions pwgissues/issue77/digentry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#-*- coding:utf-8 -*-
"""digentry.py
Module to read a digitization
and generate a list of Entry objects
Adapted for temp_pwkvn_22.txt
"""
from __future__ import print_function
import sys,re,codecs

class Entry(object):
Ldict = {}
def __init__(self,lines,linenum1,linenum2):
# linenum1,2 are int
self.metaline = lines[0]
self.lend = lines[-1] # the <LEND> line
self.datalines = lines[1:-1] # the non-meta lines
# parse the meta line into a dictionary
self.metad = parseheadline(self.metaline)
self.linenum1 = linenum1
self.linenum2 = linenum2
L = self.metad['L']
if L in self.Ldict:
print("Entry init error: duplicate L",L,linenum1)
exit(1)
self.Ldict[L] = self
self.lsarr = []

def init(filein):
# slurp lines
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [line.rstrip('\r\n') for line in f]
recs=[] # list of Entry objects
inentry = False
idx1 = None
idx2 = None
for idx,line in enumerate(lines):
if inentry:
if line.startswith('<LEND>'):
idx2 = idx
entrylines = lines[idx1:idx2+1]
linenum1 = idx1 + 1
linenum2 = idx2 + 1
entry = Entry(entrylines,linenum1,linenum2)
recs.append(entry)
# prepare for next entry
idx1 = None
idx2 = None
inentry = False
elif line.startswith('<L>'): # error
print('init_entries Error 1. Not expecting <L>')
print("line # ",idx+1)
print(line.encode('utf-8'))
exit(1)
else:
# keep looking for <LEND>
continue
else:
# inentry = False. Looking for '<L>'
if line.startswith('<L>'):
idx1 = idx
inentry = True
elif line.startswith('<LEND>'): # error
print('init_entries Error 2. Not expecting <LEND>')
print("line # ",idx+1)
print(line.encode('utf-8'))
exit(1)
else:
# keep looking for <L>
continue
# when all lines are read, we should have inentry = False
if inentry:
print('digentry.init Error 3. for file',filein)
print('Last entry not closed. Open entry starts at line',idx1+1)
exit(1)

print(len(lines),"lines read from",filein)
print(len(recs),"entries found")
return recs

def parseheadline(headline):
"""
function to parse a 'metaline' and return a dictionary.
Example:
headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e>
returns dictionary
{'L': '16850',
'pc': '292-3',
'k1': 'visarga',
'k2': 'visarga',
'h': '1',
'e': ''}
"""
headline = headline.strip()
splits = re.split('[<]([^>]*)[>]([^<]*)',headline)
result = {}
for i in range(len(splits)):
if i % 3 == 1:
result[splits[i]] = splits[i+1]
return result

if __name__=="__main__":
filein = sys.argv[1] # xxx.txt (path to digitization of xxx)
entries = init(filein)
Loading

0 comments on commit 230fe31

Please sign in to comment.