-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
83d115b
commit 230fe31
Showing
9 changed files
with
1,060 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
#-*- coding:utf-8 -*- | ||
""" analyze_between.py | ||
analyze lines between <LEND> and <L> | ||
""" | ||
import sys,re,codecs | ||
## https:##stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters required by git bash to avoid error | ||
## UnicodeEncodeError: 'charmap' codec cannot encode characters | ||
## when run in a git bash script. | ||
|
||
sys.stdout.reconfigure(encoding='utf-8') | ||
|
||
def read_lines(filein): | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [x.rstrip('\r\n') for x in f] | ||
print("%s lines read from %s" % (len(lines),filein)) | ||
return lines | ||
|
||
def remove_between(lines): | ||
# remove lines between <LEND> and <L> | ||
newlines = [] # returned | ||
metaline = None | ||
nchg = 0 | ||
for iline,line in enumerate(lines): | ||
if line.startswith('<L>'): | ||
metaline = line | ||
newlines.append(line) | ||
continue | ||
if line.startswith('<LEND>'): | ||
metaline = None | ||
newlines.append('<LEND>') | ||
continue | ||
if metaline == None: | ||
# drop this 'between' line | ||
nchg = nchg + 1 | ||
continue | ||
# keep line in body. | ||
newlines.append(line) | ||
print('adjust: %s dropped between <LEND> and <L>' % nchg) | ||
return newlines | ||
|
||
def insert_between(lines): | ||
# insert single blank line after <LEND> | ||
newlines = [] # returned | ||
metaline = None | ||
nchg = 0 | ||
for iline,line in enumerate(lines): | ||
if line.startswith('<L>'): | ||
metaline = line | ||
newlines.append(line) | ||
continue | ||
if line.startswith('<LEND>'): | ||
metaline = None | ||
newlines.append('<LEND>') | ||
newlines.append('') # extra blank line | ||
nchg = nchg + 1 | ||
continue | ||
if metaline == None: | ||
# drop this 'between' line | ||
# unexpected | ||
print('Unexpected line at line #',iline+1) | ||
exit(1) | ||
# keep line in body. | ||
newlines.append(line) | ||
print('adjust: %s blank lines inserted between <LEND> and <L>' % nchg) | ||
return newlines | ||
|
||
def analyze_after_lend(iline,metaline,lines,nlines): | ||
ans = [] # lines after lend and before next <L> | ||
assert lines[iline] == '<LEND>' | ||
m = re.search(r'<L>(.*?)<',metaline) | ||
L = m.group(1) | ||
ans.append(L) | ||
if (iline + 1) == nlines: | ||
nextmeta = None | ||
ans.append(nextmeta) | ||
return ans | ||
|
||
b = [] | ||
while True: | ||
iline = iline + 1 | ||
nextline = lines[iline] | ||
if nextline.startswith('<L>'): | ||
ans.append(nextline) # next meta line | ||
for c in b: | ||
ans.append(c) | ||
return ans | ||
else: | ||
b.append(nextline) | ||
|
||
def check_afters_page(afters): | ||
nprob = 0 | ||
for after in afters: | ||
L = after[0] | ||
nextmeta = after[1] | ||
b = after[2:] | ||
for c in b: | ||
m = re.search(r'^\[Page([0-9]-[0-9][0-9][0-9][0-9])\]$',c) | ||
if m == None: | ||
continue | ||
pc = m.group(1) | ||
m1 = re.search(r'<pc>(.*?)<',nextmeta) | ||
pc1 = m.group(1) | ||
if pc != pc1: | ||
out = '%s' % c | ||
print(out) | ||
nprob = nprob + 1 | ||
print('check_afters_page finds %s problems' % nprob) | ||
|
||
def summary_after(afters,fileout): | ||
n = 0 | ||
for after in afters: | ||
L = after[0] | ||
nextmeta = after[1] | ||
lines = after[2:] | ||
if lines == ['']: | ||
n = n + 1 | ||
print('%s entries with single blank line between <LEND> and <L>' % n) | ||
d = {} | ||
check_afters_page(afters) | ||
|
||
for after in afters: | ||
L = after[0] | ||
nextmeta = after[1] | ||
b = after[2:] | ||
n = len(b) | ||
if n not in d: | ||
d[n] = 0 | ||
d[n] = d[n] + 1 | ||
print(d) | ||
# fileout = 'temp.txt' | ||
outarr = [] | ||
for after in afters: | ||
L = after[0] | ||
nextmeta = after[1] | ||
if nextmeta == None: | ||
continue # last entry | ||
b = after[2:] | ||
if b != ['']: | ||
x = [L] + b | ||
out = '%s' % x | ||
outarr.append(out) | ||
write(fileout,outarr) | ||
|
||
def analyze_between(lines,fileout): | ||
# remove lines between <LEND> and <L> | ||
newlines = [] # returned | ||
metaline = None | ||
nchg = 0 | ||
d = {} | ||
afters = [] | ||
nmeta = 0 | ||
nlines = len(lines) | ||
for iline,line in enumerate(lines): | ||
if line.startswith('<L>'): | ||
metaline = line | ||
nmeta = nmeta + 1 | ||
continue | ||
if line.startswith('<LEND>'): | ||
#metaline = None | ||
newlines.append('<LEND>') | ||
after = analyze_after_lend(iline,metaline,lines,nlines) | ||
afters.append(after) | ||
print('# of metalines = %s' % nmeta) | ||
print('# afters = %s' % len(afters)) | ||
summary_after(afters,fileout) | ||
|
||
def write(fileout,lines): | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for line in lines: | ||
f.write(line + '\n') | ||
print(len(lines),"written to",fileout) | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # xxx.txt (path to digitization of xxx) | ||
fileout = sys.argv[2] # revised xxx.txt | ||
lines = read_lines(filein) | ||
analyze_between(lines,fileout) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
['7764', '<H>{#A#}', ''] | ||
['9937', '<H>{#i#}', ''] | ||
['10465', '<H>{#I#}', ''] | ||
['10575', '<H>{#u#}', ''] | ||
['12698', '<H>{#U#}', ''] | ||
['12892', '<H>{#f#}', ''] | ||
['13188', '<H>{#F, x, X#}', ''] | ||
['13192', '<H>{#e#}', ''] | ||
['13603', '<H>{#E#}', ''] | ||
['13750', '<H>{#o#}', ''] | ||
['13854', '<H>{#O#}', ''] | ||
['14148', '<H>{#ka#}', ''] | ||
['20747', '<H>{#Ka#}', ''] | ||
['21317', '<H>{#ga#}', ''] | ||
['23877', '<H>{#Ga#}', ''] | ||
['24256', '<H>{#Na#}', ''] | ||
['24258', '<H>{#ca#}', ''] | ||
['26044', '<H>{#Ca#}', ''] | ||
['26305', '<H>{#ja#}', ''] | ||
['28080', '<H>{#Ja#}', ''] | ||
['28164', '<H>{#Ya#}', ''] | ||
['28165', '<H>{#wa#}', ''] | ||
['28233', '<H>{#Wa#}', ''] | ||
['28240', '<H>{#qa#}', ''] | ||
['28313', '<H>{#Qa#}', ''] | ||
['28327', '<H>{#Ra#}', ''] | ||
['28329', '<H>{#ta#}', ''] | ||
['31542', '<H>{#Ta#}', ''] | ||
['31558', '<H>{#da#}', ''] | ||
['35991', '<H>{#Da#}', ''] | ||
['37259', '<H>{#na#}', ''] | ||
['41153', '<H>{#pa#}', ''] | ||
['51386', '<H>{#Pa#}', ''] | ||
['51684', '<H>{#ba#}', ''] | ||
['53687', '<H>{#Ba#}', ''] | ||
['56101', '<H>{#ma#}', ''] | ||
['62403', '<H>VERBESSERUNGEN UND NACHTRÄGE ZU THEIL I-V.', '', '<H>{#a#}', ''] | ||
['66037', '<H>{#A#}', ''] | ||
['67139', '<H>{#i#}', ''] | ||
['67368', '<H>{#I#}', ''] | ||
['67454', '<H>{#u#}', ''] | ||
['68663', '<H>{#U#}', ''] | ||
['68750', '<H>{#f#}', ''] | ||
['68840', '<H>{#e#}', ''] | ||
['69021', '<H>{#E#}', ''] | ||
['69108', '<H>{#o#}', ''] | ||
['69147', '<H>{#O#}', ''] | ||
['69260', '<H>{#ka#}', ''] | ||
['72206', '<H>{#Ka#}', ''] | ||
['72390', '<H>{#ga#}', ''] | ||
['73247', '<H>{#Ga#}', ''] | ||
['73344', '<H>{#ca#}', ''] | ||
['73978', '<H>{#Ca#}', ''] | ||
['74072', '<H>{#ja#}', ''] | ||
['74625', '<H>{#Ja#}', ''] | ||
['74641', '<H>{#wa Wa qa Qa#}', ''] | ||
['74708', '<H>{#ta#}', ''] | ||
['75426', '<H>{#da#}', ''] | ||
['76427', '<H>{#Da#}', ''] | ||
['76722', '<H>{#na#}', ''] | ||
['77774', '<H>{#pa#}', ''] | ||
['79738', '<H>{#Pa#}', ''] | ||
['79777', '<H>{#ba#}', ''] | ||
['80043', '<H>{#Ba#}', ''] | ||
['80294', '<H>{#ma#}', ''] | ||
['80800', '<H>{#ya#}', ''] | ||
['82643', '<H>{#ra#}', ''] | ||
['85605', '<H>{#la#}', ''] | ||
['87082', '<H>{#va#}', ''] | ||
['96987', '<H>{#Sa#}', ''] | ||
['102408', '<H>{#za#}', ''] | ||
['102721', '<H>{#sa#}', ''] | ||
['115844', '<H>{#ha#}', ''] | ||
['117928', '<H>Verbesserungen und Nachträge zum ganzen Werke.', ''] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
#-*- coding:utf-8 -*- | ||
"""digentry.py | ||
Module to read a digitization | ||
and generate a list of Entry objects | ||
Adapted for temp_pwkvn_22.txt | ||
""" | ||
from __future__ import print_function | ||
import sys,re,codecs | ||
|
||
class Entry(object): | ||
Ldict = {} | ||
def __init__(self,lines,linenum1,linenum2): | ||
# linenum1,2 are int | ||
self.metaline = lines[0] | ||
self.lend = lines[-1] # the <LEND> line | ||
self.datalines = lines[1:-1] # the non-meta lines | ||
# parse the meta line into a dictionary | ||
self.metad = parseheadline(self.metaline) | ||
self.linenum1 = linenum1 | ||
self.linenum2 = linenum2 | ||
L = self.metad['L'] | ||
if L in self.Ldict: | ||
print("Entry init error: duplicate L",L,linenum1) | ||
exit(1) | ||
self.Ldict[L] = self | ||
self.lsarr = [] | ||
|
||
def init(filein): | ||
# slurp lines | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [line.rstrip('\r\n') for line in f] | ||
recs=[] # list of Entry objects | ||
inentry = False | ||
idx1 = None | ||
idx2 = None | ||
for idx,line in enumerate(lines): | ||
if inentry: | ||
if line.startswith('<LEND>'): | ||
idx2 = idx | ||
entrylines = lines[idx1:idx2+1] | ||
linenum1 = idx1 + 1 | ||
linenum2 = idx2 + 1 | ||
entry = Entry(entrylines,linenum1,linenum2) | ||
recs.append(entry) | ||
# prepare for next entry | ||
idx1 = None | ||
idx2 = None | ||
inentry = False | ||
elif line.startswith('<L>'): # error | ||
print('init_entries Error 1. Not expecting <L>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <LEND> | ||
continue | ||
else: | ||
# inentry = False. Looking for '<L>' | ||
if line.startswith('<L>'): | ||
idx1 = idx | ||
inentry = True | ||
elif line.startswith('<LEND>'): # error | ||
print('init_entries Error 2. Not expecting <LEND>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <L> | ||
continue | ||
# when all lines are read, we should have inentry = False | ||
if inentry: | ||
print('digentry.init Error 3. for file',filein) | ||
print('Last entry not closed. Open entry starts at line',idx1+1) | ||
exit(1) | ||
|
||
print(len(lines),"lines read from",filein) | ||
print(len(recs),"entries found") | ||
return recs | ||
|
||
def parseheadline(headline): | ||
""" | ||
function to parse a 'metaline' and return a dictionary. | ||
Example: | ||
headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e> | ||
returns dictionary | ||
{'L': '16850', | ||
'pc': '292-3', | ||
'k1': 'visarga', | ||
'k2': 'visarga', | ||
'h': '1', | ||
'e': ''} | ||
""" | ||
headline = headline.strip() | ||
splits = re.split('[<]([^>]*)[>]([^<]*)',headline) | ||
result = {} | ||
for i in range(len(splits)): | ||
if i % 3 == 1: | ||
result[splits[i]] = splits[i+1] | ||
return result | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # xxx.txt (path to digitization of xxx) | ||
entries = init(filein) |
Oops, something went wrong.