#77

sanskrit-lexicon · Sep 20, 2024 · 230fe31 · 230fe31
1 parent 83d115b
commit 230fe31
Show file tree

Hide file tree

Showing 9 changed files with 1,060 additions and 0 deletions.
diff --git a/pwgissues/issue77/analyze_between.py b/pwgissues/issue77/analyze_between.py
@@ -0,0 +1,178 @@
+#-*- coding:utf-8 -*-
+""" analyze_between.py
+    analyze lines between <LEND> and <L>
+"""
+import sys,re,codecs
+## https:##stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters required by git bash to avoid error
+## UnicodeEncodeError: 'charmap' codec cannot encode characters 
+## when run in a git bash script.
+
+sys.stdout.reconfigure(encoding='utf-8') 
+
+def read_lines(filein):
+ with codecs.open(filein,encoding='utf-8',mode='r') as f:
+  lines = [x.rstrip('\r\n') for x in f]
+ print("%s lines read from %s" % (len(lines),filein))
+ return lines
+
+def remove_between(lines):
+ # remove lines between <LEND> and <L>
+ newlines = []  # returned
+ metaline = None
+ nchg = 0
+ for iline,line in enumerate(lines):
+  if line.startswith('<L>'):
+   metaline = line
+   newlines.append(line)
+   continue
+  if line.startswith('<LEND>'):
+   metaline = None
+   newlines.append('<LEND>')
+   continue
+  if metaline == None:
+   # drop this 'between' line
+   nchg = nchg + 1
+   continue
+  # keep line in body.
+  newlines.append(line)
+ print('adjust: %s dropped between <LEND> and <L>' % nchg)
+ return newlines
+
+def insert_between(lines):
+ # insert single blank line after <LEND>
+ newlines = []  # returned
+ metaline = None
+ nchg = 0
+ for iline,line in enumerate(lines):
+  if line.startswith('<L>'):
+   metaline = line
+   newlines.append(line)
+   continue
+  if line.startswith('<LEND>'):
+   metaline = None
+   newlines.append('<LEND>')
+   newlines.append('')  # extra blank line
+   nchg = nchg + 1
+   continue
+  if metaline == None:
+   # drop this 'between' line
+   # unexpected
+   print('Unexpected line at line #',iline+1)
+   exit(1)
+  # keep line in body.
+  newlines.append(line)
+ print('adjust: %s blank lines inserted between <LEND> and <L>' % nchg)
+ return newlines
+
+def analyze_after_lend(iline,metaline,lines,nlines):
+ ans = [] # lines after lend and before next <L>
+ assert lines[iline] == '<LEND>'
+ m = re.search(r'<L>(.*?)<',metaline)
+ L = m.group(1)
+ ans.append(L)
+ if (iline + 1) == nlines:
+  nextmeta = None
+  ans.append(nextmeta)
+  return ans
+
+ b = []
+ while True:
+  iline = iline + 1
+  nextline = lines[iline]
+  if nextline.startswith('<L>'):
+   ans.append(nextline)  # next meta line
+   for c in b:
+    ans.append(c)
+   return ans
+  else:
+   b.append(nextline)
+
+def check_afters_page(afters):
+ nprob = 0
+ for after in afters:
+  L = after[0]
+  nextmeta = after[1]
+  b = after[2:]
+  for c in b:
+   m = re.search(r'^\[Page([0-9]-[0-9][0-9][0-9][0-9])\]$',c)
+   if m == None:
+    continue
+   pc = m.group(1)
+   m1 = re.search(r'<pc>(.*?)<',nextmeta)
+   pc1 = m.group(1)
+   if pc != pc1:
+    out = '%s' % c
+    print(out)
+    nprob = nprob + 1
+ print('check_afters_page finds %s problems' % nprob)
+
+def summary_after(afters,fileout):
+ n = 0
+ for after in afters:
+  L = after[0]
+  nextmeta = after[1]
+  lines = after[2:]
+  if lines == ['']:
+   n = n + 1
+ print('%s entries with single blank line between <LEND> and <L>' % n)
+ d = {}
+ check_afters_page(afters)
+
+ for after in afters:
+  L = after[0]
+  nextmeta = after[1]
+  b = after[2:]
+  n = len(b)
+  if n not in d:
+   d[n] = 0
+  d[n] = d[n] + 1
+ print(d)
+ # fileout = 'temp.txt'
+ outarr = []
+ for after in afters:
+  L = after[0]
+  nextmeta = after[1]
+  if nextmeta == None:
+   continue # last entry
+  b = after[2:]
+  if b != ['']:
+   x = [L] + b
+   out = '%s' % x
+   outarr.append(out)
+ write(fileout,outarr)
+
+def analyze_between(lines,fileout):
+ # remove lines between <LEND> and <L>
+ newlines = []  # returned
+ metaline = None
+ nchg = 0
+ d = {}
+ afters = []
+ nmeta = 0
+ nlines = len(lines)
+ for iline,line in enumerate(lines):
+  if line.startswith('<L>'):
+   metaline = line
+   nmeta = nmeta + 1
+   continue
+  if line.startswith('<LEND>'):
+   #metaline = None
+   newlines.append('<LEND>')
+   after = analyze_after_lend(iline,metaline,lines,nlines)
+   afters.append(after)
+ print('# of metalines = %s' % nmeta)
+ print('# afters = %s' % len(afters))
+ summary_after(afters,fileout)
+
+def write(fileout,lines):
+ with codecs.open(fileout,"w","utf-8") as f:
+  for line in lines:
+   f.write(line + '\n')
+ print(len(lines),"written to",fileout)
+
+if __name__=="__main__": 
+ filein = sys.argv[1] #  xxx.txt (path to digitization of xxx)
+ fileout = sys.argv[2] # revised xxx.txt
+ lines = read_lines(filein)
+ analyze_between(lines,fileout)
+
diff --git a/pwgissues/issue77/analyze_between.txt b/pwgissues/issue77/analyze_between.txt
@@ -0,0 +1,74 @@
+['7764', '<H>{#A#}', '']
+['9937', '<H>{#i#}', '']
+['10465', '<H>{#I#}', '']
+['10575', '<H>{#u#}', '']
+['12698', '<H>{#U#}', '']
+['12892', '<H>{#f#}', '']
+['13188', '<H>{#F, x, X#}', '']
+['13192', '<H>{#e#}', '']
+['13603', '<H>{#E#}', '']
+['13750', '<H>{#o#}', '']
+['13854', '<H>{#O#}', '']
+['14148', '<H>{#ka#}', '']
+['20747', '<H>{#Ka#}', '']
+['21317', '<H>{#ga#}', '']
+['23877', '<H>{#Ga#}', '']
+['24256', '<H>{#Na#}', '']
+['24258', '<H>{#ca#}', '']
+['26044', '<H>{#Ca#}', '']
+['26305', '<H>{#ja#}', '']
+['28080', '<H>{#Ja#}', '']
+['28164', '<H>{#Ya#}', '']
+['28165', '<H>{#wa#}', '']
+['28233', '<H>{#Wa#}', '']
+['28240', '<H>{#qa#}', '']
+['28313', '<H>{#Qa#}', '']
+['28327', '<H>{#Ra#}', '']
+['28329', '<H>{#ta#}', '']
+['31542', '<H>{#Ta#}', '']
+['31558', '<H>{#da#}', '']
+['35991', '<H>{#Da#}', '']
+['37259', '<H>{#na#}', '']
+['41153', '<H>{#pa#}', '']
+['51386', '<H>{#Pa#}', '']
+['51684', '<H>{#ba#}', '']
+['53687', '<H>{#Ba#}', '']
+['56101', '<H>{#ma#}', '']
+['62403', '<H>VERBESSERUNGEN UND NACHTRÄGE ZU THEIL I-V.', '', '<H>{#a#}', '']
+['66037', '<H>{#A#}', '']
+['67139', '<H>{#i#}', '']
+['67368', '<H>{#I#}', '']
+['67454', '<H>{#u#}', '']
+['68663', '<H>{#U#}', '']
+['68750', '<H>{#f#}', '']
+['68840', '<H>{#e#}', '']
+['69021', '<H>{#E#}', '']
+['69108', '<H>{#o#}', '']
+['69147', '<H>{#O#}', '']
+['69260', '<H>{#ka#}', '']
+['72206', '<H>{#Ka#}', '']
+['72390', '<H>{#ga#}', '']
+['73247', '<H>{#Ga#}', '']
+['73344', '<H>{#ca#}', '']
+['73978', '<H>{#Ca#}', '']
+['74072', '<H>{#ja#}', '']
+['74625', '<H>{#Ja#}', '']
+['74641', '<H>{#wa Wa qa Qa#}', '']
+['74708', '<H>{#ta#}', '']
+['75426', '<H>{#da#}', '']
+['76427', '<H>{#Da#}', '']
+['76722', '<H>{#na#}', '']
+['77774', '<H>{#pa#}', '']
+['79738', '<H>{#Pa#}', '']
+['79777', '<H>{#ba#}', '']
+['80043', '<H>{#Ba#}', '']
+['80294', '<H>{#ma#}', '']
+['80800', '<H>{#ya#}', '']
+['82643', '<H>{#ra#}', '']
+['85605', '<H>{#la#}', '']
+['87082', '<H>{#va#}', '']
+['96987', '<H>{#Sa#}', '']
+['102408', '<H>{#za#}', '']
+['102721', '<H>{#sa#}', '']
+['115844', '<H>{#ha#}', '']
+['117928', '<H>Verbesserungen und Nachträge zum ganzen Werke.', '']
diff --git a/pwgissues/issue77/digentry.py b/pwgissues/issue77/digentry.py
@@ -0,0 +1,103 @@
+#-*- coding:utf-8 -*-
+"""digentry.py
+  Module to read a digitization 
+  and generate a list of Entry objects
+  Adapted for temp_pwkvn_22.txt
+"""
+from __future__ import print_function
+import sys,re,codecs
+
+class Entry(object):
+ Ldict = {}
+ def __init__(self,lines,linenum1,linenum2):
+  # linenum1,2 are int
+  self.metaline = lines[0]
+  self.lend = lines[-1]  # the <LEND> line
+  self.datalines = lines[1:-1]  # the non-meta lines
+  # parse the meta line into a dictionary
+  self.metad = parseheadline(self.metaline)
+  self.linenum1 = linenum1
+  self.linenum2 = linenum2
+  L = self.metad['L']
+  if L in self.Ldict:
+   print("Entry init error: duplicate L",L,linenum1)
+   exit(1)
+  self.Ldict[L] = self
+  self.lsarr = []
+
+def init(filein):
+ # slurp lines
+ with codecs.open(filein,encoding='utf-8',mode='r') as f:
+  lines = [line.rstrip('\r\n') for line in f]
+ recs=[]  # list of Entry objects
+ inentry = False  
+ idx1 = None
+ idx2 = None
+ for idx,line in enumerate(lines):
+  if inentry:
+   if line.startswith('<LEND>'):
+    idx2 = idx
+    entrylines = lines[idx1:idx2+1]
+    linenum1 = idx1 + 1
+    linenum2 = idx2 + 1
+    entry = Entry(entrylines,linenum1,linenum2)
+    recs.append(entry)
+    # prepare for next entry
+    idx1 = None
+    idx2 = None
+    inentry = False
+   elif line.startswith('<L>'):  # error
+    print('init_entries Error 1. Not expecting <L>')
+    print("line # ",idx+1)
+    print(line.encode('utf-8'))
+    exit(1)
+   else: 
+    # keep looking for <LEND>
+    continue
+  else:
+   # inentry = False. Looking for '<L>'
+   if line.startswith('<L>'):
+    idx1 = idx
+    inentry = True
+   elif line.startswith('<LEND>'): # error
+    print('init_entries Error 2. Not expecting <LEND>')
+    print("line # ",idx+1)
+    print(line.encode('utf-8'))
+    exit(1)
+   else: 
+    # keep looking for <L>
+    continue
+ # when all lines are read, we should have inentry = False
+ if inentry:
+  print('digentry.init Error 3. for file',filein)
+  print('Last entry not closed. Open entry starts at line',idx1+1)
+  exit(1)
+
+ print(len(lines),"lines read from",filein)
+ print(len(recs),"entries found")
+ return recs
+
+def parseheadline(headline):
+ """
+  function to parse a 'metaline' and return a dictionary.
+  Example:
+  headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e>
+  returns dictionary
+  {'L': '16850', 
+   'pc': '292-3',
+   'k1': 'visarga', 
+   'k2': 'visarga', 
+   'h': '1', 
+   'e': ''}
+ """
+ headline = headline.strip()
+ splits = re.split('[<]([^>]*)[>]([^<]*)',headline)
+ result = {}
+ for i in range(len(splits)):
+  if i % 3 == 1:
+   result[splits[i]] = splits[i+1]
+ return result
+
+if __name__=="__main__":
+ filein = sys.argv[1] #  xxx.txt (path to digitization of xxx)
+ entries = init(filein)