Skip to content

Commit

Permalink
issue71 work. #71
Browse files Browse the repository at this point in the history
  • Loading branch information
funderburkjim committed Aug 1, 2024
1 parent f9d4266 commit 44aa22f
Show file tree
Hide file tree
Showing 26 changed files with 108,328 additions and 0 deletions.
15,183 changes: 15,183 additions & 0 deletions pwgissues/issue71/change_1.txt

Large diffs are not rendered by default.

1,168 changes: 1,168 additions & 0 deletions pwgissues/issue71/change_2.txt

Large diffs are not rendered by default.

1,150 changes: 1,150 additions & 0 deletions pwgissues/issue71/change_3.txt

Large diffs are not rendered by default.

71 changes: 71 additions & 0 deletions pwgissues/issue71/diff_to_changes_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# coding=utf-8
""" diff_to_changes_dict.py
Generate change transactions from an 'old' and 'new' file
The two files should have same number of lines
ASSUME input file is a dictionary as in csl-orig/v02, e.g. mw.txt.
This structure identifies the metaline for each change;
and this is the only difference from diff_to_changes.py,
which ignores this structure, and is thus available for
generating changes for any two text files with same number of lines.
python diff_to_changes_dict.py old.txt new.txt changes.txt
Now:
python updateByLine.py old.txt changes.txt new1.txt
then new1.txt is same as new.txt.
"""
from __future__ import print_function
import sys, re,codecs

def read_lines(filein):
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [x.rstrip('\r\n') for x in f]
return lines

class Change(object):
def __init__(self,iline,line1,line2,metaline1):
self.iline = iline
self.line1 = line1
self.line2 = line2
self.lnum = iline+1
self.metaline1 = metaline1
a = []
a.append('; %s' %metaline1)
a.append('%s old %s' %(self.lnum,self.line1))
a.append(';')
a.append('%s new %s' %(self.lnum,self.line2))
a.append(';---------------------------------------------------')
self.changeout = a

def write_changes(fileout,changes):
outarr = []
for change in changes:
for x in change.changeout:
outarr.append(x)
with codecs.open(fileout,"w","utf-8") as f:
for out in outarr:
f.write(out+'\n')
print(len(changes),"changes written to",fileout)

if __name__=="__main__":
filein1 = sys.argv[1] # old.txt
filein2 = sys.argv[2] # new.txt
fileout = sys.argv[3] # changes.txt
lines1 = read_lines(filein1)
lines2 = read_lines(filein2)
n = len(lines1)
if n != len(lines2):
print('ERROR: files have different number of lines')
exit(1)
changes = []
metaline1 = None
metaline2 = None
for iline,line1 in enumerate(lines1):
line2 = lines2[iline]
if line1.startswith('<L>'):
metaline1 = line1

if line1 == line2:
continue
changes.append(Change(iline,line1,line2,metaline1))
#
write_changes(fileout,changes)

103 changes: 103 additions & 0 deletions pwgissues/issue71/digentry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#-*- coding:utf-8 -*-
"""digentry.py
Module to read a digitization
and generate a list of Entry objects
Adapted for temp_pwkvn_22.txt
"""
from __future__ import print_function
import sys,re,codecs

class Entry(object):
Ldict = {}
def __init__(self,lines,linenum1,linenum2):
# linenum1,2 are int
self.metaline = lines[0]
self.lend = lines[-1] # the <LEND> line
self.datalines = lines[1:-1] # the non-meta lines
# parse the meta line into a dictionary
self.metad = parseheadline(self.metaline)
self.linenum1 = linenum1
self.linenum2 = linenum2
L = self.metad['L']
if L in self.Ldict:
print("Entry init error: duplicate L",L,linenum1)
exit(1)
self.Ldict[L] = self
self.lsarr = []

def init(filein):
# slurp lines
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [line.rstrip('\r\n') for line in f]
recs=[] # list of Entry objects
inentry = False
idx1 = None
idx2 = None
for idx,line in enumerate(lines):
if inentry:
if line.startswith('<LEND>'):
idx2 = idx
entrylines = lines[idx1:idx2+1]
linenum1 = idx1 + 1
linenum2 = idx2 + 1
entry = Entry(entrylines,linenum1,linenum2)
recs.append(entry)
# prepare for next entry
idx1 = None
idx2 = None
inentry = False
elif line.startswith('<L>'): # error
print('init_entries Error 1. Not expecting <L>')
print("line # ",idx+1)
print(line.encode('utf-8'))
exit(1)
else:
# keep looking for <LEND>
continue
else:
# inentry = False. Looking for '<L>'
if line.startswith('<L>'):
idx1 = idx
inentry = True
elif line.startswith('<LEND>'): # error
print('init_entries Error 2. Not expecting <LEND>')
print("line # ",idx+1)
print(line.encode('utf-8'))
exit(1)
else:
# keep looking for <L>
continue
# when all lines are read, we should have inentry = False
if inentry:
print('digentry.init Error 3. for file',filein)
print('Last entry not closed. Open entry starts at line',idx1+1)
exit(1)

print(len(lines),"lines read from",filein)
print(len(recs),"entries found")
return recs

def parseheadline(headline):
"""
function to parse a 'metaline' and return a dictionary.
Example:
headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e>
returns dictionary
{'L': '16850',
'pc': '292-3',
'k1': 'visarga',
'k2': 'visarga',
'h': '1',
'e': ''}
"""
headline = headline.strip()
splits = re.split('[<]([^>]*)[>]([^<]*)',headline)
result = {}
for i in range(len(splits)):
if i % 3 == 1:
result[splits[i]] = splits[i+1]
return result

if __name__=="__main__":
filein = sys.argv[1] # xxx.txt (path to digitization of xxx)
entries = init(filein)
Loading

0 comments on commit 44aa22f

Please sign in to comment.