-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f9d4266
commit 44aa22f
Showing
26 changed files
with
108,328 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# coding=utf-8 | ||
""" diff_to_changes_dict.py | ||
Generate change transactions from an 'old' and 'new' file | ||
The two files should have same number of lines | ||
ASSUME input file is a dictionary as in csl-orig/v02, e.g. mw.txt. | ||
This structure identifies the metaline for each change; | ||
and this is the only difference from diff_to_changes.py, | ||
which ignores this structure, and is thus available for | ||
generating changes for any two text files with same number of lines. | ||
python diff_to_changes_dict.py old.txt new.txt changes.txt | ||
Now: | ||
python updateByLine.py old.txt changes.txt new1.txt | ||
then new1.txt is same as new.txt. | ||
""" | ||
from __future__ import print_function | ||
import sys, re,codecs | ||
|
||
def read_lines(filein): | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [x.rstrip('\r\n') for x in f] | ||
return lines | ||
|
||
class Change(object): | ||
def __init__(self,iline,line1,line2,metaline1): | ||
self.iline = iline | ||
self.line1 = line1 | ||
self.line2 = line2 | ||
self.lnum = iline+1 | ||
self.metaline1 = metaline1 | ||
a = [] | ||
a.append('; %s' %metaline1) | ||
a.append('%s old %s' %(self.lnum,self.line1)) | ||
a.append(';') | ||
a.append('%s new %s' %(self.lnum,self.line2)) | ||
a.append(';---------------------------------------------------') | ||
self.changeout = a | ||
|
||
def write_changes(fileout,changes): | ||
outarr = [] | ||
for change in changes: | ||
for x in change.changeout: | ||
outarr.append(x) | ||
with codecs.open(fileout,"w","utf-8") as f: | ||
for out in outarr: | ||
f.write(out+'\n') | ||
print(len(changes),"changes written to",fileout) | ||
|
||
if __name__=="__main__": | ||
filein1 = sys.argv[1] # old.txt | ||
filein2 = sys.argv[2] # new.txt | ||
fileout = sys.argv[3] # changes.txt | ||
lines1 = read_lines(filein1) | ||
lines2 = read_lines(filein2) | ||
n = len(lines1) | ||
if n != len(lines2): | ||
print('ERROR: files have different number of lines') | ||
exit(1) | ||
changes = [] | ||
metaline1 = None | ||
metaline2 = None | ||
for iline,line1 in enumerate(lines1): | ||
line2 = lines2[iline] | ||
if line1.startswith('<L>'): | ||
metaline1 = line1 | ||
|
||
if line1 == line2: | ||
continue | ||
changes.append(Change(iline,line1,line2,metaline1)) | ||
# | ||
write_changes(fileout,changes) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
#-*- coding:utf-8 -*- | ||
"""digentry.py | ||
Module to read a digitization | ||
and generate a list of Entry objects | ||
Adapted for temp_pwkvn_22.txt | ||
""" | ||
from __future__ import print_function | ||
import sys,re,codecs | ||
|
||
class Entry(object): | ||
Ldict = {} | ||
def __init__(self,lines,linenum1,linenum2): | ||
# linenum1,2 are int | ||
self.metaline = lines[0] | ||
self.lend = lines[-1] # the <LEND> line | ||
self.datalines = lines[1:-1] # the non-meta lines | ||
# parse the meta line into a dictionary | ||
self.metad = parseheadline(self.metaline) | ||
self.linenum1 = linenum1 | ||
self.linenum2 = linenum2 | ||
L = self.metad['L'] | ||
if L in self.Ldict: | ||
print("Entry init error: duplicate L",L,linenum1) | ||
exit(1) | ||
self.Ldict[L] = self | ||
self.lsarr = [] | ||
|
||
def init(filein): | ||
# slurp lines | ||
with codecs.open(filein,encoding='utf-8',mode='r') as f: | ||
lines = [line.rstrip('\r\n') for line in f] | ||
recs=[] # list of Entry objects | ||
inentry = False | ||
idx1 = None | ||
idx2 = None | ||
for idx,line in enumerate(lines): | ||
if inentry: | ||
if line.startswith('<LEND>'): | ||
idx2 = idx | ||
entrylines = lines[idx1:idx2+1] | ||
linenum1 = idx1 + 1 | ||
linenum2 = idx2 + 1 | ||
entry = Entry(entrylines,linenum1,linenum2) | ||
recs.append(entry) | ||
# prepare for next entry | ||
idx1 = None | ||
idx2 = None | ||
inentry = False | ||
elif line.startswith('<L>'): # error | ||
print('init_entries Error 1. Not expecting <L>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <LEND> | ||
continue | ||
else: | ||
# inentry = False. Looking for '<L>' | ||
if line.startswith('<L>'): | ||
idx1 = idx | ||
inentry = True | ||
elif line.startswith('<LEND>'): # error | ||
print('init_entries Error 2. Not expecting <LEND>') | ||
print("line # ",idx+1) | ||
print(line.encode('utf-8')) | ||
exit(1) | ||
else: | ||
# keep looking for <L> | ||
continue | ||
# when all lines are read, we should have inentry = False | ||
if inentry: | ||
print('digentry.init Error 3. for file',filein) | ||
print('Last entry not closed. Open entry starts at line',idx1+1) | ||
exit(1) | ||
|
||
print(len(lines),"lines read from",filein) | ||
print(len(recs),"entries found") | ||
return recs | ||
|
||
def parseheadline(headline): | ||
""" | ||
function to parse a 'metaline' and return a dictionary. | ||
Example: | ||
headline = <L>16850<pc>292-3<k1>visarga<k2>visarga<h>1<e> | ||
returns dictionary | ||
{'L': '16850', | ||
'pc': '292-3', | ||
'k1': 'visarga', | ||
'k2': 'visarga', | ||
'h': '1', | ||
'e': ''} | ||
""" | ||
headline = headline.strip() | ||
splits = re.split('[<]([^>]*)[>]([^<]*)',headline) | ||
result = {} | ||
for i in range(len(splits)): | ||
if i % 3 == 1: | ||
result[splits[i]] = splits[i+1] | ||
return result | ||
|
||
if __name__=="__main__": | ||
filein = sys.argv[1] # xxx.txt (path to digitization of xxx) | ||
entries = init(filein) |
Oops, something went wrong.