-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdiff.lua
274 lines (242 loc) · 10.2 KB
/
diff.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
-----------------------------------------------------------------------------
-- Provides functions for diffing text.
--
-- (c) 2007, 2008 Yuri Takhteyev (yuri@freewisdom.org)
-- (c) 2007 Hisham Muhammad
--
-- License: MIT/X, see http://sputnik.freewisdom.org/en/License
-----------------------------------------------------------------------------
module(..., package.seeall)
SKIP_SEPARATOR = true -- a constant
IN = "in"; OUT = "out"; SAME = "same" -- token statuses
-----------------------------------------------------------------------------
-- Split a string into tokens. (Adapted from Gavin Kistner's split on
-- http://lua-users.org/wiki/SplitJoin.
--
-- @param text A string to be split.
-- @param separator [optional] the separator pattern (defaults to any
-- white space - %s+).
-- @param skip_separator [optional] don't include the sepator in the results.
-- @return A list of tokens.
-----------------------------------------------------------------------------
function split(text, separator, skip_separator)
separator = separator or "%s+"
local parts = {}
local start = 1
local split_start, split_end = text:find(separator, start)
while split_start do
table.insert(parts, text:sub(start, split_start-1))
if not skip_separator then
table.insert(parts, text:sub(split_start, split_end))
end
start = split_end + 1
split_start, split_end = text:find(separator, start)
end
if text:sub(start)~="" then
table.insert(parts, text:sub(start) )
end
return parts
end
-----------------------------------------------------------------------------
-- Derives the longest common subsequence of two strings. This is a faster
-- implementation than one provided by stdlib. Submitted by Hisham Muhammad.
-- The algorithm was taken from:
-- http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_subsequence
--
-- @param t1 the first string.
-- @param t2 the second string.
-- @return the least common subsequence as a matrix.
-----------------------------------------------------------------------------
function quick_LCS(t1, t2)
local m = #t1
local n = #t2
-- Build matrix on demand
local C = {}
local setmetatable = setmetatable
local mt_tbl = {
__index = function(t, k)
t[k] = 0
return 0
end
}
local mt_C = {
__index = function(t, k)
local tbl = {}
setmetatable(tbl, mt_tbl)
t[k] = tbl
return tbl
end
}
setmetatable(C, mt_C)
local max = math.max
for i = 1, m+1 do
local ci1 = C[i+1]
local ci = C[i]
for j = 1, n+1 do
if t1[i-1] == t2[j-1] then
ci1[j+1] = ci[j] + 1
else
ci1[j+1] = max(ci1[j], ci[j+1])
end
end
end
return C
end
-----------------------------------------------------------------------------
-- Escapes an HTML string.
--
-- @param text The string to be escaped.
-- @return Escaped string.
-----------------------------------------------------------------------------
function escape_html(text)
text = text:gsub("&", "&"):gsub(">",">"):gsub("<","<")
text = text:gsub("\"", """)
return text
end
-----------------------------------------------------------------------------
-- Formats an a diff into a table of HTML tags for subsequent rendering.
--
-- @param tokens a table of {token, status} pairs.
-- @return a Lua table of HTML strings for rendering via table.concat()
-----------------------------------------------------------------------------
function format_as_html(tokens)
local diff_buffer = ""
local token, status
diff_buffer = {}
table.insert(diff_buffer, '<table width="100%" cellspacing="0" cellpadding="0">')
local prev_field = ''
local prev_status = ''
for i, token_record in ipairs(tokens) do
token = escape_html(token_record[1])
status = token_record[2]
if token ~= '' and token ~= nil and token ~= "\n" and token ~= "\r" then
if status == "in" then
--if the previous status was also "in", we just need to append the item and not close the cell
if prev_status == "in" then
table.insert(diff_buffer, "<ins>"..token.."</ins>")
--if the previous status was "out", we need to close the old cell, create a new cell and add the content
elseif prev_status == "out" then
table.insert(diff_buffer, "</td><td class='in' valign='top' width='50%'><ins>"..token.."</ins>")
--if the previous status was "same", we need to add a new row with a blank first cell and start the second cell
elseif prev_status == "same" then
table.insert(diff_buffer, "<tr><td class='empty'> </td><td class='in' valign='top' width='50%'><ins>"..token.."</ins>")
end
elseif status == "out" then
--if the previous status was also "out", we just need to append the item and not close the cell
if prev_status == "out" then
table.insert(diff_buffer, "<del>"..token.."</del>")
--if the previous cell was "in", then we need to close the old cell/row and create a new row and cell
elseif prev_status == "in" then
table.insert(diff_buffer, "</td></tr><tr><td class='out' valign='top' width='50%'><del>"..token.."</del>")
--if the previous status was "same", we need to add a new row and populate its first cell
elseif prev_status == "same" then
table.insert(diff_buffer, "<tr><td class='out' valign='top' width='50%'><del>"..token.."</del>")
end
elseif status == "same" then
--if the previous cell was "out", we need to close the first cell, add a second blank cell and then close the previous row
if prev_status == "out" then
table.insert(diff_buffer, "</td><td class='empty'> </td></tr><tr><td class='same' width='50%'>"..token.."</td><td class='same' width='50%'>"..token.."</td></tr>")
--if the previous cell was "in", then we need to close its cell/row and add our new row
elseif prev_status == "in" then
table.insert(diff_buffer, "</td></tr><tr><td class='same' width='50%'>"..token.."</td><td class='same' width='50%'>"..token.."</td></tr>")
--if the previous status was "same", just add another row
elseif prev_status == "same" then
table.insert(diff_buffer, "<tr><td class='same' width='50%'>"..token.."</td><td class='same' width='50%'>"..token.."</td></tr>")
end
end
--diff_buffer = diff_buffer .. "</tr>"
prev_status = status
end
end
table.insert(diff_buffer, "</table>")
return diff_buffer
end
function format_as_html_original(tokens)
local diff_buffer = ""
local token, status
for i, token_record in ipairs(tokens) do
token = escape_html(token_record[1])
status = token_record[2]
if status == "in" then
diff_buffer = diff_buffer.."<ins>"..token.."</ins>"
elseif status == "out" then
diff_buffer = diff_buffer.."<del>"..token.."</del>"
else
diff_buffer = diff_buffer..token
end
end
return diff_buffer
end
-----------------------------------------------------------------------------
-- Returns a diff of two strings as a list of pairs, where the first value
-- represents a token and the second the token's status ("same", "in", "out").
--
-- @param old The "old" text string
-- @param new The "new" text string
-- @param separator [optional] the separator pattern (defaults ot any
-- white space).
-- @return A list of annotated tokens.
-----------------------------------------------------------------------------
function diff(old, new, separator)
assert(old); assert(new)
new = split(new, separator); old = split(old, separator)
-- First, compare the beginnings and ends of strings to remove the common
-- prefix and suffix. Chances are, there is only a small number of tokens
-- in the middle that differ, in which case we can save ourselves a lot
-- in terms of LCS computation.
local prefix = "" -- common text in the beginning
local suffix = "" -- common text in the end
while old[1] and old[1] == new[1] do
local token = table.remove(old, 1)
table.remove(new, 1)
prefix = prefix..token
end
while old[#old] and old[#old] == new[#new] do
local token = table.remove(old)
table.remove(new)
suffix = token..suffix
end
-- Setup a table that will store the diff (an upvalue for get_diff). We'll
-- store it in the reverse order to allow for tail calls. We'll also keep
-- in this table functions to handle different events.
local rev_diff = {
put = function(self, token, type) table.insert(self, {token,type}) end,
ins = function(self, token) self:put(token, IN) end,
del = function(self, token) self:put(token, OUT) end,
same = function(self, token) if token then self:put(token, SAME) end end,
}
-- Put the suffix as the first token (we are storing the diff in the
-- reverse order)
rev_diff:same(suffix)
-- Define a function that will scan the LCS matrix backwards and build the
-- diff output recursively.
local function get_diff(C, old, new, i, j)
local old_i = old[i]
local new_j = new[j]
if i >= 1 and j >= 1 and old_i == new_j then
rev_diff:same(old_i)
return get_diff(C, old, new, i-1, j-1)
else
local Cij1 = C[i][j-1]
local Ci1j = C[i-1][j]
if j >= 1 and (i == 0 or Cij1 >= Ci1j) then
rev_diff:ins(new_j)
return get_diff(C, old, new, i, j-1)
elseif i >= 1 and (j == 0 or Cij1 < Ci1j) then
rev_diff:del(old_i)
return get_diff(C, old, new, i-1, j)
end
end
end
-- Then call it.
get_diff(quick_LCS(old, new), old, new, #old + 1, #new + 1)
-- Put the prefix in at the end
rev_diff:same(prefix)
-- Reverse the diff.
local diff = {}
for i = #rev_diff, 1, -1 do
table.insert(diff, rev_diff[i])
end
diff.to_html = format_as_html
return diff
end