Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dramatically optimize algorithm in the common case by excluding match… #89

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions Dwifft/Dwifft.swift
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,47 @@ public enum SectionedDiffStep<Section, Value>: CustomDebugStringConvertible {
}
}

// Need to be able to count a sequence without materializing as an array in order to keep matchingEndsInfo below as fase as possible
private extension Sequence {
func count() -> Int {
var i = 0
for _ in self {
i += 1
}
return i
}
}

/// Namespace for the `diff` and `apply` functions.
public enum Dwifft {

internal static func matchingEndsInfo<Value: Equatable>(_ lhs: [Value], _ rhs: [Value]) -> (Int, ArraySlice<Value>, ArraySlice<Value>) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though it's an internal function, can you please add a docstring to this method to help with future debugging etc?

let minTotalCount = min(lhs.count, rhs.count)
let matchingHeadCount = zip(lhs, rhs).lazy.prefix() { $0.0 == $0.1 }.count()
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two lines feel a bit too clever for their own good - it's hard for me to understand what they're doing at a quick glance. I think using a couple of plain old for loops would be preferable here.

let matchingTailCount = matchingHeadCount == minTotalCount
? 0 // if the matching head consumed all of either of the arrays, there's no tail
: zip(lhs.reversed(), rhs.reversed()).prefix(minTotalCount - matchingHeadCount).lazy.prefix() { $0.0 == $0.1 }.count()

let matchingEndsCount = matchingHeadCount + matchingTailCount
let lhsMiddle = matchingEndsCount < lhs.count ? lhs[matchingHeadCount..<lhs.count - matchingTailCount] : []
let rhsMiddle = matchingEndsCount < rhs.count ? rhs[matchingHeadCount..<rhs.count - matchingTailCount] : []

return (matchingHeadCount, lhsMiddle, rhsMiddle)
}

/// Returns the sequence of `DiffStep`s required to transform one array into another.
///
/// - Parameters:
/// - lhs: an array
/// - rhs: another, uh, array
/// - Returns: the series of transformations that, when applied to `lhs`, will yield `rhs`.
public static func diff<Value: Equatable>(_ lhs: [Value], _ rhs: [Value]) -> [DiffStep<Value>] {
let (matchingHeadCount, lhs, rhs) = matchingEndsInfo(lhs, rhs)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpicking, but can you not shadow the lhs and rhs variable names here? Made it harder for me to understand how this compiled at first glance...


if lhs.isEmpty {
return rhs.enumerated().map(DiffStep.insert)
return rhs.enumerated().map { DiffStep.insert($0 + matchingHeadCount, $1) }
} else if rhs.isEmpty {
return lhs.enumerated().map(DiffStep.delete).reversed()
return lhs.enumerated().map { DiffStep.delete($0 + matchingHeadCount, $1) }.reversed()
}

let table = MemoizedSequenceComparison.buildTable(lhs, rhs)
Expand Down Expand Up @@ -241,8 +268,8 @@ public enum Dwifft {

private static func diffInternal<Value: Equatable>(
_ table: [[Int]],
_ x: [Value],
_ y: [Value],
_ x: ArraySlice<Value>,
_ y: ArraySlice<Value>,
_ i: Int,
_ j: Int,
_ currentResults: ([DiffStep<Value>], [DiffStep<Value>])
Expand All @@ -252,18 +279,19 @@ public enum Dwifft {
}
else {
return .call {
let prefixCount = x.startIndex
var nextResults = currentResults
if i == 0 {
nextResults.0 = [DiffStep.insert(j-1, y[j-1])] + nextResults.0
nextResults.0 = [DiffStep.insert(j-1+prefixCount, y[j-1+prefixCount])] + nextResults.0
return diffInternal(table, x, y, i, j-1, nextResults)
} else if j == 0 {
nextResults.1 = nextResults.1 + [DiffStep.delete(i-1, x[i-1])]
nextResults.1 = nextResults.1 + [DiffStep.delete(i-1+prefixCount, x[i-1+prefixCount])]
return diffInternal(table, x, y, i - 1, j, nextResults)
} else if table[i][j] == table[i][j-1] {
nextResults.0 = [DiffStep.insert(j-1, y[j-1])] + nextResults.0
nextResults.0 = [DiffStep.insert(j-1+prefixCount, y[j-1+prefixCount])] + nextResults.0
return diffInternal(table, x, y, i, j-1, nextResults)
} else if table[i][j] == table[i-1][j] {
nextResults.1 = nextResults.1 + [DiffStep.delete(i-1, x[i-1])]
nextResults.1 = nextResults.1 + [DiffStep.delete(i-1+prefixCount, x[i-1+prefixCount])]
return diffInternal(table, x, y, i - 1, j, nextResults)
} else {
return diffInternal(table, x, y, i-1, j-1, nextResults)
Expand All @@ -279,7 +307,7 @@ fileprivate enum Result<T>{
}

fileprivate struct MemoizedSequenceComparison<T: Equatable> {
static func buildTable(_ x: [T], _ y: [T]) -> [[Int]] {
static func buildTable(_ x: ArraySlice<T>, _ y: ArraySlice<T>) -> [[Int]] {
let n = x.count, m = y.count
var table = Array(repeating: Array(repeating: 0, count: m + 1), count: n + 1)
// using unsafe pointers lets us avoid swift array bounds-checking, which results in a considerable speed boost.
Expand Down
48 changes: 48 additions & 0 deletions DwifftTests/DwifftTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,44 @@ struct SectionedValuesWrapper: Arbitrary {
}
}


// Generator that makes arrays from 0-20 elements long where each element has the value 0...9
let smallishArrayGen = Gen<Int>.fromElements(in: 0...20).flatMap(Gen<Int>.fromElements(in: 0...9).proliferate(withSize:))

struct smallishArrayWrapper: Arbitrary {
let getArray: [Int]

static var arbitrary: Gen<smallishArrayWrapper> {
return smallishArrayGen.map {smallishArrayWrapper(getArray:$0)}
}

public static func shrink(_ x: smallishArrayWrapper) -> [smallishArrayWrapper] {
return Array<Int>.shrink(x.getArray).map {smallishArrayWrapper(getArray:$0)}
}
}

class DwifftSwiftCheckTests: XCTestCase {

func testMatchingEndsInfo() {
property("Confirming matching ends info is self consistent", arguments: CheckerArguments(maxAllowableSuccessfulTests: 5000)) <- forAll { (lhs : smallishArrayWrapper, rhs : smallishArrayWrapper) in
let lhs = lhs.getArray, rhs = rhs.getArray

let (matchingHeadCount, lhsInner, rhsInner) = Dwifft.matchingEndsInfo(lhs, rhs)

// Generate slices for the head and tail based on the results from matchingEndsInfo
let matchingHead = matchingHeadCount > 0 ? lhs[0..<matchingHeadCount] : ArraySlice<Int>()
let matchingTailCount = lhs.count - matchingHeadCount + lhsInner.count
let matchingTail = matchingTailCount > 0 ? lhs[(matchingHeadCount + lhsInner.count)...] : ArraySlice<Int>()

// Now reconstruct the input arrays using the matching head, innards, and tail
let reconstructedLhs = Array(matchingHead + lhsInner + matchingTail)
let reconstructedRhs = Array(matchingHead + rhsInner + matchingTail)

return (reconstructedLhs == lhs) <?> "Left identity"
^&&^
(reconstructedRhs == rhs) <?> "Right identity"
}
}
func testDiff() {
property("Diffing two arrays, then applying the diff to the first, yields the second") <- forAll { (a1 : ArrayOf<Int>, a2 : ArrayOf<Int>) in
let diff = Dwifft.diff(a1.getArray, a2.getArray)
Expand Down Expand Up @@ -115,6 +151,7 @@ class DwifftTests: XCTestCase {
TestCase("1234", "1224533324", "+2@2+4@3+5@4+3@6+3@7+2@8"),
TestCase("thisisatest", "testing123testing", "-a@6-s@5-i@2-h@1+e@1+t@3+n@5+g@6+1@7+2@8+3@9+i@14+n@15+g@16"),
TestCase("HUMAN", "CHIMPANZEE", "-U@1+C@0+I@2+P@4+Z@7+E@8+E@9"),
TestCase("1211", "11", "-1@2-2@1"), // Needed to verify matchingEndsInfo bug where tail match size was overstated
]

for test in tests {
Expand All @@ -126,6 +163,17 @@ class DwifftTests: XCTestCase {

}

func testMatchingEndsInfoBenchmark() {
let lhs = Gen<Int>.fromElements(in: 0...9).proliferate(withSize:1000).generate
let rhs = lhs + [666] + lhs

measure {
for _ in 0...10000 {
let _ = Dwifft.matchingEndsInfo(lhs, rhs)
}
}
}

func testDiffBenchmark() {
let a: [Int] = (0...1000).map({ _ in Int(arc4random_uniform(100)) }).filter({ _ in arc4random_uniform(2) == 0})
let b: [Int] = (0...1000).map({ _ in Int(arc4random_uniform(100)) }).filter({ _ in arc4random_uniform(2) == 0})
Expand Down