diff --git a/deltas/__init__.py b/deltas/__init__.py index 89fda6f..b8b3e90 100644 --- a/deltas/__init__.py +++ b/deltas/__init__.py @@ -3,7 +3,7 @@ from .algorithms.diff_engine import DiffEngine from .algorithms import segment_matcher, SegmentMatcher from .algorithms import sequence_matcher, SequenceMatcher -from .tokenizers import Tokenizer, RegexTokenizer, text_split, wikitext_split +from .tokenizers import Token, Tokenizer, RegexTokenizer, text_split, wikitext_split from .segmenters import Segmenter, Segment, MatchableSegment __version__ = "0.3.3" diff --git a/deltas/algorithms/diff_engine.py b/deltas/algorithms/diff_engine.py index 0b4fd6a..dc3a528 100644 --- a/deltas/algorithms/diff_engine.py +++ b/deltas/algorithms/diff_engine.py @@ -1,6 +1,5 @@ import yamlconf -""" -""" + class DiffEngine: """ @@ -13,13 +12,12 @@ class Processor: of a single text. """ - def process(text): + def process(text, token_class=None): raise NotImplementedError() - - def processor(): + def processor(self): """ - Configures and returns a new :class:`deltas.algorithms.Engine.Processor` + Configures and returns a new :class:`~deltas.DiffEngine.Processor` """ raise NotImplementedError() diff --git a/deltas/algorithms/segment_matcher.py b/deltas/algorithms/segment_matcher.py index d2a3063..eee6f3e 100644 --- a/deltas/algorithms/segment_matcher.py +++ b/deltas/algorithms/segment_matcher.py @@ -27,6 +27,7 @@ SEGMENTER = ParagraphsSentencesAndWhitespace() TOKENIZER = text_split + def diff(a, b, segmenter=None): """ Performs a diff comparison between two sequences of tokens (`a` and `b`) @@ -68,6 +69,7 @@ def diff(a, b, segmenter=None): return diff_segments(a_segments, b_segments) + def diff_segments(a_segments, b_segments): """ Performs a diff comparison between two pre-clustered @@ -96,7 +98,6 @@ def diff_segments(a_segments, b_segments): b_segment_tokens).expand()) - def process(texts, *args, **kwargs): """ Processes a single sequence of texts with a @@ -116,6 +117,7 @@ def process(texts, *args, **kwargs): for text in texts: yield processor.process(text) + class SegmentMatcher(DiffEngine): """ Constructs a segment matcher diff engine that preserves segmentation state @@ -130,13 +132,16 @@ class SegmentMatcher(DiffEngine): >>> engine = SegmentMatcher(text_split) >>> >>> processor = engine.processor() - >>> ops, a, b = processor.process("This is a version. It has some text in it.") + >>> ops, a, b = processor.process("This is a version. It has some " + + "text in it.") >>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops)) 'This is a version. It has some text in it.' - >>> ops, a, b = processor.process("This is a version. However, it has different.") + >>> ops, a, b = processor.process("This is a version. However, it " + + "has different.") >>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops)) 'This is a version. ' '' 'However, it' ' has ' '' 'different' '.' - >>> ops, a, b = processor.process("Switching it up here. This is a version.") + >>> ops, a, b = processor.process("Switching it up here. This is a " + + "version.") >>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops)) '' 'Switching' ' it ' '' 'up' ' ' '' 'here' '.' ' ' 'This is a version.' """ @@ -158,7 +163,7 @@ def update(self, last_text=None, last_tokens=None, last_segments=None): self.last_tokens = self.last_segments.tokens() elif last_tokens is not None: self.last_tokens = last_tokens - self.last_segments = self.segments.segment(last_tokens) + self.last_segments = self.segmenter.segment(last_tokens) elif last_text is not None: self.last_tokens = self.tokenizer.tokenize(last_text) self.last_segments = self.segmenter.segment(self.last_tokens) @@ -166,8 +171,7 @@ def update(self, last_text=None, last_tokens=None, last_segments=None): self.last_tokens = [] self.last_segments = Segment() - - def process(self, text): + def process(self, text, token_class=Token): """ Processes a new version of a text and returns the delta. @@ -179,7 +183,7 @@ def process(self, text): A tuple of `operations`, `a_tokens`, `b_tokens` """ # Tokenize and segment - tokens = self.tokenizer.tokenize(text) + tokens = self.tokenizer.tokenize(text, token_class=token_class) segments = self.segmenter.segment(tokens) return self.process_segments(segments, tokens=tokens) diff --git a/deltas/algorithms/sequence_matcher.py b/deltas/algorithms/sequence_matcher.py index 955b982..f859b55 100644 --- a/deltas/algorithms/sequence_matcher.py +++ b/deltas/algorithms/sequence_matcher.py @@ -7,11 +7,10 @@ """ from difflib import SequenceMatcher as SM -from ..tokenizers import text_split -from .diff_engine import DiffEngine from ..operations import Delete, Equal, Insert - +from ..tokenizers import Token, text_split +from .diff_engine import DiffEngine TOKENIZER = text_split @@ -23,6 +22,7 @@ "equal": lambda a1, a2, b1, b2: [Equal(a1, a2, b1, b2)] } + def diff(a, b): """ Performs a longest common substring diff. @@ -40,12 +40,14 @@ def diff(a, b): opcodes = SM(None, a, b).get_opcodes() return parse_opcodes(opcodes) + def process(texts, *args, **kwargs): processor = SequenceMatcher.Processor(*args, **kwargs) for text in texts: yield processor.process(text) + class SequenceMatcher(DiffEngine): """ Constructs a sequence matching diff engine that preserves verion state @@ -58,13 +60,16 @@ class SequenceMatcher(DiffEngine): >>> engine = SequenceMatcher() >>> >>> processor = engine.processor() - >>> ops, a, b = processor.process("This is a version. It has some text in it.") + >>> ops, a, b = processor.process("This is a version. It has some " + + ... "text in it.") >>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops)) 'This is a version. It has some text in it.' - >>> ops, a, b = processor.process("This is a version. However, it has different.") + >>> ops, a, b = processor.process("This is a version. However, it " + + ... "has different.") >>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops)) 'This is a version. ' '' 'However, it' ' has ' '' 'different' '.' - >>> ops, a, b = processor.process("Switching it up here. This is a version.") + >>> ops, a, b = processor.process("Switching it up here. This is " + + ... "a version.") >>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops)) 'Switching it up here. ' 'This is a version.' '' """ @@ -75,17 +80,19 @@ class Processor(DiffEngine.Processor): A processor used by the SequenceMatcher difference engine to track the history of a single text. """ - def __init__(self, tokenizer=None, last_text=None, last_tokens=None): + def __init__(self, tokenizer=None, last_text=None, last_tokens=None, + token_class=None): self.tokenizer = tokenizer or TOKENIZER self.update(last_text, last_tokens) + self.token_class = token_class - def update(self, last_text=None, last_tokens=None): + def update(self, last_text=None, last_tokens=None, **kwargs): if last_text is not None: - self.last_tokens = tokenizer.tokenize(last_text) + self.last_tokens = self.tokenizer.tokenize(last_text, **kwargs) else: self.last_tokens = last_tokens or [] - def process(self, text): + def process(self, text, token_class=None): """ Processes a new version of a text and returns the delta. @@ -96,7 +103,8 @@ def process(self, text): :Returns: A tuple of `operations`, `a_tokens`, `b_tokens` """ - tokens = self.tokenizer.tokenize(text) + token_class = token_class or self.token_class + tokens = self.tokenizer.tokenize(text, token_class=token_class) operations = diff(self.last_tokens, tokens) a = self.last_tokens @@ -118,6 +126,7 @@ def process(self, texts, *args, **kwargs): def from_config(cls, config, name, section_key="diff_engines"): return cls() + def parse_opcodes(opcodes): for opcode in opcodes: diff --git a/deltas/algorithms/tests/test_segment_matcher.py b/deltas/algorithms/tests/test_segment_matcher.py index b0e14c6..86ab61f 100644 --- a/deltas/algorithms/tests/test_segment_matcher.py +++ b/deltas/algorithms/tests/test_segment_matcher.py @@ -1,10 +1,38 @@ +from nose.tools import eq_ + +from ...apply import apply +from ...operations import Delete, Equal, Insert from ...tests.diff_and_replay import diff_and_replay from ...tests.diff_sequence import diff_sequence +from ...tokenizers import text_split, wikitext_split from ..segment_matcher import diff, process -from ...tokenizers import text_split + def test_diff_and_replay(): return diff_and_replay(diff) + def test_engine(): return diff_sequence(process) + + +def test_easy_diff(): + a = "Apples are red." + b = "Apples are tasty and red." + + operation_tokens = process([a, b], tokenizer=wikitext_split) + + # Apples are red. + operations, a, b = next(operation_tokens) + + # Apples are tasty and red. + operations, a, b = next(operation_tokens) + + eq_( + list(operations), + [ + Equal(0, 4, 0, 4), + Insert(4, 4, 4, 8), + Equal(4, 6, 8, 10) + ] + ) diff --git a/deltas/tests/diff_and_replay.py b/deltas/tests/diff_and_replay.py index 401adcc..60f4ddf 100644 --- a/deltas/tests/diff_and_replay.py +++ b/deltas/tests/diff_and_replay.py @@ -4,7 +4,7 @@ from ..tokenizers import text_split -def diff_and_replay(diff, tokenizer=None): +def diff_and_replay(diff): a = """ This sentence is going to get copied. This sentence is going to go away. diff --git a/deltas/tokenizers/tests/test_wikitext_split.py b/deltas/tokenizers/tests/test_wikitext_split.py index 77c07da..a666dc7 100644 --- a/deltas/tokenizers/tests/test_wikitext_split.py +++ b/deltas/tokenizers/tests/test_wikitext_split.py @@ -5,7 +5,6 @@ def test_wikitext_split(): - input = "As a sentence, this includes punctuation. \n" + \ "\n" + \ "==Header!==\n" + \ diff --git a/deltas/tokenizers/tokenizer.py b/deltas/tokenizers/tokenizer.py index 2be6fb4..235fa18 100644 --- a/deltas/tokenizers/tokenizer.py +++ b/deltas/tokenizers/tokenizer.py @@ -10,7 +10,7 @@ class Tokenizer: """ Constructs a tokenizaton strategy. """ - def tokenize(self, text): + def tokenize(self, text, token_class=Token): """ Tokenizes a text. """ @@ -34,16 +34,17 @@ def __init__(self, lexicon): self.regex = re.compile('|'.join('(?P<{0}>{1})'.format(name, pattern) for name, pattern in lexicon)) - def tokenize(self, text): - return [t for t in self._tokenize(text)] + def tokenize(self, text, token_class=None): + return [t for t in self._tokenize(text, token_class=token_class)] - def _tokenize(self, text): + def _tokenize(self, text, token_class=None): """ Tokenizes a text :Returns: A `list` of tokens """ + token_class = token_class or Token tokens = {} for i, match in enumerate(self.regex.finditer(text)): @@ -53,7 +54,7 @@ def _tokenize(self, text): token = tokens[value] except KeyError: type = match.lastgroup - token = Token(value, type) + token = token_class(value, type=type) tokens[value] = token yield token