Skip to content

Commit

Permalink
Adds token_class option to tokenization and propagates the param to d…
Browse files Browse the repository at this point in the history
…iffengines.
  • Loading branch information
halfak committed Sep 7, 2015
1 parent 266f024 commit 3bfb7b0
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 34 deletions.
2 changes: 1 addition & 1 deletion deltas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .algorithms.diff_engine import DiffEngine
from .algorithms import segment_matcher, SegmentMatcher
from .algorithms import sequence_matcher, SequenceMatcher
from .tokenizers import Tokenizer, RegexTokenizer, text_split, wikitext_split
from .tokenizers import Token, Tokenizer, RegexTokenizer, text_split, wikitext_split
from .segmenters import Segmenter, Segment, MatchableSegment

__version__ = "0.3.3"
10 changes: 4 additions & 6 deletions deltas/algorithms/diff_engine.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import yamlconf
"""
"""


class DiffEngine:
"""
Expand All @@ -13,13 +12,12 @@ class Processor:
of a single text.
"""

def process(text):
def process(text, token_class=None):
raise NotImplementedError()


def processor():
def processor(self):
"""
Configures and returns a new :class:`deltas.algorithms.Engine.Processor`
Configures and returns a new :class:`~deltas.DiffEngine.Processor`
"""
raise NotImplementedError()

Expand Down
20 changes: 12 additions & 8 deletions deltas/algorithms/segment_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
SEGMENTER = ParagraphsSentencesAndWhitespace()
TOKENIZER = text_split


def diff(a, b, segmenter=None):
"""
Performs a diff comparison between two sequences of tokens (`a` and `b`)
Expand Down Expand Up @@ -68,6 +69,7 @@ def diff(a, b, segmenter=None):

return diff_segments(a_segments, b_segments)


def diff_segments(a_segments, b_segments):
"""
Performs a diff comparison between two pre-clustered
Expand Down Expand Up @@ -96,7 +98,6 @@ def diff_segments(a_segments, b_segments):
b_segment_tokens).expand())



def process(texts, *args, **kwargs):
"""
Processes a single sequence of texts with a
Expand All @@ -116,6 +117,7 @@ def process(texts, *args, **kwargs):
for text in texts:
yield processor.process(text)


class SegmentMatcher(DiffEngine):
"""
Constructs a segment matcher diff engine that preserves segmentation state
Expand All @@ -130,13 +132,16 @@ class SegmentMatcher(DiffEngine):
>>> engine = SegmentMatcher(text_split)
>>>
>>> processor = engine.processor()
>>> ops, a, b = processor.process("This is a version. It has some text in it.")
>>> ops, a, b = processor.process("This is a version. It has some " +
"text in it.")
>>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops))
'This is a version. It has some text in it.'
>>> ops, a, b = processor.process("This is a version. However, it has different.")
>>> ops, a, b = processor.process("This is a version. However, it " +
"has different.")
>>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops))
'This is a version. ' '' 'However, it' ' has ' '' 'different' '.'
>>> ops, a, b = processor.process("Switching it up here. This is a version.")
>>> ops, a, b = processor.process("Switching it up here. This is a " +
"version.")
>>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops))
'' 'Switching' ' it ' '' 'up' ' ' '' 'here' '.' ' ' 'This is a version.'
"""
Expand All @@ -158,16 +163,15 @@ def update(self, last_text=None, last_tokens=None, last_segments=None):
self.last_tokens = self.last_segments.tokens()
elif last_tokens is not None:
self.last_tokens = last_tokens
self.last_segments = self.segments.segment(last_tokens)
self.last_segments = self.segmenter.segment(last_tokens)
elif last_text is not None:
self.last_tokens = self.tokenizer.tokenize(last_text)
self.last_segments = self.segmenter.segment(self.last_tokens)
else:
self.last_tokens = []
self.last_segments = Segment()


def process(self, text):
def process(self, text, token_class=Token):
"""
Processes a new version of a text and returns the delta.
Expand All @@ -179,7 +183,7 @@ def process(self, text):
A tuple of `operations`, `a_tokens`, `b_tokens`
"""
# Tokenize and segment
tokens = self.tokenizer.tokenize(text)
tokens = self.tokenizer.tokenize(text, token_class=token_class)
segments = self.segmenter.segment(tokens)

return self.process_segments(segments, tokens=tokens)
Expand Down
31 changes: 20 additions & 11 deletions deltas/algorithms/sequence_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
"""

from difflib import SequenceMatcher as SM
from ..tokenizers import text_split
from .diff_engine import DiffEngine

from ..operations import Delete, Equal, Insert

from ..tokenizers import Token, text_split
from .diff_engine import DiffEngine

TOKENIZER = text_split

Expand All @@ -23,6 +22,7 @@
"equal": lambda a1, a2, b1, b2: [Equal(a1, a2, b1, b2)]
}


def diff(a, b):
"""
Performs a longest common substring diff.
Expand All @@ -40,12 +40,14 @@ def diff(a, b):
opcodes = SM(None, a, b).get_opcodes()
return parse_opcodes(opcodes)


def process(texts, *args, **kwargs):
processor = SequenceMatcher.Processor(*args, **kwargs)

for text in texts:
yield processor.process(text)


class SequenceMatcher(DiffEngine):
"""
Constructs a sequence matching diff engine that preserves verion state
Expand All @@ -58,13 +60,16 @@ class SequenceMatcher(DiffEngine):
>>> engine = SequenceMatcher()
>>>
>>> processor = engine.processor()
>>> ops, a, b = processor.process("This is a version. It has some text in it.")
>>> ops, a, b = processor.process("This is a version. It has some " +
... "text in it.")
>>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops))
'This is a version. It has some text in it.'
>>> ops, a, b = processor.process("This is a version. However, it has different.")
>>> ops, a, b = processor.process("This is a version. However, it " +
... "has different.")
>>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops))
'This is a version. ' '' 'However, it' ' has ' '' 'different' '.'
>>> ops, a, b = processor.process("Switching it up here. This is a version.")
>>> ops, a, b = processor.process("Switching it up here. This is " +
... "a version.")
>>> print(" ".join(repr(''.join(b[op.b1:op.b2])) for op in ops))
'Switching it up here. ' 'This is a version.' ''
"""
Expand All @@ -75,17 +80,19 @@ class Processor(DiffEngine.Processor):
A processor used by the SequenceMatcher difference engine to track the
history of a single text.
"""
def __init__(self, tokenizer=None, last_text=None, last_tokens=None):
def __init__(self, tokenizer=None, last_text=None, last_tokens=None,
token_class=None):
self.tokenizer = tokenizer or TOKENIZER
self.update(last_text, last_tokens)
self.token_class = token_class

def update(self, last_text=None, last_tokens=None):
def update(self, last_text=None, last_tokens=None, **kwargs):
if last_text is not None:
self.last_tokens = tokenizer.tokenize(last_text)
self.last_tokens = self.tokenizer.tokenize(last_text, **kwargs)
else:
self.last_tokens = last_tokens or []

def process(self, text):
def process(self, text, token_class=None):
"""
Processes a new version of a text and returns the delta.
Expand All @@ -96,7 +103,8 @@ def process(self, text):
:Returns:
A tuple of `operations`, `a_tokens`, `b_tokens`
"""
tokens = self.tokenizer.tokenize(text)
token_class = token_class or self.token_class
tokens = self.tokenizer.tokenize(text, token_class=token_class)
operations = diff(self.last_tokens, tokens)

a = self.last_tokens
Expand All @@ -118,6 +126,7 @@ def process(self, texts, *args, **kwargs):
def from_config(cls, config, name, section_key="diff_engines"):
return cls()


def parse_opcodes(opcodes):

for opcode in opcodes:
Expand Down
30 changes: 29 additions & 1 deletion deltas/algorithms/tests/test_segment_matcher.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,38 @@
from nose.tools import eq_

from ...apply import apply
from ...operations import Delete, Equal, Insert
from ...tests.diff_and_replay import diff_and_replay
from ...tests.diff_sequence import diff_sequence
from ...tokenizers import text_split, wikitext_split
from ..segment_matcher import diff, process
from ...tokenizers import text_split


def test_diff_and_replay():
return diff_and_replay(diff)


def test_engine():
return diff_sequence(process)


def test_easy_diff():
a = "Apples are red."
b = "Apples are tasty and red."

operation_tokens = process([a, b], tokenizer=wikitext_split)

# Apples are red.
operations, a, b = next(operation_tokens)

# Apples are tasty and red.
operations, a, b = next(operation_tokens)

eq_(
list(operations),
[
Equal(0, 4, 0, 4),
Insert(4, 4, 4, 8),
Equal(4, 6, 8, 10)
]
)
2 changes: 1 addition & 1 deletion deltas/tests/diff_and_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ..tokenizers import text_split


def diff_and_replay(diff, tokenizer=None):
def diff_and_replay(diff):
a = """
This sentence is going to get copied. This sentence is going to go away.
Expand Down
1 change: 0 additions & 1 deletion deltas/tokenizers/tests/test_wikitext_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

def test_wikitext_split():


input = "As a sentence, this includes punctuation. \n" + \
"\n" + \
"==Header!==\n" + \
Expand Down
11 changes: 6 additions & 5 deletions deltas/tokenizers/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class Tokenizer:
"""
Constructs a tokenizaton strategy.
"""
def tokenize(self, text):
def tokenize(self, text, token_class=Token):
"""
Tokenizes a text.
"""
Expand All @@ -34,16 +34,17 @@ def __init__(self, lexicon):
self.regex = re.compile('|'.join('(?P<{0}>{1})'.format(name, pattern)
for name, pattern in lexicon))

def tokenize(self, text):
return [t for t in self._tokenize(text)]
def tokenize(self, text, token_class=None):
return [t for t in self._tokenize(text, token_class=token_class)]

def _tokenize(self, text):
def _tokenize(self, text, token_class=None):
"""
Tokenizes a text
:Returns:
A `list` of tokens
"""
token_class = token_class or Token
tokens = {}

for i, match in enumerate(self.regex.finditer(text)):
Expand All @@ -53,7 +54,7 @@ def _tokenize(self, text):
token = tokens[value]
except KeyError:
type = match.lastgroup
token = Token(value, type)
token = token_class(value, type=type)
tokens[value] = token

yield token

0 comments on commit 3bfb7b0

Please sign in to comment.