Skip to content

Commit

Permalink
Custom spellchecker removed.
Browse files Browse the repository at this point in the history
Although the lucene spellcheckers support a minimum prefix, they are based on edit distances, not optimized for autocompletion. The lucene suggesters would work, but require building first. Term enumeration supports seeking, so the in-line version seems comparably fast, as does `DirectSpellChecker`.
  • Loading branch information
coady committed Jul 6, 2024
1 parent 097845f commit 103fe79
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 76 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

### Removed
* Spatial field
* Cached custom spellchecker

## [3.1](https://pypi.org/project/lupyne/3.1/) - 2023-11-22
### Changed
Expand Down
33 changes: 10 additions & 23 deletions lupyne/engine/indexers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextlib
import heapq
import itertools
import operator
from collections.abc import Iterator, Mapping
Expand All @@ -14,7 +15,7 @@
from .analyzers import Analyzer
from .queries import Query, DocValues, SpellParser
from .documents import Field, Document, Hits, GroupingSearch, Groups
from .utils import suppress, Atomic, SpellChecker
from .utils import suppress, Atomic


class closing(set):
Expand Down Expand Up @@ -151,6 +152,11 @@ def suggest(self, name: str, value, count: int = 1, **attrs) -> list:
words = checker.suggestSimilar(index.Term(name, value), count, self.indexReader)
return [word.string for word in words]

def complete(self, name: str, prefix: str, count: int) -> list[str]:
"""Return autocomplete suggestions for word prefix."""
terms = dict(self.terms(name, prefix, counts=True))
return heapq.nlargest(count, terms, key=terms.__getitem__)

def sortfield(self, name: str, type=None, reverse=False) -> search.SortField:
"""Return lucene SortField, deriving the the type from FieldInfos if necessary.
Expand Down Expand Up @@ -310,7 +316,6 @@ def __init__(self, directory, analyzer=None):
self.shared = closing()
super().__init__(self.shared.reader(directory))
self.analyzer = self.shared.analyzer(analyzer)
self.spellcheckers = {}

def __del__(self):
if hash(self): # pragma: no branch
Expand All @@ -319,23 +324,17 @@ def __del__(self):
def openIfChanged(self):
return index.DirectoryReader.openIfChanged(index.DirectoryReader.cast_(self.indexReader))

def reopen(self, spellcheckers=False) -> 'IndexSearcher':
"""Return current [IndexSearcher][lupyne.engine.indexers.IndexSearcher], only creating a new one if necessary.
def reopen(self) -> 'IndexSearcher':
"""Return current [IndexSearcher][lupyne.engine.indexers.IndexSearcher].
Args:
spellcheckers: refresh cached :attr:`spellcheckers`
Only creates a new one if necessary.
"""
reader = self.openIfChanged()
if reader is None:
return self
other = type(self)(reader, self.analyzer)
other.decRef()
other.shared = self.shared
if spellcheckers:
for field in self.spellcheckers:
other.spellchecker(field)
else:
other.spellcheckers = dict(self.spellcheckers)
return other

def __getitem__(self, id: int) -> Document:
Expand Down Expand Up @@ -472,18 +471,6 @@ def groupby(
using a [GroupingSearch][lupyne.engine.documents.GroupingSearch]."""
return GroupingSearch(field, **attrs).search(self, self.parse(query), count, start)

def spellchecker(self, field: str) -> SpellChecker:
"""Return and cache spellchecker for given field."""
try:
return self.spellcheckers[field]
except KeyError:
spellchecker = SpellChecker(self.terms(field, counts=True))
return self.spellcheckers.setdefault(field, spellchecker)

def complete(self, field: str, prefix: str, count: Optional[int] = None) -> list:
"""Return ordered suggested words for prefix."""
return self.spellchecker(field).complete(prefix, count)

def match(self, document: Mapping, *queries) -> Iterator[float]:
"""Generate scores for all queries against a given document mapping."""
searcher = index.memory.MemoryIndex()
Expand Down
27 changes: 0 additions & 27 deletions lupyne/engine/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import abc
import bisect
import contextlib
import heapq
import itertools
from collections.abc import Iterable
from typing import Optional
import lucene
from java.lang import Double, Float, Number, Object
from org.apache.lucene import analysis, util
Expand All @@ -22,29 +18,6 @@ def __subclasshook__(cls, other):
Atomic.register(cls)


class SpellChecker(dict):
"""Correct spellings and suggest words for queries.
Supply a vocabulary mapping words to (reverse) sort keys, such as document frequencies.
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.words = sorted(self)
alphabet = ''.join(set(itertools.chain.from_iterable(self)))
self.suffix = alphabet and max(alphabet) * max(map(len, self))

def complete(self, prefix: str, count: Optional[int] = None) -> list:
"""Return ordered suggested words for prefix."""
start = bisect.bisect_left(self.words, prefix)
stop = bisect.bisect_right(self.words, prefix + self.suffix, start)
words = self.words[start:stop]
if count is not None and count < len(words):
return heapq.nlargest(count, words, key=self.__getitem__)
words.sort(key=self.__getitem__, reverse=True)
return words


@contextlib.contextmanager
def suppress(exception):
"""Suppress specific lucene exception."""
Expand Down
4 changes: 2 additions & 2 deletions lupyne/services/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ def index(self) -> dict:
return {str(reader.directory()): reader.numDocs() for reader in searcher.indexReaders}
return {str(searcher.directory): len(searcher)}

def refresh(self, spellcheckers: bool = False) -> dict:
def refresh(self) -> dict:
"""Refresh index version."""
self._searcher = self.searcher.reopen(spellcheckers=spellcheckers)
self._searcher = self.searcher.reopen()
self.updated = time.time()
return self.index()

Expand Down
10 changes: 5 additions & 5 deletions lupyne/services/graphql.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ def terms(self, info: Info) -> IndexedFields:
fields = {}
for name, selected in selections(*info.selected_fields).items():
if 'counts' in selected:
values, counts = zip(*root.searcher.terms(name, counts=True))
fields[name] = Terms(values=values, counts=counts)
terms = dict(root.searcher.terms(name, counts=True))
fields[name] = Terms(values=terms, counts=terms.values())
else:
fields[name] = Terms(values=root.searcher.terms(name))
return IndexedFields(**fields)
Expand Down Expand Up @@ -144,10 +144,10 @@ def search(self, info: Info, q: str, count: Optional[int] = None, sort: list[str

@doc_type
class Mutation:
@doc_field(spellcheckers="refresh cached spellcheckers")
def index(self, spellcheckers: bool = False) -> Index:
@doc_field
def index(self) -> Index:
"""Refresh index."""
index = root.refresh(spellcheckers=spellcheckers)
index = root.refresh()
return Index(directories=list(index), counts=index.values())


Expand Down
21 changes: 7 additions & 14 deletions tests/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,12 @@ def test_searcher(tempdir, fields, constitution):
for doc in constitution:
indexer.add(doc)
indexer.commit()
assert indexer.spellcheckers == {}
assert indexer.complete('amendment', '')
assert list(indexer.spellcheckers) == ['amendment']
assert indexer.complete('amendment', '', 1)
indexer.delete('amendment', doc['amendment'])
indexer.add(doc)
reader = indexer.indexReader
indexer.commit(spellcheckers=True)
indexer.commit()
assert reader.refCount == 0
assert list(indexer.spellcheckers) == ['amendment']
analyzer = engine.Analyzer.standard()
doc = {'text': doc['text'], 'amendment': analyzer.tokens(doc['amendment'])}
scores = list(indexer.match(doc, 'text:congress', 'text:law', 'amendment:27'))
Expand Down Expand Up @@ -213,15 +210,11 @@ def test_spellcheck(tempdir, fields, constitution):
for doc in constitution:
indexer.add(doc)
indexer.commit()
assert indexer.complete('missing', '') == []
assert {'shall', 'states'} <= set(indexer.complete('text', '')[:8])
assert indexer.complete('text', 'con')[:2] == ['congress', 'constitution']
assert (
indexer.complete('text', 'congress')
== indexer.complete('text', 'con', count=1)
== ['congress']
)
assert indexer.complete('text', 'congresses') == []
assert indexer.complete('missing', '', 1) == []
assert ['the', 'shall'] == indexer.complete('text', '', 2)
assert indexer.complete('text', 'con', 2) == ['congress', 'constitution']
assert indexer.complete('text', 'congress', 2) == ['congress']
assert indexer.complete('text', 'congresses', 1) == []
assert indexer.suggest('text', 'write') == ['writs']
assert indexer.suggest('text', 'write', 3) == ['writs', 'writ', 'written']
assert indexer.suggest('text', 'write', 3, maxEdits=1) == ['writs', 'writ']
Expand Down
6 changes: 2 additions & 4 deletions tests/test_graphql.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@ def test_index(client):
(directory,) = index['directories']
assert 'Directory@' in directory
assert index['counts'] == [35]
data = client.execute('mutation { index { directories } }')
assert data == {'index': {'directories': [directory]}}
data = client.execute('mutation { index(spellcheckers: true) { counts } }')
assert data == {'index': {'counts': index['counts']}}
data = client.execute('mutation { index { directories counts } }')
assert data == {'index': index}


def test_terms(client):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_index(client):
assert resp.json() == result
assert float(resp.headers['x-response-time']) > 0.0
assert int(resp.headers['age']) == 0
assert not client.post('/', params={'spellcheckers': True}).is_error
assert not client.post('/').is_error


def test_terms(client):
Expand Down

0 comments on commit 103fe79

Please sign in to comment.