-
Notifications
You must be signed in to change notification settings - Fork 0
/
document_transformer.py
37 lines (27 loc) · 1.22 KB
/
document_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import abc
from abc import ABC
from documents import InputDocument, TransformedDocument
from tokenizer import Tokenizer
class DocumentTransformer(ABC):
"""
Text Transformation component of the Index Process.
Text normalization and tokenization is expected to be part of this component.
"""
@abc.abstractmethod
def transform_document(self, doc: InputDocument) -> TransformedDocument:
pass
class NaiveSearchDocumentTransformer(DocumentTransformer):
def __init__(self, tokenizer: Tokenizer):
"""
A DocumentTransformer implementation that runs the supplied tokenizer.
:param tokenizer: A tokenizer instance that will be used in document transformation.
"""
self.tokenizer = tokenizer
def transform_document(self, doc: InputDocument) -> TransformedDocument:
"""
Creates TransformedDocument from the given InputDocument by tokenizing its text.
Uses the tokenizer instance supplied in the constructor.
:param doc: The InputDocument to be transformed.
:return: The transformed document.
"""
return TransformedDocument(doc_id=doc.doc_id, tokens=self.tokenizer.tokenize(doc.text))