From 1cc2b1e2d8647a99b6aa77a118f493d381969f84 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 17 Nov 2024 10:55:49 -0500 Subject: [PATCH] fix: pdfplumber caching considered harmful (also remove dead code) --- alexi/convert.py | 1 + alexi/extract.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/alexi/convert.py b/alexi/convert.py index bde5ee2..0a0cd8e 100644 --- a/alexi/convert.py +++ b/alexi/convert.py @@ -157,3 +157,4 @@ def extract_words(self, pages: Optional[Iterable[int]] = None) -> Iterator[T_obj feats = get_word_features(word, page, chars, elmap) feats["path"] = str(self.path) yield feats + page.close() diff --git a/alexi/extract.py b/alexi/extract.py index 2d9f39d..89b914d 100644 --- a/alexi/extract.py +++ b/alexi/extract.py @@ -407,9 +407,6 @@ def __call__(self, path: Path) -> Union[Document, None]: if self.crf_n is not None: crf = self.crf_n iob = list(self.crf_s(crf(feats))) - if conv is None and pdf_path.exists(): - conv = Converteur(pdf_path) - assert conv is not None doc = self.analyse(iob, pdf_path) if self.pdfdata: doc.pdfurl = self.pdfdata.get(pdf_path.name, {}).get("url", None)