fix: pdfplumber caching considered harmful (also remove dead code)

dhdaines · Nov 17, 2024 · 1cc2b1e · 1cc2b1e
1 parent 6cf7ca5
commit 1cc2b1e
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 3 deletions.
diff --git a/alexi/convert.py b/alexi/convert.py
@@ -157,3 +157,4 @@ def extract_words(self, pages: Optional[Iterable[int]] = None) -> Iterator[T_obj
                 feats = get_word_features(word, page, chars, elmap)
                 feats["path"] = str(self.path)
                 yield feats
+            page.close()
diff --git a/alexi/extract.py b/alexi/extract.py
@@ -407,9 +407,6 @@ def __call__(self, path: Path) -> Union[Document, None]:
                     if self.crf_n is not None:
                         crf = self.crf_n
                 iob = list(self.crf_s(crf(feats)))
-        if conv is None and pdf_path.exists():
-            conv = Converteur(pdf_path)
-        assert conv is not None
         doc = self.analyse(iob, pdf_path)
         if self.pdfdata:
             doc.pdfurl = self.pdfdata.get(pdf_path.name, {}).get("url", None)