From 1cc2b1e2d8647a99b6aa77a118f493d381969f84 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Sun, 17 Nov 2024 10:55:49 -0500
Subject: [PATCH] fix: pdfplumber caching considered harmful (also remove dead
 code)

---
 alexi/convert.py | 1 +
 alexi/extract.py | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/alexi/convert.py b/alexi/convert.py
index bde5ee2..0a0cd8e 100644
--- a/alexi/convert.py
+++ b/alexi/convert.py
@@ -157,3 +157,4 @@ def extract_words(self, pages: Optional[Iterable[int]] = None) -> Iterator[T_obj
                 feats = get_word_features(word, page, chars, elmap)
                 feats["path"] = str(self.path)
                 yield feats
+            page.close()
diff --git a/alexi/extract.py b/alexi/extract.py
index 2d9f39d..89b914d 100644
--- a/alexi/extract.py
+++ b/alexi/extract.py
@@ -407,9 +407,6 @@ def __call__(self, path: Path) -> Union[Document, None]:
                     if self.crf_n is not None:
                         crf = self.crf_n
                 iob = list(self.crf_s(crf(feats)))
-        if conv is None and pdf_path.exists():
-            conv = Converteur(pdf_path)
-        assert conv is not None
         doc = self.analyse(iob, pdf_path)
         if self.pdfdata:
             doc.pdfurl = self.pdfdata.get(pdf_path.name, {}).get("url", None)