Skip to content

Commit

Permalink
fix: pdfplumber caching considered harmful (also remove dead code)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Nov 17, 2024
1 parent 6cf7ca5 commit 1cc2b1e
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 3 deletions.
1 change: 1 addition & 0 deletions alexi/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,4 @@ def extract_words(self, pages: Optional[Iterable[int]] = None) -> Iterator[T_obj
feats = get_word_features(word, page, chars, elmap)
feats["path"] = str(self.path)
yield feats
page.close()
3 changes: 0 additions & 3 deletions alexi/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,6 @@ def __call__(self, path: Path) -> Union[Document, None]:
if self.crf_n is not None:
crf = self.crf_n
iob = list(self.crf_s(crf(feats)))
if conv is None and pdf_path.exists():
conv = Converteur(pdf_path)
assert conv is not None
doc = self.analyse(iob, pdf_path)
if self.pdfdata:
doc.pdfurl = self.pdfdata.get(pdf_path.name, {}).get("url", None)
Expand Down

0 comments on commit 1cc2b1e

Please sign in to comment.