From 32a7a84e92142e81c696505977c9a74b7902ac91 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 4 Jul 2024 15:48:36 -0400 Subject: [PATCH 1/5] feat: switch to lunr.py for indexing --- alexi/index.py | 148 ++++++++++++++++++++++++++++++++++-------------- alexi/search.py | 28 ++++----- pyproject.toml | 2 +- 3 files changed, 119 insertions(+), 59 deletions(-) diff --git a/alexi/index.py b/alexi/index.py index da76ab3..2a879ad 100644 --- a/alexi/index.py +++ b/alexi/index.py @@ -4,56 +4,116 @@ import json import logging -import os +import re from pathlib import Path +from dataclasses import dataclass -from whoosh.analysis import CharsetFilter, StemmingAnalyzer # type: ignore -from whoosh.fields import ID, NUMERIC, TEXT, Schema # type: ignore -from whoosh.index import create_in # type: ignore -from whoosh.support.charset import charset_table_to_dict # type: ignore -from whoosh.support.charset import default_charset -from whoosh.writing import IndexWriter # type: ignore +from bs4 import BeautifulSoup +from lunr import lunr, get_default_builder +from lunr.pipeline import Pipeline +from unidecode import unidecode LOGGER = logging.getLogger("index") -CHARMAP = charset_table_to_dict(default_charset) -ANALYZER = StemmingAnalyzer() | CharsetFilter(CHARMAP) - - -def add_from_dir(writer: IndexWriter, document: str, docdir: Path) -> dict: - with open(docdir / "index.json") as infh: - element = json.load(infh) - titre = f'{element["type"]} {element["numero"]}: {element["titre"]}' - page = element.get("page", 1) - LOGGER.info("Indexing %s: %s", docdir, element["titre"]) - with open(docdir / "index.md") as infh: - writer.add_document( - document=document, page=page, titre=titre, contenu=infh.read() - ) - return element + + +@dataclass +class Document: + url: str + titre: str + texte: str + + +def body_text(soup: BeautifulSoup): + body = soup.div(id="body")[0] + for header in body(class_="header"): + header.extract() + for img in body("img"): + alt = soup.new_tag("p") + alt.string = img["alt"] + img.replace_with(alt) + return re.sub("\n\n+", "\n\n", soup.text.strip()) + + +def unifold(token, _idx=None, _tokens=None): + def wrap_unidecode(text, _metadata): + return unidecode(text) + + return token.update(wrap_unidecode) + + +Pipeline.register_function(unifold, "unifold") def index(indir: Path, outdir: Path) -> None: - outdir.mkdir(exist_ok=True) - schema = Schema( - document=ID(stored=True), - page=NUMERIC(stored=True), - titre=TEXT(ANALYZER, stored=True), - contenu=TEXT(ANALYZER, stored=True), - ) - ix = create_in(outdir, schema) - writer = ix.writer() - for docdir in indir.iterdir(): - if not docdir.is_dir(): + """ + Generer l'index a partir des fichiers HTML. + """ + # Metadata (use to index specific zones, etc) + # with open(indir / "index.json", "rt") as infh: + # metadata = json.load(infh) + + # lunr does not do storage so we store plaintext here + textes = {} + + # Use index.html to find things (as in the js version) + LOGGER.info("Traitement: %s", indir / "index.html") + with open(indir / "index.html", "rt") as infh: + soup = BeautifulSoup(infh, features="lxml") + for section in soup.select("li.node"): + summary = section.summary + if summary is None: + LOGGER.error(" non trouvé dans %s", section) continue - if not (docdir / "index.json").exists(): + title = summary.text + if "Document" in section["class"]: + LOGGER.info("Texte complet de %s ne sera pas indexé", title) continue - document = docdir.with_suffix(".pdf").name - add_from_dir(writer, document, docdir) - for subdir in docdir.iterdir(): - if not docdir.is_dir(): - continue - for dirpath, _, filenames in os.walk(subdir, topdown=True): - if "index.json" not in filenames: - continue - add_from_dir(writer, document, Path(dirpath)) - writer.commit() + url = section.a["href"] + # Assume it is a relative URL (we made it) + LOGGER.info("Traitement: %s: %s", title, indir / url) + with open(indir / url, "rt") as infh: + subsoup = BeautifulSoup(infh, features="lxml") + textes[url] = {"titre": title, "texte": body_text(subsoup)} + for text in soup.select("li.leaf"): + title = text.a.text + url = text.a["href"] + LOGGER.info("Traitement: %s: %s", title, indir / url) + with open(indir / url, "rt") as infh: + subsoup = BeautifulSoup(infh, features="lxml") + textes[url] = {"titre": title, "texte": body_text(subsoup)} + + outdir.mkdir(exist_ok=True) + with open(outdir / "textes.json", "wt", encoding="utf-8") as outfh: + json.dump(textes, outfh, indent=2, ensure_ascii=False) + + builder = get_default_builder("fr") + # Skip the trimmer for titles (FIXME: instead we should add some + # missing characters to it so it will match zones, usages, etc) + for funcname in ("lunr-multi-trimmer-fr",): + builder.pipeline.skip( + builder.pipeline.registered_functions[funcname], ["titre"] + ) + # Add a missing pipeline function for search (don't add the + # trimmer as it will strip out zones, usages, etc) + for funcname in ("stopWordFilter-fr",): + builder.search_pipeline.before( + builder.search_pipeline.registered_functions["stemmer-fr"], + builder.search_pipeline.registered_functions[funcname], + ) + builder.pipeline.add(unifold) + builder.metadata_whitelist.append("position") + LOGGER.info("pipeline: %s", builder.pipeline) + LOGGER.info("search pipeline: %s", builder.pipeline) + + index = lunr( + ref="url", + fields=[{"field_name": "titre", "boost": 2}, "texte"], + documents=[ + {"url": url, "titre": doc["titre"], "texte": doc["texte"]} + for url, doc in textes.items() + ], + languages="fr", + builder=builder, + ) + with open(outdir / "index.json", "wt", encoding="utf-8") as outfh: + json.dump(index.serialize(), outfh, indent=2, ensure_ascii=False) diff --git a/alexi/search.py b/alexi/search.py index 9a92a8c..dabf11f 100644 --- a/alexi/search.py +++ b/alexi/search.py @@ -2,23 +2,23 @@ Lancer des recherches dans l'index de données. """ +import json from pathlib import Path from typing import List -from whoosh.index import open_dir # type: ignore -from whoosh.qparser import MultifieldParser, OrGroup # type: ignore +from lunr.languages import get_nltk_builder +from lunr.index import Index +from alexi.index import unifold + +# This is just here to register the necessary pipeline functions +get_nltk_builder(["fr"]) def search(indexdir: Path, terms: List[str]) -> None: - ix = open_dir(indexdir) - parser = MultifieldParser( - ["titre", "contenu"], ix.schema, group=OrGroup.factory(0.9) - ) - query = parser.parse(" ".join(terms)) - with ix.searcher() as searcher: - results = searcher.search(query) - for r in results: - print( - f'https://ville.sainte-adele.qc.ca/upload/documents/{r["document"]}' - f'#page={r["page"]} {r["titre"]}' - ) + with open(indexdir / "index.json", "rt", encoding="utf-8") as infh: + index = Index.load(json.load(infh)) + index.pipeline.add(unifold) + results = index.search(" ".join(terms)) + for r in results: + print(r) + print(r["match_data"].metadata) diff --git a/pyproject.toml b/pyproject.toml index 4bcc627..8c7e7d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "pdfplumber", "scikit-learn", "sklearn-crfsuite", - "whoosh", + "lunr[languages]", ] [project.optional-dependencies] dev = [ From fb99aea4944636dec5cf0a352c66ee8316807bdc Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 5 Jul 2024 15:06:31 -0400 Subject: [PATCH 2/5] fix: scrap the broken french trimmer entirely --- alexi/index.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/alexi/index.py b/alexi/index.py index 2a879ad..9673d0a 100644 --- a/alexi/index.py +++ b/alexi/index.py @@ -9,7 +9,7 @@ from dataclasses import dataclass from bs4 import BeautifulSoup -from lunr import lunr, get_default_builder +from lunr import lunr, get_default_builder, trimmer from lunr.pipeline import Pipeline from unidecode import unidecode @@ -87,19 +87,22 @@ def index(indir: Path, outdir: Path) -> None: json.dump(textes, outfh, indent=2, ensure_ascii=False) builder = get_default_builder("fr") - # Skip the trimmer for titles (FIXME: instead we should add some - # missing characters to it so it will match zones, usages, etc) - for funcname in ("lunr-multi-trimmer-fr",): - builder.pipeline.skip( - builder.pipeline.registered_functions[funcname], ["titre"] - ) - # Add a missing pipeline function for search (don't add the - # trimmer as it will strip out zones, usages, etc) - for funcname in ("stopWordFilter-fr",): - builder.search_pipeline.before( - builder.search_pipeline.registered_functions["stemmer-fr"], - builder.search_pipeline.registered_functions[funcname], - ) + # DO NOT USE the French trimmer as it is seriously defective + builder.pipeline.remove( + builder.pipeline.registered_functions["lunr-multi-trimmer-fr"] + ) + builder.pipeline.before( + builder.pipeline.registered_functions["stopWordFilter-fr"], trimmer.trimmer + ) + # Missing pipeline functions for search + builder.search_pipeline.before( + builder.search_pipeline.registered_functions["stemmer-fr"], + builder.search_pipeline.registered_functions["stopWordFilter-fr"], + ) + builder.search_pipeline.before( + builder.search_pipeline.registered_functions["stopWordFilter-fr"], + trimmer.trimmer, + ) builder.pipeline.add(unifold) builder.metadata_whitelist.append("position") LOGGER.info("pipeline: %s", builder.pipeline) From 3f043f776e61f1b089782e7a4cf1b3a66a6e200a Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 5 Jul 2024 15:26:14 -0400 Subject: [PATCH 3/5] fix: add unifold to search --- alexi/index.py | 1 + 1 file changed, 1 insertion(+) diff --git a/alexi/index.py b/alexi/index.py index 9673d0a..b1fa0d1 100644 --- a/alexi/index.py +++ b/alexi/index.py @@ -104,6 +104,7 @@ def index(indir: Path, outdir: Path) -> None: trimmer.trimmer, ) builder.pipeline.add(unifold) + builder.search_pipeline.add(unifold) builder.metadata_whitelist.append("position") LOGGER.info("pipeline: %s", builder.pipeline) LOGGER.info("search pipeline: %s", builder.pipeline) From da3bddc7b7f1204d6f2e7e39aff4ba394580c992 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 5 Jul 2024 15:42:09 -0400 Subject: [PATCH 4/5] feat: inclure lindex avec les documents --- .github/workflows/analyse.yml | 3 +++ alexi/__init__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/analyse.yml b/.github/workflows/analyse.yml index 4943d37..98aed66 100644 --- a/.github/workflows/analyse.yml +++ b/.github/workflows/analyse.yml @@ -50,6 +50,9 @@ jobs: - name: Extract run: | alexi -v extract -m download/index.json download/*.pdf + - name: Index + run: | + alexi -v index export - name: Setup Pages uses: actions/configure-pages@v5 - name: Upload artifact diff --git a/alexi/__init__.py b/alexi/__init__.py index af2607b..614feca 100644 --- a/alexi/__init__.py +++ b/alexi/__init__.py @@ -199,7 +199,7 @@ def make_argparse() -> argparse.ArgumentParser: "--outdir", help="Repertoire destination pour l'index", type=Path, - default="indexdir", + default="export/_idx", ) index.add_argument("indir", help="Repertoire avec les fichiers extraits", type=Path) index.set_defaults(func=index_main) From 0cdc33cff1cc1bd131c929efce3fb26b4ab9fe2a Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 5 Jul 2024 15:48:41 -0400 Subject: [PATCH 5/5] chore: gossage pour satisfaire mypy et compagnie --- alexi/index.py | 28 ++++++++++++++++++---------- alexi/search.py | 5 +++-- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/alexi/index.py b/alexi/index.py index b1fa0d1..0cb50ce 100644 --- a/alexi/index.py +++ b/alexi/index.py @@ -5,13 +5,13 @@ import json import logging import re -from pathlib import Path from dataclasses import dataclass +from pathlib import Path from bs4 import BeautifulSoup -from lunr import lunr, get_default_builder, trimmer -from lunr.pipeline import Pipeline -from unidecode import unidecode +from lunr import get_default_builder, lunr, trimmer # type: ignore +from lunr.pipeline import Pipeline # type: ignore +from unidecode import unidecode # type: ignore LOGGER = logging.getLogger("index") @@ -24,10 +24,11 @@ class Document: def body_text(soup: BeautifulSoup): - body = soup.div(id="body")[0] - for header in body(class_="header"): + body = soup.find_all("div", id="body") + assert body is not None + for header in body[0](class_="header"): header.extract() - for img in body("img"): + for img in body[0]("img"): alt = soup.new_tag("p") alt.string = img["alt"] img.replace_with(alt) @@ -68,15 +69,22 @@ def index(indir: Path, outdir: Path) -> None: if "Document" in section["class"]: LOGGER.info("Texte complet de %s ne sera pas indexé", title) continue - url = section.a["href"] + a = section.a + assert a is not None + url = a["href"] + assert not isinstance(url, list) # Assume it is a relative URL (we made it) LOGGER.info("Traitement: %s: %s", title, indir / url) with open(indir / url, "rt") as infh: subsoup = BeautifulSoup(infh, features="lxml") textes[url] = {"titre": title, "texte": body_text(subsoup)} for text in soup.select("li.leaf"): - title = text.a.text - url = text.a["href"] + assert text is not None + a = text.a + assert a is not None + title = a.text + url = a["href"] + assert not isinstance(url, list) LOGGER.info("Traitement: %s: %s", title, indir / url) with open(indir / url, "rt") as infh: subsoup = BeautifulSoup(infh, features="lxml") diff --git a/alexi/search.py b/alexi/search.py index dabf11f..9f62d78 100644 --- a/alexi/search.py +++ b/alexi/search.py @@ -6,8 +6,9 @@ from pathlib import Path from typing import List -from lunr.languages import get_nltk_builder -from lunr.index import Index +from lunr.index import Index # type: ignore +from lunr.languages import get_nltk_builder # type: ignore + from alexi.index import unifold # This is just here to register the necessary pipeline functions