dhdaines · dhdaines · Jul 5, 2024 · Jul 4, 2024 · Jul 5, 2024 · Jul 5, 2024
diff --git a/.github/workflows/analyse.yml b/.github/workflows/analyse.yml
@@ -50,6 +50,9 @@ jobs:
     - name: Extract
       run: |
         alexi -v extract -m download/index.json download/*.pdf
+    - name: Index
+      run: |
+        alexi -v index export
     - name: Setup Pages
       uses: actions/configure-pages@v5
     - name: Upload artifact

diff --git a/alexi/__init__.py b/alexi/__init__.py
@@ -199,7 +199,7 @@ def make_argparse() -> argparse.ArgumentParser:
         "--outdir",
         help="Repertoire destination pour l'index",
         type=Path,
-        default="indexdir",
+        default="export/_idx",
     )
     index.add_argument("indir", help="Repertoire avec les fichiers extraits", type=Path)
     index.set_defaults(func=index_main)

diff --git a/alexi/index.py b/alexi/index.py
@@ -4,56 +4,128 @@
 
 import json
 import logging
-import os
+import re
+from dataclasses import dataclass
 from pathlib import Path
 
-from whoosh.analysis import CharsetFilter, StemmingAnalyzer  # type: ignore
-from whoosh.fields import ID, NUMERIC, TEXT, Schema  # type: ignore
-from whoosh.index import create_in  # type: ignore
-from whoosh.support.charset import charset_table_to_dict  # type: ignore
-from whoosh.support.charset import default_charset
-from whoosh.writing import IndexWriter  # type: ignore
+from bs4 import BeautifulSoup
+from lunr import get_default_builder, lunr, trimmer  # type: ignore
+from lunr.pipeline import Pipeline  # type: ignore
+from unidecode import unidecode  # type: ignore
 
 LOGGER = logging.getLogger("index")
-CHARMAP = charset_table_to_dict(default_charset)
-ANALYZER = StemmingAnalyzer() | CharsetFilter(CHARMAP)
 
 
-def add_from_dir(writer: IndexWriter, document: str, docdir: Path) -> dict:
-    with open(docdir / "index.json") as infh:
-        element = json.load(infh)
-        titre = f'{element["type"]} {element["numero"]}: {element["titre"]}'
-        page = element.get("page", 1)
-    LOGGER.info("Indexing %s: %s", docdir, element["titre"])
-    with open(docdir / "index.md") as infh:
-        writer.add_document(
-            document=document, page=page, titre=titre, contenu=infh.read()
-        )
-    return element
+@dataclass
+class Document:
+    url: str
+    titre: str
+    texte: str
+
+
+def body_text(soup: BeautifulSoup):
+    body = soup.find_all("div", id="body")
+    assert body is not None
+    for header in body[0](class_="header"):
+        header.extract()
+    for img in body[0]("img"):
+        alt = soup.new_tag("p")
+        alt.string = img["alt"]
+        img.replace_with(alt)
+    return re.sub("\n\n+", "\n\n", soup.text.strip())
+
+
+def unifold(token, _idx=None, _tokens=None):
+    def wrap_unidecode(text, _metadata):
+        return unidecode(text)
+
+    return token.update(wrap_unidecode)
+
+
+Pipeline.register_function(unifold, "unifold")
 
 
 def index(indir: Path, outdir: Path) -> None:
-    outdir.mkdir(exist_ok=True)
-    schema = Schema(
-        document=ID(stored=True),
-        page=NUMERIC(stored=True),
-        titre=TEXT(ANALYZER, stored=True),
-        contenu=TEXT(ANALYZER, stored=True),
-    )
-    ix = create_in(outdir, schema)
-    writer = ix.writer()
-    for docdir in indir.iterdir():
-        if not docdir.is_dir():
+    """
+    Generer l'index a partir des fichiers HTML.
+    """
+    # Metadata (use to index specific zones, etc)
+    # with open(indir / "index.json", "rt") as infh:
+    #     metadata = json.load(infh)
+
+    # lunr does not do storage so we store plaintext here
+    textes = {}
+
+    # Use index.html to find things (as in the js version)
+    LOGGER.info("Traitement: %s", indir / "index.html")
+    with open(indir / "index.html", "rt") as infh:
+        soup = BeautifulSoup(infh, features="lxml")
+    for section in soup.select("li.node"):
+        summary = section.summary
+        if summary is None:
+            LOGGER.error("<summary> non trouvé dans %s", section)
             continue
-        if not (docdir / "index.json").exists():
+        title = summary.text
+        if "Document" in section["class"]:
+            LOGGER.info("Texte complet de %s ne sera pas indexé", title)
             continue
-        document = docdir.with_suffix(".pdf").name
-        add_from_dir(writer, document, docdir)
-        for subdir in docdir.iterdir():
-            if not docdir.is_dir():
-                continue
-            for dirpath, _, filenames in os.walk(subdir, topdown=True):
-                if "index.json" not in filenames:
-                    continue
-                add_from_dir(writer, document, Path(dirpath))
-    writer.commit()
+        a = section.a
+        assert a is not None
+        url = a["href"]
+        assert not isinstance(url, list)
+        # Assume it is a relative URL (we made it)
+        LOGGER.info("Traitement: %s: %s", title, indir / url)
+        with open(indir / url, "rt") as infh:
+            subsoup = BeautifulSoup(infh, features="lxml")
+            textes[url] = {"titre": title, "texte": body_text(subsoup)}
+    for text in soup.select("li.leaf"):
+        assert text is not None
+        a = text.a
+        assert a is not None
+        title = a.text
+        url = a["href"]
+        assert not isinstance(url, list)
+        LOGGER.info("Traitement: %s: %s", title, indir / url)
+        with open(indir / url, "rt") as infh:
+            subsoup = BeautifulSoup(infh, features="lxml")
+            textes[url] = {"titre": title, "texte": body_text(subsoup)}
+
+    outdir.mkdir(exist_ok=True)
+    with open(outdir / "textes.json", "wt", encoding="utf-8") as outfh:
+        json.dump(textes, outfh, indent=2, ensure_ascii=False)
+
+    builder = get_default_builder("fr")
+    # DO NOT USE the French trimmer as it is seriously defective
+    builder.pipeline.remove(
+        builder.pipeline.registered_functions["lunr-multi-trimmer-fr"]
+    )
+    builder.pipeline.before(
+        builder.pipeline.registered_functions["stopWordFilter-fr"], trimmer.trimmer
+    )
+    # Missing pipeline functions for search
+    builder.search_pipeline.before(
+        builder.search_pipeline.registered_functions["stemmer-fr"],
+        builder.search_pipeline.registered_functions["stopWordFilter-fr"],
+    )
+    builder.search_pipeline.before(
+        builder.search_pipeline.registered_functions["stopWordFilter-fr"],
+        trimmer.trimmer,
+    )
+    builder.pipeline.add(unifold)
+    builder.search_pipeline.add(unifold)
+    builder.metadata_whitelist.append("position")
+    LOGGER.info("pipeline: %s", builder.pipeline)
+    LOGGER.info("search pipeline: %s", builder.pipeline)
+
+    index = lunr(
+        ref="url",
+        fields=[{"field_name": "titre", "boost": 2}, "texte"],
+        documents=[
+            {"url": url, "titre": doc["titre"], "texte": doc["texte"]}
+            for url, doc in textes.items()
+        ],
+        languages="fr",
+        builder=builder,
+    )
+    with open(outdir / "index.json", "wt", encoding="utf-8") as outfh:
+        json.dump(index.serialize(), outfh, indent=2, ensure_ascii=False)
diff --git a/alexi/search.py b/alexi/search.py
@@ -2,23 +2,24 @@
 Lancer des recherches dans l'index de données.
 """
 
+import json
 from pathlib import Path
 from typing import List
 
-from whoosh.index import open_dir  # type: ignore
-from whoosh.qparser import MultifieldParser, OrGroup  # type: ignore
+from lunr.index import Index  # type: ignore
+from lunr.languages import get_nltk_builder  # type: ignore
+
+from alexi.index import unifold
+
+# This is just here to register the necessary pipeline functions
+get_nltk_builder(["fr"])
 
 
 def search(indexdir: Path, terms: List[str]) -> None:
-    ix = open_dir(indexdir)
-    parser = MultifieldParser(
-        ["titre", "contenu"], ix.schema, group=OrGroup.factory(0.9)
-    )
-    query = parser.parse(" ".join(terms))
-    with ix.searcher() as searcher:
-        results = searcher.search(query)
-        for r in results:
-            print(
-                f'https://ville.sainte-adele.qc.ca/upload/documents/{r["document"]}'
-                f'#page={r["page"]} {r["titre"]}'
-            )
+    with open(indexdir / "index.json", "rt", encoding="utf-8") as infh:
+        index = Index.load(json.load(infh))
+    index.pipeline.add(unifold)
+    results = index.search(" ".join(terms))
+    for r in results:
+        print(r)
+        print(r["match_data"].metadata)
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "pdfplumber",
     "scikit-learn",
     "sklearn-crfsuite",
-    "whoosh",
+    "lunr[languages]",
 ]
 [project.optional-dependencies]
 dev = [