From 32a7a84e92142e81c696505977c9a74b7902ac91 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Thu, 4 Jul 2024 15:48:36 -0400
Subject: [PATCH 1/5] feat: switch to lunr.py for indexing

---
 alexi/index.py  | 148 ++++++++++++++++++++++++++++++++++--------------
 alexi/search.py |  28 ++++-----
 pyproject.toml  |   2 +-
 3 files changed, 119 insertions(+), 59 deletions(-)
diff --git a/alexi/index.py b/alexi/index.py
index da76ab3..2a879ad 100644
--- a/alexi/index.py
+++ b/alexi/index.py
@@ -4,56 +4,116 @@
 
 import json
 import logging
-import os
+import re
 from pathlib import Path
+from dataclasses import dataclass
 
-from whoosh.analysis import CharsetFilter, StemmingAnalyzer  # type: ignore
-from whoosh.fields import ID, NUMERIC, TEXT, Schema  # type: ignore
-from whoosh.index import create_in  # type: ignore
-from whoosh.support.charset import charset_table_to_dict  # type: ignore
-from whoosh.support.charset import default_charset
-from whoosh.writing import IndexWriter  # type: ignore
+from bs4 import BeautifulSoup
+from lunr import lunr, get_default_builder
+from lunr.pipeline import Pipeline
+from unidecode import unidecode
 
 LOGGER = logging.getLogger("index")
-CHARMAP = charset_table_to_dict(default_charset)
-ANALYZER = StemmingAnalyzer() | CharsetFilter(CHARMAP)
-
-
-def add_from_dir(writer: IndexWriter, document: str, docdir: Path) -> dict:
-    with open(docdir / "index.json") as infh:
-        element = json.load(infh)
-        titre = f'{element["type"]} {element["numero"]}: {element["titre"]}'
-        page = element.get("page", 1)
-    LOGGER.info("Indexing %s: %s", docdir, element["titre"])
-    with open(docdir / "index.md") as infh:
-        writer.add_document(
-            document=document, page=page, titre=titre, contenu=infh.read()
-        )
-    return element
+
+
+@dataclass
+class Document:
+    url: str
+    titre: str
+    texte: str
+
+
+def body_text(soup: BeautifulSoup):
+    body = soup.div(id="body")[0]
+    for header in body(class_="header"):
+        header.extract()
+    for img in body("img"):
+        alt = soup.new_tag("p")
+        alt.string = img["alt"]
+        img.replace_with(alt)
+    return re.sub("\n\n+", "\n\n", soup.text.strip())
+
+
+def unifold(token, _idx=None, _tokens=None):
+    def wrap_unidecode(text, _metadata):
+        return unidecode(text)
+
+    return token.update(wrap_unidecode)
+
+
+Pipeline.register_function(unifold, "unifold")
 
 
 def index(indir: Path, outdir: Path) -> None:
-    outdir.mkdir(exist_ok=True)
-    schema = Schema(
-        document=ID(stored=True),
-        page=NUMERIC(stored=True),
-        titre=TEXT(ANALYZER, stored=True),
-        contenu=TEXT(ANALYZER, stored=True),
-    )
-    ix = create_in(outdir, schema)
-    writer = ix.writer()
-    for docdir in indir.iterdir():
-        if not docdir.is_dir():
+    """
+    Generer l'index a partir des fichiers HTML.
+    """
+    # Metadata (use to index specific zones, etc)
+    # with open(indir / "index.json", "rt") as infh:
+    #     metadata = json.load(infh)
+
+    # lunr does not do storage so we store plaintext here
+    textes = {}
+
+    # Use index.html to find things (as in the js version)
+    LOGGER.info("Traitement: %s", indir / "index.html")
+    with open(indir / "index.html", "rt") as infh:
+        soup = BeautifulSoup(infh, features="lxml")
+    for section in soup.select("li.node"):
+        summary = section.summary
+        if summary is None:
+            LOGGER.error("<summary> non trouvé dans %s", section)
             continue
-        if not (docdir / "index.json").exists():
+        title = summary.text
+        if "Document" in section["class"]:
+            LOGGER.info("Texte complet de %s ne sera pas indexé", title)
             continue
-        document = docdir.with_suffix(".pdf").name
-        add_from_dir(writer, document, docdir)
-        for subdir in docdir.iterdir():
-            if not docdir.is_dir():
-                continue
-            for dirpath, _, filenames in os.walk(subdir, topdown=True):
-                if "index.json" not in filenames:
-                    continue
-                add_from_dir(writer, document, Path(dirpath))
-    writer.commit()
+        url = section.a["href"]
+        # Assume it is a relative URL (we made it)
+        LOGGER.info("Traitement: %s: %s", title, indir / url)
+        with open(indir / url, "rt") as infh:
+            subsoup = BeautifulSoup(infh, features="lxml")
+            textes[url] = {"titre": title, "texte": body_text(subsoup)}
+    for text in soup.select("li.leaf"):
+        title = text.a.text
+        url = text.a["href"]
+        LOGGER.info("Traitement: %s: %s", title, indir / url)
+        with open(indir / url, "rt") as infh:
+            subsoup = BeautifulSoup(infh, features="lxml")
+            textes[url] = {"titre": title, "texte": body_text(subsoup)}
+
+    outdir.mkdir(exist_ok=True)
+    with open(outdir / "textes.json", "wt", encoding="utf-8") as outfh:
+        json.dump(textes, outfh, indent=2, ensure_ascii=False)
+
+    builder = get_default_builder("fr")
+    # Skip the trimmer for titles (FIXME: instead we should add some
+    # missing characters to it so it will match zones, usages, etc)
+    for funcname in ("lunr-multi-trimmer-fr",):
+        builder.pipeline.skip(
+            builder.pipeline.registered_functions[funcname], ["titre"]
+        )
+    # Add a missing pipeline function for search (don't add the
+    # trimmer as it will strip out zones, usages, etc)
+    for funcname in ("stopWordFilter-fr",):
+        builder.search_pipeline.before(
+            builder.search_pipeline.registered_functions["stemmer-fr"],
+            builder.search_pipeline.registered_functions[funcname],
+        )
+    builder.pipeline.add(unifold)
+    builder.metadata_whitelist.append("position")
+    LOGGER.info("pipeline: %s", builder.pipeline)
+    LOGGER.info("search pipeline: %s", builder.pipeline)
+
+    index = lunr(
+        ref="url",
+        fields=[{"field_name": "titre", "boost": 2}, "texte"],
+        documents=[
+            {"url": url, "titre": doc["titre"], "texte": doc["texte"]}
+            for url, doc in textes.items()
+        ],
+        languages="fr",
+        builder=builder,
+    )
+    with open(outdir / "index.json", "wt", encoding="utf-8") as outfh:
+        json.dump(index.serialize(), outfh, indent=2, ensure_ascii=False)
diff --git a/alexi/search.py b/alexi/search.py
index 9a92a8c..dabf11f 100644
--- a/alexi/search.py
+++ b/alexi/search.py
@@ -2,23 +2,23 @@
 Lancer des recherches dans l'index de données.
 """
 
+import json
 from pathlib import Path
 from typing import List
 
-from whoosh.index import open_dir  # type: ignore
-from whoosh.qparser import MultifieldParser, OrGroup  # type: ignore
+from lunr.languages import get_nltk_builder
+from lunr.index import Index
+from alexi.index import unifold
+
+# This is just here to register the necessary pipeline functions
+get_nltk_builder(["fr"])
 
 
 def search(indexdir: Path, terms: List[str]) -> None:
-    ix = open_dir(indexdir)
-    parser = MultifieldParser(
-        ["titre", "contenu"], ix.schema, group=OrGroup.factory(0.9)
-    )
-    query = parser.parse(" ".join(terms))
-    with ix.searcher() as searcher:
-        results = searcher.search(query)
-        for r in results:
-            print(
-                f'https://ville.sainte-adele.qc.ca/upload/documents/{r["document"]}'
-                f'#page={r["page"]} {r["titre"]}'
-            )
+    with open(indexdir / "index.json", "rt", encoding="utf-8") as infh:
+        index = Index.load(json.load(infh))
+    index.pipeline.add(unifold)
+    results = index.search(" ".join(terms))
+    for r in results:
+        print(r)
+        print(r["match_data"].metadata)
diff --git a/pyproject.toml b/pyproject.toml
index 4bcc627..8c7e7d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "pdfplumber",
     "scikit-learn",
     "sklearn-crfsuite",
-    "whoosh",
+    "lunr[languages]",
 ]
 [project.optional-dependencies]
 dev = [

From fb99aea4944636dec5cf0a352c66ee8316807bdc Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Fri, 5 Jul 2024 15:06:31 -0400
Subject: [PATCH 2/5] fix: scrap the broken french trimmer entirely

---
 alexi/index.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/alexi/index.py b/alexi/index.py
index 2a879ad..9673d0a 100644
--- a/alexi/index.py
+++ b/alexi/index.py
@@ -9,7 +9,7 @@
 from dataclasses import dataclass
 
 from bs4 import BeautifulSoup
-from lunr import lunr, get_default_builder
+from lunr import lunr, get_default_builder, trimmer
 from lunr.pipeline import Pipeline
 from unidecode import unidecode
 
@@ -87,19 +87,22 @@ def index(indir: Path, outdir: Path) -> None:
         json.dump(textes, outfh, indent=2, ensure_ascii=False)
 
     builder = get_default_builder("fr")
-    # Skip the trimmer for titles (FIXME: instead we should add some
-    # missing characters to it so it will match zones, usages, etc)
-    for funcname in ("lunr-multi-trimmer-fr",):
-        builder.pipeline.skip(
-            builder.pipeline.registered_functions[funcname], ["titre"]
-        )
-    # Add a missing pipeline function for search (don't add the
-    # trimmer as it will strip out zones, usages, etc)
-    for funcname in ("stopWordFilter-fr",):
-        builder.search_pipeline.before(
-            builder.search_pipeline.registered_functions["stemmer-fr"],
-            builder.search_pipeline.registered_functions[funcname],
-        )
+    # DO NOT USE the French trimmer as it is seriously defective
+    builder.pipeline.remove(
+        builder.pipeline.registered_functions["lunr-multi-trimmer-fr"]
+    )
+    builder.pipeline.before(
+        builder.pipeline.registered_functions["stopWordFilter-fr"], trimmer.trimmer
+    )
+    # Missing pipeline functions for search
+    builder.search_pipeline.before(
+        builder.search_pipeline.registered_functions["stemmer-fr"],
+        builder.search_pipeline.registered_functions["stopWordFilter-fr"],
+    )
+    builder.search_pipeline.before(
+        builder.search_pipeline.registered_functions["stopWordFilter-fr"],
+        trimmer.trimmer,
+    )
     builder.pipeline.add(unifold)
     builder.metadata_whitelist.append("position")
     LOGGER.info("pipeline: %s", builder.pipeline)

From 3f043f776e61f1b089782e7a4cf1b3a66a6e200a Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Fri, 5 Jul 2024 15:26:14 -0400
Subject: [PATCH 3/5] fix: add unifold to search

---
 alexi/index.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/alexi/index.py b/alexi/index.py
index 9673d0a..b1fa0d1 100644
--- a/alexi/index.py
+++ b/alexi/index.py
@@ -104,6 +104,7 @@ def index(indir: Path, outdir: Path) -> None:
         trimmer.trimmer,
     )
     builder.pipeline.add(unifold)
+    builder.search_pipeline.add(unifold)
     builder.metadata_whitelist.append("position")
     LOGGER.info("pipeline: %s", builder.pipeline)
     LOGGER.info("search pipeline: %s", builder.pipeline)

From da3bddc7b7f1204d6f2e7e39aff4ba394580c992 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Fri, 5 Jul 2024 15:42:09 -0400
Subject: [PATCH 4/5] feat: inclure lindex avec les documents

---
 .github/workflows/analyse.yml | 3 +++
 alexi/__init__.py             | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/analyse.yml b/.github/workflows/analyse.yml
index 4943d37..98aed66 100644
--- a/.github/workflows/analyse.yml
+++ b/.github/workflows/analyse.yml
@@ -50,6 +50,9 @@ jobs:
     - name: Extract
       run: |
         alexi -v extract -m download/index.json download/*.pdf
+    - name: Index
+      run: |
+        alexi -v index export
     - name: Setup Pages
       uses: actions/configure-pages@v5
     - name: Upload artifact
diff --git a/alexi/__init__.py b/alexi/__init__.py
index af2607b..614feca 100644
--- a/alexi/__init__.py
+++ b/alexi/__init__.py
@@ -199,7 +199,7 @@ def make_argparse() -> argparse.ArgumentParser:
         "--outdir",
         help="Repertoire destination pour l'index",
         type=Path,
-        default="indexdir",
+        default="export/_idx",
     )
     index.add_argument("indir", help="Repertoire avec les fichiers extraits", type=Path)
     index.set_defaults(func=index_main)

From 0cdc33cff1cc1bd131c929efce3fb26b4ab9fe2a Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Fri, 5 Jul 2024 15:48:41 -0400
Subject: [PATCH 5/5] chore: gossage pour satisfaire mypy et compagnie

---
 alexi/index.py  | 28 ++++++++++++++++++----------
 alexi/search.py |  5 +++--
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/alexi/index.py b/alexi/index.py
index b1fa0d1..0cb50ce 100644
--- a/alexi/index.py
+++ b/alexi/index.py
@@ -5,13 +5,13 @@
 import json
 import logging
 import re
-from pathlib import Path
 from dataclasses import dataclass
+from pathlib import Path
 
 from bs4 import BeautifulSoup
-from lunr import lunr, get_default_builder, trimmer
-from lunr.pipeline import Pipeline
-from unidecode import unidecode
+from lunr import get_default_builder, lunr, trimmer  # type: ignore
+from lunr.pipeline import Pipeline  # type: ignore
+from unidecode import unidecode  # type: ignore
 
 LOGGER = logging.getLogger("index")
 
@@ -24,10 +24,11 @@ class Document:
 
 
 def body_text(soup: BeautifulSoup):
-    body = soup.div(id="body")[0]
-    for header in body(class_="header"):
+    body = soup.find_all("div", id="body")
+    assert body is not None
+    for header in body[0](class_="header"):
         header.extract()
-    for img in body("img"):
+    for img in body[0]("img"):
         alt = soup.new_tag("p")
         alt.string = img["alt"]
         img.replace_with(alt)
@@ -68,15 +69,22 @@ def index(indir: Path, outdir: Path) -> None:
         if "Document" in section["class"]:
             LOGGER.info("Texte complet de %s ne sera pas indexé", title)
             continue
-        url = section.a["href"]
+        a = section.a
+        assert a is not None
+        url = a["href"]
+        assert not isinstance(url, list)
         # Assume it is a relative URL (we made it)
         LOGGER.info("Traitement: %s: %s", title, indir / url)
         with open(indir / url, "rt") as infh:
             subsoup = BeautifulSoup(infh, features="lxml")
             textes[url] = {"titre": title, "texte": body_text(subsoup)}
     for text in soup.select("li.leaf"):
-        title = text.a.text
-        url = text.a["href"]
+        assert text is not None
+        a = text.a
+        assert a is not None
+        title = a.text
+        url = a["href"]
+        assert not isinstance(url, list)
         LOGGER.info("Traitement: %s: %s", title, indir / url)
         with open(indir / url, "rt") as infh:
             subsoup = BeautifulSoup(infh, features="lxml")
diff --git a/alexi/search.py b/alexi/search.py
index dabf11f..9f62d78 100644
--- a/alexi/search.py
+++ b/alexi/search.py
@@ -6,8 +6,9 @@
 from pathlib import Path
 from typing import List
 
-from lunr.languages import get_nltk_builder
-from lunr.index import Index
+from lunr.index import Index  # type: ignore
+from lunr.languages import get_nltk_builder  # type: ignore
+
 from alexi.index import unifold
 
 # This is just here to register the necessary pipeline functions