Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changer pour lunr.py et faire l'indexation pour SÈRAFIM #27

Merged
merged 5 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/analyse.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ jobs:
- name: Extract
run: |
alexi -v extract -m download/index.json download/*.pdf
- name: Index
run: |
alexi -v index export
- name: Setup Pages
uses: actions/configure-pages@v5
- name: Upload artifact
Expand Down
2 changes: 1 addition & 1 deletion alexi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def make_argparse() -> argparse.ArgumentParser:
"--outdir",
help="Repertoire destination pour l'index",
type=Path,
default="indexdir",
default="export/_idx",
)
index.add_argument("indir", help="Repertoire avec les fichiers extraits", type=Path)
index.set_defaults(func=index_main)
Expand Down
156 changes: 114 additions & 42 deletions alexi/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,56 +4,128 @@

import json
import logging
import os
import re
from dataclasses import dataclass
from pathlib import Path

from whoosh.analysis import CharsetFilter, StemmingAnalyzer # type: ignore
from whoosh.fields import ID, NUMERIC, TEXT, Schema # type: ignore
from whoosh.index import create_in # type: ignore
from whoosh.support.charset import charset_table_to_dict # type: ignore
from whoosh.support.charset import default_charset
from whoosh.writing import IndexWriter # type: ignore
from bs4 import BeautifulSoup
from lunr import get_default_builder, lunr, trimmer # type: ignore
from lunr.pipeline import Pipeline # type: ignore
from unidecode import unidecode # type: ignore

LOGGER = logging.getLogger("index")
CHARMAP = charset_table_to_dict(default_charset)
ANALYZER = StemmingAnalyzer() | CharsetFilter(CHARMAP)


def add_from_dir(writer: IndexWriter, document: str, docdir: Path) -> dict:
with open(docdir / "index.json") as infh:
element = json.load(infh)
titre = f'{element["type"]} {element["numero"]}: {element["titre"]}'
page = element.get("page", 1)
LOGGER.info("Indexing %s: %s", docdir, element["titre"])
with open(docdir / "index.md") as infh:
writer.add_document(
document=document, page=page, titre=titre, contenu=infh.read()
)
return element
@dataclass
class Document:
url: str
titre: str
texte: str


def body_text(soup: BeautifulSoup):
body = soup.find_all("div", id="body")
assert body is not None
for header in body[0](class_="header"):
header.extract()
for img in body[0]("img"):
alt = soup.new_tag("p")
alt.string = img["alt"]
img.replace_with(alt)
return re.sub("\n\n+", "\n\n", soup.text.strip())


def unifold(token, _idx=None, _tokens=None):
def wrap_unidecode(text, _metadata):
return unidecode(text)

return token.update(wrap_unidecode)


Pipeline.register_function(unifold, "unifold")


def index(indir: Path, outdir: Path) -> None:
outdir.mkdir(exist_ok=True)
schema = Schema(
document=ID(stored=True),
page=NUMERIC(stored=True),
titre=TEXT(ANALYZER, stored=True),
contenu=TEXT(ANALYZER, stored=True),
)
ix = create_in(outdir, schema)
writer = ix.writer()
for docdir in indir.iterdir():
if not docdir.is_dir():
"""
Generer l'index a partir des fichiers HTML.
"""
# Metadata (use to index specific zones, etc)
# with open(indir / "index.json", "rt") as infh:
# metadata = json.load(infh)

# lunr does not do storage so we store plaintext here
textes = {}

# Use index.html to find things (as in the js version)
LOGGER.info("Traitement: %s", indir / "index.html")
with open(indir / "index.html", "rt") as infh:
soup = BeautifulSoup(infh, features="lxml")
for section in soup.select("li.node"):
summary = section.summary
if summary is None:
LOGGER.error("<summary> non trouvé dans %s", section)
continue
if not (docdir / "index.json").exists():
title = summary.text
if "Document" in section["class"]:
LOGGER.info("Texte complet de %s ne sera pas indexé", title)
continue
document = docdir.with_suffix(".pdf").name
add_from_dir(writer, document, docdir)
for subdir in docdir.iterdir():
if not docdir.is_dir():
continue
for dirpath, _, filenames in os.walk(subdir, topdown=True):
if "index.json" not in filenames:
continue
add_from_dir(writer, document, Path(dirpath))
writer.commit()
a = section.a
assert a is not None
url = a["href"]
assert not isinstance(url, list)
# Assume it is a relative URL (we made it)
LOGGER.info("Traitement: %s: %s", title, indir / url)
with open(indir / url, "rt") as infh:
subsoup = BeautifulSoup(infh, features="lxml")
textes[url] = {"titre": title, "texte": body_text(subsoup)}
for text in soup.select("li.leaf"):
assert text is not None
a = text.a
assert a is not None
title = a.text
url = a["href"]
assert not isinstance(url, list)
LOGGER.info("Traitement: %s: %s", title, indir / url)
with open(indir / url, "rt") as infh:
subsoup = BeautifulSoup(infh, features="lxml")
textes[url] = {"titre": title, "texte": body_text(subsoup)}

outdir.mkdir(exist_ok=True)
with open(outdir / "textes.json", "wt", encoding="utf-8") as outfh:
json.dump(textes, outfh, indent=2, ensure_ascii=False)

builder = get_default_builder("fr")
# DO NOT USE the French trimmer as it is seriously defective
builder.pipeline.remove(
builder.pipeline.registered_functions["lunr-multi-trimmer-fr"]
)
builder.pipeline.before(
builder.pipeline.registered_functions["stopWordFilter-fr"], trimmer.trimmer
)
# Missing pipeline functions for search
builder.search_pipeline.before(
builder.search_pipeline.registered_functions["stemmer-fr"],
builder.search_pipeline.registered_functions["stopWordFilter-fr"],
)
builder.search_pipeline.before(
builder.search_pipeline.registered_functions["stopWordFilter-fr"],
trimmer.trimmer,
)
builder.pipeline.add(unifold)
builder.search_pipeline.add(unifold)
builder.metadata_whitelist.append("position")
LOGGER.info("pipeline: %s", builder.pipeline)
LOGGER.info("search pipeline: %s", builder.pipeline)

index = lunr(
ref="url",
fields=[{"field_name": "titre", "boost": 2}, "texte"],
documents=[
{"url": url, "titre": doc["titre"], "texte": doc["texte"]}
for url, doc in textes.items()
],
languages="fr",
builder=builder,
)
with open(outdir / "index.json", "wt", encoding="utf-8") as outfh:
json.dump(index.serialize(), outfh, indent=2, ensure_ascii=False)
29 changes: 15 additions & 14 deletions alexi/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,24 @@
Lancer des recherches dans l'index de données.
"""

import json
from pathlib import Path
from typing import List

from whoosh.index import open_dir # type: ignore
from whoosh.qparser import MultifieldParser, OrGroup # type: ignore
from lunr.index import Index # type: ignore
from lunr.languages import get_nltk_builder # type: ignore

from alexi.index import unifold

# This is just here to register the necessary pipeline functions
get_nltk_builder(["fr"])


def search(indexdir: Path, terms: List[str]) -> None:
ix = open_dir(indexdir)
parser = MultifieldParser(
["titre", "contenu"], ix.schema, group=OrGroup.factory(0.9)
)
query = parser.parse(" ".join(terms))
with ix.searcher() as searcher:
results = searcher.search(query)
for r in results:
print(
f'https://ville.sainte-adele.qc.ca/upload/documents/{r["document"]}'
f'#page={r["page"]} {r["titre"]}'
)
with open(indexdir / "index.json", "rt", encoding="utf-8") as infh:
index = Index.load(json.load(infh))
index.pipeline.add(unifold)
results = index.search(" ".join(terms))
for r in results:
print(r)
print(r["match_data"].metadata)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies = [
"pdfplumber",
"scikit-learn",
"sklearn-crfsuite",
"whoosh",
"lunr[languages]",
]
[project.optional-dependencies]
dev = [
Expand Down
Loading