Merge remote-tracking branch 'origin/main'

dhdaines · Feb 12, 2024 · 9d42645 · 9d42645
2 parents 24b28cc + 058bc15
commit 9d42645
Show file tree

Hide file tree

Showing 7 changed files with 154 additions and 69 deletions.
diff --git a/.github/workflows/analyse.yml b/.github/workflows/analyse.yml
@@ -1,4 +1,4 @@
-name: Téléchargement, analyse et indexation des règlements d'urbanisme
+name: Téléchargement et analyse des règlements d'urbanisme
 
 on:
   workflow_dispatch:
@@ -35,7 +35,7 @@ jobs:
       run: |
         alexi -v download --exclude=Plan --exclude=/derogation \
                  --exclude='\d-[aA]dopt' --exclude='Z-\d' \
-                 --exclude=Reso
+                 --exclude='-[rR]eso'
         for d in download/*.pdf; do
             bn=$(basename $d .pdf)
             for dd in data/train data/dev data/test; do
@@ -47,6 +47,9 @@ jobs:
     - name: Extract
       run: |
         alexi -v extract -m download/index.json download/*.pdf
+    - name: Link
+      run: |
+        alexi -v link
     - name: Setup Pages
       uses: actions/configure-pages@v4
     - name: Upload artifact

diff --git a/.github/workflows/entrainement.yml b/.github/workflows/entrainement.yml
@@ -17,15 +17,15 @@ jobs:
       run: |
         sh results/run.sh
     - name: Upload results
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: results
         path: results/*.csv
     - name: Train
       run: |
         sh scripts/retrain.sh
     - name: Upload models
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: models
         path: alexi/models/
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -10,7 +10,7 @@ jobs:
     - uses: actions/checkout@v4
 
     - name: Set up Python 3.10
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: "3.10"
 
@@ -46,7 +46,7 @@ jobs:
     - uses: actions/checkout@v4
 
     - name: Set up Python 3.10
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: "3.10"
 
@@ -68,4 +68,4 @@ jobs:
         python -m coverage html
 
     - name: Upload code coverage
-      uses: codecov/codecov-action@v3
+      uses: codecov/codecov-action@v4
diff --git a/alexi/__init__.py b/alexi/__init__.py
@@ -15,7 +15,7 @@
 from pathlib import Path
 from typing import Any, Iterable, TextIO
 
-from . import download, extract
+from . import download, extract, link
 from .analyse import Analyseur, Bloc
 from .convert import FIELDNAMES, Converteur, merge_overlaps
 from .format import format_dict, format_html, format_xml
@@ -213,6 +213,13 @@ def make_argparse() -> argparse.ArgumentParser:
     extract.add_arguments(extract_command)
     extract_command.set_defaults(func=extract.main)
 
+    link_command = subp.add_parser(
+        "link",
+        help="Ajouter des hyperliens aux documents HTML",
+    )
+    link.add_arguments(link_command)
+    link_command.set_defaults(func=link.main)
+
     index = subp.add_parser(
         "index", help="Générer un index Whoosh sur les documents extraits"
     )

diff --git a/alexi/extract.py b/alexi/extract.py
@@ -1,10 +1,9 @@
 """
-Convertir les règlements en HTML, texte, et/ou JSON structuré.
+Convertir les règlements en HTML
 """
 
 import argparse
 import csv
-import dataclasses
 import itertools
 import json
 import logging
@@ -17,7 +16,7 @@
 
 from alexi.analyse import Analyseur, Document, Element
 from alexi.convert import Converteur
-from alexi.format import format_dict, format_html, format_text
+from alexi.format import format_html
 from alexi.label import DEFAULT_MODEL as DEFAULT_LABEL_MODEL
 from alexi.label import Extracteur
 from alexi.segment import DEFAULT_MODEL as DEFAULT_SEGMENT_MODEL
@@ -45,12 +44,6 @@ def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.add_argument(
         "--label-model", help="Modele CRF", type=Path, default=DEFAULT_LABEL_MODEL
     )
-    parser.add_argument(
-        "-s",
-        "--serafim",
-        help="Générer le format JSON attendu par SÈRAFIM",
-        action="store_true",
-    )
     parser.add_argument(
         "-m",
         "--metadata",
@@ -68,30 +61,6 @@ def read_csv(path: Path) -> list[dict[str, Any]]:
         return list(csv.DictReader(infh))
 
 
-def extract_serafim(args, path, iob, conv):
-    docdir = args.outdir / "data"
-    imgdir = args.outdir / "public" / "img" / path.stem
-    LOGGER.info("Génération de fichiers SÈRAFIM sous %s", docdir)
-    docdir.mkdir(parents=True, exist_ok=True)
-    analyseur = Analyseur(path.stem, iob)
-    if not args.no_images:
-        LOGGER.info("Extraction d'images sous %s", imgdir)
-        imgdir.mkdir(parents=True, exist_ok=True)
-    if conv and not args.no_images:
-        LOGGER.info("Extraction d'images de %s", path)
-        images = conv.extract_images()
-        analyseur.add_images(images)
-        save_images_from_pdf(analyseur.blocs, conv, imgdir)
-    LOGGER.info("Analyse de la structure de %s", path)
-    doc = analyseur()
-    with open(docdir / f"{path.stem}.json", "wt") as outfh:
-        LOGGER.info("Génération de %s/%s.json", docdir, path.stem)
-        docdict = format_dict(doc, imgdir=path.stem)
-        pdf_path = path.with_suffix(".pdf")
-        docdict["fichier"] = pdf_path.name
-        json.dump(docdict, outfh, indent=2, ensure_ascii=False)
-
-
 HTML_GLOBAL_HEADER = """<!DOCTYPE html>
 <html lang="fr">
   <head>
@@ -159,7 +128,10 @@ def extract_serafim(args, path, iob, conv):
 
 
 def extract_element(
-    doc: Document, el: Element, outdir: Path, imgdir: Path, fragment=True
+    doc: Document,
+    el: Element,
+    outdir: Path,
+    imgdir: Path,
 ):
     """Extract the various constituents, referencing images in the
     generated image directory."""
@@ -189,12 +161,8 @@ def extract_element(
     LOGGER.info("%s %s", outdir, el.titre)
     with open(outdir / "index.html", "wt") as outfh:
         outfh.write(HTML_HEADER)
-        outfh.write(format_html(doc, element=el, imgdir=rel_imgdir, fragment=fragment))
+        outfh.write(format_html(doc, element=el, imgdir=rel_imgdir, fragment=True))
         outfh.write(HTML_FOOTER)
-    with open(outdir / "index.md", "wt") as outfh:
-        outfh.write(format_text(doc, element=el))
-    with open(outdir / "index.json", "wt") as outfh:
-        json.dump(dataclasses.asdict(el), outfh)
 
 
 def make_index_html(
@@ -348,7 +316,7 @@ def make_sub_index(el: Element, path: Path, titre: str):
             (subel, parent / el.type / el.numero) for subel in reversed(el.sub)
         )
     # And do a full extraction (which might crash your browser)
-    extract_element(doc, doc.structure, docdir, imgdir, fragment=False)
+    extract_element(doc, doc.structure, docdir, imgdir)
     return doc
 
 
@@ -378,21 +346,23 @@ def make_doc_subtree(doc: Document, outfh: TextIO):
             else:
                 eltitre = f"{el.type} {el.numero}"
         while level < prev_level:
-            outfh.write("</ul></li>\n")
+            outfh.write("</ul></details></li>\n")
             prev_level -= 1
         if el.sub:
-            outfh.write(f'<li class="node"><details><summary>{eltitre}</summary><ul>\n')
+            outfh.write(
+                f'<li class="{el.type} node"><details><summary>{eltitre}</summary><ul>\n'
+            )
             link = f'<a target="_blank" href="{eldir}/index.html">Texte intégral</a>'
             pdflink = f'<a target="_blank" href="{doc.pdfurl}#page={el.page}">PDF</a>'
             outfh.write(f'<li class="text">{link} ({pdflink})</li>\n')
         else:
             link = f'<a target="_blank" href="{eldir}/index.html">{eltitre}</a>'
             pdflink = f'<a target="_blank" href="{doc.pdfurl}#page={el.page}">PDF</a>'
-            outfh.write(f'<li class="leaf">{link} ({pdflink})</li>\n')
+            outfh.write(f'<li class="{el.type} leaf">{link} ({pdflink})</li>\n')
         d.extendleft((subel, eldir, level + 1) for subel in reversed(el.sub))
         prev_level = level
     while prev_level > 1:
-        outfh.write("</ul></li>\n")
+        outfh.write("</ul></details></li>\n")
         prev_level -= 1
     outfh.write("</ul>\n")
 
@@ -422,10 +392,10 @@ def make_doc_tree(docs: list[Document], outdir: Path):
         LOGGER.info("Génération de %s", outdir / "index.html")
         outfh.write(HTML_HEADER)
         for doc in docs:
-            outfh.write('<li class="node"><details>\n')
+            outfh.write('<li class="Document node"><details>\n')
             outfh.write(f"<summary>{doc.numero}: {doc.titre}</summary>\n")
             make_doc_subtree(doc, outfh)
-            outfh.write("</li>\n")
+            outfh.write("</details></li>\n")
         outfh.write(HTML_FOOTER)
     with open(outdir / "style.css", "wt") as outfh:
         outfh.write(STYLE_CSS)
@@ -478,6 +448,10 @@ def main(args):
             metadata = json.load(infh)
     docs = []
     for path in args.docs:
+        pdf_path = path.with_suffix(".pdf")
+        if metadata and pdf_path.name not in metadata:
+            LOGGER.warning("Non-traitement de %s car absent des metadonnées", path)
+            continue
         conv = None
         if path.suffix == ".csv":
             LOGGER.info("Lecture de %s", path)
@@ -501,23 +475,18 @@ def main(args):
                     crf = Segmenteur(args.segment_model)
                 iob = list(extracteur(crf(feats)))
 
-        pdf_path = path.with_suffix(".pdf")
         pdf_url = metadata.get(
             pdf_path.name,
             f"https://ville.sainte-adele.qc.ca/upload/documents/{pdf_path.name}",
         )
         if conv is None and pdf_path.exists():
             conv = Converteur(pdf_path)
-        if args.serafim:
-            extract_serafim(args, path, iob, conv)
-        else:
-            doc = extract_html(args, path, iob, conv)
-            doc.pdfurl = pdf_url
-            docs.append(doc)
-            if "zonage" in doc.titre.lower():
-                extract_zonage(doc, args.outdir / "zonage.json")
-    if not args.serafim:
-        make_doc_tree(docs, args.outdir)
+        doc = extract_html(args, path, iob, conv)
+        doc.pdfurl = pdf_url
+        docs.append(doc)
+        if "zonage" in doc.titre.lower():
+            extract_zonage(doc, args.outdir / "zonage.json")
+    make_doc_tree(docs, args.outdir)
 
 
 if __name__ == "__main__":

diff --git a/alexi/format.py b/alexi/format.py
@@ -117,7 +117,9 @@ def element_html(el: Element, indent: int = 2, offset: int = 0) -> list[str]:
         sp = " " * indent
         tag = TAG[el.type]
         header = HEADER[el.type]
-        lines = [f'{off}<{tag} class="{el.type}">']
+        lines = []
+        if tag != "body":
+            lines.append(f'{off}<{tag} class="{el.type}">')
         if el.numero and offset:
             lines.append(
                 f'{off}{sp}<a class="anchor" name="{el.type}/{el.numero}"></a>'
@@ -147,7 +149,8 @@ def element_html(el: Element, indent: int = 2, offset: int = 0) -> list[str]:
                 if html:
                     lines.append(off + sp + html)
                 idx += 1
-        lines.append(off + f"</{tag}>")
+        if tag != "body":
+            lines.append(off + f"</{tag}>")
         return lines
 
     if element is None:
@@ -161,8 +164,9 @@ def element_html(el: Element, indent: int = 2, offset: int = 0) -> list[str]:
 <html>
   <head>
     <title>{doc.titre}</title>
-  </head>"""
-        doc_footer = "</html>"
+  </head>
+  <body>"""
+        doc_footer = "</body></html>"
         return "\n".join((doc_header, doc_body, doc_footer))