Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Feb 12, 2024
2 parents 24b28cc + 058bc15 commit 9d42645
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 69 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/analyse.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Téléchargement, analyse et indexation des règlements d'urbanisme
name: Téléchargement et analyse des règlements d'urbanisme

on:
workflow_dispatch:
Expand Down Expand Up @@ -35,7 +35,7 @@ jobs:
run: |
alexi -v download --exclude=Plan --exclude=/derogation \
--exclude='\d-[aA]dopt' --exclude='Z-\d' \
--exclude=Reso
--exclude='-[rR]eso'
for d in download/*.pdf; do
bn=$(basename $d .pdf)
for dd in data/train data/dev data/test; do
Expand All @@ -47,6 +47,9 @@ jobs:
- name: Extract
run: |
alexi -v extract -m download/index.json download/*.pdf
- name: Link
run: |
alexi -v link
- name: Setup Pages
uses: actions/configure-pages@v4
- name: Upload artifact
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/entrainement.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ jobs:
run: |
sh results/run.sh
- name: Upload results
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: results
path: results/*.csv
- name: Train
run: |
sh scripts/retrain.sh
- name: Upload models
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: models
path: alexi/models/
6 changes: 3 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- uses: actions/checkout@v4

- name: Set up Python 3.10
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.10"

Expand Down Expand Up @@ -46,7 +46,7 @@ jobs:
- uses: actions/checkout@v4

- name: Set up Python 3.10
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.10"

Expand All @@ -68,4 +68,4 @@ jobs:
python -m coverage html
- name: Upload code coverage
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
9 changes: 8 additions & 1 deletion alexi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pathlib import Path
from typing import Any, Iterable, TextIO

from . import download, extract
from . import download, extract, link
from .analyse import Analyseur, Bloc
from .convert import FIELDNAMES, Converteur, merge_overlaps
from .format import format_dict, format_html, format_xml
Expand Down Expand Up @@ -213,6 +213,13 @@ def make_argparse() -> argparse.ArgumentParser:
extract.add_arguments(extract_command)
extract_command.set_defaults(func=extract.main)

link_command = subp.add_parser(
"link",
help="Ajouter des hyperliens aux documents HTML",
)
link.add_arguments(link_command)
link_command.set_defaults(func=link.main)

index = subp.add_parser(
"index", help="Générer un index Whoosh sur les documents extraits"
)
Expand Down
83 changes: 26 additions & 57 deletions alexi/extract.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
"""
Convertir les règlements en HTML, texte, et/ou JSON structuré.
Convertir les règlements en HTML
"""

import argparse
import csv
import dataclasses
import itertools
import json
import logging
Expand All @@ -17,7 +16,7 @@

from alexi.analyse import Analyseur, Document, Element
from alexi.convert import Converteur
from alexi.format import format_dict, format_html, format_text
from alexi.format import format_html
from alexi.label import DEFAULT_MODEL as DEFAULT_LABEL_MODEL
from alexi.label import Extracteur
from alexi.segment import DEFAULT_MODEL as DEFAULT_SEGMENT_MODEL
Expand Down Expand Up @@ -45,12 +44,6 @@ def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument(
"--label-model", help="Modele CRF", type=Path, default=DEFAULT_LABEL_MODEL
)
parser.add_argument(
"-s",
"--serafim",
help="Générer le format JSON attendu par SÈRAFIM",
action="store_true",
)
parser.add_argument(
"-m",
"--metadata",
Expand All @@ -68,30 +61,6 @@ def read_csv(path: Path) -> list[dict[str, Any]]:
return list(csv.DictReader(infh))


def extract_serafim(args, path, iob, conv):
docdir = args.outdir / "data"
imgdir = args.outdir / "public" / "img" / path.stem
LOGGER.info("Génération de fichiers SÈRAFIM sous %s", docdir)
docdir.mkdir(parents=True, exist_ok=True)
analyseur = Analyseur(path.stem, iob)
if not args.no_images:
LOGGER.info("Extraction d'images sous %s", imgdir)
imgdir.mkdir(parents=True, exist_ok=True)
if conv and not args.no_images:
LOGGER.info("Extraction d'images de %s", path)
images = conv.extract_images()
analyseur.add_images(images)
save_images_from_pdf(analyseur.blocs, conv, imgdir)
LOGGER.info("Analyse de la structure de %s", path)
doc = analyseur()
with open(docdir / f"{path.stem}.json", "wt") as outfh:
LOGGER.info("Génération de %s/%s.json", docdir, path.stem)
docdict = format_dict(doc, imgdir=path.stem)
pdf_path = path.with_suffix(".pdf")
docdict["fichier"] = pdf_path.name
json.dump(docdict, outfh, indent=2, ensure_ascii=False)


HTML_GLOBAL_HEADER = """<!DOCTYPE html>
<html lang="fr">
<head>
Expand Down Expand Up @@ -159,7 +128,10 @@ def extract_serafim(args, path, iob, conv):


def extract_element(
doc: Document, el: Element, outdir: Path, imgdir: Path, fragment=True
doc: Document,
el: Element,
outdir: Path,
imgdir: Path,
):
"""Extract the various constituents, referencing images in the
generated image directory."""
Expand Down Expand Up @@ -189,12 +161,8 @@ def extract_element(
LOGGER.info("%s %s", outdir, el.titre)
with open(outdir / "index.html", "wt") as outfh:
outfh.write(HTML_HEADER)
outfh.write(format_html(doc, element=el, imgdir=rel_imgdir, fragment=fragment))
outfh.write(format_html(doc, element=el, imgdir=rel_imgdir, fragment=True))
outfh.write(HTML_FOOTER)
with open(outdir / "index.md", "wt") as outfh:
outfh.write(format_text(doc, element=el))
with open(outdir / "index.json", "wt") as outfh:
json.dump(dataclasses.asdict(el), outfh)


def make_index_html(
Expand Down Expand Up @@ -348,7 +316,7 @@ def make_sub_index(el: Element, path: Path, titre: str):
(subel, parent / el.type / el.numero) for subel in reversed(el.sub)
)
# And do a full extraction (which might crash your browser)
extract_element(doc, doc.structure, docdir, imgdir, fragment=False)
extract_element(doc, doc.structure, docdir, imgdir)
return doc


Expand Down Expand Up @@ -378,21 +346,23 @@ def make_doc_subtree(doc: Document, outfh: TextIO):
else:
eltitre = f"{el.type} {el.numero}"
while level < prev_level:
outfh.write("</ul></li>\n")
outfh.write("</ul></details></li>\n")
prev_level -= 1
if el.sub:
outfh.write(f'<li class="node"><details><summary>{eltitre}</summary><ul>\n')
outfh.write(
f'<li class="{el.type} node"><details><summary>{eltitre}</summary><ul>\n'
)
link = f'<a target="_blank" href="{eldir}/index.html">Texte intégral</a>'
pdflink = f'<a target="_blank" href="{doc.pdfurl}#page={el.page}">PDF</a>'
outfh.write(f'<li class="text">{link} ({pdflink})</li>\n')
else:
link = f'<a target="_blank" href="{eldir}/index.html">{eltitre}</a>'
pdflink = f'<a target="_blank" href="{doc.pdfurl}#page={el.page}">PDF</a>'
outfh.write(f'<li class="leaf">{link} ({pdflink})</li>\n')
outfh.write(f'<li class="{el.type} leaf">{link} ({pdflink})</li>\n')
d.extendleft((subel, eldir, level + 1) for subel in reversed(el.sub))
prev_level = level
while prev_level > 1:
outfh.write("</ul></li>\n")
outfh.write("</ul></details></li>\n")
prev_level -= 1
outfh.write("</ul>\n")

Expand Down Expand Up @@ -422,10 +392,10 @@ def make_doc_tree(docs: list[Document], outdir: Path):
LOGGER.info("Génération de %s", outdir / "index.html")
outfh.write(HTML_HEADER)
for doc in docs:
outfh.write('<li class="node"><details>\n')
outfh.write('<li class="Document node"><details>\n')
outfh.write(f"<summary>{doc.numero}: {doc.titre}</summary>\n")
make_doc_subtree(doc, outfh)
outfh.write("</li>\n")
outfh.write("</details></li>\n")
outfh.write(HTML_FOOTER)
with open(outdir / "style.css", "wt") as outfh:
outfh.write(STYLE_CSS)
Expand Down Expand Up @@ -478,6 +448,10 @@ def main(args):
metadata = json.load(infh)
docs = []
for path in args.docs:
pdf_path = path.with_suffix(".pdf")
if metadata and pdf_path.name not in metadata:
LOGGER.warning("Non-traitement de %s car absent des metadonnées", path)
continue
conv = None
if path.suffix == ".csv":
LOGGER.info("Lecture de %s", path)
Expand All @@ -501,23 +475,18 @@ def main(args):
crf = Segmenteur(args.segment_model)
iob = list(extracteur(crf(feats)))

pdf_path = path.with_suffix(".pdf")
pdf_url = metadata.get(
pdf_path.name,
f"https://ville.sainte-adele.qc.ca/upload/documents/{pdf_path.name}",
)
if conv is None and pdf_path.exists():
conv = Converteur(pdf_path)
if args.serafim:
extract_serafim(args, path, iob, conv)
else:
doc = extract_html(args, path, iob, conv)
doc.pdfurl = pdf_url
docs.append(doc)
if "zonage" in doc.titre.lower():
extract_zonage(doc, args.outdir / "zonage.json")
if not args.serafim:
make_doc_tree(docs, args.outdir)
doc = extract_html(args, path, iob, conv)
doc.pdfurl = pdf_url
docs.append(doc)
if "zonage" in doc.titre.lower():
extract_zonage(doc, args.outdir / "zonage.json")
make_doc_tree(docs, args.outdir)


if __name__ == "__main__":
Expand Down
12 changes: 8 additions & 4 deletions alexi/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ def element_html(el: Element, indent: int = 2, offset: int = 0) -> list[str]:
sp = " " * indent
tag = TAG[el.type]
header = HEADER[el.type]
lines = [f'{off}<{tag} class="{el.type}">']
lines = []
if tag != "body":
lines.append(f'{off}<{tag} class="{el.type}">')
if el.numero and offset:
lines.append(
f'{off}{sp}<a class="anchor" name="{el.type}/{el.numero}"></a>'
Expand Down Expand Up @@ -147,7 +149,8 @@ def element_html(el: Element, indent: int = 2, offset: int = 0) -> list[str]:
if html:
lines.append(off + sp + html)
idx += 1
lines.append(off + f"</{tag}>")
if tag != "body":
lines.append(off + f"</{tag}>")
return lines

if element is None:
Expand All @@ -161,8 +164,9 @@ def element_html(el: Element, indent: int = 2, offset: int = 0) -> list[str]:
<html>
<head>
<title>{doc.titre}</title>
</head>"""
doc_footer = "</html>"
</head>
<body>"""
doc_footer = "</body></html>"
return "\n".join((doc_header, doc_body, doc_footer))


Expand Down
Loading

0 comments on commit 9d42645

Please sign in to comment.