Skip to content

Commit

Permalink
fix: ameliorer un peu la reconnaissance des reglements
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Sep 5, 2024
1 parent 2e420c0 commit cb98ee5
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 12 deletions.
31 changes: 21 additions & 10 deletions alexi/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
r"\b(?P<sec>article|chapitre|section|sous-section|annexe) (?P<num>[\d\.]+)",
re.IGNORECASE,
)
REG_RE = re.compile(r"règlement[^\d]+(?P<reg>[\d\.A-Z-]+)", re.IGNORECASE)
REG_RE = re.compile(r"(?i:règlement)(?:[^\d]+(?P<reg>[\d\.A-Z-]+))?")
MILIEU_RE = re.compile(rf"{MILIEU}\s+(?P<mtype>{MTYPE})", re.IGNORECASE | re.VERBOSE)
PALIER_IDX = {palier: idx for idx, palier in enumerate(PALIERS)}

Expand All @@ -40,14 +40,22 @@ def locate_article(numero: str, doc: Document) -> list[str]:
def normalize_title(title: str):
title = title.lower()
title = re.sub(r"\s+", " ", title).strip()
title = re.sub(r"^règlement (?:de|sur|concernant|relatif aux) ", "", title)
title = re.sub(
r"^règlement"
r"(?: (?:des|de|sur|concernant|relatif|afin de))?"
r"(?: (?:aux|au|à la|les|des|de la|de|du|le|la))?",
"",
title,
)
title = re.sub(
r"\bpiia\b",
r"plans d'implantation et d'intégration architecturale",
title,
)
title = re.sub(r"[‘’]", "'", title)
title = re.sub(r", ", " ", title)
title = re.sub(r"\([^\)]+\)$", "", title)
return title
return title.strip(r""" .,;'«»"“”""")


class Resolver:
Expand All @@ -58,7 +66,10 @@ def __init__(self, metadata: Optional[dict] = None):
self.urls: set[str] = set()
for docpath, info in self.metadata["docs"].items():
self.numeros[info["numero"]] = docpath
self.titles[normalize_title(info["titre"])] = docpath
normtitle = normalize_title(info["titre"])
if normtitle != "":
self.titles[normtitle] = docpath
LOGGER.info("%s:%s => %s", info["numero"], normtitle, docpath)

def __call__(
self, text: str, srcpath: str = "", doc: Optional[Document] = None
Expand Down Expand Up @@ -98,15 +109,15 @@ def resolve_internal(
"""
docpath = None
text = re.sub(r"\s+", " ", text).strip()
# NOTE: This really matches anything starting with "règlement"
if m := REG_RE.search(text):
numero = m.group("reg").strip(" .,;")
if numero is None:
return None
docpath = self.numeros.get(numero)
numero = m.group("reg")
if numero is not None:
numero = numero.strip(" .,;")
docpath = self.numeros.get(numero)
if docpath is None:
normtext = normalize_title(text)
for title in self.titles:
if title in text.lower():
if title in normtext:
docpath = self.titles[title]
break
if docpath is None:
Expand Down
38 changes: 36 additions & 2 deletions test/test_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from alexi.analyse import Document, match_links
from alexi.link import Resolver, locate_article
from alexi.link import Resolver, locate_article, normalize_title

DATADIR = Path(__file__).parent / "data"
TRAINDIR = Path(__file__).parent.parent / "data"
Expand Down Expand Up @@ -124,10 +124,18 @@ def test_laws(test_input, expected):
"Règlement de zonage 1314-2021-Z",
"../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z",
),
(
"règlement foo bar baz 1314-2021-Z",
"../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z",
),
(
"Règlement sur les permis et certificats 1314-2021-PC",
"../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013",
),
(
"RÈGLEMENT 1314-2021-PC",
"../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013",
),
(
"chapitre 5 du Règlement de zonage 1314-2021-Z",
"../20231213-Codification-administrative-Rgl-1314-2021-Z/Chapitre/5/index.html",
Expand All @@ -141,6 +149,10 @@ def test_laws(test_input, expected):
"Règlement de zonage",
"../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z",
),
(
"Règlement sur les permis et certificats",
"../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013",
),
(
"section 3 du chapitre 5 du Règlement de zonage 1314-2021-Z",
"../20231213-Codification-administrative-Rgl-1314-2021-Z/Chapitre/5/Section/3/index.html",
Expand All @@ -156,7 +168,8 @@ def test_laws(test_input, expected):
@pytest.mark.parametrize("test_input,expected", BYLAWS)
def test_bylaws(test_input, expected):
r = Resolver(METADATA)
assert r(test_input, ".") == expected
found = r(test_input, ".")
assert found == expected


with open(DATADIR / "lotissement.json", "rt") as infh:
Expand Down Expand Up @@ -315,3 +328,24 @@ def test_match_multiples(text, before, multi, after):
for link, ref in zip(links, multi):
assert text[link.start : link.end] == ref
assert link.alt == f"{before} {ref} {after}"


TITLES = [
("RÈGLEMENT ", ""),
(
"règlement relatif aux projets particuliers de construction, de modification ou d'occupation d'un immeuble.",
"projets particuliers de construction de modification ou d'occupation d'un immeuble",
),
(
"RÈGLEMENT AFIN DE DÉCRÉTER DES DISPOSITIONS CONCERNANT L’OCCUPATION DU DOMAINE PUBLIC.",
"décréter des dispositions concernant l'occupation du domaine public",
),
("Règlement de construction ", "construction"),
("Règlement sur les dérogations mineures ", "dérogations mineures"),
]


@pytest.mark.parametrize("text,after", TITLES)
def test_normalize_title(text, after):
"""Verifier la normalisation des titres de règlements"""
assert normalize_title(text) == after

0 comments on commit cb98ee5

Please sign in to comment.