From cb98ee54458a390d6f6b2dd8ae67757f7ca3740b Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 5 Sep 2024 15:51:49 -0400 Subject: [PATCH] fix: ameliorer un peu la reconnaissance des reglements --- alexi/link.py | 31 +++++++++++++++++++++---------- test/test_link.py | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/alexi/link.py b/alexi/link.py index d5ced61..d840f92 100644 --- a/alexi/link.py +++ b/alexi/link.py @@ -22,7 +22,7 @@ r"\b(?Particle|chapitre|section|sous-section|annexe) (?P[\d\.]+)", re.IGNORECASE, ) -REG_RE = re.compile(r"règlement[^\d]+(?P[\d\.A-Z-]+)", re.IGNORECASE) +REG_RE = re.compile(r"(?i:règlement)(?:[^\d]+(?P[\d\.A-Z-]+))?") MILIEU_RE = re.compile(rf"{MILIEU}\s+(?P{MTYPE})", re.IGNORECASE | re.VERBOSE) PALIER_IDX = {palier: idx for idx, palier in enumerate(PALIERS)} @@ -40,14 +40,22 @@ def locate_article(numero: str, doc: Document) -> list[str]: def normalize_title(title: str): title = title.lower() title = re.sub(r"\s+", " ", title).strip() - title = re.sub(r"^règlement (?:de|sur|concernant|relatif aux) ", "", title) + title = re.sub( + r"^règlement" + r"(?: (?:des|de|sur|concernant|relatif|afin de))?" + r"(?: (?:aux|au|à la|les|des|de la|de|du|le|la))?", + "", + title, + ) title = re.sub( r"\bpiia\b", r"plans d'implantation et d'intégration architecturale", title, ) + title = re.sub(r"[‘’]", "'", title) + title = re.sub(r", ", " ", title) title = re.sub(r"\([^\)]+\)$", "", title) - return title + return title.strip(r""" .,;'«»"“”""") class Resolver: @@ -58,7 +66,10 @@ def __init__(self, metadata: Optional[dict] = None): self.urls: set[str] = set() for docpath, info in self.metadata["docs"].items(): self.numeros[info["numero"]] = docpath - self.titles[normalize_title(info["titre"])] = docpath + normtitle = normalize_title(info["titre"]) + if normtitle != "": + self.titles[normtitle] = docpath + LOGGER.info("%s:%s => %s", info["numero"], normtitle, docpath) def __call__( self, text: str, srcpath: str = "", doc: Optional[Document] = None @@ -98,15 +109,15 @@ def resolve_internal( """ docpath = None text = re.sub(r"\s+", " ", text).strip() - # NOTE: This really matches anything starting with "règlement" if m := REG_RE.search(text): - numero = m.group("reg").strip(" .,;") - if numero is None: - return None - docpath = self.numeros.get(numero) + numero = m.group("reg") + if numero is not None: + numero = numero.strip(" .,;") + docpath = self.numeros.get(numero) if docpath is None: + normtext = normalize_title(text) for title in self.titles: - if title in text.lower(): + if title in normtext: docpath = self.titles[title] break if docpath is None: diff --git a/test/test_link.py b/test/test_link.py index c9be1a0..da280a3 100644 --- a/test/test_link.py +++ b/test/test_link.py @@ -4,7 +4,7 @@ import pytest from alexi.analyse import Document, match_links -from alexi.link import Resolver, locate_article +from alexi.link import Resolver, locate_article, normalize_title DATADIR = Path(__file__).parent / "data" TRAINDIR = Path(__file__).parent.parent / "data" @@ -124,10 +124,18 @@ def test_laws(test_input, expected): "Règlement de zonage 1314-2021-Z", "../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z", ), + ( + "règlement foo bar baz 1314-2021-Z", + "../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z", + ), ( "Règlement sur les permis et certificats 1314-2021-PC", "../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013", ), + ( + "RÈGLEMENT 1314-2021-PC", + "../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013", + ), ( "chapitre 5 du Règlement de zonage 1314-2021-Z", "../20231213-Codification-administrative-Rgl-1314-2021-Z/Chapitre/5/index.html", @@ -141,6 +149,10 @@ def test_laws(test_input, expected): "Règlement de zonage", "../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z", ), + ( + "Règlement sur les permis et certificats", + "../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013", + ), ( "section 3 du chapitre 5 du Règlement de zonage 1314-2021-Z", "../20231213-Codification-administrative-Rgl-1314-2021-Z/Chapitre/5/Section/3/index.html", @@ -156,7 +168,8 @@ def test_laws(test_input, expected): @pytest.mark.parametrize("test_input,expected", BYLAWS) def test_bylaws(test_input, expected): r = Resolver(METADATA) - assert r(test_input, ".") == expected + found = r(test_input, ".") + assert found == expected with open(DATADIR / "lotissement.json", "rt") as infh: @@ -315,3 +328,24 @@ def test_match_multiples(text, before, multi, after): for link, ref in zip(links, multi): assert text[link.start : link.end] == ref assert link.alt == f"{before} {ref} {after}" + + +TITLES = [ + ("RÈGLEMENT ", ""), + ( + "règlement relatif aux projets particuliers de construction, de modification ou d'occupation d'un immeuble.", + "projets particuliers de construction de modification ou d'occupation d'un immeuble", + ), + ( + "RÈGLEMENT AFIN DE DÉCRÉTER DES DISPOSITIONS CONCERNANT L’OCCUPATION DU DOMAINE PUBLIC.", + "décréter des dispositions concernant l'occupation du domaine public", + ), + ("Règlement de construction ", "construction"), + ("Règlement sur les dérogations mineures ", "dérogations mineures"), +] + + +@pytest.mark.parametrize("text,after", TITLES) +def test_normalize_title(text, after): + """Verifier la normalisation des titres de règlements""" + assert normalize_title(text) == after