fix: ameliorer un peu la reconnaissance des reglements

dhdaines · Sep 5, 2024 · cb98ee5 · cb98ee5
1 parent 2e420c0
commit cb98ee5
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 12 deletions.
diff --git a/alexi/link.py b/alexi/link.py
@@ -22,7 +22,7 @@
     r"\b(?P<sec>article|chapitre|section|sous-section|annexe) (?P<num>[\d\.]+)",
     re.IGNORECASE,
 )
-REG_RE = re.compile(r"règlement[^\d]+(?P<reg>[\d\.A-Z-]+)", re.IGNORECASE)
+REG_RE = re.compile(r"(?i:règlement)(?:[^\d]+(?P<reg>[\d\.A-Z-]+))?")
 MILIEU_RE = re.compile(rf"{MILIEU}\s+(?P<mtype>{MTYPE})", re.IGNORECASE | re.VERBOSE)
 PALIER_IDX = {palier: idx for idx, palier in enumerate(PALIERS)}
 
@@ -40,14 +40,22 @@ def locate_article(numero: str, doc: Document) -> list[str]:
 def normalize_title(title: str):
     title = title.lower()
     title = re.sub(r"\s+", " ", title).strip()
-    title = re.sub(r"^règlement (?:de|sur|concernant|relatif aux) ", "", title)
+    title = re.sub(
+        r"^règlement"
+        r"(?: (?:des|de|sur|concernant|relatif|afin de))?"
+        r"(?: (?:aux|au|à la|les|des|de la|de|du|le|la))?",
+        "",
+        title,
+    )
     title = re.sub(
         r"\bpiia\b",
         r"plans d'implantation et d'intégration architecturale",
         title,
     )
+    title = re.sub(r"[‘’]", "'", title)
+    title = re.sub(r", ", " ", title)
     title = re.sub(r"\([^\)]+\)$", "", title)
-    return title
+    return title.strip(r""" .,;'«»"“”""")
 
 
 class Resolver:
@@ -58,7 +66,10 @@ def __init__(self, metadata: Optional[dict] = None):
         self.urls: set[str] = set()
         for docpath, info in self.metadata["docs"].items():
             self.numeros[info["numero"]] = docpath
-            self.titles[normalize_title(info["titre"])] = docpath
+            normtitle = normalize_title(info["titre"])
+            if normtitle != "":
+                self.titles[normtitle] = docpath
+            LOGGER.info("%s:%s => %s", info["numero"], normtitle, docpath)
 
     def __call__(
         self, text: str, srcpath: str = "", doc: Optional[Document] = None
@@ -98,15 +109,15 @@ def resolve_internal(
         """
         docpath = None
         text = re.sub(r"\s+", " ", text).strip()
-        # NOTE: This really matches anything starting with "règlement"
         if m := REG_RE.search(text):
-            numero = m.group("reg").strip(" .,;")
-            if numero is None:
-                return None
-            docpath = self.numeros.get(numero)
+            numero = m.group("reg")
+            if numero is not None:
+                numero = numero.strip(" .,;")
+                docpath = self.numeros.get(numero)
             if docpath is None:
+                normtext = normalize_title(text)
                 for title in self.titles:
-                    if title in text.lower():
+                    if title in normtext:
                         docpath = self.titles[title]
                         break
             if docpath is None:

diff --git a/test/test_link.py b/test/test_link.py
@@ -4,7 +4,7 @@
 import pytest
 
 from alexi.analyse import Document, match_links
-from alexi.link import Resolver, locate_article
+from alexi.link import Resolver, locate_article, normalize_title
 
 DATADIR = Path(__file__).parent / "data"
 TRAINDIR = Path(__file__).parent.parent / "data"
@@ -124,10 +124,18 @@ def test_laws(test_input, expected):
         "Règlement de zonage 1314-2021-Z",
         "../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z",
     ),
+    (
+        "règlement foo bar baz 1314-2021-Z",
+        "../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z",
+    ),
     (
         "Règlement sur les permis et certificats 1314-2021-PC",
         "../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013",
     ),
+    (
+        "RÈGLEMENT 1314-2021-PC",
+        "../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013",
+    ),
     (
         "chapitre 5 du Règlement de zonage 1314-2021-Z",
         "../20231213-Codification-administrative-Rgl-1314-2021-Z/Chapitre/5/index.html",
@@ -141,6 +149,10 @@ def test_laws(test_input, expected):
         "Règlement de zonage",
         "../index.html#20231213-Codification-administrative-Rgl-1314-2021-Z",
     ),
+    (
+        "Règlement sur les permis et certificats",
+        "../index.html#Rgl-1314-2021-PC-version-en-vigueur-20231013",
+    ),
     (
         "section 3 du chapitre 5 du Règlement de zonage 1314-2021-Z",
         "../20231213-Codification-administrative-Rgl-1314-2021-Z/Chapitre/5/Section/3/index.html",
@@ -156,7 +168,8 @@ def test_laws(test_input, expected):
 @pytest.mark.parametrize("test_input,expected", BYLAWS)
 def test_bylaws(test_input, expected):
     r = Resolver(METADATA)
-    assert r(test_input, ".") == expected
+    found = r(test_input, ".")
+    assert found == expected
 
 
 with open(DATADIR / "lotissement.json", "rt") as infh:
@@ -315,3 +328,24 @@ def test_match_multiples(text, before, multi, after):
     for link, ref in zip(links, multi):
         assert text[link.start : link.end] == ref
         assert link.alt == f"{before} {ref} {after}"
+
+
+TITLES = [
+    ("RÈGLEMENT ", ""),
+    (
+        "règlement  relatif aux projets particuliers de construction, de modification ou d'occupation d'un immeuble.",
+        "projets particuliers de construction de modification ou d'occupation d'un immeuble",
+    ),
+    (
+        "RÈGLEMENT  AFIN DE DÉCRÉTER DES DISPOSITIONS CONCERNANT L’OCCUPATION DU DOMAINE PUBLIC.",
+        "décréter des dispositions concernant l'occupation du domaine public",
+    ),
+    ("Règlement de construction ", "construction"),
+    ("Règlement sur les dérogations mineures ", "dérogations mineures"),
+]
+
+
+@pytest.mark.parametrize("text,after", TITLES)
+def test_normalize_title(text, after):
+    """Verifier la normalisation des titres de règlements"""
+    assert normalize_title(text) == after