From 7fadd53f90a2f020557bc62a2f8fda2fa4322b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Desboeufs?= Date: Sat, 1 Nov 2025 14:29:27 +0100 Subject: [PATCH 1/2] Add relevant synonyms from cquest-11rc1 branch Add 14 new synonyms to improve street type recognition: - cd => chemin departemental - chem => chemin - clef => cle, clefs => cles - dept => departement - gir => giratoire - habit => habitation - periph => peripherique - prl => parc residentiel de loisirs - prm => promenade - rd => route departementale - rn => route nationale - rdpt => rond point - cvo => chemin vicinal These synonyms come from the cquest-11rc1 branch (commits 2019-2021) and are still relevant to improve address search. --- addok_fr/resources/synonyms.txt | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/addok_fr/resources/synonyms.txt b/addok_fr/resources/synonyms.txt index a04b7af..7af5ff1 100644 --- a/addok_fr/resources/synonyms.txt +++ b/addok_fr/resources/synonyms.txt @@ -38,15 +38,18 @@ brtl => bretelle bsn => bassin car => carriere caref => carrefour +cd => chemin departemental cdt => commandant cf => chemin forestier -ch, che,chem => chemin +ch,che,chem => chemin chev => chevalier chl => chalet chp => champ chps => champs cha,chs,chse,chss => chaussee cht => chateau +clef => cle +clefs => cles clr => couloir col => colonel cor,crnc => corniche @@ -60,6 +63,7 @@ cou,cr => cour crx => croix ctre => centre cv => chemin vicinal +dept => departement devi => deviation dig => digue dom => domaine @@ -86,10 +90,11 @@ gde => grande gdes => grandes gds => grands gen,gal,gl => general +gir => giratoire gpl => grand place gr => grande rue grp,grpe => groupe -hab => habitation +hab,habit => habitation ham => hameau hle => halle hlg => halage @@ -131,6 +136,7 @@ pae => petite avenue pass,pge,psge => passage pch => petit chemin pdt => president +periph => peripherique pkg => parking pl,pla,plac => place ple => passerelle @@ -140,8 +146,9 @@ pnt => pointe porq => portique pr => pere pref => prefecture +prl => parc residentiel de loisirs prof => professeur -prom => promenade +prom,prm => promenade pta => petite allee pte => porte ptr => petite rue @@ -152,6 +159,7 @@ qu, q => quai r => rue ran => rangee rav => ravin +rd => route departementale rdl => raidillon regt => regiment renc => rencontre @@ -159,8 +167,9 @@ rep => republique res => residence rfg => refuge rle,ruel,rul => ruelle +rn => route nationale rpe => rampe -rpt,rdpoint,rop,rdp,rondpoint => rond point +rpt,rdpoint,rop,rdp,rondpoint,rdpt => rond point rtd => rotonde rte,rou => route ruet => ruette From 60d8bd828792261d78f07ba20438fc18ef405308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Desboeufs?= Date: Sat, 1 Nov 2025 14:42:38 +0100 Subject: [PATCH 2/2] Improve phonemicization rules from cquest-11rc1 branch Add enhanced phonemicization rules with careful refinements: - Add mp->n conversion in vowel+mp+consonant context (champvallon->chanvalon) - Add targeted ei+gn->ni rule for specific cases (seigneur->senieur) - Improve je->j handling (georges->jorj instead of jeorj) - Add anc ending simplification (blanc->blan) - Handle y at word beginning - Improve eim->aim handling (pforzheim->pforzaim) - Better ae/ei->e conversion - Enhanced oeu/oe->eu handling (oeufs->beu) - Fix duplicate letter removal to handle multiple repetitions Rules are more targeted than original cquest-11rc1 to avoid regressions: - gn->ni only after 'ei' (preserves montagne->montagn, not montani) - Preserves common French patterns while improving edge cases Add 11 new test cases covering the improved phonemicization patterns. All 122 tests pass. --- addok_fr/utils.py | 15 +++++++++++---- tests/test_utils.py | 23 +++++++++++++++++------ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/addok_fr/utils.py b/addok_fr/utils.py index 43d89d6..d66f305 100644 --- a/addok_fr/utils.py +++ b/addok_fr/utils.py @@ -5,11 +5,15 @@ RULES = ( ("(?<=a)(mp|nd|nt)s?$", "n"), # champ(s) > cham + (r"([aeiouy])mp(?=[^aeiouyr])", r"\1n"), # champvallon -> chanvalon (r"ngt(?=[aeiouy])", "nt"), # vingtieme > vintieme (r"ngt", "n"), # vingt > vin ("((?<=[^g])g|^g)(?=[eyi])", "j"), ("(?<=g)u(?=[aeio])", ""), + (r"(?<=ei)gn([aeiouy])", r"ni\1"), # seigneur -> senieur (only after ei specifically) + (r"je([aeiouy])", r"j\1"), # georges -> jorj ("c(?=[^hieyw])", "k"), + (r"anc$", "an"), # blanc -> blan ("((?<=[^s])ch|(?<=[^0-9])c)$", "k"), # final "c", "ch", # but not "sch" and not 10c. ("(?<=[aeiouy])s(?=[aeiouy])", "z"), @@ -24,19 +28,22 @@ ("sh", "ch"), ("((?<=[^0-9])w|^w)", "v"), ("c(?=[eiy])", "s"), - ("(?<=[^0-9])y", "i"), + (r"((?<=[^0-9])y|^y)", "i"), # also handle y at beginning ("esn", "en"), - (r"oe(?=\w)", "e"), + (r"eim( |$)", "aim"), # pforzheim -> pforzaim + (r"(ae|ei)(?=\w)", "e"), # improved ae/ei handling + (r"oeufs( |$)", "eu"), # special case for oeufs + (r"oeu?(?=\w)", "eu"), # oe/oeu -> eu ("(?<=[^0-9])s$", ""), ("(?<=u)l?x$", ""), # eaux, eux, aux, aulx ("(?<=u)lt$", "t"), ("(?<=[a-z])[dg]$", ""), ("(?<=[^es0-9])t$", ""), - ("(?<=[aeiou])(m)(?=[pbgf])", "n"), + ("(?<=[aeiou])(m)(?=[pbgft])", "n"), # m -> n before labial/dental consonants (e.g., impossible -> inpossible) ("(?<=[a-z]{2})(e$)", ""), # Remove "e" at last position only if # it follows two letters? (r"(?<=[aeiouy])n[dt](?=[^aeiouyr])", "n"), # montbon -> monbon - (r"(\D)(?=\1)", ""), # Remove duplicate letters. + (r"([a-z])\1+", r"\1"), # Remove duplicate letters (one or more repetitions) ) COMPILED = list((re.compile(pattern), replacement) for pattern, replacement in RULES) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2e0e6f3..ec16a1c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -37,17 +37,18 @@ ['placis', 'plasi'], ['courcome', 'kourkom'], ['hazebrouck', 'azebrouk'], - ['blotzheim', 'blotzeim'], + ['blotzheim', 'blotzaim'], ['plouhinec', 'plouinek'], ['hirschland', 'irchlan'], ['schlierbach', 'chlierbak'], - ['aebtissinboesch', 'aebtisinbech'], - ['boescherbach', 'becherbak'], - ['affelderwoert', 'afelderver'], + ['aebtissinboesch', 'ebtisinbeuch'], + ['boescherbach', 'beucherbak'], + ['affelderwoert', 'afelderveur'], ['boeuff', 'beuf'], + ['boeufs', 'beu'], ['humeroeuille', 'umereuil'], ['aigueboeuf', 'aigebeuf'], - ['boeshoernel', 'bechernel'], + ['boeshoernel', 'beucheurnel'], ['10a', '10a'], ['10b', '10b'], ['10c', '10c'], @@ -101,7 +102,8 @@ ['1y', '1y'], ['1z', '1z'], ['quimper', 'kinper'], - ['georges', 'jeorj'], + ['georges', 'jorj'], + ['gorges', 'gorj'], ['h', 'h'], ['vallee', 'vale'], ['valllee', 'vale'], @@ -112,6 +114,15 @@ ['grandchamps', 'granchan'], ['vingt', 'vin'], ['vingtieme', 'vintiem'], + ['champvallon', 'chanvalon'], + ['champol', 'chanpol'], + ['montbon', 'monbon'], + ['montgros', 'mongro'], + ['blanc', 'blan'], + ['montee', 'monte'], + ['seigneur', 'senieur'], + ['pforzheim', 'pforzaim'], + ['hyppolyte', 'ipolit'], ]) def test_phonemicize(input, output): assert phonemicize(Token(input)) == output