Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions addok_fr/resources/synonyms.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,18 @@ brtl => bretelle
bsn => bassin
car => carriere
caref => carrefour
cd => chemin departemental
cdt => commandant
cf => chemin forestier
ch, che,chem => chemin
ch,che,chem => chemin
chev => chevalier
chl => chalet
chp => champ
chps => champs
cha,chs,chse,chss => chaussee
cht => chateau
clef => cle
clefs => cles
clr => couloir
col => colonel
cor,crnc => corniche
Expand All @@ -60,6 +63,7 @@ cou,cr => cour
crx => croix
ctre => centre
cv => chemin vicinal
dept => departement
devi => deviation
dig => digue
dom => domaine
Expand All @@ -86,10 +90,11 @@ gde => grande
gdes => grandes
gds => grands
gen,gal,gl => general
gir => giratoire
gpl => grand place
gr => grande rue
grp,grpe => groupe
hab => habitation
hab,habit => habitation
ham => hameau
hle => halle
hlg => halage
Expand Down Expand Up @@ -131,6 +136,7 @@ pae => petite avenue
pass,pge,psge => passage
pch => petit chemin
pdt => president
periph => peripherique
pkg => parking
pl,pla,plac => place
ple => passerelle
Expand All @@ -140,8 +146,9 @@ pnt => pointe
porq => portique
pr => pere
pref => prefecture
prl => parc residentiel de loisirs
prof => professeur
prom => promenade
prom,prm => promenade
pta => petite allee
pte => porte
ptr => petite rue
Expand All @@ -152,15 +159,17 @@ qu, q => quai
r => rue
ran => rangee
rav => ravin
rd => route departementale
rdl => raidillon
regt => regiment
renc => rencontre
rep => republique
res => residence
rfg => refuge
rle,ruel,rul => ruelle
rn => route nationale
rpe => rampe
rpt,rdpoint,rop,rdp,rondpoint => rond point
rpt,rdpoint,rop,rdp,rondpoint,rdpt => rond point
rtd => rotonde
rte,rou => route
ruet => ruette
Expand Down
15 changes: 11 additions & 4 deletions addok_fr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@

RULES = (
("(?<=a)(mp|nd|nt)s?$", "n"), # champ(s) > cham
(r"([aeiouy])mp(?=[^aeiouyr])", r"\1n"), # champvallon -> chanvalon
(r"ngt(?=[aeiouy])", "nt"), # vingtieme > vintieme
(r"ngt", "n"), # vingt > vin
("((?<=[^g])g|^g)(?=[eyi])", "j"),
("(?<=g)u(?=[aeio])", ""),
(r"(?<=ei)gn([aeiouy])", r"ni\1"), # seigneur -> senieur (only after ei specifically)
(r"je([aeiouy])", r"j\1"), # georges -> jorj
("c(?=[^hieyw])", "k"),
(r"anc$", "an"), # blanc -> blan
("((?<=[^s])ch|(?<=[^0-9])c)$", "k"), # final "c", "ch",
# but not "sch" and not 10c.
("(?<=[aeiouy])s(?=[aeiouy])", "z"),
Expand All @@ -24,19 +28,22 @@
("sh", "ch"),
("((?<=[^0-9])w|^w)", "v"),
("c(?=[eiy])", "s"),
("(?<=[^0-9])y", "i"),
(r"((?<=[^0-9])y|^y)", "i"), # also handle y at beginning
("esn", "en"),
(r"oe(?=\w)", "e"),
(r"eim( |$)", "aim"), # pforzheim -> pforzaim
(r"(ae|ei)(?=\w)", "e"), # improved ae/ei handling
(r"oeufs( |$)", "eu"), # special case for oeufs
(r"oeu?(?=\w)", "eu"), # oe/oeu -> eu
("(?<=[^0-9])s$", ""),
("(?<=u)l?x$", ""), # eaux, eux, aux, aulx
("(?<=u)lt$", "t"),
("(?<=[a-z])[dg]$", ""),
("(?<=[^es0-9])t$", ""),
("(?<=[aeiou])(m)(?=[pbgf])", "n"),
("(?<=[aeiou])(m)(?=[pbgft])", "n"), # m -> n before labial/dental consonants (e.g., impossible -> inpossible)
("(?<=[a-z]{2})(e$)", ""), # Remove "e" at last position only if
# it follows two letters?
(r"(?<=[aeiouy])n[dt](?=[^aeiouyr])", "n"), # montbon -> monbon
(r"(\D)(?=\1)", ""), # Remove duplicate letters.
(r"([a-z])\1+", r"\1"), # Remove duplicate letters (one or more repetitions)
)
COMPILED = list((re.compile(pattern), replacement) for pattern, replacement in RULES)

Expand Down
23 changes: 17 additions & 6 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,18 @@
['placis', 'plasi'],
['courcome', 'kourkom'],
['hazebrouck', 'azebrouk'],
['blotzheim', 'blotzeim'],
['blotzheim', 'blotzaim'],
['plouhinec', 'plouinek'],
['hirschland', 'irchlan'],
['schlierbach', 'chlierbak'],
['aebtissinboesch', 'aebtisinbech'],
['boescherbach', 'becherbak'],
['affelderwoert', 'afelderver'],
['aebtissinboesch', 'ebtisinbeuch'],
['boescherbach', 'beucherbak'],
['affelderwoert', 'afelderveur'],
['boeuff', 'beuf'],
['boeufs', 'beu'],
['humeroeuille', 'umereuil'],
['aigueboeuf', 'aigebeuf'],
['boeshoernel', 'bechernel'],
['boeshoernel', 'beucheurnel'],
['10a', '10a'],
['10b', '10b'],
['10c', '10c'],
Expand Down Expand Up @@ -101,7 +102,8 @@
['1y', '1y'],
['1z', '1z'],
['quimper', 'kinper'],
['georges', 'jeorj'],
['georges', 'jorj'],
['gorges', 'gorj'],
['h', 'h'],
['vallee', 'vale'],
['valllee', 'vale'],
Expand All @@ -112,6 +114,15 @@
['grandchamps', 'granchan'],
['vingt', 'vin'],
['vingtieme', 'vintiem'],
['champvallon', 'chanvalon'],
['champol', 'chanpol'],
['montbon', 'monbon'],
['montgros', 'mongro'],
['blanc', 'blan'],
['montee', 'monte'],
['seigneur', 'senieur'],
['pforzheim', 'pforzaim'],
['hyppolyte', 'ipolit'],
])
def test_phonemicize(input, output):
assert phonemicize(Token(input)) == output