From 13842221b3a710b2f473b49c2ea3933cfbdb67bb Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Sun, 15 Oct 2023 17:53:43 +0530 Subject: [PATCH 01/14] Resolved parity issue with normalize-punctuation.perl --- sacremoses/normalize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index b29d316..9e35b4c 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -29,6 +29,7 @@ class MosesPunctNormalizer: NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34 NORMALIZE_UNICODE = [ # lines 37 - 50 + ("‘([^‘’]*?)’", r''''\1"'''), ("„", r'"'), ("“", r'"'), ("”", r'"'), From ac07ddb5bcd1c057922156d5e18bd9dc9c7bb9e8 Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Tue, 17 Oct 2023 22:27:32 +0530 Subject: [PATCH 02/14] to conert all doble quote before s to 's --- sacremoses/normalize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index 9e35b4c..af441d1 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -30,6 +30,7 @@ class MosesPunctNormalizer: NORMALIZE_UNICODE = [ # lines 37 - 50 ("‘([^‘’]*?)’", r''''\1"'''), + ('"s ', r"'s "), ("„", r'"'), ("“", r'"'), ("”", r'"'), From a020b7638d1c160cfec43e189adcaef3ca3dff8a Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Thu, 19 Oct 2023 19:08:18 +0530 Subject: [PATCH 03/14] Fixes most of the differences only 4 left in dev and 8 left in devtest --- sacremoses/normalize.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index af441d1..e3d133b 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -29,8 +29,6 @@ class MosesPunctNormalizer: NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34 NORMALIZE_UNICODE = [ # lines 37 - 50 - ("‘([^‘’]*?)’", r''''\1"'''), - ('"s ', r"'s "), ("„", r'"'), ("“", r'"'), ("”", r'"'), @@ -42,7 +40,7 @@ class MosesPunctNormalizer: ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"), ("‘", r"'"), ("‚", r"'"), - ("’", r"'"), + ("’", r'''"'''), (r"''", r'"'), ("´´", r'"'), ("…", r"..."), From 876635d13095a714c86fc8b1e8f1899c98e8c73d Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Thu, 19 Oct 2023 21:12:54 +0530 Subject: [PATCH 04/14] decreases diff 1 for dev and one for devtest --- sacremoses/normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index e3d133b..38c96f5 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -47,10 +47,10 @@ class MosesPunctNormalizer: ] FRENCH_QUOTES = [ # lines 52 - 57 - ("\u00A0«\u00A0", r'"'), + ("\u00A0«\u00A0", r' "'), ("«\u00A0", r'"'), ("«", r'"'), - ("\u00A0»\u00A0", r'"'), + ("\u00A0»\u00A0", r'" '), ("\u00A0»", r'"'), ("»", r'"'), ] From 9411cca180bf70112c43bbe649eccd813e5ab05a Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Thu, 19 Oct 2023 22:04:02 +0530 Subject: [PATCH 05/14] changed quotes --- sacremoses/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index 38c96f5..8c7729a 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -40,7 +40,7 @@ class MosesPunctNormalizer: ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"), ("‘", r"'"), ("‚", r"'"), - ("’", r'''"'''), + ("’", r'"'), (r"''", r'"'), ("´´", r'"'), ("…", r"..."), From 6d28db5f31af6bed8db11e399f13edcf444fc61a Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Fri, 20 Oct 2023 12:00:10 +0530 Subject: [PATCH 06/14] option added to switch between parity and previous regex --- sacremoses/normalize.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index 8c7729a..f4ce3a5 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -40,17 +40,17 @@ class MosesPunctNormalizer: ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"), ("‘", r"'"), ("‚", r"'"), - ("’", r'"'), + ("’", r"'"), (r"''", r'"'), ("´´", r'"'), ("…", r"..."), ] FRENCH_QUOTES = [ # lines 52 - 57 - ("\u00A0«\u00A0", r' "'), + ("\u00A0«\u00A0", r'"'), ("«\u00A0", r'"'), ("«", r'"'), - ("\u00A0»\u00A0", r'" '), + ("\u00A0»\u00A0", r'"'), ("\u00A0»", r'"'), ("»", r'"'), ] @@ -132,6 +132,7 @@ def __init__( norm_numbers=True, pre_replace_unicode_punct=False, post_remove_control_chars=False, + pearl_parity=False ): """ :param language: The two-letter language code. @@ -150,6 +151,11 @@ def __init__( self.HANDLE_PSEUDO_SPACES, ] + if pearl_parity == True: + self.substitutions[1][11] = ("’", r'"') + self.substitutions[2][0] = ("\u00A0«\u00A0", r' "') + self.substitutions[2][3] = ("\u00A0»\u00A0", r'" ') + if penn: # Adds the penn substitutions after extra_whitespace regexes. self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN) @@ -196,4 +202,4 @@ def replace_unicode_punct(self, text): return text def remove_control_chars(self, text): - return regex.sub(r"\p{C}", "", text) + return regex.sub(r"\p{C}", "", text) \ No newline at end of file From fa2d4b04f43f750aaecf538b078fea713482b06e Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Fri, 20 Oct 2023 21:18:11 +0530 Subject: [PATCH 07/14] changes according to comments --- .../__pycache__/normalize.cpython-310.pyc | Bin 0 -> 3995 bytes sacremoses/__pycache__/tokenize.cpython-310.pyc | Bin 0 -> 16674 bytes sacremoses/normalize.py | 15 ++++++++------- sacremoses/test_normalize.py | 12 ++++++++++++ 4 files changed, 20 insertions(+), 7 deletions(-) create mode 100644 sacremoses/__pycache__/normalize.cpython-310.pyc create mode 100644 sacremoses/__pycache__/tokenize.cpython-310.pyc create mode 100644 sacremoses/test_normalize.py diff --git a/sacremoses/__pycache__/normalize.cpython-310.pyc b/sacremoses/__pycache__/normalize.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6020a67fa2db0f3352dff6a847f831d9ef5c78d GIT binary patch literal 3995 zcmaJ@-ESMm5x?C#Qb$VEha)F-UAt!@j&-p_Ca#?}vKm#9s7QrPDiQ^!6xA!mJ5#Jl z9WbxzU*cuU&R{dL0 zgD1I}cQp8Cw57}Lrd`s?Hgh%mqULT|ni!{*abP9m=0RHW#%mXuT@)rZUAOE^Mx(jB zyII+YWbIp(bKC`*Y1K$JJ}=}utI5F8(WJUo18v7zOGmOdOA1t;wvDK z(_>nb;3!#3>rJ9V&p(znb?tc7FO-f0hia2eGATM|o8+uco8%`=aspa<;slHgYx5;kzC12Q~;y5B^a=PxX@~nef0(Ff^etAL#4!A%&)Kj|~V^fz*}8)-n^- znTgc8et+v$7c~2Yn2hN09b7uaOulgp`$?um|`LIY!O19EqXXT-6YdpbgW5^ z0m67R>Sq}qByBj}s8{KDXRoSicx-JvvVIa0OYB7$Gi5wnrQvbt4UMHEC-v}H+IfEM z$5W7f{b8Zi-mr&g7c;I=M_O%d=XhJnj)Jt!i?n#pww>7Be!odzf(izGXX}?>zeyvV z2<&WKW#0w?8%8n&q&@8ZjlXw7gDczJ{%kM&oPDdSytDOQXCqO0XY1F!aCiG%rgoKg zx397zz3>L>>xIAX4dKq#Z+cDbY`xzLceg*}qe7IwJq*!z1t-CQMQ8jVUw(R;G-s4Z-d?KU$Mhrf>W!MoX=uNI}i3TS!dyO z{t2Yk$3{$2lYR;%WD05$^oXfI&nJ5RDG6GNa>Ve;Ii)5UN{zDT04a5!*$1uv4kc1e zX45rds+TA`Eo3NB?+_xD`hg#KO9DBLL4E-FHH9TX5(LS4PZAU(P^YEYkJue>L&*6+ zx&~gr{NSH4Sn~rNGW($j4P-WnB#CnzijFo}&ag~TD`ZN!N+xGbLK(6&_icNkV7X8n zJb{LFBhnq_UM^cgIH@@<>+)OX%2uf~sd*(%TLP>lnPqKcN|#{3<$|TXS+QMF%A7~d zwnlH*cB|aG{pOnAJHfhW#WsuVXYwr7eqYd@o& z_8xmWWd|X9#5eY!A9}#Rq5`EY%wQi79}+mMJ-}>zfK2uRN@Pw=3v$!p^di%J>@-Tw z0O2%Sh1X#(0f_6vd?cP&K0kAAF%>gbKR?5tor@W<6=Qz+|7s1k zv}RMrEb4M!BDoM-=1L(CnTB`3Qciz2xS7LGCH5a0r zcF~GDdDDa`+|B5m;@a+FvYdrak-yaIqRJ=ALz|>v7arHK-BsJk(?t3Zo+qLZPtV>|C8{ Tt^5@%Cyt6P$SN722ju?&1{e#z literal 0 HcmV?d00001 diff --git a/sacremoses/__pycache__/tokenize.cpython-310.pyc b/sacremoses/__pycache__/tokenize.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43326779b708e1a8a6bdde9cd9ba5fc40250a7e6 GIT binary patch literal 16674 zcmb_@Yjj&zb{_7z7f*r|Nl_&AvLT8V2vHSaAF2_hkp775Y>WJw@M2;#jWA%XyP zFKAg1IO7>9vE6hMdnTP^GMJ~u?Von7@mft+fAlqJo8BgA>!fX*cBW~k zlO~+8)Nk*LHz_AwtAx(E``c%q_j{kMY-6LxhQHtV?OyGT&)aOjPoVlw0k{Zf!8UPS0=paDi_in2!;oh_n4M(2QDg5(gR^FSAXjxo9j6a+fX=n~K{&uvOp_96O8Tw=|Evd22c)Y zlF=&AYd}+s?gLSKzQX8rAex8Mj2;wGgi#*oA;O}J3P5W>*BE^c=<`6YGI|5(3qaQy zeG%vp&<#e9fqnw$CLX$iX;@;BY+N?IGCA3=$_E?f->T|!%+^epjw z^}X2gf=z3ESJHOE`QQ$~b-^8k>xO#)?hu?Gt_SWgTrb=axIVakxBxJz)ua3gRbxXW-?;6~xb;9iEi3O5cn z0T+gwgqwO-iec(St%us(xegCm7mQz1ae;MCZ^_%Lq@O@4FYwCyi-dF#+ z`YZUpuKud}H}HL+{!R7Q@Xf1#PyO5KuRoSz58ZzYzu$2GRrT+ye@Feh;1u4pDYn=e zcJO~-Hcey=&4^}~v`i|m>G`jEeE5qlrVJncC4Ey5a*NrFZzZeeeAzp`+@j{=7`}28 z-yJ=>%&``8xfSEgz`#N(x43${e?GfBuuQ4<8Ou3~QnmT4s_6r_)7jet%Sj^#+L+f< zD>-8zS1Ln=>R-XJf|E9FraWeZP3M>qOlMY?O=Zl8%(I`hm@A7(BF3}#Q42Y4^lnPN ze8Wuq=+UEn@uSC1%?3nX_)ix9xmc+VCKkd-i~aJ`0^gmMQ+{> zMtqQ@9-^2C=+4V?w{GR-Tep0ua#!9pcMGz&kfU;oB*mN`fj*?tGeea;Hyh~6JCHC% zXejfLL~{icQ=n=hk)f)%el8TEI^zBG%X@u^(D?Y+RAelY@R^0e>%;8K8j!07|=)Y zzh(jYC>ei_LinTj$H!TWK8k-j#DG4E|5XdnM+y6*6vCfIjH==J*+4Rr$s#2W23<1a zW4ZxBBPxpgKtu5n;P(;b1^ddvY#_C~4CDqswd^CZ6p+nil2!l~CL+rLxm+nA=OdB= z#?!L_t(=r*B}Jv0Se^}JbMU=P1zMlkgGoMMy#5A4u8fJB-;!p6@U^>d>bBP;z&1%5wSL^n{my`e)+z<9+c2t^Gye9uqm`mcwUBN>CXXz_AcTX|4_2oa<-iGmhF*P7aKOoRsfxlRh)Gn z+j7dfy?~qK2=2e&ItoI6dBH|h&RGzu{7C%F!8N%ckJx5A*OY>??%J@?S3d*Vt>QLq zfO^Yz*YTR9pP}!%rvR>GyVrFc8U_WwN6;^1T!S_vbRT^B-Gt*aj2&fgQLJMA4WMP! z@tRHlUDdf}XXFA}F4%Kk)w$l7Yg*qyw`aQ7pRyGtJdpTYjRfDBt0cUT_*RWXVawh_3w82}j6>PJPHK`!2+cs?PihCkr`_M*n z+{oeV1a*q&KWZw8%vw=dTw!GL^GCh*_8n(X%_V4vN2#J z=XK3Gq20;qOI5gh1nXaUV0zA_Q$`M_xs~(%D5pWg&tY>(j;HzF+S6lyB9Y3Za*0I$ zxpa0unKsUYD8)4;eN#HPrKWbIwiC|ihQ~E+TEkFX$1vWIbb&fU^@GcUw7XbyEL3!c z)2Yc&_2$K-k-(Wi(^K<_Y$lEJ>MN*kDwoP;AbSjArgZ;9$y5w2eaC8`6O_P7c;~1> zUJwqO5H3l4>P58ziYfAorns9l#ZuOkp|UET74?f0F$m9ej3zS+t7+tLzRZ)s1LH!O z;Tlcp$%SMlX}Yc^bIB!8dNYN#kk?8c)m`63-ahj7!wYnxXIF9d)uH>cnXXc+no1gZ znr)eiaxJk^)UKKK`&nHz%(n1k_|kMJcy%m%B{4M}x;%Cx6fu?Tik8ubC^e6fL+cp# zaYi>C$rbQbb65DzgdYTV|ba$-E1mjdg&xyQt+Afd5kDSZ=?c~qRte$ zW;$tvq|%z{z+hTV=FA;a!I4CGGMX5R1jmts*%7>QWjYjzjJ+C4jNY6Y4TTetsqwL> z**X@EP6wltSBWzojEsgN0f+t)B|l2Z`{q}TTy{CJUBwf3C{Otg3aO-18O?N~yAoVL z^tn5cm`^7SBazV0QF7ezS3vbp$A>{Q35Vp6U6SjmVs}Vh36#_%c|eIb#EEyhY;7~{zH>q`-kP@||2kqpg){*np-LZUi?B5dc+0FDKQnQ+=*f7*slD>BVmFp%f0t=p-#? zdeqdNJDRR#<~5y~`9opNmicW;>`WPHx)d)^IVr1lDc@c}N>v`DG=S)o!Q~S8 zVjrh+J;IRFQwYSGDJL@z^t+%M}F4XNDDX45?ItuEiDg!*YDyId+- zvy^|nu4wgpPfY>oh(-%Ick6F~u=ejYYCaTjP0nF{Kf=*$_C{Q+DQoteysn`5cFbx= z!J(@K$2tyB818uLC^&J%IaIK%I|_J`Fs>Aw=mO#$Wy)<+AXfQOO} z+ionOh~=(eRoRXGR3E+PEylFzbCzsDUINXfuvg?agg!^Is-we3+;g^jbX4;cP%<>K zE1Y-F_257ri1&AYg0`~h3*-@%ch1f5@f@0$eoOPxfURvMoywVVTFaPjjd$)OMhmS# z+!3aILC>zP=qtpyO5P-S%jCTRFW{<-b<@FGBg6C{okWqAQl_0rvzhBwb3(J^6|^C=x;+AuUiL`hX`2vFgx#Zf zrB)yzdxdzzBW*b%eO4Yd!-tN5rTO;*9{@VshQZ>&U=cX8V+$zI!jEC30AAHTRH~a#*A}$bS4=Nfegj9QRi3bQOP)tF=94R0 z;`Qb9*U$(*pz|N?eahCUT9Le5b7P@;lZ1(FCaWm|XS^I!#OYFS5$ehriF%V@_$Aj4!7{bDhjA`FN<13e=ZWlPft* z_1%6@QFoS8Tu#m};*^1S$&8UsQhWJu@L9&$DEFYhEQaID6sl>qS$#1T3WrO*kQn5C z(AS<7KBOB63fQaq$)jtzRXvlyflU7b0`)JFSJzLCILl?MNq~C%B8^wt8(fYifo)ij zk0oqh-4a$4DLPM`Tl%AOG4+}ZXZ!)u%i-quNP3$_BJ~{JQi{FeSaV{r!XovSZ9}|m zyXPs`^$XSwv|y*Z-zP+^Q>LQtLR@8n}JYBGeXW z-QW8Qwq8TQ!>#vEksEHO+y;vNM?kxZNY=mq7Um6Fx1HyYE4dS0T^`BLd;lm&_2j~Wt{Fz^H7%hgjl~4-DG;Ar;Tf6BVuGd?7BHJB*vG*>eHDwlZ@b){l^h)r z%GqzZy3$;U(A?MqkFTz7KD^@ulEduS5Y|d&aJWC#OTh!APilL#?{H zTelRtp8jK+#B^rcW81Qkl-hs0yLL@cY5LMMrkiHXEt}3GGzc(o9emf2pk0vfxkS4h7yBYI73@F$E>^P-cb0;_*i5VWe5$AUB;z3I5ioGPEX=KYwo^0ofwO58`Ob| z`E~);KG1>t@7Orq*>NqYN>N{=GCv}ZHwACirqI6qsMX)38ak}4*lgX>Q$}S6F`Fx) z)`Zg^P*!j(+59wxQn#9}((wB=>f1T6TrF+bVHDnLgm64*cC_M<;F5*# z?C^rmmm^tG6dWA9pElV&G94nsQ|wIE*NMxM>=Dh_s~|5o;~a&3OU5}%G>a`4>r(!h zKr~=rg>*TfT)rF)2R=s{lT311OC<2(lvvKHnD?N)iNw9tWV#sQOeECoJoEyzPxHZ2 zpC*bXwC*O4=8R5roOK*}15vakS{EAHS}Rjp{|dpMg=e}a#zqnt5MxuOXFPNnqk-v$ z>9H%LMWDY*(&aH}`ldru;~3H0`PavyqX}q5Les%JiZM@snlQt^usqmzjUh8L3H?nnJbgW-{J z+#PD!o~jkD5outp;&Fqj*TA7#c&rv4uZ1US;mKNfsusRj3s0L(HN`kuhX?EMP#r#2 zhmY6c6Lt7x9X?fuU#!EY&7H&Jp&$(@w3W46(d637#N^t_JZ3f*FK@vKjMNA^#cU{Y zrpBhQ%@hGIFG23XOX!oUp{Tj17-X$BwP8DpVb#5BmYBsc!6TuZinV^dat(7JQOaR& zO%9RTz(&^T7B@x=k7|hFQ4KLXsv(9)H9VpJ0?pUvx(yW@bsYyAbsY!ISI0T7f1Q@S z_sIJOCA<4lFoK=CeDJ`8j$*+Lqm!N)H1zDCW6usc{_LO=`cF`Sev&*G_}a|QXOt>p zwqFlj+FII(PMT~!(OBmM){(3bj}%vYvarq$ZxMVSp4nJbh|g>3G&aA=6_U1a>)=T` zD|%ruTsV2UY>L$85K@<f5xkn2dS~NsFRWzLlWQm_*+w?^ew46*>+aRD^J=L%1_7wWu~9i~l5i;y;Q5;y;Kk@q6N+_+8O0en-3@{=GPav#?+MJJBP4TLi?v z6^F&Y5xwGHizDKXJpa zM{8L3s<6dYov;^dWCVL=)eY3d2=?4+1JDjeUZ6&xW=445Rd)cjFlq*B0cvH`3Pcvw zI~kQMs@oWqEUI@gDp^#wGunfAlppM}QL?>2WQpC$Xde(+V(->EG}wM?En26xTWi&J zYHivst$m>(wg+~7`(f9&7vJab-G}e<`0nTUzV+wS2H2=ms`RDw=(~RdFBcmU?6N7( z1MAO`6+5txaThSZlE)4(K3K+GjJwPDAmbMhvzsYcYR6ts2U#1_qXuAMct}00_QGP& zuMVkwYX4&?)&pulJqjwomV$?&Rl}mnilY&&bnw>mU+1gwXX@BEj;|`x|9@*iwNfI> zUXV-054P69ou4Ubq>_=*MVNcQGEeG*WeHh!SZ~P(5xOZA^cV2e=^?N;@yaVv^_CvRm`dRwuN%|~lxwC2$I5;~)^0qp)J3(XfjQVt!ennI{_tV^JO3#DosHSvZrDx2Sc!}( z^DgoMV46gdv-!PqGq4ryk6(x%h@Xj{jmP6J#Mj6)5b;)5%Wz&tCf{{F3gNNbt&ymM(eZt&h>9wzgi-*DbR|fqJ60V#sSZCl} z%~T!;zJ>DWk0F)cU%#;QSIb4`sSu?}^yr7*nu0eq1?px6 zic=YiQ#)SL?CVOw0iz|nu@z+fIjBDEWWuzj;DNaGQ{J=Vl})Uc~p5x(XS&^0`2U?%bV(Asp|#A zfH9Lh;{fFQNgYyHPCH24e3hrnIq*tQWGsN##Zzev{L0EJ$h-UJAiyyaO2ZDdR>`%Og&) zZdLg&bB|Zc zZxc#2+Q&J6m-E|Sm*00$%8qh7eYd7Btx;ZGg3`F}teKx~%ukHs_1&0xc&$d={)s^J)VxPf+#W;> zRFb#e=4ziRV3hrE%~xfpX4gN(lEvqUyts91Q@WKGe!q1}xkw>8yicCuPk$Fwz8Skp z)s8Z2?HZ<-H)t>7NA>Pco&JX0(rKcP)Ue+S;G3*OvJ-^Xu! z1-q7aE}iNw%q=HV>HfpvRZ?Tl%~+e$$8UUtwkIFOp{)qq;`co+eO`e9Y&R71sk}JT z9oS>3F<*hCr9@@@`YnQFC&&tGomKJ*KOoRLZt2il@xK9vuT4X-HheQ->F2Dce-{F# zu7!uAtXVbN0ZZ!kDus`pLrUf@1YV{jMiU5|j#@e?vxQY)kp!zzgX2@9LH%tKS2AS7 z)QCuvc#X9*UZ%VNW12?j1jD4!=Vx-$JsB=OF!0kspwZe_?^EgDq?BBx?a6xT_yPrd zkpjGAsjVgEv#Xh$VK&0ZfUN8AHU--#v$--h)danIRjJyWGGsBA#;dIg6?L|5-{gz? zuhRDY9puvNa^VIF1v#t571*V@q;}jf_pu=YEZA_TBflMH3NBC{*tU^*f(zDdM3X!& zxlfyLy~PFZN4W2S52ZV}uw{GVB`Y|&6<7hrHCk3a?R0n~2d>)+F5LcQht}x)D(o8lpsy?) zxfzZIZ@{863{}|#>(ME1^T20GZ~@HLvPeavSxLCQa*6TKmEiDA(_fDDjI2w7J)b3L zHG!3+-bc;aTAJ_^lhae9kx&$BzpcEXZFtA5CGP@_n^WYSChrV+XUTh!yqCzMBe%5- z($Lc{6YtN6stmV^#*I$$`)PvT2EiIPON+LhCd_&*D{Hk%`WyNxKXf&@PQyOQOGd0P z(Eha9(G0azGZqZ8|8OZDyY2I>d-&!M*kP@)Efp(p{Et0-W!2D85}QsNfVrb85Vk>S zo#v$3T*al=;rXhkG9KS!B<0B7zKbVfRvdATCZ61qby`Pm4Za+>&RGw2=h0F&!w1g= a6Dlt(Dw`z-*0E;U_Ic~q+)7{X!2bo4>{W{Z literal 0 HcmV?d00001 diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index f4ce3a5..1159a42 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -132,7 +132,7 @@ def __init__( norm_numbers=True, pre_replace_unicode_punct=False, post_remove_control_chars=False, - pearl_parity=False + perl_parity=False ): """ :param language: The two-letter language code. @@ -144,18 +144,19 @@ def __init__( :param norm_numbers: Normalize numbers :type norm_numbers: bool """ + + if perl_parity: + self.NORMALIZE_UNICODE[11] = ("’", r'"') + self.FRENCH_QUOTES[0] = ("\u00A0«\u00A0", r' "') + self.FRENCH_QUOTES[3] = ("\u00A0»\u00A0", r'" ') + self.substitutions = [ self.EXTRA_WHITESPACE, self.NORMALIZE_UNICODE, self.FRENCH_QUOTES, self.HANDLE_PSEUDO_SPACES, ] - - if pearl_parity == True: - self.substitutions[1][11] = ("’", r'"') - self.substitutions[2][0] = ("\u00A0«\u00A0", r' "') - self.substitutions[2][3] = ("\u00A0»\u00A0", r'" ') - + if penn: # Adds the penn substitutions after extra_whitespace regexes. self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN) diff --git a/sacremoses/test_normalize.py b/sacremoses/test_normalize.py new file mode 100644 index 0000000..f9fee2d --- /dev/null +++ b/sacremoses/test_normalize.py @@ -0,0 +1,12 @@ +from normalize import MosesPunctNormalizer +import sys + +def test(fileName): + a = MosesPunctNormalizer(perl_parity = True) + file_path = fileName + + with open(file_path, "r") as file: + text = file.read() + print(a.normalize(text)) + +test(sys.argv[1]) \ No newline at end of file From 1f7479553842e921244f6e193f0b59820e0afe59 Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Fri, 20 Oct 2023 21:19:05 +0530 Subject: [PATCH 08/14] changes according to comments --- .../__pycache__/normalize.cpython-310.pyc | Bin 3995 -> 0 bytes sacremoses/__pycache__/tokenize.cpython-310.pyc | Bin 16674 -> 0 bytes sacremoses/test_normalize.py | 12 ------------ 3 files changed, 12 deletions(-) delete mode 100644 sacremoses/__pycache__/normalize.cpython-310.pyc delete mode 100644 sacremoses/__pycache__/tokenize.cpython-310.pyc delete mode 100644 sacremoses/test_normalize.py diff --git a/sacremoses/__pycache__/normalize.cpython-310.pyc b/sacremoses/__pycache__/normalize.cpython-310.pyc deleted file mode 100644 index e6020a67fa2db0f3352dff6a847f831d9ef5c78d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3995 zcmaJ@-ESMm5x?C#Qb$VEha)F-UAt!@j&-p_Ca#?}vKm#9s7QrPDiQ^!6xA!mJ5#Jl z9WbxzU*cuU&R{dL0 zgD1I}cQp8Cw57}Lrd`s?Hgh%mqULT|ni!{*abP9m=0RHW#%mXuT@)rZUAOE^Mx(jB zyII+YWbIp(bKC`*Y1K$JJ}=}utI5F8(WJUo18v7zOGmOdOA1t;wvDK z(_>nb;3!#3>rJ9V&p(znb?tc7FO-f0hia2eGATM|o8+uco8%`=aspa<;slHgYx5;kzC12Q~;y5B^a=PxX@~nef0(Ff^etAL#4!A%&)Kj|~V^fz*}8)-n^- znTgc8et+v$7c~2Yn2hN09b7uaOulgp`$?um|`LIY!O19EqXXT-6YdpbgW5^ z0m67R>Sq}qByBj}s8{KDXRoSicx-JvvVIa0OYB7$Gi5wnrQvbt4UMHEC-v}H+IfEM z$5W7f{b8Zi-mr&g7c;I=M_O%d=XhJnj)Jt!i?n#pww>7Be!odzf(izGXX}?>zeyvV z2<&WKW#0w?8%8n&q&@8ZjlXw7gDczJ{%kM&oPDdSytDOQXCqO0XY1F!aCiG%rgoKg zx397zz3>L>>xIAX4dKq#Z+cDbY`xzLceg*}qe7IwJq*!z1t-CQMQ8jVUw(R;G-s4Z-d?KU$Mhrf>W!MoX=uNI}i3TS!dyO z{t2Yk$3{$2lYR;%WD05$^oXfI&nJ5RDG6GNa>Ve;Ii)5UN{zDT04a5!*$1uv4kc1e zX45rds+TA`Eo3NB?+_xD`hg#KO9DBLL4E-FHH9TX5(LS4PZAU(P^YEYkJue>L&*6+ zx&~gr{NSH4Sn~rNGW($j4P-WnB#CnzijFo}&ag~TD`ZN!N+xGbLK(6&_icNkV7X8n zJb{LFBhnq_UM^cgIH@@<>+)OX%2uf~sd*(%TLP>lnPqKcN|#{3<$|TXS+QMF%A7~d zwnlH*cB|aG{pOnAJHfhW#WsuVXYwr7eqYd@o& z_8xmWWd|X9#5eY!A9}#Rq5`EY%wQi79}+mMJ-}>zfK2uRN@Pw=3v$!p^di%J>@-Tw z0O2%Sh1X#(0f_6vd?cP&K0kAAF%>gbKR?5tor@W<6=Qz+|7s1k zv}RMrEb4M!BDoM-=1L(CnTB`3Qciz2xS7LGCH5a0r zcF~GDdDDa`+|B5m;@a+FvYdrak-yaIqRJ=ALz|>v7arHK-BsJk(?t3Zo+qLZPtV>|C8{ Tt^5@%Cyt6P$SN722ju?&1{e#z diff --git a/sacremoses/__pycache__/tokenize.cpython-310.pyc b/sacremoses/__pycache__/tokenize.cpython-310.pyc deleted file mode 100644 index 43326779b708e1a8a6bdde9cd9ba5fc40250a7e6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16674 zcmb_@Yjj&zb{_7z7f*r|Nl_&AvLT8V2vHSaAF2_hkp775Y>WJw@M2;#jWA%XyP zFKAg1IO7>9vE6hMdnTP^GMJ~u?Von7@mft+fAlqJo8BgA>!fX*cBW~k zlO~+8)Nk*LHz_AwtAx(E``c%q_j{kMY-6LxhQHtV?OyGT&)aOjPoVlw0k{Zf!8UPS0=paDi_in2!;oh_n4M(2QDg5(gR^FSAXjxo9j6a+fX=n~K{&uvOp_96O8Tw=|Evd22c)Y zlF=&AYd}+s?gLSKzQX8rAex8Mj2;wGgi#*oA;O}J3P5W>*BE^c=<`6YGI|5(3qaQy zeG%vp&<#e9fqnw$CLX$iX;@;BY+N?IGCA3=$_E?f->T|!%+^epjw z^}X2gf=z3ESJHOE`QQ$~b-^8k>xO#)?hu?Gt_SWgTrb=axIVakxBxJz)ua3gRbxXW-?;6~xb;9iEi3O5cn z0T+gwgqwO-iec(St%us(xegCm7mQz1ae;MCZ^_%Lq@O@4FYwCyi-dF#+ z`YZUpuKud}H}HL+{!R7Q@Xf1#PyO5KuRoSz58ZzYzu$2GRrT+ye@Feh;1u4pDYn=e zcJO~-Hcey=&4^}~v`i|m>G`jEeE5qlrVJncC4Ey5a*NrFZzZeeeAzp`+@j{=7`}28 z-yJ=>%&``8xfSEgz`#N(x43${e?GfBuuQ4<8Ou3~QnmT4s_6r_)7jet%Sj^#+L+f< zD>-8zS1Ln=>R-XJf|E9FraWeZP3M>qOlMY?O=Zl8%(I`hm@A7(BF3}#Q42Y4^lnPN ze8Wuq=+UEn@uSC1%?3nX_)ix9xmc+VCKkd-i~aJ`0^gmMQ+{> zMtqQ@9-^2C=+4V?w{GR-Tep0ua#!9pcMGz&kfU;oB*mN`fj*?tGeea;Hyh~6JCHC% zXejfLL~{icQ=n=hk)f)%el8TEI^zBG%X@u^(D?Y+RAelY@R^0e>%;8K8j!07|=)Y zzh(jYC>ei_LinTj$H!TWK8k-j#DG4E|5XdnM+y6*6vCfIjH==J*+4Rr$s#2W23<1a zW4ZxBBPxpgKtu5n;P(;b1^ddvY#_C~4CDqswd^CZ6p+nil2!l~CL+rLxm+nA=OdB= z#?!L_t(=r*B}Jv0Se^}JbMU=P1zMlkgGoMMy#5A4u8fJB-;!p6@U^>d>bBP;z&1%5wSL^n{my`e)+z<9+c2t^Gye9uqm`mcwUBN>CXXz_AcTX|4_2oa<-iGmhF*P7aKOoRsfxlRh)Gn z+j7dfy?~qK2=2e&ItoI6dBH|h&RGzu{7C%F!8N%ckJx5A*OY>??%J@?S3d*Vt>QLq zfO^Yz*YTR9pP}!%rvR>GyVrFc8U_WwN6;^1T!S_vbRT^B-Gt*aj2&fgQLJMA4WMP! z@tRHlUDdf}XXFA}F4%Kk)w$l7Yg*qyw`aQ7pRyGtJdpTYjRfDBt0cUT_*RWXVawh_3w82}j6>PJPHK`!2+cs?PihCkr`_M*n z+{oeV1a*q&KWZw8%vw=dTw!GL^GCh*_8n(X%_V4vN2#J z=XK3Gq20;qOI5gh1nXaUV0zA_Q$`M_xs~(%D5pWg&tY>(j;HzF+S6lyB9Y3Za*0I$ zxpa0unKsUYD8)4;eN#HPrKWbIwiC|ihQ~E+TEkFX$1vWIbb&fU^@GcUw7XbyEL3!c z)2Yc&_2$K-k-(Wi(^K<_Y$lEJ>MN*kDwoP;AbSjArgZ;9$y5w2eaC8`6O_P7c;~1> zUJwqO5H3l4>P58ziYfAorns9l#ZuOkp|UET74?f0F$m9ej3zS+t7+tLzRZ)s1LH!O z;Tlcp$%SMlX}Yc^bIB!8dNYN#kk?8c)m`63-ahj7!wYnxXIF9d)uH>cnXXc+no1gZ znr)eiaxJk^)UKKK`&nHz%(n1k_|kMJcy%m%B{4M}x;%Cx6fu?Tik8ubC^e6fL+cp# zaYi>C$rbQbb65DzgdYTV|ba$-E1mjdg&xyQt+Afd5kDSZ=?c~qRte$ zW;$tvq|%z{z+hTV=FA;a!I4CGGMX5R1jmts*%7>QWjYjzjJ+C4jNY6Y4TTetsqwL> z**X@EP6wltSBWzojEsgN0f+t)B|l2Z`{q}TTy{CJUBwf3C{Otg3aO-18O?N~yAoVL z^tn5cm`^7SBazV0QF7ezS3vbp$A>{Q35Vp6U6SjmVs}Vh36#_%c|eIb#EEyhY;7~{zH>q`-kP@||2kqpg){*np-LZUi?B5dc+0FDKQnQ+=*f7*slD>BVmFp%f0t=p-#? zdeqdNJDRR#<~5y~`9opNmicW;>`WPHx)d)^IVr1lDc@c}N>v`DG=S)o!Q~S8 zVjrh+J;IRFQwYSGDJL@z^t+%M}F4XNDDX45?ItuEiDg!*YDyId+- zvy^|nu4wgpPfY>oh(-%Ick6F~u=ejYYCaTjP0nF{Kf=*$_C{Q+DQoteysn`5cFbx= z!J(@K$2tyB818uLC^&J%IaIK%I|_J`Fs>Aw=mO#$Wy)<+AXfQOO} z+ionOh~=(eRoRXGR3E+PEylFzbCzsDUINXfuvg?agg!^Is-we3+;g^jbX4;cP%<>K zE1Y-F_257ri1&AYg0`~h3*-@%ch1f5@f@0$eoOPxfURvMoywVVTFaPjjd$)OMhmS# z+!3aILC>zP=qtpyO5P-S%jCTRFW{<-b<@FGBg6C{okWqAQl_0rvzhBwb3(J^6|^C=x;+AuUiL`hX`2vFgx#Zf zrB)yzdxdzzBW*b%eO4Yd!-tN5rTO;*9{@VshQZ>&U=cX8V+$zI!jEC30AAHTRH~a#*A}$bS4=Nfegj9QRi3bQOP)tF=94R0 z;`Qb9*U$(*pz|N?eahCUT9Le5b7P@;lZ1(FCaWm|XS^I!#OYFS5$ehriF%V@_$Aj4!7{bDhjA`FN<13e=ZWlPft* z_1%6@QFoS8Tu#m};*^1S$&8UsQhWJu@L9&$DEFYhEQaID6sl>qS$#1T3WrO*kQn5C z(AS<7KBOB63fQaq$)jtzRXvlyflU7b0`)JFSJzLCILl?MNq~C%B8^wt8(fYifo)ij zk0oqh-4a$4DLPM`Tl%AOG4+}ZXZ!)u%i-quNP3$_BJ~{JQi{FeSaV{r!XovSZ9}|m zyXPs`^$XSwv|y*Z-zP+^Q>LQtLR@8n}JYBGeXW z-QW8Qwq8TQ!>#vEksEHO+y;vNM?kxZNY=mq7Um6Fx1HyYE4dS0T^`BLd;lm&_2j~Wt{Fz^H7%hgjl~4-DG;Ar;Tf6BVuGd?7BHJB*vG*>eHDwlZ@b){l^h)r z%GqzZy3$;U(A?MqkFTz7KD^@ulEduS5Y|d&aJWC#OTh!APilL#?{H zTelRtp8jK+#B^rcW81Qkl-hs0yLL@cY5LMMrkiHXEt}3GGzc(o9emf2pk0vfxkS4h7yBYI73@F$E>^P-cb0;_*i5VWe5$AUB;z3I5ioGPEX=KYwo^0ofwO58`Ob| z`E~);KG1>t@7Orq*>NqYN>N{=GCv}ZHwACirqI6qsMX)38ak}4*lgX>Q$}S6F`Fx) z)`Zg^P*!j(+59wxQn#9}((wB=>f1T6TrF+bVHDnLgm64*cC_M<;F5*# z?C^rmmm^tG6dWA9pElV&G94nsQ|wIE*NMxM>=Dh_s~|5o;~a&3OU5}%G>a`4>r(!h zKr~=rg>*TfT)rF)2R=s{lT311OC<2(lvvKHnD?N)iNw9tWV#sQOeECoJoEyzPxHZ2 zpC*bXwC*O4=8R5roOK*}15vakS{EAHS}Rjp{|dpMg=e}a#zqnt5MxuOXFPNnqk-v$ z>9H%LMWDY*(&aH}`ldru;~3H0`PavyqX}q5Les%JiZM@snlQt^usqmzjUh8L3H?nnJbgW-{J z+#PD!o~jkD5outp;&Fqj*TA7#c&rv4uZ1US;mKNfsusRj3s0L(HN`kuhX?EMP#r#2 zhmY6c6Lt7x9X?fuU#!EY&7H&Jp&$(@w3W46(d637#N^t_JZ3f*FK@vKjMNA^#cU{Y zrpBhQ%@hGIFG23XOX!oUp{Tj17-X$BwP8DpVb#5BmYBsc!6TuZinV^dat(7JQOaR& zO%9RTz(&^T7B@x=k7|hFQ4KLXsv(9)H9VpJ0?pUvx(yW@bsYyAbsY!ISI0T7f1Q@S z_sIJOCA<4lFoK=CeDJ`8j$*+Lqm!N)H1zDCW6usc{_LO=`cF`Sev&*G_}a|QXOt>p zwqFlj+FII(PMT~!(OBmM){(3bj}%vYvarq$ZxMVSp4nJbh|g>3G&aA=6_U1a>)=T` zD|%ruTsV2UY>L$85K@<f5xkn2dS~NsFRWzLlWQm_*+w?^ew46*>+aRD^J=L%1_7wWu~9i~l5i;y;Q5;y;Kk@q6N+_+8O0en-3@{=GPav#?+MJJBP4TLi?v z6^F&Y5xwGHizDKXJpa zM{8L3s<6dYov;^dWCVL=)eY3d2=?4+1JDjeUZ6&xW=445Rd)cjFlq*B0cvH`3Pcvw zI~kQMs@oWqEUI@gDp^#wGunfAlppM}QL?>2WQpC$Xde(+V(->EG}wM?En26xTWi&J zYHivst$m>(wg+~7`(f9&7vJab-G}e<`0nTUzV+wS2H2=ms`RDw=(~RdFBcmU?6N7( z1MAO`6+5txaThSZlE)4(K3K+GjJwPDAmbMhvzsYcYR6ts2U#1_qXuAMct}00_QGP& zuMVkwYX4&?)&pulJqjwomV$?&Rl}mnilY&&bnw>mU+1gwXX@BEj;|`x|9@*iwNfI> zUXV-054P69ou4Ubq>_=*MVNcQGEeG*WeHh!SZ~P(5xOZA^cV2e=^?N;@yaVv^_CvRm`dRwuN%|~lxwC2$I5;~)^0qp)J3(XfjQVt!ennI{_tV^JO3#DosHSvZrDx2Sc!}( z^DgoMV46gdv-!PqGq4ryk6(x%h@Xj{jmP6J#Mj6)5b;)5%Wz&tCf{{F3gNNbt&ymM(eZt&h>9wzgi-*DbR|fqJ60V#sSZCl} z%~T!;zJ>DWk0F)cU%#;QSIb4`sSu?}^yr7*nu0eq1?px6 zic=YiQ#)SL?CVOw0iz|nu@z+fIjBDEWWuzj;DNaGQ{J=Vl})Uc~p5x(XS&^0`2U?%bV(Asp|#A zfH9Lh;{fFQNgYyHPCH24e3hrnIq*tQWGsN##Zzev{L0EJ$h-UJAiyyaO2ZDdR>`%Og&) zZdLg&bB|Zc zZxc#2+Q&J6m-E|Sm*00$%8qh7eYd7Btx;ZGg3`F}teKx~%ukHs_1&0xc&$d={)s^J)VxPf+#W;> zRFb#e=4ziRV3hrE%~xfpX4gN(lEvqUyts91Q@WKGe!q1}xkw>8yicCuPk$Fwz8Skp z)s8Z2?HZ<-H)t>7NA>Pco&JX0(rKcP)Ue+S;G3*OvJ-^Xu! z1-q7aE}iNw%q=HV>HfpvRZ?Tl%~+e$$8UUtwkIFOp{)qq;`co+eO`e9Y&R71sk}JT z9oS>3F<*hCr9@@@`YnQFC&&tGomKJ*KOoRLZt2il@xK9vuT4X-HheQ->F2Dce-{F# zu7!uAtXVbN0ZZ!kDus`pLrUf@1YV{jMiU5|j#@e?vxQY)kp!zzgX2@9LH%tKS2AS7 z)QCuvc#X9*UZ%VNW12?j1jD4!=Vx-$JsB=OF!0kspwZe_?^EgDq?BBx?a6xT_yPrd zkpjGAsjVgEv#Xh$VK&0ZfUN8AHU--#v$--h)danIRjJyWGGsBA#;dIg6?L|5-{gz? zuhRDY9puvNa^VIF1v#t571*V@q;}jf_pu=YEZA_TBflMH3NBC{*tU^*f(zDdM3X!& zxlfyLy~PFZN4W2S52ZV}uw{GVB`Y|&6<7hrHCk3a?R0n~2d>)+F5LcQht}x)D(o8lpsy?) zxfzZIZ@{863{}|#>(ME1^T20GZ~@HLvPeavSxLCQa*6TKmEiDA(_fDDjI2w7J)b3L zHG!3+-bc;aTAJ_^lhae9kx&$BzpcEXZFtA5CGP@_n^WYSChrV+XUTh!yqCzMBe%5- z($Lc{6YtN6stmV^#*I$$`)PvT2EiIPON+LhCd_&*D{Hk%`WyNxKXf&@PQyOQOGd0P z(Eha9(G0azGZqZ8|8OZDyY2I>d-&!M*kP@)Efp(p{Et0-W!2D85}QsNfVrb85Vk>S zo#v$3T*al=;rXhkG9KS!B<0B7zKbVfRvdATCZ61qby`Pm4Za+>&RGw2=h0F&!w1g= a6Dlt(Dw`z-*0E;U_Ic~q+)7{X!2bo4>{W{Z diff --git a/sacremoses/test_normalize.py b/sacremoses/test_normalize.py deleted file mode 100644 index f9fee2d..0000000 --- a/sacremoses/test_normalize.py +++ /dev/null @@ -1,12 +0,0 @@ -from normalize import MosesPunctNormalizer -import sys - -def test(fileName): - a = MosesPunctNormalizer(perl_parity = True) - file_path = fileName - - with open(file_path, "r") as file: - text = file.read() - print(a.normalize(text)) - -test(sys.argv[1]) \ No newline at end of file From 0b55f137cd11cf75417c3341b9c1faa0014efb8b Mon Sep 17 00:00:00 2001 From: NIXBLACK11 Date: Fri, 20 Oct 2023 21:22:45 +0530 Subject: [PATCH 09/14] changes made to indentation --- sacremoses/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index 1159a42..277bc73 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -156,7 +156,7 @@ def __init__( self.FRENCH_QUOTES, self.HANDLE_PSEUDO_SPACES, ] - + if penn: # Adds the penn substitutions after extra_whitespace regexes. self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN) From 4903ad8b18419a4a721a70221bf5bc68cd82cf58 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Rana <91743459+NIXBLACK11@users.noreply.github.com> Date: Fri, 20 Oct 2023 21:27:04 +0530 Subject: [PATCH 10/14] Update normalize.py --- sacremoses/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index 277bc73..e4e04d3 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -203,4 +203,4 @@ def replace_unicode_punct(self, text): return text def remove_control_chars(self, text): - return regex.sub(r"\p{C}", "", text) \ No newline at end of file + return regex.sub(r"\p{C}", "", text) From 06d181e96d6ca2696106f30284dfbc7f7e9ee5e7 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Rana <91743459+NIXBLACK11@users.noreply.github.com> Date: Fri, 20 Oct 2023 22:00:45 +0530 Subject: [PATCH 11/14] Added comments --- sacremoses/normalize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index e4e04d3..deee082 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -143,6 +143,8 @@ def __init__( :type norm_quote_commas: bool :param norm_numbers: Normalize numbers :type norm_numbers: bool + :param perl_parity: exact parity with perl script + :type: bool """ if perl_parity: From a57726d9f5d8e91fe24c29c268c0a0b9389a9dca Mon Sep 17 00:00:00 2001 From: Siddharth Singh Rana <91743459+NIXBLACK11@users.noreply.github.com> Date: Fri, 20 Oct 2023 22:01:41 +0530 Subject: [PATCH 12/14] Added comments --- sacremoses/normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index deee082..21e1e4f 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -144,7 +144,7 @@ def __init__( :param norm_numbers: Normalize numbers :type norm_numbers: bool :param perl_parity: exact parity with perl script - :type: bool + :type perl_parity: bool """ if perl_parity: From c57e7777a7bb6c8a91c13f43a2d9d3b98a8e1d5e Mon Sep 17 00:00:00 2001 From: Siddharth Singh Rana <91743459+NIXBLACK11@users.noreply.github.com> Date: Thu, 26 Oct 2023 17:01:14 +0530 Subject: [PATCH 13/14] Update test_normalizer.py to include test for perl parity --- sacremoses/test/test_normalizer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sacremoses/test/test_normalizer.py b/sacremoses/test/test_normalizer.py index 74e1ff4..e06a319 100644 --- a/sacremoses/test/test_normalizer.py +++ b/sacremoses/test/test_normalizer.py @@ -70,3 +70,9 @@ def test_normalization_pipeline(self): text = "0《123》 456% '' 【789】" expected = '0"123" 456% " [789]' assert moses_norm_unicode.normalize(text) == expected + + def test_moses_normalize_with_perl_parity(self): + moses_perl_parity = MosesPunctNormalizer(perl_parity=True) + text = 'from the ‘bad bank’, Northern, wala « dox ci jawwu Les « wagonways » étaient construits' + expected = '''from the 'bad bank," Northern, wala "dox ci jawwu Les "wagonways" étaient construits''' + assert moses_perl_parity.normalize(text) == expected From 4164186292ed9dd5ea84876c17cc1d5a8cb76bea Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 26 Oct 2023 16:41:49 +0100 Subject: [PATCH 14/14] Make non-breaking spaces that are in there more explicit --- sacremoses/test/test_normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sacremoses/test/test_normalizer.py b/sacremoses/test/test_normalizer.py index e06a319..c6a0152 100644 --- a/sacremoses/test/test_normalizer.py +++ b/sacremoses/test/test_normalizer.py @@ -73,6 +73,6 @@ def test_normalization_pipeline(self): def test_moses_normalize_with_perl_parity(self): moses_perl_parity = MosesPunctNormalizer(perl_parity=True) - text = 'from the ‘bad bank’, Northern, wala « dox ci jawwu Les « wagonways » étaient construits' + text = 'from the ‘bad bank’, Northern, wala\u00A0«\u00A0dox ci jawwu Les «\u00A0wagonways\u00A0»\u00A0étaient construits' expected = '''from the 'bad bank," Northern, wala "dox ci jawwu Les "wagonways" étaient construits''' assert moses_perl_parity.normalize(text) == expected