From 0212271a2379c1dd3939cd127fb17dbd9aba66c8 Mon Sep 17 00:00:00 2001 From: padre Date: Sun, 5 Nov 2023 15:57:27 +0100 Subject: [PATCH] fix small bugs with docx reader such as non-integer sizes in docx style and filename with dots and spaces --- .../docx_reader/properties_extractor.py | 16 +++++++----- dedoc/utils/utils.py | 7 +++++ tests/api_tests/test_api_format_docx.py | 10 ++++++++ ...Known -Nik O'Tinn -Ireland 2023- DRAFT.doc | Bin 0 -> 8704 bytes tests/data/docx/broken_properties.docx | Bin 0 -> 4934 bytes tests/unit_tests/test_utils.py | 24 ++++++++++++++++++ 6 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 tests/data/docx/Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc create mode 100644 tests/data/docx/broken_properties.docx create mode 100644 tests/unit_tests/test_utils.py diff --git a/dedoc/readers/docx_reader/properties_extractor.py b/dedoc/readers/docx_reader/properties_extractor.py index 016930a8..0e92b80a 100644 --- a/dedoc/readers/docx_reader/properties_extractor.py +++ b/dedoc/readers/docx_reader/properties_extractor.py @@ -74,7 +74,10 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None: if not tree.ind: return - attributes = {attribute: 0 for attribute in ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]} + attributes = { + attribute: 0 for attribute in + ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"] + } for attribute in attributes: attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0)) @@ -106,7 +109,8 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None: :param tree: BeautifulSoup tree with properties """ if tree.sz: - old_properties.size = int(tree.sz.get("w:val", old_properties.size)) + new_size = float(tree.sz.get("w:val", old_properties.size)) + old_properties.size = int(new_size) def change_jc(old_properties: BaseProperties, tree: Tag) -> None: @@ -176,19 +180,19 @@ def change_spacing(old_properties: BaseProperties, tree: Tag) -> None: if not before_autospacing: before_lines = tree.spacing.get("w:beforeLines", False) - before_lines = int(before_lines) if before_lines else before_lines + before_lines = int(float(before_lines)) if before_lines else before_lines if not before_lines: before_tag = tree.spacing.get("w:before", False) - before = int(before_tag) if before_tag else before + before = int(float(before_tag)) if before_tag else before else: before = before_lines if not after_autospacing: after_lines = tree.spacing.get("w:afterLines", False) - after_lines = int(after_lines) if after_lines else after_lines + after_lines = int(float(after_lines)) if after_lines else after_lines if not after_lines: after_tag = tree.spacing.get("w:after", False) - after = int(after_tag) if after_tag else after + after = int(float(after_tag)) if after_tag else after else: after = after_lines diff --git a/dedoc/utils/utils.py b/dedoc/utils/utils.py index c8f74605..5ab6521a 100644 --- a/dedoc/utils/utils.py +++ b/dedoc/utils/utils.py @@ -63,6 +63,13 @@ def splitext_(path: str) -> Tuple[str, str]: """ get extensions with several dots """ + if len(path.split()) > 1: + first, second = path.rsplit(maxsplit=1) + sep = path[len(first)] + name, ext = splitext(second) + if len(ext) == 0: + name, ext = ext, name + return first + sep + name, ext if len(path.split(".")) > 2: return path.split(".")[0], "." + ".".join(path.split(".")[-2:]) return splitext(path) diff --git a/tests/api_tests/test_api_format_docx.py b/tests/api_tests/test_api_format_docx.py index 2894db6d..779100cc 100644 --- a/tests/api_tests/test_api_format_docx.py +++ b/tests/api_tests/test_api_format_docx.py @@ -118,6 +118,16 @@ def test_docx_heading_new(self) -> None: data = dict(structure_type="tree", return_format="html") _ = self._send_request(file_name, data=data) + def test_properties_extractor(self) -> None: + file_name = "broken_properties.docx" + result = self._send_request(file_name, data={}) + content = result["content"]["structure"] + self.assertEqual("FonFfff", get_by_tree_path(content, "0.0")["text"].strip()) + + def test_name_with_apostrophe(self) -> None: + file_name = "Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc" + _ = self._send_request(file_name, data={}) + def __check_doc_like(self, result: dict) -> None: content = result["content"]["structure"] self.assertEqual("", get_by_tree_path(content, "0")["text"]) diff --git a/tests/data/docx/Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc b/tests/data/docx/Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc new file mode 100644 index 0000000000000000000000000000000000000000..d803e55b0882127fa30b269f03be6c4933db9bae GIT binary patch literal 8704 zcmeI1PfQ$D7{I?-77Dadx>lrAW!tVTwy-T=b+J{KLR+ZCVrfznP0Rv|U}2{&ON@yJ zqcQPfQlp6%6YIf~#?X57V7+)Vdg-B_+LJNypvHq}3jV$~GqOvU-GP#(n0cGuyf^QC zGv9mPd;9);^WzU&u77%|>4q!`honO07iy#`FW(|v=iS$fgh*@N{KCS5mrXJV>&OB( z-MVTYhUvsg&_3S))ldVqun{)FJ)p9i-FO`}HK=M43!E04UrwTOR5HX_d7Ym19~u}j z`LFf!Wd3~NN4}g)pnuT<M3T+UF{cr%xxnizztdV$NVkB!%+he)tSvx!0)@z@enM!AJW=$9v(!5@K#Eb+@$GI`(@r3-@ zb@^52MWwEcKSyaPp%P^gMq(|ykQdB|bOhVaOv94q7G1(il zx*2Yb(sQCwruw#pryS7B}%i>@*1&D;M%whk00bceylmfBtIp?JgFw>-0e?$ zH)fo0%{YN3OaJ?2!bzSOrlwh*%aOLJXH0(5_!1l?on`l9pG^RNj< zLHFC|K@IcEO=s=Yw61$ekd5bbX?4~L zSNJXRjZ@+n)1A-iq$g!hrZR~lCawuL$bK^eIuCmDT!DhT8$C7X{HPVtv}XtBG?6Dr z@?CSAb2GiTP^R!sWwsV(mAq38YUK~z*c1MiM6{@_JPCjCMSo(qNZmm9uy(N3oH;=;I6X385w6Y9%U<@;z(f2j5%ENvtdtjIM8Fo zD5Y6SUZyN<_9J&y1A@xjo(0@}-tE04czT^$!18<*bL+GLLA6(E0Zv^NF-gUwJH}D0 zO%5NBzoe%HNYHWS_nyDntHG3cbcr9O$6UfBo$E#^@2pWA3?Z9ob^n z7WAu6M}0Yd^{Y2Sy&4+M8eOA&d$DE2)Fe#}a!#R-Ytt&)uk(uMx2n8F^UCazScU%Q U(C@#6<&9uE-o2EUF8u!YFW|1aasU7T literal 0 HcmV?d00001 diff --git a/tests/data/docx/broken_properties.docx b/tests/data/docx/broken_properties.docx new file mode 100644 index 0000000000000000000000000000000000000000..a2f332b3f74ea4e0f417e31cb5797509d893c95d GIT binary patch literal 4934 zcmb7HcRbYpA3yH!J)2a7GD|jH4riRbSERlnS*Pp~vN|L?8ClmCml25w zk?c(fzteKQt?%#daqExI^Ios#^Yebbo-dd(5JU$c1pEbf?hMx@>!E2;CjbD9i2#5@ z049LpMF*rS9O-JJ<>d%>G2-{Mx6{@F1AwIW6yPaY3UD`1G5`VS0}ufC{Pq;ps@fn( zEw{cMiZMi06iNNXIUCeMTg7E-F{5a6>qK#O*0xVGLW`BIpN+KD^Zn2n7ryMI0+mEG zi|njj6E;k!wN5F!#GZQ%Qv0H?&sCUvf{{un!smb?odg#hs0q%CTd3*LV2Sn0Taq@q zCNf7;@NiSuXh=;bvUoa#Z1&yjb&x}%0KmS>a?)3Camq}u>+q%JOHL$cmc)>2RL@|* z0M*2W(Z((EC{)bjVc$$!Fl>g(pGqLtMNOU>*5D*tyd&1pQ+ikGxbnqlLX8g6mr*kw zN3~AOVT1a2**{QjGi>HRjbTLdvdViWl++iP2bYmd+^RsZruc`V=cRpBFqNK&k(EO; z52>sGw=y@v=)5QVgVmou*&QDP!f@)40=_(0zkYuC0OQoLaJUL{c5rkN_^ipo;_j~E zkDBa3>)WP$9p_ox6rZl2pri`7OQ<-dM zZwy7+3WVf0mdy~=54?&L3HN@ecK%6*!^~O3lmw_)5LJId((SI>9HyDt{U|Xno|L~` zqCs@>A^pcN&lw2yml(^|dIJ?GSVKUEoHbahh_a-0I&MWaKMIL?x*A@`<)=97U&tx0 z?`2)b!ff>#^$c9k1-tl)KvDS$(_$?QK0fYmrkP|k=A^{FMP&{l_ zwa90}hL5N@d?9vcVZk{(?Xhb#wrkNet9_)zCsoa(dmz)Quj!O;kSz2$F%^kX{zzQs zg&CQllKl4>Pat6vHXkP+m2$Bz&}7)xm-Xi7BgxkEzycid=Tn9HHJ?7$xw<+P94wh) zEcxD3-FqZ&zDQNB(+{{yc#!o$sAQxDj`dI+>ooYRUv+SX|A}^c4I{M%7COImT}X8~ zivP^b5~KT;YWzabj75r(f{SFFQmOQ zV=A4>d=ldEw%klf`U)F~kQkr&F^ich{z=R0XWyg_NAz@03)=9y)pAqjv=L>$AzTjU zbMd8=;j44v38&&Y(L}~JoG_V3v31&>v9_AxZpf6V$(ae=k#G^(#%)%MDOTQ*vp*62`cx6)#4&M(+xO%(s`)k~xl)#Bg_WgSPJ(S;hB|2I~ z87NJDbAJ2q#koN{o?95~8k>v*+bT(L{LEYjmgEcqy*ngLS_KPDYpM2HUSR67$L^%5 zCWVDbH{>Ctodi3p1Z_MABm_>Fb_-PLUWk0D6n}*~Zz%Y_8LeW(t)|N}xey_zoJ3Yj zHH{W=bGFCXQBH$p>u~|XX{kazy*3!bCxoS2mp+M#cpoX|2tH|P{NPIA<8%CnRlO`a zO*6c5oi=PBvE-gc*4sYm=g6@Onh=E-(gj|N`seh@%44~*4COs`5fBI8*6J3!PUF$A z_WKR58CO#G*}dum*Tpw2A{;v&4$c-|{7USB0(|fFuNks`IRJ3b^7hF6eEGi=Qu&kV zZ3qa~HQ@?5gj0y=APCMv-RyA#ZtoBtlI% zN-nDx0YB&R_5p?(aE(42|Kv9>Bw%8)b;qcK#6J8QuVP@jdRf;48b_}3ZqA2ciZv~& z8dtz*_rNB`L!c5_WRn++Z%vq^H$5b#f74il*RLU2>(pzO)U3&%Ej3~~;K>1P!cPVN zT#FU`ySB*Q$o2+fK33w`v8drdB=gNns2XY7g`5}+!3BQFt!!U=irA&Tk%9XB)%M0u zg_2VD7ncE$-rk@c4`#kTf zmk-~ae~F;_OHPHGqY=C+)kcWYKH?PYu-pY%M;j<_3J~ON|{Y+#@A6?UR!8G1f_$NN$f@xHv)hS8xen*d){qod zC!sAf@3_nFR5S=x1l?3p&r8tw+qIK3Spr82^v4cAvw$9r3BkOk$O#QghegvE|CK?xjNJ-2jj7 z%MPLjyTtoPYtQ*az zw1OsJYETKtJBBAzEMx>WoM?I4lh|%uVZzJNlNIem={<7}9=n?r0iH+1q3ZVZ7^bXc zken7RSKLGAmF0zVh8Y+=Ua~gnE~eyMX{toEb$cBE0u(Fa3_=KF)HC3gmsE&E&eY!` z24*GGP&1%7*&*ayvLBdwdh>YiT@EC%@Q$SxC$ zp@cl(pjCdl7UdY-YQONq%!r;A>$ea{UN+cSxtwtNZTM2?>GBMP=6DH{V<*Ct<}c{< z*tYLPc)0m@z1StzBL_$q&&PtoBi;;h=qip4gfY?BzqtXGaazl$@^#-7o|%pB&S_VT zoA~ftxkcS~(UNF-R?%ux&KsDWHEY-JGU`NrW!n4dZ`fJqxP%!>A^A#!GcxH^#;^tLYdw+jp;n4 zbd)BrXE&2qk6L8culb?k5<7z3`0V4|XR9V0`t1D95vO~zj_yF3-kpEC@p%>4&+!T@ z0j!I-k>-eVnC~~23*6NefwcOgzrZ6UkPU+5xH9edL@;{x{Y7Ex`Bn;n4}*TNHo z^8L1o5~psr$&Aa01Yc1LKNfct?Ir^B#&F7!-(N+E7m0UktD&mmFs4}Hsbv4WJoK9B zbsPBj(I|ljP;+5=y%{^6ta+KgC2rq&;V32~CygAI9TKnQh2Fm0B1TDnjT0kKhiD`c z9AjTFNItSmNW4I6JJ}!Rc6Y*K3Kjx6xg~%qKdJs8MVX=V<$CY18t>}F)T+Nmq3G(S z?yM+GnSk&(aG$+{YcBogjDjN)P!hQJ^`}cO(U+z7hp@5-{HqP^3-I2ewJn{3!^hj# zaBzG3+E3y3>pZxv;nz8W5BGg{`(2Uys0X(Zd{kZBtomN${%-O+*q`@K48ZSw1qAq> zs{dQ#2QxK(i4dHde#w7d? None: + name_extension = "name.doc" + name, extension = splitext_(name_extension) + self.assertEqual("name", name) + self.assertEqual(".doc", extension) + + def test_splitext_apostrophe_name(self) -> None: + name_extension = "Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc" + name, extension = splitext_(name_extension) + self.assertEqual("Well. Known -Nik O'Tinn -Ireland 2023- DRAFT", name) + self.assertEqual(".doc", extension) + + def test_splitext_space_name(self) -> None: + name_extension = "some file .doc" + name, extension = splitext_(name_extension) + self.assertEqual("some file ", name) + self.assertEqual(".doc", extension)