Skip to content

Commit

Permalink
fix small bugs with docx reader such as non-integer sizes in docx sty…
Browse files Browse the repository at this point in the history
…le and filename with dots and spaces
  • Loading branch information
IlyaKozlov committed Nov 5, 2023
1 parent f203aef commit 0212271
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 6 deletions.
16 changes: 10 additions & 6 deletions dedoc/readers/docx_reader/properties_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None:
if not tree.ind:
return

attributes = {attribute: 0 for attribute in ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]}
attributes = {
attribute: 0 for attribute in
["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]
}
for attribute in attributes:
attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0))

Expand Down Expand Up @@ -106,7 +109,8 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None:
:param tree: BeautifulSoup tree with properties
"""
if tree.sz:
old_properties.size = int(tree.sz.get("w:val", old_properties.size))
new_size = float(tree.sz.get("w:val", old_properties.size))
old_properties.size = int(new_size)


def change_jc(old_properties: BaseProperties, tree: Tag) -> None:
Expand Down Expand Up @@ -176,19 +180,19 @@ def change_spacing(old_properties: BaseProperties, tree: Tag) -> None:

if not before_autospacing:
before_lines = tree.spacing.get("w:beforeLines", False)
before_lines = int(before_lines) if before_lines else before_lines
before_lines = int(float(before_lines)) if before_lines else before_lines
if not before_lines:
before_tag = tree.spacing.get("w:before", False)
before = int(before_tag) if before_tag else before
before = int(float(before_tag)) if before_tag else before
else:
before = before_lines

if not after_autospacing:
after_lines = tree.spacing.get("w:afterLines", False)
after_lines = int(after_lines) if after_lines else after_lines
after_lines = int(float(after_lines)) if after_lines else after_lines
if not after_lines:
after_tag = tree.spacing.get("w:after", False)
after = int(after_tag) if after_tag else after
after = int(float(after_tag)) if after_tag else after
else:
after = after_lines

Expand Down
7 changes: 7 additions & 0 deletions dedoc/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ def splitext_(path: str) -> Tuple[str, str]:
"""
get extensions with several dots
"""
if len(path.split()) > 1:
first, second = path.rsplit(maxsplit=1)
sep = path[len(first)]
name, ext = splitext(second)
if len(ext) == 0:
name, ext = ext, name
return first + sep + name, ext
if len(path.split(".")) > 2:
return path.split(".")[0], "." + ".".join(path.split(".")[-2:])
return splitext(path)
Expand Down
10 changes: 10 additions & 0 deletions tests/api_tests/test_api_format_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,16 @@ def test_docx_heading_new(self) -> None:
data = dict(structure_type="tree", return_format="html")
_ = self._send_request(file_name, data=data)

def test_properties_extractor(self) -> None:
file_name = "broken_properties.docx"
result = self._send_request(file_name, data={})
content = result["content"]["structure"]
self.assertEqual("FonFfff", get_by_tree_path(content, "0.0")["text"].strip())

def test_name_with_apostrophe(self) -> None:
file_name = "Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc"
_ = self._send_request(file_name, data={})

def __check_doc_like(self, result: dict) -> None:
content = result["content"]["structure"]
self.assertEqual("", get_by_tree_path(content, "0")["text"])
Expand Down
Binary file not shown.
Binary file added tests/data/docx/broken_properties.docx
Binary file not shown.
24 changes: 24 additions & 0 deletions tests/unit_tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import unittest

from dedoc.utils.utils import splitext_


class TestRecognizedTable(unittest.TestCase):

def test_splitext_simple_name(self) -> None:
name_extension = "name.doc"
name, extension = splitext_(name_extension)
self.assertEqual("name", name)
self.assertEqual(".doc", extension)

def test_splitext_apostrophe_name(self) -> None:
name_extension = "Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc"
name, extension = splitext_(name_extension)
self.assertEqual("Well. Known -Nik O'Tinn -Ireland 2023- DRAFT", name)
self.assertEqual(".doc", extension)

def test_splitext_space_name(self) -> None:
name_extension = "some file .doc"
name, extension = splitext_(name_extension)
self.assertEqual("some file ", name)
self.assertEqual(".doc", extension)

0 comments on commit 0212271

Please sign in to comment.