diff --git a/dedoc/readers/docx_reader/properties_extractor.py b/dedoc/readers/docx_reader/properties_extractor.py index 016930a8..0e92b80a 100644 --- a/dedoc/readers/docx_reader/properties_extractor.py +++ b/dedoc/readers/docx_reader/properties_extractor.py @@ -74,7 +74,10 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None: if not tree.ind: return - attributes = {attribute: 0 for attribute in ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]} + attributes = { + attribute: 0 for attribute in + ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"] + } for attribute in attributes: attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0)) @@ -106,7 +109,8 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None: :param tree: BeautifulSoup tree with properties """ if tree.sz: - old_properties.size = int(tree.sz.get("w:val", old_properties.size)) + new_size = float(tree.sz.get("w:val", old_properties.size)) + old_properties.size = int(new_size) def change_jc(old_properties: BaseProperties, tree: Tag) -> None: @@ -176,19 +180,19 @@ def change_spacing(old_properties: BaseProperties, tree: Tag) -> None: if not before_autospacing: before_lines = tree.spacing.get("w:beforeLines", False) - before_lines = int(before_lines) if before_lines else before_lines + before_lines = int(float(before_lines)) if before_lines else before_lines if not before_lines: before_tag = tree.spacing.get("w:before", False) - before = int(before_tag) if before_tag else before + before = int(float(before_tag)) if before_tag else before else: before = before_lines if not after_autospacing: after_lines = tree.spacing.get("w:afterLines", False) - after_lines = int(after_lines) if after_lines else after_lines + after_lines = int(float(after_lines)) if after_lines else after_lines if not after_lines: after_tag = tree.spacing.get("w:after", False) - after = int(after_tag) if after_tag else after + after = int(float(after_tag)) if after_tag else after else: after = after_lines diff --git a/dedoc/utils/utils.py b/dedoc/utils/utils.py index c8f74605..5ab6521a 100644 --- a/dedoc/utils/utils.py +++ b/dedoc/utils/utils.py @@ -63,6 +63,13 @@ def splitext_(path: str) -> Tuple[str, str]: """ get extensions with several dots """ + if len(path.split()) > 1: + first, second = path.rsplit(maxsplit=1) + sep = path[len(first)] + name, ext = splitext(second) + if len(ext) == 0: + name, ext = ext, name + return first + sep + name, ext if len(path.split(".")) > 2: return path.split(".")[0], "." + ".".join(path.split(".")[-2:]) return splitext(path) diff --git a/tests/api_tests/test_api_format_docx.py b/tests/api_tests/test_api_format_docx.py index 2894db6d..779100cc 100644 --- a/tests/api_tests/test_api_format_docx.py +++ b/tests/api_tests/test_api_format_docx.py @@ -118,6 +118,16 @@ def test_docx_heading_new(self) -> None: data = dict(structure_type="tree", return_format="html") _ = self._send_request(file_name, data=data) + def test_properties_extractor(self) -> None: + file_name = "broken_properties.docx" + result = self._send_request(file_name, data={}) + content = result["content"]["structure"] + self.assertEqual("FonFfff", get_by_tree_path(content, "0.0")["text"].strip()) + + def test_name_with_apostrophe(self) -> None: + file_name = "Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc" + _ = self._send_request(file_name, data={}) + def __check_doc_like(self, result: dict) -> None: content = result["content"]["structure"] self.assertEqual("", get_by_tree_path(content, "0")["text"]) diff --git a/tests/data/docx/Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc b/tests/data/docx/Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc new file mode 100644 index 00000000..d803e55b Binary files /dev/null and b/tests/data/docx/Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc differ diff --git a/tests/data/docx/broken_properties.docx b/tests/data/docx/broken_properties.docx new file mode 100644 index 00000000..a2f332b3 Binary files /dev/null and b/tests/data/docx/broken_properties.docx differ diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py new file mode 100644 index 00000000..e631c932 --- /dev/null +++ b/tests/unit_tests/test_utils.py @@ -0,0 +1,24 @@ +import unittest + +from dedoc.utils.utils import splitext_ + + +class TestRecognizedTable(unittest.TestCase): + + def test_splitext_simple_name(self) -> None: + name_extension = "name.doc" + name, extension = splitext_(name_extension) + self.assertEqual("name", name) + self.assertEqual(".doc", extension) + + def test_splitext_apostrophe_name(self) -> None: + name_extension = "Well. Known -Nik O'Tinn -Ireland 2023- DRAFT.doc" + name, extension = splitext_(name_extension) + self.assertEqual("Well. Known -Nik O'Tinn -Ireland 2023- DRAFT", name) + self.assertEqual(".doc", extension) + + def test_splitext_space_name(self) -> None: + name_extension = "some file .doc" + name, extension = splitext_(name_extension) + self.assertEqual("some file ", name) + self.assertEqual(".doc", extension)