Skip to content

Commit

Permalink
raise human-friendly Sparv errors when parsing of odt or docx fails
Browse files Browse the repository at this point in the history
  • Loading branch information
anne17 committed Oct 3, 2023
1 parent 1ceacf0 commit ccd06ef
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
7 changes: 5 additions & 2 deletions sparv/modules/docx_import/docx_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from docx2python import docx2python
from docx2python.iterators import iter_at_depth

from sparv.api import Config, SourceFilename, Output, Source, SourceStructure, Text, importer, util
from sparv.api import Config, SourceFilename, Output, Source, SourceStructure, Text, importer, util, SparvErrorMessage


@importer("docx import", file_extension="docx", outputs=["text"], text_annotation="text", config=[
Expand Down Expand Up @@ -41,7 +41,10 @@ def parse(source_file: SourceFilename = SourceFilename(),
'NFC' is used by default.
"""
source_file_path = source_dir.get_path(source_file, ".docx")
d = docx2python(source_file_path)
try:
d = docx2python(source_file_path)
except Exception as e:
raise SparvErrorMessage(f"Failed to parse docx file '{source_file}'. {type(e).__name__}: {e}")

# Extract all text from the body, ignoring headers and footers
text = "\n\n".join(iter_at_depth(d.body, 4))
Expand Down
8 changes: 6 additions & 2 deletions sparv/modules/odt_import/odt_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import zipfile
from typing import Optional

from sparv.api import Config, SourceFilename, Output, Source, SourceStructure, Text, get_logger, importer, util
from sparv.api import (Config, Output, Source, SourceFilename, SourceStructure, SparvErrorMessage, Text, get_logger,
importer, util)

logger = get_logger(__name__)

Expand Down Expand Up @@ -44,7 +45,10 @@ def parse(source_file: SourceFilename = SourceFilename(),
source_file_path = str(source_dir.get_path(source_file, ".odt"))

# Parse odt and extract all text content
text = OdtParser(source_file_path).text
try:
text = OdtParser(source_file_path).text
except Exception as e:
raise SparvErrorMessage(f"Failed to parse odt file '{source_file}'. {type(e).__name__}: {e}")

if not keep_control_chars:
text = util.misc.remove_control_characters(text)
Expand Down

0 comments on commit ccd06ef

Please sign in to comment.