Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion document_converter/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from fastapi import HTTPException

from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult, ImageData
from document_converter.utils import handle_csv_file
from document_converter.utils import handle_csv_file, handle_xlsx_file

logging.basicConfig(level=logging.INFO)
IMAGE_RESOLUTION_SCALE = 4
Expand Down Expand Up @@ -109,6 +109,13 @@ def convert(
if error:
return ConversionResult(filename=filename, error=error)


if filename.lower().endswith('.xlsx'):
file, error = handle_xlsx_file(file)
if error:
return ConversionResult(filename=filename, error=error)


conv_res = doc_converter.convert(DocumentStream(name=filename, stream=file), raises_on_error=False)
doc_filename = conv_res.input.file.stem

Expand Down
27 changes: 26 additions & 1 deletion document_converter/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import filetype

import openpyxl


class InputFormat(str, Enum):
DOCX = "docx"
Expand All @@ -15,6 +17,7 @@ class InputFormat(str, Enum):
ASCIIDOC = "asciidoc"
MD = "md"
CSV = "csv"
XLSX = "xlsx"


class OutputFormat(str, Enum):
Expand All @@ -33,6 +36,7 @@ class OutputFormat(str, Enum):
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"],
InputFormat.XLSX: ["xlsx"], # "xltx", "xlsm", "xltm", "xlam", "xlsb"
}

FormatToMimeType: Dict[InputFormat, List[str]] = {
Expand All @@ -57,6 +61,7 @@ class OutputFormat(str, Enum):
InputFormat.ASCIIDOC: ["text/asciidoc"],
InputFormat.MD: ["text/markdown", "text/x-markdown"],
InputFormat.CSV: ["text/csv"],
InputFormat.XLSX: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
}
MimeTypeToFormat = {mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes}

Expand Down Expand Up @@ -93,7 +98,7 @@ def guess_format(obj: bytes, filename: str = None):
return InputFormat.CSV

mime = filetype.guess_mime(content)
if mime is None:
if mime is None: # mime type is not found
ext = filename.rsplit(".", 1)[-1] if ("." in filename and not filename.startswith(".")) else ""
mime = mime_from_extension(ext)

Expand All @@ -118,6 +123,26 @@ def handle_csv_file(file: BytesIO) -> Tuple[BytesIO, Optional[str]]:
continue
return file, f"Could not decode CSV file. Supported encodings: {', '.join(SUPPORTED_CSV_ENCODINGS)}"

def handle_xlsx_file(file: BytesIO) -> Tuple[BytesIO, Optional[str]]:
"""Handle XLSX file. reads the cell value, not the formula.

Returns:
Tuple[BytesIO, Optional[str]]: (processed file, error message if any)
"""
err_msg = None

newFile = BytesIO() # var to place converted .xlsx
try:
wb = openpyxl.load_workbook(filename=file, data_only=True) # create .xlsx from bytesio, converting formulas into values
wb.save(newFile)
except Exception as err:
err_msg = str(err)

newFile.seek(0)

return newFile, err_msg



def mime_from_extension(ext):
mime = None
Expand Down