Fix reading temp file for pypandoc conversion (#78)

* Fix reading temp file for pypandoc conversion * Use flush instead of seek
dataiku · Mar 8, 2024 · 2a0b13e · 2a0b13e
1 parent 2562173
commit 2a0b13e
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## [Version 2.3.2](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v2.3.2) - Patch release - 2024-03
+
+- Fix reading temporary file for pypandoc conversion
+
 ## [Version 2.3.1](https://github.com/dataiku/dss-plugin-tesseract-ocr/releases/tag/v2.3.1) - Patch release - 2024-01
 
 - Fix text extraction from html files with line wraps when chunking

diff --git a/plugin.json b/plugin.json
@@ -1,6 +1,6 @@
 {
     "id": "tesseract-ocr",
-    "version": "2.3.1",
+    "version": "2.3.2",
     "meta": {
         "label": "Text extraction and OCR",
         "description": "Extract text from documents & images.",

diff --git a/python-lib/text_extraction/__init__.py b/python-lib/text_extraction/__init__.py
@@ -45,6 +45,7 @@ def extract_text_content(file_bytes, extension, with_pandoc):
                 temporary_job_folder = os.getcwd()
                 with tempfile.NamedTemporaryFile(dir=temporary_job_folder, suffix=".{}".format(extension)) as tmp:
                     tmp.write(file_bytes)
+                    tmp.flush()
                     text = pypandoc.convert_file(tmp.name, to="plain", format=extension)
 
             except Exception as e:
@@ -87,6 +88,7 @@ def extract_text_chunks(filename, file_bytes, extension, with_pandoc, metadata_a
             temporary_job_folder = os.getcwd()
             with tempfile.NamedTemporaryFile(dir=temporary_job_folder, suffix=".{}".format(extension)) as tmp:
                 tmp.write(file_bytes)
+                tmp.flush()
                 # 'gfm' is for markdown_github, a simplified form of markdown for more consistent results across OSes
                 # We use the `--wrap none` argument to avoid line wraps in the conversion output, that could lead to
                 # incorrect markdown chunk extraction