-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #71 from dataiku/feature/text-extraction-pandoc
Text extraction with pandoc
- Loading branch information
Showing
19 changed files
with
245 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
{ | ||
"meta": { | ||
"label": "Text extraction", | ||
"description": "Extract text content from various file types (PDF, Docx, HTML, etc) into a dataset with columns filename and text", | ||
"icon": "icon-file-text-alt", | ||
"displayOrderRank": 2 | ||
}, | ||
|
||
"kind": "PYTHON", | ||
"selectableFromFolder": "input_folder", | ||
|
||
"inputRoles" : [ | ||
{ | ||
"name": "input_folder", | ||
"label": "Input folder", | ||
"arity": "UNARY", | ||
"required": true, | ||
"description": "Folder of input files", | ||
"acceptsManagedFolder": true, | ||
"acceptsDataset": false | ||
} | ||
], | ||
|
||
"outputRoles" : [ | ||
{ | ||
"name": "output_dataset", | ||
"label": "Output dataset", | ||
"arity": "UNARY", | ||
"required": true, | ||
"description": "Dataset of extracted text", | ||
"acceptsManagedFolder": false, | ||
"acceptsDataset": true | ||
} | ||
], | ||
"params": [ | ||
{ | ||
"name": "description", | ||
"label": "", | ||
"type": "SEPARATOR", | ||
"description": "The Text extraction recipe is ready to use 'out of the box' - no settings required." | ||
} | ||
], | ||
|
||
"resourceKeys": [] | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import logging | ||
import os | ||
import pandas as pd | ||
from time import perf_counter | ||
|
||
from text_extraction_ocr_utils.recipes_io_utils import get_input_output | ||
from text_extraction import extract_text_content | ||
from text_extraction import download_pandoc_binaries | ||
|
||
|
||
# call this method to download pandoc binaries | ||
with_pandoc = download_pandoc_binaries() | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
input_folder, output_dataset = get_input_output('folder', 'dataset') | ||
|
||
input_filenames = input_folder.list_paths_in_partition() | ||
total_files = len(input_filenames) | ||
|
||
rows = [] | ||
|
||
for i, sample_file in enumerate(input_filenames): | ||
prefix, suffix = os.path.splitext(sample_file) | ||
suffix = suffix[1:].lower() # removing the dot from the extension and accepting capital letters | ||
|
||
start = perf_counter() | ||
|
||
logger.info("Extracting text from file {} ...".format(sample_file)) | ||
|
||
with input_folder.get_download_stream(sample_file) as stream: | ||
file_bytes = stream.read() | ||
|
||
try: | ||
extracted_text = extract_text_content(file_bytes, suffix, with_pandoc) | ||
|
||
if not extracted_text.strip(): | ||
logger.warning("Extracted text is empty") | ||
|
||
rows.append({'file': sample_file, 'text': extracted_text, 'error_message': ""}) | ||
logger.info("Extracted text from {}/{} files (in {:.2f} seconds)".format(i+1, total_files, perf_counter() - start)) | ||
except Exception as e: | ||
rows.append({'file': sample_file, 'text': "", 'error_message': e}) | ||
logger.info("Failed extracting text from file {} because: {}".format(sample_file, e)) | ||
|
||
df = pd.DataFrame(rows) | ||
|
||
output_dataset.write_with_schema(df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.