Skip to content

Commit 9b1668d

Browse files
drobison00edknv
andauthored
Extend docx and pptx extractors to attempt to extract tables/charts from images (#334)
Co-authored-by: Edward Kim <109497216+edknv@users.noreply.github.com>
1 parent 0e44b01 commit 9b1668d

18 files changed

+1147
-389
lines changed

src/nv_ingest/extraction_workflows/docx/docx_helper.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,14 @@
3636
logger = logging.getLogger(__name__)
3737

3838

39-
def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images: bool, extract_tables: bool, **kwargs):
39+
def python_docx(
40+
docx: Union[str, Path, IO],
41+
extract_text: bool,
42+
extract_images: bool,
43+
extract_tables: bool,
44+
extract_charts: bool,
45+
**kwargs
46+
):
4047
"""
4148
Helper function that use python-docx to extract text from a bytestream document
4249
@@ -57,6 +64,8 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
5764
Specifies whether to extract images.
5865
extract_tables : bool
5966
Specifies whether to extract tables.
67+
extract_charts : bool
68+
Specifies whether to extract charts.
6069
**kwargs
6170
The keyword arguments are used for additional extraction parameters.
6271
@@ -73,10 +82,12 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
7382
source_id = row_data["source_id"]
7483
# get text_depth
7584
text_depth = kwargs.get("text_depth", "document")
76-
text_depth = TextTypeEnum[text_depth.upper()]
85+
text_depth = TextTypeEnum(text_depth)
7786
# get base metadata
7887
metadata_col = kwargs.get("metadata_column", "metadata")
7988

89+
docx_extractor_config = kwargs.get("docx_extraction_config", {})
90+
8091
base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
8192

8293
# get base source_metadata
@@ -103,7 +114,9 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
103114
}
104115

105116
# Extract data from the document using python-docx
106-
doc = DocxReader(docx, source_metadata)
107-
extracted_data = doc.extract_data(base_unified_metadata, text_depth, extract_text, extract_tables, extract_images)
117+
doc = DocxReader(docx, source_metadata, extraction_config=docx_extractor_config)
118+
extracted_data = doc.extract_data(
119+
base_unified_metadata, text_depth, extract_text, extract_charts, extract_tables, extract_images
120+
)
108121

109122
return extracted_data

0 commit comments

Comments
 (0)