36
36
logger = logging .getLogger (__name__ )
37
37
38
38
39
- def python_docx (docx : Union [str , Path , IO ], extract_text : bool , extract_images : bool , extract_tables : bool , ** kwargs ):
39
+ def python_docx (
40
+ docx : Union [str , Path , IO ],
41
+ extract_text : bool ,
42
+ extract_images : bool ,
43
+ extract_tables : bool ,
44
+ extract_charts : bool ,
45
+ ** kwargs
46
+ ):
40
47
"""
41
48
Helper function that use python-docx to extract text from a bytestream document
42
49
@@ -57,6 +64,8 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
57
64
Specifies whether to extract images.
58
65
extract_tables : bool
59
66
Specifies whether to extract tables.
67
+ extract_charts : bool
68
+ Specifies whether to extract charts.
60
69
**kwargs
61
70
The keyword arguments are used for additional extraction parameters.
62
71
@@ -73,10 +82,12 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
73
82
source_id = row_data ["source_id" ]
74
83
# get text_depth
75
84
text_depth = kwargs .get ("text_depth" , "document" )
76
- text_depth = TextTypeEnum [ text_depth . upper ()]
85
+ text_depth = TextTypeEnum ( text_depth )
77
86
# get base metadata
78
87
metadata_col = kwargs .get ("metadata_column" , "metadata" )
79
88
89
+ docx_extractor_config = kwargs .get ("docx_extraction_config" , {})
90
+
80
91
base_unified_metadata = row_data [metadata_col ] if metadata_col in row_data .index else {}
81
92
82
93
# get base source_metadata
@@ -103,7 +114,9 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
103
114
}
104
115
105
116
# Extract data from the document using python-docx
106
- doc = DocxReader (docx , source_metadata )
107
- extracted_data = doc .extract_data (base_unified_metadata , text_depth , extract_text , extract_tables , extract_images )
117
+ doc = DocxReader (docx , source_metadata , extraction_config = docx_extractor_config )
118
+ extracted_data = doc .extract_data (
119
+ base_unified_metadata , text_depth , extract_text , extract_charts , extract_tables , extract_images
120
+ )
108
121
109
122
return extracted_data
0 commit comments