From bdfd975115c2f4154404eccb6f6eeecdefea492a Mon Sep 17 00:00:00 2001
From: Filip Knefel <158048836+ds-filipknefel@users.noreply.github.com>
Date: Fri, 22 Mar 2024 11:08:49 +0100
Subject: [PATCH] chore: change table extraction defaults (#2588)

Change default values for table extraction - works in pair with
[this](https://github.com/Unstructured-IO/unstructured-api/pull/370)
`unstructured-api` PR

We want to move away from `pdf_infer_table_structure` parameter, in this
PR:
- We change how it's treated wrt `skip_infer_table_types` parameter.
Whether to extract tables from pdf now follows from the rule:
`pdf_infer_table_structure && "pdf" not in skip_infer_table_types`
- We set it to `pdf_infer_table_structure=True` and
`skip_infer_table_types=[]` by default
- We remove it from the examples in documentation
- We describe it as deprecated in favor of `skip_infer_table_types` in
documentation

More detailed description of how we want parameters to interact
- if `pdf_infer_table_structure` is False tables will never extracted
from pdf
- if `pdf_infer_table_structure` is True tables will be extracted from
pdf unless it's skipped via `skip_infer_table_types`
- on default `pdf_infer_table_structure=True` and
`skip_infer_table_types=[]`

---------

Co-authored-by: Filip Knefel <filip@unstructured.io>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: ds-filipknefel <ds-filipknefel@users.noreply.github.com>
Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
---
 CHANGELOG.md                                  |  6 +++--
 docs/source/apis/api_parameters.rst           |  2 +-
 docs/source/apis/usage_methods.rst            |  2 +-
 .../best_practices/table_extraction_pdf.rst   |  8 +------
 docs/source/core/partition.rst                |  2 +-
 docs/source/examples/databricks.rst           |  1 -
 docs/source/examples/dict_to_elements.rst     |  1 -
 .../ingest/configs/partition_config.rst       |  2 +-
 test_unstructured/partition/test_auto.py      |  2 +-
 .../Shared Documents/stanley-cups.xlsx.json   | 12 ++++++----
 .../Shared Documents/stanley-cups.xlsx.json   | 12 ++++++----
 .../gcs/nested-2/stanley-cups.xlsx.json       | 12 ++++++----
 .../tests-example.xls.json                    |  6 +++--
 unstructured/__version__.py                   |  2 +-
 unstructured/ingest/interfaces.py             |  2 +-
 unstructured/partition/auto.py                | 24 ++++++++++++-------
 16 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d5e7e796a9..daa7dfba61 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
-## 0.12.7-dev9
+## 0.13.0-dev10
 
-### Enhancements 
+### Enhancements
 
 * **Add `.metadata.is_continuation` to text-split chunks.** `.metadata.is_continuation=True` is added to second-and-later chunks formed by text-splitting an oversized `Table` element but not to their counterpart `Text` element splits. Add this indicator for `CompositeElement` to allow text-split continuation chunks to be identified for downstream processes that may wish to skip intentionally redundant metadata values in continuation chunks.
 * **Add `compound_structure_acc` metric to table eval.** Add a new property to `unstructured.metrics.table_eval.TableEvaluation`: `composite_structure_acc`, which is computed from the element level row and column index and content accuracy scores
@@ -13,6 +13,7 @@
 ### Fixes
 
 * **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
+* **Change table extraction defaults** Change table extraction defaults in favor of using `skip_infer_table_types` parameter and reflect these changes in documentation.
 * **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint
 * **Adds tracking for AstraDB** Adds tracking info so AstraDB can see what source called their api.
 * **Support AWS Bedrock Embeddings in ingest CLI** The configs required to instantiate the bedrock embedding class are now exposed in the api and the version of boto being used meets the minimum requirement to introduce the bedrock runtime required to hit the service.
@@ -66,6 +67,7 @@
 * **Rename `OpenAiEmbeddingConfig` to `OpenAIEmbeddingConfig`.**
 * **Fix partition_json() doesn't chunk.** The `@add_chunking_strategy` decorator was missing from `partition_json()` such that pre-partitioned documents serialized to JSON did not chunk when a chunking-strategy was specified.
 
+
 ## 0.12.4
 
 ### Enhancements
diff --git a/docs/source/apis/api_parameters.rst b/docs/source/apis/api_parameters.rst
index c5358765c0..7e401b8877 100644
--- a/docs/source/apis/api_parameters.rst
+++ b/docs/source/apis/api_parameters.rst
@@ -60,7 +60,7 @@ languages
 pdf_infer_table_structure
 -------------------------
 - **Type**: boolean
-- **Description**: If True and strategy=hi_res, any Table Elements extracted from a PDF will include an additional metadata field, 'text_as_html'.
+- **Description**: Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents.
 
 skip_infer_table_types
 ----------------------
diff --git a/docs/source/apis/usage_methods.rst b/docs/source/apis/usage_methods.rst
index 2ab6436c92..c755014ffd 100644
--- a/docs/source/apis/usage_methods.rst
+++ b/docs/source/apis/usage_methods.rst
@@ -24,7 +24,7 @@ Method 1: Partition via API (``partition_via_api``)
 
       filename = "example-docs/DA-1p.pdf"
       elements = partition_via_api(
-        filename=filename, api_key="MY_API_KEY", strategy="auto", pdf_infer_table_structure="true"
+        filename=filename, api_key="MY_API_KEY", strategy="auto"
       )
 
   - **Self-Hosting or Local API**::
diff --git a/docs/source/best_practices/table_extraction_pdf.rst b/docs/source/best_practices/table_extraction_pdf.rst
index aacf6ea6bc..e85c7291c8 100644
--- a/docs/source/best_practices/table_extraction_pdf.rst
+++ b/docs/source/best_practices/table_extraction_pdf.rst
@@ -33,7 +33,7 @@ To extract the tables from PDF files using the `partition_pdf <https://unstructu
 Method 2: Using Auto Partition or Unstructured API
 --------------------------------------------------
 
-By default, table extraction from ``pdf``, ``jpg``, ``png``, ``xls``, and ``xlsx`` file types is disabled. To enable table extraction from PDFs and other file types using `Auto Partition <https://unstructured-io.github.io/unstructured/core/partition.html#partition>`__ or `Unstructured API parameters <https://unstructured-io.github.io/unstructured/apis/api_parameters.html>`__ , you can set the ``skip_infer_table_types`` parameter to ``'[]'`` and ``strategy`` parameter to ``hi_res``.
+By default, table extraction from all file types is enabled. To extract tables from PDFs and images using `Auto Partition <https://unstructured-io.github.io/unstructured/core/partition.html#partition>`__ or `Unstructured API parameters <https://unstructured-io.github.io/unstructured/apis/api_parameters.html>`__ simply set ``strategy`` parameter to ``hi_res``.
 
 
 **Usage: Auto Partition**
@@ -46,7 +46,6 @@ By default, table extraction from ``pdf``, ``jpg``, ``png``, ``xls``, and ``xlsx
 
     elements = partition(filename=filename,
                          strategy='hi_res',
-                         skip_infer_table_types='[]', # don't forget to include apostrophe around the square bracket
                )
 
     tables = [el for el in elements if el.category == "Table"]
@@ -65,9 +64,4 @@ By default, table extraction from ``pdf``, ``jpg``, ``png``, ``xls``, and ``xlsx
           -H 'Content-Type: multipart/form-data' \
           -F 'files=@sample-docs/layout-parser-paper-with-table.jpg' \
           -F 'strategy=hi_res' \
-          -F 'skip_infer_table_types=[]' \
           | jq -C . | less -R
-
-.. warning::
-
-    You may get a warning when the ``pdf_infer_table_structure`` parameter is set to **True** AND **pdf** is included in the list of ``skip_infer_table_types`` parameter. However, this function will still extract the tables from PDF despite the conflict.
diff --git a/docs/source/core/partition.rst b/docs/source/core/partition.rst
index 27bb516c2d..198fbb24ef 100644
--- a/docs/source/core/partition.rst
+++ b/docs/source/core/partition.rst
@@ -872,7 +872,7 @@ settings supported by the API.
   filename = "example-docs/DA-1p.pdf"
 
   elements = partition_via_api(
-    filename=filename, api_key=api_key, strategy="auto", pdf_infer_table_structure="true"
+    filename=filename, api_key=api_key, strategy="auto"
   )
 
 If you are using the `Unstructured SaaS API <https://unstructured-io.github.io/unstructured/apis/saas_api.html>`__, you can use the ``api_url`` kwarg to point the ``partition_via_api`` function at your Unstructured SaaS API URL.
diff --git a/docs/source/examples/databricks.rst b/docs/source/examples/databricks.rst
index c6a862d498..94df7385f6 100644
--- a/docs/source/examples/databricks.rst
+++ b/docs/source/examples/databricks.rst
@@ -47,7 +47,6 @@ Extracting PDF Using Unstructured Python SDK
        ),
        # Other partition params
        strategy="hi_res",
-       pdf_infer_table_structure=True,
        chunking_strategy="by_title",
    )
 
diff --git a/docs/source/examples/dict_to_elements.rst b/docs/source/examples/dict_to_elements.rst
index daa198441e..11850ecadc 100644
--- a/docs/source/examples/dict_to_elements.rst
+++ b/docs/source/examples/dict_to_elements.rst
@@ -74,7 +74,6 @@ Configure and run the S3Runner for processing the data.
             api_key=UNSTRUCTURED_API_KEY,
             strategy="hi_res",
             hi_res_model_name="yolox",
-            pdf_infer_table_structure=True,
         ),
         fsspec_config=FsspecConfig(
             remote_url=S3_URL,
diff --git a/docs/source/ingest/configs/partition_config.rst b/docs/source/ingest/configs/partition_config.rst
index a1c4572ce3..7aa1fa411e 100644
--- a/docs/source/ingest/configs/partition_config.rst
+++ b/docs/source/ingest/configs/partition_config.rst
@@ -9,7 +9,7 @@ responsible for coordinating data after processing, including the dynamic metada
 Configs for Partitioning
 -------------------------
 
-* ``pdf_infer_table_structure``: If True and strategy=hi_res, any Table Elements extracted from a PDF will include an additional metadata field, "text_as_html," where the value (string) is a just a transformation of the data into an HTML <table>. The "text" field for a partitioned Table Element is always present, whether True or False.
+* ``pdf_infer_table_structure``: Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents.
 * ``skip_infer_table_types``: List of document types that you want to skip table extraction with.
 * ``strategy (default auto)``: The strategy to use for partitioning PDF/image. Uses a layout detection model if set to 'hi_res', otherwise partition simply extracts the text from the document and processes it.
 * ``ocr_languages``: The languages present in the document, for use in partitioning and/or OCR. For partitioning image or pdf documents with Tesseract, you'll first need to install the appropriate Tesseract language pack if running via local unstructured library. For other partitions, language is detected using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be in either language.
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 2f3769bbfc..c5b4958785 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -356,7 +356,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
         languages=None,
         metadata_filename=None,
         include_page_breaks=False,
-        infer_table_structure=False,
+        infer_table_structure=True,
         extract_images_in_pdf=False,
         extract_image_block_types=None,
         extract_image_block_output_dir=None,
diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/stanley-cups.xlsx.json
index 928c97ff9d..cc8b8ee0c5 100644
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/stanley-cups.xlsx.json	
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/stanley-cups.xlsx.json	
@@ -18,7 +18,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups",
-      "page_number": 1
+      "page_number": 1,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "Stanley Cups",
     "type": "Title"
@@ -42,7 +43,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups",
-      "page_number": 1
+      "page_number": 1,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n",
     "type": "Table"
@@ -66,7 +68,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups Since 67",
-      "page_number": 2
+      "page_number": 2,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "Stanley Cups Since 67",
     "type": "Title"
@@ -90,7 +93,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups Since 67",
-      "page_number": 2
+      "page_number": 2,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n",
     "type": "Table"
diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json
index 928c97ff9d..cc8b8ee0c5 100644
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json	
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json	
@@ -18,7 +18,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups",
-      "page_number": 1
+      "page_number": 1,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "Stanley Cups",
     "type": "Title"
@@ -42,7 +43,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups",
-      "page_number": 1
+      "page_number": 1,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n",
     "type": "Table"
@@ -66,7 +68,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups Since 67",
-      "page_number": 2
+      "page_number": 2,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "Stanley Cups Since 67",
     "type": "Title"
@@ -90,7 +93,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups Since 67",
-      "page_number": 2
+      "page_number": 2,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n",
     "type": "Table"
diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
index 2c31243319..95a8f1164f 100644
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
@@ -17,7 +17,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups",
-      "page_number": 1
+      "page_number": 1,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "Stanley Cups",
     "type": "Title"
@@ -40,7 +41,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups",
-      "page_number": 1
+      "page_number": 1,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n",
     "type": "Table"
@@ -63,7 +65,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups Since 67",
-      "page_number": 2
+      "page_number": 2,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "Stanley Cups Since 67",
     "type": "Title"
@@ -86,7 +89,8 @@
         "eng"
       ],
       "page_name": "Stanley Cups Since 67",
-      "page_number": 2
+      "page_number": 2,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n",
     "type": "Table"
diff --git a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
index 33efd0f6b2..d20538af83 100644
--- a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
+++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
@@ -17,7 +17,8 @@
         "eng"
       ],
       "page_name": "Example Test",
-      "page_number": 1
+      "page_number": 1,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>MC</td>\n      <td>What is 2+2?</td>\n      <td>4</td>\n      <td>correct</td>\n      <td>3</td>\n      <td>incorrect</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>MA</td>\n      <td>What C datatypes are 8 bits? (assume i386)</td>\n      <td>int</td>\n      <td></td>\n      <td>float</td>\n      <td></td>\n      <td>double</td>\n      <td></td>\n      <td>char</td>\n    </tr>\n    <tr>\n      <td>TF</td>\n      <td>Bagpipes are awesome.</td>\n      <td>true</td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>ESS</td>\n      <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>ORD</td>\n      <td>Rank the following in their order of operation.</td>\n      <td>Parentheses</td>\n      <td>Exponents</td>\n      <td>Division</td>\n      <td>Addition</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>FIB</td>\n      <td>The student activities fee is</td>\n      <td>95</td>\n      <td>dollars for students enrolled in</td>\n      <td>19</td>\n      <td>units or more,</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>MAT</td>\n      <td>Match the lower-case greek letter with its capital form.</td>\n      <td>λ</td>\n      <td>Λ</td>\n      <td>α</td>\n      <td>γ</td>\n      <td>Γ</td>\n      <td>φ</td>\n      <td>Φ</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n",
     "type": "Table"
@@ -86,7 +87,8 @@
         "eng"
       ],
       "page_name": "Format Abbr.",
-      "page_number": 2
+      "page_number": 2,
+      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Abbreviation</td>\n      <td>Question Type</td>\n    </tr>\n    <tr>\n      <td>MC</td>\n      <td>Multiple Choice</td>\n    </tr>\n    <tr>\n      <td>MA</td>\n      <td>Multiple Answer</td>\n    </tr>\n    <tr>\n      <td>TF</td>\n      <td>True/False</td>\n    </tr>\n    <tr>\n      <td>ESS</td>\n      <td>Essay</td>\n    </tr>\n    <tr>\n      <td>ORD</td>\n      <td>Ordering</td>\n    </tr>\n    <tr>\n      <td>MAT</td>\n      <td>Matching</td>\n    </tr>\n    <tr>\n      <td>FIB</td>\n      <td>Fill in the Blank</td>\n    </tr>\n    <tr>\n      <td>FIL</td>\n      <td>File response</td>\n    </tr>\n    <tr>\n      <td>NUM</td>\n      <td>Numeric Response</td>\n    </tr>\n    <tr>\n      <td>SR</td>\n      <td>Short response</td>\n    </tr>\n    <tr>\n      <td>OP</td>\n      <td>Opinion</td>\n    </tr>\n    <tr>\n      <td>FIB_PLUS</td>\n      <td>Multiple Fill in the Blank</td>\n    </tr>\n    <tr>\n      <td>JUMBLED_SENTENCE</td>\n      <td>Jumbled Sentence</td>\n    </tr>\n    <tr>\n      <td>QUIZ_BOWL</td>\n      <td>Quiz Bowl</td>\n    </tr>\n  </tbody>\n</table>"
     },
     "text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n",
     "type": "Table"
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index f9c076205d..c29cef9d38 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.12.7-dev9"  # pragma: no cover
+__version__ = "0.13.0-dev10"  # pragma: no cover
diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py
index 3f1d9f369a..2249073cb1 100644
--- a/unstructured/ingest/interfaces.py
+++ b/unstructured/ingest/interfaces.py
@@ -81,7 +81,7 @@ class RetryStrategyConfig(BaseConfig):
 @dataclass
 class PartitionConfig(BaseConfig):
     # where to write structured data outputs
-    pdf_infer_table_structure: bool = False
+    pdf_infer_table_structure: bool = True
     strategy: str = "auto"
     ocr_languages: t.Optional[t.List[str]] = None
     encoding: t.Optional[str] = None
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index 5d0db7b1ea..1345872bc6 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -139,12 +139,12 @@ def partition(
     encoding: Optional[str] = None,
     paragraph_grouper: Optional[Callable[[str], str]] = None,
     headers: Dict[str, str] = {},
-    skip_infer_table_types: List[str] = ["pdf", "jpg", "png", "xls", "xlsx", "heic"],
+    skip_infer_table_types: List[str] = [],
     ssl_verify: bool = True,
     ocr_languages: Optional[str] = None,  # changing to optional for deprecation
     languages: Optional[List[str]] = None,
     detect_language_per_element: bool = False,
-    pdf_infer_table_structure: bool = False,
+    pdf_infer_table_structure: bool = True,
     extract_images_in_pdf: bool = False,
     extract_image_block_types: Optional[List[str]] = None,
     extract_image_block_output_dir: Optional[str] = None,
@@ -200,6 +200,8 @@ def partition(
             detect_language_per_element
                 Detect language per element instead of at the document level.
     pdf_infer_table_structure
+        Deprecated! Use `skip_infer_table_types` to opt out of table extraction for any document
+        type.
         If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
         additional metadata field, "text_as_html," where the value (string) is a just a
         transformation of the data into an HTML <table>.
@@ -259,6 +261,12 @@ def partition(
     kwargs.setdefault("metadata_filename", metadata_filename)
     kwargs.setdefault("date_from_file_object", date_from_file_object)
 
+    if not pdf_infer_table_structure:
+        logger.warning(
+            "The pdf_infer_table_structure kwarg is deprecated. Please use skip_infer_table_types "
+            "instead."
+        )
+
     languages = check_language_args(languages or [], ocr_languages)
 
     if url is not None:
@@ -560,12 +568,10 @@ def decide_table_extraction(
     doc_type = filetype.name.lower() if filetype else None
 
     if doc_type == "pdf":
-        if doc_type in skip_infer_table_types and pdf_infer_table_structure:
-            logger.warning(
-                f"Conflict between variables skip_infer_table_types: {skip_infer_table_types} "
-                f"and pdf_infer_table_structure: {pdf_infer_table_structure}, "
-                "please reset skip_infer_table_types to turn on table extraction for PDFs.",
-            )
-        return doc_type not in skip_infer_table_types or pdf_infer_table_structure
+        # For backwards compatibility. Ultimately we want to remove pdf_infer_table_structure
+        # completely and rely exclusively on `skip_infer_table_types` for all file types.
+        # Until then for pdf files we first check pdf_infer_table_structure and then update
+        # based on skip_infer_tables.
+        return pdf_infer_table_structure and doc_type not in skip_infer_table_types
 
     return doc_type not in skip_infer_table_types