run-llama · anoopshrma · Feb 12, 2024 · Feb 8, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/llama_hub/docugami/docugami.ipynb b/llama_hub/docugami/docugami.ipynb
@@ -81,16 +81,16 @@
    "source": [
     "from base import DocugamiReader\n",
     "\n",
-    "docset_id=\"tjwrr2ekqkc3\"\n",
-    "docset_name=\"SEC 10-Q reports\"\n",
-    "document_ids=[\"ui7pkriyckwi\", \"1be3o7ch10iy\"]\n",
+    "docset_id = \"tjwrr2ekqkc3\"\n",
+    "docset_name = \"SEC 10-Q reports\"\n",
+    "document_ids = [\"ui7pkriyckwi\", \"1be3o7ch10iy\"]\n",
     "\n",
     "reader = DocugamiReader()\n",
     "chunks = reader.load_data(docset_id=docset_id, document_ids=document_ids)\n",
     "\n",
     "for chunk in chunks[:5]:\n",
     "    print(chunk)\n",
-    "    print(\"*\"*32)"
+    "    print(\"*\" * 32)"
    ]
   },
   {
@@ -164,7 +164,7 @@
     }
    ],
    "source": [
-    "reader.min_text_length = 1024 * 4 # ~1k tokens\n",
+    "reader.min_text_length = 1024 * 4  # ~1k tokens\n",
     "reader.max_text_length = 1024 * 24  # ~6k tokens\n",
     "reader.include_xml_tags = True\n",
     "chunks = reader.load_data(docset_id=docset_id)\n",
@@ -236,7 +236,9 @@
    ],
    "source": [
     "# Try out the query engine with example query\n",
-    "response = query_engine.query(\"How much did Microsoft spend for opex in the latest quarter?\")\n",
+    "response = query_engine.query(\n",
+    "    \"How much did Microsoft spend for opex in the latest quarter?\"\n",
+    ")\n",
     "print(response.response)"
    ]
   },
@@ -317,7 +319,9 @@
     "response = query_engine.query(\n",
     "    \"What was Microsoft's weighted average discount rate for operating leases as of March 2023?\"\n",
     ")\n",
-    "print(response.response)  # the correct answer should be 2.7%, listed on page 24 of \"2023 Q2 MSFT.pdf\""
+    "print(\n",
+    "    response.response\n",
+    ")  # the correct answer should be 2.7%, listed on page 24 of \"2023 Q2 MSFT.pdf\""
    ]
   },
   {
@@ -428,7 +432,11 @@
    "outputs": [],
    "source": [
     "from llama_index.indices.vector_store.retrievers import VectorIndexAutoRetriever\n",
-    "from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo, VectorStoreQueryMode\n",
+    "from llama_index.vector_stores.types import (\n",
+    "    MetadataInfo,\n",
+    "    VectorStoreInfo,\n",
+    "    VectorStoreQueryMode,\n",
+    ")\n",
     "from llama_index.query_engine import RetrieverQueryEngine\n",
     "\n",
     "EXCLUDE_KEYS = [\"id\", \"xpath\", \"structure\", \"name\", \"tag\"]\n",

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
@@ -3,7 +3,7 @@
 import os
 import logging
 
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 import tempfile
 
 import requests
@@ -28,6 +28,8 @@ def __init__(
         client_id: str,
         client_secret: str,
         tenant_id: str,
+        filename_as_id: bool = False,
+        file_extractor: Optional[Dict[str, BaseReader]] = None,
     ) -> None:
         """
         Initializes an instance of SharePoint reader.
@@ -37,11 +39,16 @@ def __init__(
                        The application must alse be configured with MS Graph permissions "Files.ReadAll", "Sites.ReadAll" and BrowserSiteLists.Read.All.
             client_secret: The application secret for the app registered in Azure.
             tenant_id: Unique identifier of the Azure Active Directory Instance.
+            file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
+                extension to a BaseReader class that specifies how to convert that file
+                to text. See `SimpleDirectoryReader` for more details.
         """
         self.client_id = (client_id,)
         self.client_secret = (client_secret,)
         self.tenant_id = tenant_id
         self._authorization_headers = None
+        self.file_extractor = file_extractor
+        self.filename_as_id = filename_as_id
 
     def _get_access_token(self) -> str:
         """
@@ -343,7 +350,11 @@ def get_metadata(filename: str) -> Any:
             simple_directory_reader = download_loader("SimpleDirectoryReader")
 
         simple_loader = simple_directory_reader(
-            download_dir, file_metadata=get_metadata, recursive=recursive
+            download_dir,
+            file_metadata=get_metadata,
+            recursive=recursive,
+            filename_as_id=self.filename_as_id,
+            file_extractor=self.file_extractor,
         )
         documents = simple_loader.load_data()
         return documents