Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Added handling of filename_as_id and file_extractor to SharePointReader #934

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions llama_hub/docugami/docugami.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,16 @@
"source": [
"from base import DocugamiReader\n",
"\n",
"docset_id=\"tjwrr2ekqkc3\"\n",
"docset_name=\"SEC 10-Q reports\"\n",
"document_ids=[\"ui7pkriyckwi\", \"1be3o7ch10iy\"]\n",
"docset_id = \"tjwrr2ekqkc3\"\n",
"docset_name = \"SEC 10-Q reports\"\n",
"document_ids = [\"ui7pkriyckwi\", \"1be3o7ch10iy\"]\n",
"\n",
"reader = DocugamiReader()\n",
"chunks = reader.load_data(docset_id=docset_id, document_ids=document_ids)\n",
"\n",
"for chunk in chunks[:5]:\n",
" print(chunk)\n",
" print(\"*\"*32)"
" print(\"*\" * 32)"
]
},
{
Expand Down Expand Up @@ -164,7 +164,7 @@
}
],
"source": [
"reader.min_text_length = 1024 * 4 # ~1k tokens\n",
"reader.min_text_length = 1024 * 4 # ~1k tokens\n",
"reader.max_text_length = 1024 * 24 # ~6k tokens\n",
"reader.include_xml_tags = True\n",
"chunks = reader.load_data(docset_id=docset_id)\n",
Expand Down Expand Up @@ -236,7 +236,9 @@
],
"source": [
"# Try out the query engine with example query\n",
"response = query_engine.query(\"How much did Microsoft spend for opex in the latest quarter?\")\n",
"response = query_engine.query(\n",
" \"How much did Microsoft spend for opex in the latest quarter?\"\n",
")\n",
"print(response.response)"
]
},
Expand Down Expand Up @@ -317,7 +319,9 @@
"response = query_engine.query(\n",
" \"What was Microsoft's weighted average discount rate for operating leases as of March 2023?\"\n",
")\n",
"print(response.response) # the correct answer should be 2.7%, listed on page 24 of \"2023 Q2 MSFT.pdf\""
"print(\n",
" response.response\n",
") # the correct answer should be 2.7%, listed on page 24 of \"2023 Q2 MSFT.pdf\""
]
},
{
Expand Down Expand Up @@ -428,7 +432,11 @@
"outputs": [],
"source": [
"from llama_index.indices.vector_store.retrievers import VectorIndexAutoRetriever\n",
"from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo, VectorStoreQueryMode\n",
"from llama_index.vector_stores.types import (\n",
" MetadataInfo,\n",
" VectorStoreInfo,\n",
" VectorStoreQueryMode,\n",
")\n",
"from llama_index.query_engine import RetrieverQueryEngine\n",
"\n",
"EXCLUDE_KEYS = [\"id\", \"xpath\", \"structure\", \"name\", \"tag\"]\n",
Expand Down
15 changes: 13 additions & 2 deletions llama_hub/microsoft_sharepoint/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import logging

from typing import Any, Dict, List
from typing import Any, Dict, List, Optional
import tempfile

import requests
Expand All @@ -28,6 +28,8 @@ def __init__(
client_id: str,
client_secret: str,
tenant_id: str,
filename_as_id: bool = False,
file_extractor: Optional[Dict[str, BaseReader]] = None,
) -> None:
"""
Initializes an instance of SharePoint reader.
Expand All @@ -37,11 +39,16 @@ def __init__(
The application must alse be configured with MS Graph permissions "Files.ReadAll", "Sites.ReadAll" and BrowserSiteLists.Read.All.
client_secret: The application secret for the app registered in Azure.
tenant_id: Unique identifier of the Azure Active Directory Instance.
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
extension to a BaseReader class that specifies how to convert that file
to text. See `SimpleDirectoryReader` for more details.
"""
self.client_id = (client_id,)
self.client_secret = (client_secret,)
self.tenant_id = tenant_id
self._authorization_headers = None
self.file_extractor = file_extractor
self.filename_as_id = filename_as_id

def _get_access_token(self) -> str:
"""
Expand Down Expand Up @@ -343,7 +350,11 @@ def get_metadata(filename: str) -> Any:
simple_directory_reader = download_loader("SimpleDirectoryReader")

simple_loader = simple_directory_reader(
download_dir, file_metadata=get_metadata, recursive=recursive
download_dir,
file_metadata=get_metadata,
recursive=recursive,
filename_as_id=self.filename_as_id,
file_extractor=self.file_extractor,
)
documents = simple_loader.load_data()
return documents
Expand Down
Loading