superagent-ai · homanp · Aug 10, 2023 · Aug 8, 2023 · Aug 8, 2023 · Aug 9, 2023
diff --git a/app/api/documents.py b/app/api/documents.py
@@ -90,6 +90,7 @@ async def read_document(documentId: str, token=Depends(JWTBearer())):
 async def delete_document(documentId: str, token=Depends(JWTBearer())):
     """Delete a document"""
     try:
+        prisma.agentdocument.delete_many(where={"documentId": documentId})
         prisma.document.delete(where={"id": documentId})
         VectorStoreBase().get_database().delete(namespace=documentId)
         return {"success": True, "data": None}

diff --git a/app/lib/documents.py b/app/lib/documents.py
@@ -16,9 +16,11 @@
 from langchain.embeddings.openai import OpenAIEmbeddings
 from llama_index.readers.schema.base import Document
 
+
 from app.lib.parsers import CustomPDFPlumberLoader
 from app.lib.splitters import TextSplitters
 from app.lib.vectorstores.base import VectorStoreBase
+from app.lib.loaders.sitemap import SitemapLoader
 
 valid_ingestion_types = [
     "TXT",
@@ -29,9 +31,18 @@
     "FIRESTORE",
     "PSYCHIC",
     "GITHUB_REPOSITORY",
+    "WEBPAGE",
+    "STRIPE",
+    "AIRTABLE",
+    "SITEMAP",
 ]
 
 
+def chunkify(lst, size):
+    """Divide a list into chunks of given size."""
+    return [lst[i : i + size] for i in range(0, len(lst), size)]
+
+
 def upsert_document(
     type: str,
     document_id: str,
@@ -52,6 +63,66 @@ def upsert_document(
 
     embeddings = OpenAIEmbeddings()
 
+    if type == "STRIPE":
+        pass
+
+    if type == "AIRTABLE":
+        from langchain.document_loaders import AirtableLoader
+
+        api_key = metadata["api_key"]
+        base_id = metadata["base_id"]
+        table_id = metadata["table_id"]
+        loader = AirtableLoader(api_key, table_id, base_id)
+        documents = loader.load()
+        newDocuments = [
+            document.metadata.update({"namespace": document_id}) or document
+            for document in documents
+        ]
+        docs = TextSplitters(newDocuments, text_splitter).document_splitter()
+
+        VectorStoreBase().get_database().from_documents(
+            docs, embeddings, index_name=INDEX_NAME, namespace=document_id
+        )
+
+    if type == "SITEMAP":
+        filter_urls = metadata["filter_urls"].split(",")
+        loader = SitemapLoader(sitemap_url=url, filter_urls=filter_urls)
+        documents = loader.load()
+        newDocuments = [
+            document.metadata.update({"namespace": document_id}) or document
+            for document in documents
+        ]
+        docs = TextSplitters(newDocuments, text_splitter).document_splitter()
+
+        chunk_size = 100
+        chunks = chunkify(docs, chunk_size)
+
+        for chunk in chunks:
+            VectorStoreBase().get_database().from_documents(
+                chunk, embeddings, index_name=INDEX_NAME, namespace=document_id
+            )
+
+    if type == "WEBPAGE":
+        from llama_index import download_loader
+
+        RemoteDepthReader = download_loader("RemoteDepthReader")
+        depth = int(metadata["depth"])
+        loader = RemoteDepthReader(depth=depth)
+        documents = loader.load_data(url=url)
+        langchain_documents = [d.to_langchain_format() for d in documents]
+        newDocuments = [
+            document.metadata.update({"namespace": document_id}) or document
+            for document in langchain_documents
+        ]
+        docs = TextSplitters(newDocuments, text_splitter).document_splitter()
+        chunk_size = 100
+        chunks = chunkify(docs, chunk_size)
+
+        for chunk in chunks:
+            VectorStoreBase().get_database().from_documents(
+                chunk, embeddings, index_name=INDEX_NAME, namespace=document_id
+            )
+
     if type == "TXT":
         file_response = content
         if content is None:

diff --git a/app/lib/loaders/__init__.py b/app/lib/loaders/__init__.py
diff --git a/app/lib/loaders/sitemap.py b/app/lib/loaders/sitemap.py
@@ -0,0 +1,65 @@
+import re
+import requests
+from xml.etree import ElementTree
+from bs4 import BeautifulSoup
+from langchain.schema import Document
+
+
+class SitemapLoader:
+    SITEMAP_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
+
+    def __init__(self, sitemap_url, filter_urls=None):
+        self.sitemap_url = sitemap_url
+        self.filter_urls = filter_urls if filter_urls else []
+
+    def fetch(self, url):
+        """Fetch content of a URL using requests."""
+        response = requests.get(url)
+        response.raise_for_status()  # Raise exception for HTTP errors
+        return response.text
+
+    def fetch_text(self, url):
+        response = requests.get(url)
+        response.raise_for_status()  # Raise exception for HTTP errors
+        soup = BeautifulSoup(response.content, "html.parser")
+        raw_text = soup.get_text(separator=" ").strip()
+        cleaned_text = re.sub(r"\s+", " ", raw_text)
+
+        return cleaned_text
+
+    def matches_any_pattern(self, url):
+        """Check if the URL matches any of the given patterns."""
+        is_match = any(re.search(pattern, url) for pattern in self.filter_urls)
+
+        if is_match:
+            print(f"Matched URL: {url}")
+
+        return is_match
+
+    def fetch_sitemap_urls(self):
+        """Fetch URLs from a sitemap.xml file and filter based on patterns."""
+        sitemap_content = self.fetch(self.sitemap_url)
+
+        # Parse XML content
+        root = ElementTree.fromstring(sitemap_content)
+        urls = [
+            url_element.text
+            for url_element in root.findall(
+                f"{self.SITEMAP_NAMESPACE}url/{self.SITEMAP_NAMESPACE}loc"
+            )
+        ]
+
+        # Filter URLs
+        if self.filter_urls:
+            urls = [url for url in urls if self.matches_any_pattern(url)]
+
+        return urls
+
+    def load(self):
+        """Fetch content of each URL listed in a sitemap.xml file."""
+        urls = self.fetch_sitemap_urls()
+
+        return [
+            Document(page_content=self.fetch_text(url), metadata={"url": url})
+            for url in urls
+        ]