From f755b8ec3c30c532347b456b49a1d5d48f80cf77 Mon Sep 17 00:00:00 2001 From: komalg1 Date: Thu, 18 Apr 2024 18:37:54 +0530 Subject: [PATCH] [Integrated Vectorization] Upload url content to blob storage (#692) * downloading the contents of url & uploading blob * changes for downloading url content & uploading to blob * Adding url as the metadata to the blob * PR only for adding environment var * removing chnages unrelated to env variable * Grouping integrated vec. with azure search * upadting description to match the usage * URL content as blob * code review comments * copied main.json from main branch * updating the success message * code review comments * Code review changes --------- Co-authored-by: Ross Smith --- code/backend/pages/01_Ingest_Data.py | 28 +++++++++++++++++++++++++++- infra/main.json | 2 +- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/code/backend/pages/01_Ingest_Data.py b/code/backend/pages/01_Ingest_Data.py index fb8ead85c..7f905d7fd 100644 --- a/code/backend/pages/01_Ingest_Data.py +++ b/code/backend/pages/01_Ingest_Data.py @@ -1,4 +1,6 @@ +import io from os import path +from bs4 import BeautifulSoup import streamlit as st from typing import Optional import mimetypes @@ -15,11 +17,13 @@ ) import urllib.parse import sys +import logging from batch.utilities.helpers.ConfigHelper import ConfigHelper from batch.utilities.helpers.EnvHelper import EnvHelper sys.path.append(path.join(path.dirname(__file__), "..")) env_helper: EnvHelper = EnvHelper() +logger = logging.getLogger(__name__) st.set_page_config( page_title="Ingest Data", @@ -75,11 +79,31 @@ def remote_convert_files_and_add_embeddings(process_all=False): def add_urls(): + urls = st.session_state["urls"].split("\n") + if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION: + download_url_and_upload_to_blob(urls) + else: + add_url_embeddings(urls) + + +def download_url_and_upload_to_blob(urls: list[str]): + for url in urls: + try: + response = requests.get(url) + parsed_data = BeautifulSoup(response.content, "html.parser") + with io.BytesIO(parsed_data.get_text().encode("utf-8")) as stream: + upload_file(stream, url) + st.success(f"Url {url} added to knowledge base") + except Exception: + logger.error(traceback.format_exc()) + st.error(f"Exception occurred while adding {url} to the knowledge base.") + + +def add_url_embeddings(urls: list[str]): params = {} if env_helper.FUNCTION_KEY is not None: params["code"] = env_helper.FUNCTION_KEY params["clientId"] = "clientKey" - urls = st.session_state["urls"].split("\n") for url in urls: body = {"url": url} backend_url = urllib.parse.urljoin( @@ -121,6 +145,7 @@ def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = bytes_data, overwrite=True, content_settings=ContentSettings(content_type=content_type + charset), + metadata={"source_url": file_name}, ) st.session_state["file_url"] = ( blob_client.url @@ -154,6 +179,7 @@ def upload_file(bytes_data: bytes, file_name: str, content_type: Optional[str] = bytes_data, overwrite=True, content_settings=ContentSettings(content_type=content_type + charset), + metadata={"source_url": file_name}, ) # Generate a SAS URL to the blob and return it st.session_state["file_url"] = ( diff --git a/infra/main.json b/infra/main.json index d260227d7..108d7daca 100644 --- a/infra/main.json +++ b/infra/main.json @@ -10371,4 +10371,4 @@ "value": "[parameters('logLevel')]" } } -} \ No newline at end of file +}