diff --git a/dj_backend_server/CHANGELOG.MD b/dj_backend_server/CHANGELOG.MD new file mode 100644 index 00000000..15b70c6c --- /dev/null +++ b/dj_backend_server/CHANGELOG.MD @@ -0,0 +1,6 @@ +2.14.2024 +- Added example.env to streamline environment setup. +- Implemented translation fixes to enhance application localization. +- Updated docker-compose.yaml to prefix each container with oc_ for better namespace management. +- Performed fixes in requirements.txt for improved dependency resolution. +- Ensured existence of Vector Database (QDrant) prior to web crawling operations to address issues encountered with large websites, ensuring Vector Database creation and availability. \ No newline at end of file diff --git a/dj_backend_server/api/data_sources/pdf_handler.py b/dj_backend_server/api/data_sources/pdf_handler.py index 6890fa8d..9bf20312 100644 --- a/dj_backend_server/api/data_sources/pdf_handler.py +++ b/dj_backend_server/api/data_sources/pdf_handler.py @@ -222,7 +222,7 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool docs = text_splitter.split_documents(raw_docs) - print("external files docs -->", docs); + # print("external files docs -->", docs); if not docs: print("No documents were processed successfully.") @@ -230,7 +230,9 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool embeddings = get_embeddings() + print(f"Initializing vector store for namespace: {namespace} with {len(docs)} documents.") init_vector_store(docs, embeddings, StoreOptions(namespace=namespace)) + print(f"Vector store initialized successfully for namespace: {namespace}.") print(f'Folder need or not to delete. {delete_folder_flag}') # Delete folder if flag is set @@ -243,7 +245,7 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool # pdf_data_source.save() failed_job = FailedJob(uuid=str(uuid4()), connection='default', queue='default', payload='txt_to_vectordb', exception=str(e),failed_at=timezone.now()) failed_job.save() - print(e) + print(f"Failed to initialize vector store for namespace: {namespace}. Exception: {e}") traceback.print_exc() diff --git a/dj_backend_server/api/utils/init_vector_store.py b/dj_backend_server/api/utils/init_vector_store.py index 36458b93..ea7a96c0 100644 --- a/dj_backend_server/api/utils/init_vector_store.py +++ b/dj_backend_server/api/utils/init_vector_store.py @@ -128,3 +128,31 @@ def delete_from_vector_store(namespace: str, filter_criteria: dict) -> None: else: raise NotImplementedError(f"Delete operation is not implemented for the store type: {store_type}") + +def ensure_vector_database_exists(namespace): + store_type = StoreType[os.environ['STORE']] + try: + if store_type == StoreType.QDRANT: + client = QdrantClient(url=os.environ['QDRANT_URL']) + for attempt in range(3): + existing_collections = client.get_collections().collections + if namespace not in existing_collections: + print(f"Namespace '{namespace}' does not exist. Attempting to create.") + vectors_config = models.VectorParams( + size=1536, # Using 1536-dimensional vectors, adjust as necessary + distance=models.Distance.COSINE # Using cosine distance, adjust as necessary + ) + client.create_collection(collection_name=namespace, vectors_config=vectors_config) + # Recheck if the namespace was successfully created + if namespace in client.get_collections().collections: + print(f"Namespace '{namespace}' successfully created.") + return + else: + print(f"Failed to create namespace '{namespace}' on attempt {attempt + 1}.") + else: + print(f"Namespace '{namespace}' exists.") + return + raise Exception(f"Failed to ensure or create namespace '{namespace}' after 3 attempts.") + except Exception as e: + print(f"Failed to ensure vector database exists for namespace {namespace}: {e}") + diff --git a/dj_backend_server/docker-compose.yaml b/dj_backend_server/docker-compose.yaml index bf2d58ab..88495ffd 100644 --- a/dj_backend_server/docker-compose.yaml +++ b/dj_backend_server/docker-compose.yaml @@ -3,7 +3,7 @@ version: '3.9' services: mysql: restart: unless-stopped - platform: linux/arm64/v8 + container_name: oc_mysql image: "mysql:8" ports: - "3307:3306" @@ -20,6 +20,7 @@ services: qdrant: image: qdrant/qdrant + container_name: oc_qdrant ports: - 6333:6333 - 6334:6334 @@ -32,7 +33,7 @@ services: build: context: . dockerfile: Dockerfile - container_name: web + container_name: oc_web ports: - "8001:8000" volumes: @@ -53,6 +54,7 @@ services: adminer: image: adminer + container_name: oc_adminer ports: - "8080:8080" environment: @@ -66,7 +68,7 @@ services: build: context: . dockerfile: Dockerfile - container_name: celery + container_name: oc_celery volumes: - ./website_data_sources:/app/website_data_sources # - ./llama-2-7b-chat.ggmlv3.q4_K_M.bin:/app/llama-2-7b-chat.ggmlv3.q4_K_M.bin:ro @@ -80,7 +82,7 @@ services: redis: image: redis:latest - container_name: redis_cache + container_name: oc_redis_cache ports: - "6379:6379" volumes: diff --git a/dj_backend_server/example.env b/dj_backend_server/example.env new file mode 100644 index 00000000..2161dd7b --- /dev/null +++ b/dj_backend_server/example.env @@ -0,0 +1,75 @@ +########################################################## + +# Edit values for your site. +# your app secret key +SECRET_KEY='ADD-YOUR-CUSTOM-KEY-HERE' +# For openai +OPENAI_API_KEY=YOURKEY +# add IP what you allow like superadmin +ALLOWED_HOSTS=localhost,0.0.0.0 +# Use * only in dev environment +#ALLOWED_HOSTS=* +# Your SITE URL +APP_URL='https://YOUR-URL-HERE' + +########################################################## + +# "azure" | "openai" | llama2 +OPENAI_API_TYPE=openai +OPENAI_API_MODEL=gpt-4-1106-preview +OPENAI_API_TEMPERATURE=1 + +# If using azure +# AZURE_OPENAI_API_BASE= +# AZURE_OPENAI_API_KEY= +# AZURE_OPENAI_API_VERSION=2023-03-15-preview +# AZURE_OPENAI_EMBEDDING_MODEL_NAME= +# AZURE_OPENAI_DEPLOYMENT_NAME= +# AZURE_OPENAI_COMPLETION_MODEL=gpt-35-turbo + +# "azure" | "openai" | llama2 +EMBEDDING_PROVIDER=openai + +# Vector Store, PINECONE|QDRANT +STORE=QDRANT + + +# if using pinecone +# PINECONE_API_KEY= +# PINECONE_ENV= +# VECTOR_STORE_INDEX_NAME= + + +# if using qdrant +QDRANT_URL=http://qdrant:6333 + + +# optional, defaults to 15 +MAX_PAGES_CRAWL=150 + +# --- these will change if you decide to start testing the software +CELERY_BROKER_URL=redis://redis:6379/ +CELERY_RESULT_BACKEND=redis://redis:6379/ +DATABASE_NAME=openchat +DATABASE_USER=dbuser +DATABASE_PASSWORD=dbpass +DATABASE_HOST=mysql +DATABASE_PORT=3306 + +# use 'external' if you want to use below services. +PDF_LIBRARY = 'external' + +#PDF API - OCRWebService.com (REST API). https://www.ocrwebservice.com/api/restguide +#Extract text from scanned images and PDF documents and convert into editable formats. +#Please create new account with ocrwebservice.com via http://www.ocrwebservice.com/account/signup and get license code +OCR_LICCODE = 'LICENSE-CODE' +OCR_USERNAME = 'USERNAME' +OCR_LANGUAGE = 'english' +# Advantage to clean up the OCR text which can be messy and full with garbage, but will generate a cost with LLM if is paid. Use carefully. +# Use 1 to enable, 0 to disable. +OCR_LLM = '1' + +# retrieval_qa | conversation_retrieval, retrieval_qa works better with azure openai +# if you want to use the conversation_retrieval | retrieval_qa chain +CHAIN_TYPE=conversation_retrieval + diff --git a/dj_backend_server/requirements.txt b/dj_backend_server/requirements.txt index 4a6e2cef..9c686faa 100644 --- a/dj_backend_server/requirements.txt +++ b/dj_backend_server/requirements.txt @@ -19,8 +19,7 @@ click-repl==0.3.0 cryptography==41.0.3 dataclasses-json==0.5.14 Django==4.2.3 -django-rest-swagger -djangorestframework +django-rest-swagger==2.2.0 dnspython==2.4.1 drf-spectacular==0.27.1 drf_spectacular.extensions==0.0.2 @@ -31,8 +30,8 @@ grpcio-tools==1.56.2 h11==0.14.0 h2==4.1.0 hpack==4.0.0 -httpcore1.0.2 -httpx=0.25.2 +httpcore==1.0.2 +httpx==0.25.2 hyperframe==6.0.1 idna==3.6 kombu==5.3.1 diff --git a/dj_backend_server/web/templates/onboarding/other-data-sources-website.html b/dj_backend_server/web/templates/onboarding/other-data-sources-website.html index 4a6c4875..f2d01891 100644 --- a/dj_backend_server/web/templates/onboarding/other-data-sources-website.html +++ b/dj_backend_server/web/templates/onboarding/other-data-sources-website.html @@ -79,20 +79,16 @@

{% trans 'Website information
-
{% trans "Just to make sure we are on - the same page" %}
+
{% trans "Just to make sure we are on the same page" %}
- {% trans "Sometimes, we might face challenges when trying to crawl certain websites, - especially the ones built using JavaScript (Single-Page Applications). However, we're - currently working on adding headless browsing to our system so that we can support all - kinds of websites." %} + {% trans "Sometimes, we might face challenges when trying to crawl certain websites, especially the ones built using JavaScript (Single-Page Applications). However, we're currently working on adding headless browsing to our system so that we can support all kinds of websites." %}
<- {% trans "Back" %} - +
diff --git a/dj_backend_server/web/templates/onboarding/step-0.html b/dj_backend_server/web/templates/onboarding/step-0.html index b3775875..db34b838 100644 --- a/dj_backend_server/web/templates/onboarding/step-0.html +++ b/dj_backend_server/web/templates/onboarding/step-0.html @@ -56,8 +56,7 @@

{% trans "Let's set up your f

{% trans "You provide the system with data" %}

-
{% trans "It could be a website, pdf files, and soon you will have the option to - integrate with many more" %}
+
{% trans "It could be a website, pdf files, and soon you will have the option to integrate with many more" %}
  • diff --git a/dj_backend_server/web/templates/onboarding/step-2.html b/dj_backend_server/web/templates/onboarding/step-2.html index 580998c7..2cf4804e 100644 --- a/dj_backend_server/web/templates/onboarding/step-2.html +++ b/dj_backend_server/web/templates/onboarding/step-2.html @@ -78,7 +78,7 @@

    {% trans "Website information
    -
    {% trans "Just to make sure we are on the same page" %} 🫶 +
    {% trans "Just to make sure we are on the same page" %}
    {% trans "We might not be able to crawl some websites, especially websites that are built using JS (SPA), we are working on adding headless browsing to support all sorts of websites." %} diff --git a/dj_backend_server/web/workers/crawler.py b/dj_backend_server/web/workers/crawler.py index 35779d91..18e4437b 100644 --- a/dj_backend_server/web/workers/crawler.py +++ b/dj_backend_server/web/workers/crawler.py @@ -5,6 +5,7 @@ from web.signals.website_data_source_crawling_was_completed import website_data_source_crawling_completed from web.models.crawled_pages import CrawledPages from web.models.website_data_sources import WebsiteDataSource +from api.utils.init_vector_store import ensure_vector_database_exists from django.core.files.storage import default_storage from django.core.files.base import ContentFile from django.utils.text import slugify @@ -35,6 +36,10 @@ def start_recursive_crawler(data_source_id, chatbot_id): Exception: If any error occurs during the crawling process, the function will catch the exception, set the crawling status to "failed", and re-raise the exception. """ + # Ensure vector database exists before starting the crawl + + ensure_vector_database_exists(str(chatbot_id)) + # print("Starting recursive crawler") data_source = WebsiteDataSource.objects.get(pk=data_source_id) root_url = data_source.root_url @@ -323,4 +328,4 @@ def crawl(data_source_id, url, crawled_urls, max_pages, chatbot_id): except Exception as e: # Handle other exceptions (e.g., invalid HTML, network issues) and continue crawling logging.exception(f"An unexpected error occurred while crawling URL: {url}") - pass \ No newline at end of file + pass