diff --git a/dj_backend_server/CHANGELOG.MD b/dj_backend_server/CHANGELOG.MD new file mode 100644 index 00000000..15b70c6c --- /dev/null +++ b/dj_backend_server/CHANGELOG.MD @@ -0,0 +1,6 @@ +2.14.2024 +- Added example.env to streamline environment setup. +- Implemented translation fixes to enhance application localization. +- Updated docker-compose.yaml to prefix each container with oc_ for better namespace management. +- Performed fixes in requirements.txt for improved dependency resolution. +- Ensured existence of Vector Database (QDrant) prior to web crawling operations to address issues encountered with large websites, ensuring Vector Database creation and availability. \ No newline at end of file diff --git a/dj_backend_server/api/data_sources/pdf_handler.py b/dj_backend_server/api/data_sources/pdf_handler.py index 6890fa8d..9bf20312 100644 --- a/dj_backend_server/api/data_sources/pdf_handler.py +++ b/dj_backend_server/api/data_sources/pdf_handler.py @@ -222,7 +222,7 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool docs = text_splitter.split_documents(raw_docs) - print("external files docs -->", docs); + # print("external files docs -->", docs); if not docs: print("No documents were processed successfully.") @@ -230,7 +230,9 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool embeddings = get_embeddings() + print(f"Initializing vector store for namespace: {namespace} with {len(docs)} documents.") init_vector_store(docs, embeddings, StoreOptions(namespace=namespace)) + print(f"Vector store initialized successfully for namespace: {namespace}.") print(f'Folder need or not to delete. {delete_folder_flag}') # Delete folder if flag is set @@ -243,7 +245,7 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool # pdf_data_source.save() failed_job = FailedJob(uuid=str(uuid4()), connection='default', queue='default', payload='txt_to_vectordb', exception=str(e),failed_at=timezone.now()) failed_job.save() - print(e) + print(f"Failed to initialize vector store for namespace: {namespace}. Exception: {e}") traceback.print_exc() diff --git a/dj_backend_server/api/utils/init_vector_store.py b/dj_backend_server/api/utils/init_vector_store.py index 36458b93..ea7a96c0 100644 --- a/dj_backend_server/api/utils/init_vector_store.py +++ b/dj_backend_server/api/utils/init_vector_store.py @@ -128,3 +128,31 @@ def delete_from_vector_store(namespace: str, filter_criteria: dict) -> None: else: raise NotImplementedError(f"Delete operation is not implemented for the store type: {store_type}") + +def ensure_vector_database_exists(namespace): + store_type = StoreType[os.environ['STORE']] + try: + if store_type == StoreType.QDRANT: + client = QdrantClient(url=os.environ['QDRANT_URL']) + for attempt in range(3): + existing_collections = client.get_collections().collections + if namespace not in existing_collections: + print(f"Namespace '{namespace}' does not exist. Attempting to create.") + vectors_config = models.VectorParams( + size=1536, # Using 1536-dimensional vectors, adjust as necessary + distance=models.Distance.COSINE # Using cosine distance, adjust as necessary + ) + client.create_collection(collection_name=namespace, vectors_config=vectors_config) + # Recheck if the namespace was successfully created + if namespace in client.get_collections().collections: + print(f"Namespace '{namespace}' successfully created.") + return + else: + print(f"Failed to create namespace '{namespace}' on attempt {attempt + 1}.") + else: + print(f"Namespace '{namespace}' exists.") + return + raise Exception(f"Failed to ensure or create namespace '{namespace}' after 3 attempts.") + except Exception as e: + print(f"Failed to ensure vector database exists for namespace {namespace}: {e}") + diff --git a/dj_backend_server/docker-compose.yaml b/dj_backend_server/docker-compose.yaml index bf2d58ab..88495ffd 100644 --- a/dj_backend_server/docker-compose.yaml +++ b/dj_backend_server/docker-compose.yaml @@ -3,7 +3,7 @@ version: '3.9' services: mysql: restart: unless-stopped - platform: linux/arm64/v8 + container_name: oc_mysql image: "mysql:8" ports: - "3307:3306" @@ -20,6 +20,7 @@ services: qdrant: image: qdrant/qdrant + container_name: oc_qdrant ports: - 6333:6333 - 6334:6334 @@ -32,7 +33,7 @@ services: build: context: . dockerfile: Dockerfile - container_name: web + container_name: oc_web ports: - "8001:8000" volumes: @@ -53,6 +54,7 @@ services: adminer: image: adminer + container_name: oc_adminer ports: - "8080:8080" environment: @@ -66,7 +68,7 @@ services: build: context: . dockerfile: Dockerfile - container_name: celery + container_name: oc_celery volumes: - ./website_data_sources:/app/website_data_sources # - ./llama-2-7b-chat.ggmlv3.q4_K_M.bin:/app/llama-2-7b-chat.ggmlv3.q4_K_M.bin:ro @@ -80,7 +82,7 @@ services: redis: image: redis:latest - container_name: redis_cache + container_name: oc_redis_cache ports: - "6379:6379" volumes: diff --git a/dj_backend_server/example.env b/dj_backend_server/example.env new file mode 100644 index 00000000..2161dd7b --- /dev/null +++ b/dj_backend_server/example.env @@ -0,0 +1,75 @@ +########################################################## + +# Edit values for your site. +# your app secret key +SECRET_KEY='ADD-YOUR-CUSTOM-KEY-HERE' +# For openai +OPENAI_API_KEY=YOURKEY +# add IP what you allow like superadmin +ALLOWED_HOSTS=localhost,0.0.0.0 +# Use * only in dev environment +#ALLOWED_HOSTS=* +# Your SITE URL +APP_URL='https://YOUR-URL-HERE' + +########################################################## + +# "azure" | "openai" | llama2 +OPENAI_API_TYPE=openai +OPENAI_API_MODEL=gpt-4-1106-preview +OPENAI_API_TEMPERATURE=1 + +# If using azure +# AZURE_OPENAI_API_BASE= +# AZURE_OPENAI_API_KEY= +# AZURE_OPENAI_API_VERSION=2023-03-15-preview +# AZURE_OPENAI_EMBEDDING_MODEL_NAME= +# AZURE_OPENAI_DEPLOYMENT_NAME= +# AZURE_OPENAI_COMPLETION_MODEL=gpt-35-turbo + +# "azure" | "openai" | llama2 +EMBEDDING_PROVIDER=openai + +# Vector Store, PINECONE|QDRANT +STORE=QDRANT + + +# if using pinecone +# PINECONE_API_KEY= +# PINECONE_ENV= +# VECTOR_STORE_INDEX_NAME= + + +# if using qdrant +QDRANT_URL=http://qdrant:6333 + + +# optional, defaults to 15 +MAX_PAGES_CRAWL=150 + +# --- these will change if you decide to start testing the software +CELERY_BROKER_URL=redis://redis:6379/ +CELERY_RESULT_BACKEND=redis://redis:6379/ +DATABASE_NAME=openchat +DATABASE_USER=dbuser +DATABASE_PASSWORD=dbpass +DATABASE_HOST=mysql +DATABASE_PORT=3306 + +# use 'external' if you want to use below services. +PDF_LIBRARY = 'external' + +#PDF API - OCRWebService.com (REST API). https://www.ocrwebservice.com/api/restguide +#Extract text from scanned images and PDF documents and convert into editable formats. +#Please create new account with ocrwebservice.com via http://www.ocrwebservice.com/account/signup and get license code +OCR_LICCODE = 'LICENSE-CODE' +OCR_USERNAME = 'USERNAME' +OCR_LANGUAGE = 'english' +# Advantage to clean up the OCR text which can be messy and full with garbage, but will generate a cost with LLM if is paid. Use carefully. +# Use 1 to enable, 0 to disable. +OCR_LLM = '1' + +# retrieval_qa | conversation_retrieval, retrieval_qa works better with azure openai +# if you want to use the conversation_retrieval | retrieval_qa chain +CHAIN_TYPE=conversation_retrieval + diff --git a/dj_backend_server/requirements.txt b/dj_backend_server/requirements.txt index 4a6e2cef..9c686faa 100644 --- a/dj_backend_server/requirements.txt +++ b/dj_backend_server/requirements.txt @@ -19,8 +19,7 @@ click-repl==0.3.0 cryptography==41.0.3 dataclasses-json==0.5.14 Django==4.2.3 -django-rest-swagger -djangorestframework +django-rest-swagger==2.2.0 dnspython==2.4.1 drf-spectacular==0.27.1 drf_spectacular.extensions==0.0.2 @@ -31,8 +30,8 @@ grpcio-tools==1.56.2 h11==0.14.0 h2==4.1.0 hpack==4.0.0 -httpcore1.0.2 -httpx=0.25.2 +httpcore==1.0.2 +httpx==0.25.2 hyperframe==6.0.1 idna==3.6 kombu==5.3.1 diff --git a/dj_backend_server/web/templates/onboarding/other-data-sources-website.html b/dj_backend_server/web/templates/onboarding/other-data-sources-website.html index 4a6c4875..f2d01891 100644 --- a/dj_backend_server/web/templates/onboarding/other-data-sources-website.html +++ b/dj_backend_server/web/templates/onboarding/other-data-sources-website.html @@ -79,20 +79,16 @@