diff --git a/.gitignore b/.gitignore index 2a1061ec..a7513790 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ dj_backend_server.code-workspace .aiderignore dj_backend_server/.vscode/settings.json +dj_backend_server/a.py +dj_backend_server/1.pdf diff --git a/dj_backend_server/.vscode/settings.json b/dj_backend_server/.vscode/settings.json index d99f2f30..a0038c4a 100644 --- a/dj_backend_server/.vscode/settings.json +++ b/dj_backend_server/.vscode/settings.json @@ -2,5 +2,6 @@ "[python]": { "editor.defaultFormatter": "ms-python.black-formatter" }, - "python.formatting.provider": "none" + "python.formatting.provider": "black", + "editor.formatOnSave": false } \ No newline at end of file diff --git a/dj_backend_server/CHANGELOG.MD b/dj_backend_server/CHANGELOG.MD index 035f04b9..06ae6754 100644 --- a/dj_backend_server/CHANGELOG.MD +++ b/dj_backend_server/CHANGELOG.MD @@ -1,3 +1,6 @@ +2.22.2024 +- We've recently enhanced our chat interface to display metadata values, such as the data source and web links, from our vector database when available. However, it's important to note that, as of now, there is no option to toggle this feature on or off. This means that whenever this metadata is available for newer database entries, it will be automatically displayed. Please be aware that older database records might not include this information due to the feature's recent implementation. + 2.20.2024 - Implemented functionality to delete a chatbot namespace from the vector database, along with all records associated with that chatbot, upon chatbot deletion. - The Directory Data Loader must be updated to include filename metadata to enable filtering. PR#138 diff --git a/dj_backend_server/api/data_sources/pdf_handler.py b/dj_backend_server/api/data_sources/pdf_handler.py index 119e740b..4bb5ef3f 100644 --- a/dj_backend_server/api/data_sources/pdf_handler.py +++ b/dj_backend_server/api/data_sources/pdf_handler.py @@ -126,26 +126,31 @@ def pdf_handler( @csrf_exempt def process_pdf_with_pypdfium(file_path, directory_path): - pdf = PdfDocument(file_path) - text_pages = [] + pdf_document = PdfDocument(file_path) + text_pages_with_numbers = [] - for page_index in range(len(pdf)): - page = pdf.get_page(page_index) + for page_index in range(len(pdf_document)): + page = pdf_document.get_page(page_index) text_page = page.get_textpage() # get a text page handle for this page text = text_page.get_text_range() # extract text from the text page - text_pages.append(text) + text_pages_with_numbers.append( + (page_index + 1, text) + ) # Store page number and text text_page.close() # close the text page handle - text = "".join(text_pages) + # Combine texts from all pages, prepending each with its page number + combined_text = "\n".join( + [f"Page {num}: {text}" for num, text in text_pages_with_numbers] + ) txt_file_path = os.path.splitext(file_path)[0] + ".txt" logging.debug( - f"Debug: Writing text to {txt_file_path}, directory_path: {directory_path}, text: {text}" + f"Debug: Writing text with page numbers to {txt_file_path}, directory_path: {directory_path}" ) with open(txt_file_path, "w") as f: - f.write(text) + f.write(combined_text) - pdf.close() + pdf_document.close() @csrf_exempt @@ -159,6 +164,7 @@ def process_pdf(FilePath, directory_path): resturl = "http://www.ocrwebservice.com/restservices/processDocument" RequestUrl = f"{resturl}?pagerange={pagerange}&language={language}&outputformat={outputformat}&gettext={gettext}" + logging.debug(f"Debug: RequestUrl: {RequestUrl}") try: with open(FilePath, "rb") as image_file: @@ -216,7 +222,7 @@ def process_pdf(FilePath, directory_path): f"\nThe text: {{text}}. " ) - # print (f"Debug: initial_prompt: {initial_prompt}") + logging.debug(f"Debug: initial_prompt: {initial_prompt}") # Call LLM and write the result into a new text file process_text_with_llm(txt_file, mode, initial_prompt) @@ -291,13 +297,8 @@ def txt_to_vectordb( ) docs = text_splitter.split_documents(raw_docs) - logging.debug("external files docs -->", docs) - if not docs: - print("No documents were processed successfully.") - return - embeddings = get_embeddings() logging.debug( @@ -311,6 +312,11 @@ def txt_to_vectordb( "bot_id": str(pdf_data_source.chatbot.id), "last_update": pdf_data_source.updated_at.strftime("%Y-%m-%d %H:%M:%S"), "type": "document", + "doc_type": ( + pdf_data_source.files_info[0]["original_name"].split(".")[-1] + if pdf_data_source.files_info + else "unknown" + ), "page": "1", # @TODO to extract the page number. "folder": pdf_data_source.folder_name, "original_filename": ( @@ -321,7 +327,7 @@ def txt_to_vectordb( }, ) logging.debug( - f"Vector store initialized successfully for namespace: {namespace}." + f"Vector store initialized successfully for metadata: {metadata}." ) logging.debug(f"Folder need or not to delete. {delete_folder_flag}") diff --git a/dj_backend_server/api/utils/make_chain.py b/dj_backend_server/api/utils/make_chain.py index 34fd940b..17838592 100644 --- a/dj_backend_server/api/utils/make_chain.py +++ b/dj_backend_server/api/utils/make_chain.py @@ -145,14 +145,11 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str): # Send the formatted prompt to LLM and get the result llm = get_llm() - result = llm(prompt=initial_prompt.format(text=text), temperature=0) - - # Check if result is a string - if isinstance(result, str): - response = result - elif isinstance(result, dict): - # Extract only the response from the result - response = result["choices"][0]["message"]["content"] + result = llm.invoke(input=initial_prompt.format(text=text), temperature=0) + + # Extract the response from the result + if hasattr(result, "content"): + response = result.content else: print( f"Error: LLM result is not a dictionary or a string. It is a {type(result)} with value {result}" @@ -166,6 +163,7 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str): print(f"Write with value {txt_file_path}") else: # Write the response into a new text file + result_file_path = txt_file_path.replace(".txt", "_processed.txt") result_file_path = txt_file_path.replace(".txt", ".txt") with open(result_file_path, "w") as result_file: result_file.write(response) diff --git a/dj_backend_server/api/views/views_chat.py b/dj_backend_server/api/views/views_chat.py index 00a87214..579cc2ac 100644 --- a/dj_backend_server/api/views/views_chat.py +++ b/dj_backend_server/api/views/views_chat.py @@ -43,7 +43,6 @@ def chat(request): """ try: - logger.debug("Received chat request from view_messages.py - /api/chat/") body = json.loads(request.body.decode("utf-8")) question = body.get("question") namespace = body.get("namespace") @@ -51,25 +50,24 @@ def chat(request): initial_prompt = body.get("initial_prompt") token = body.get("token") session_id = body.get("session_id") + metadata = body.get("metadata", {}) logger.debug(f"Request body parsed: {body}") - logger.debug(f"Question: {question}") bot = get_object_or_404(Chatbot, token=token) - logger.debug(f"Chatbot found: {bot.name}") if not question: return JsonResponse({"error": "No question in the request"}, status=400) sanitized_question = question.strip().replace("\n", " ") - logger.debug(f"Sanitized question: {sanitized_question}") vector_store = get_vector_store(StoreOptions(namespace=namespace)) - logger.debug(f"Vector store obtained") - response_text = get_completion_response( + + response_text, metadata = get_completion_response( vector_store=vector_store, initial_prompt=initial_prompt, mode=mode, sanitized_question=sanitized_question, session_id=session_id, + metadata=metadata, ) - logger.debug(f"Response text: {response_text}") + if isinstance(response_text, dict) and "text" in response_text: ChatHistory.objects.bulk_create( [ @@ -90,9 +88,9 @@ def chat(request): ] ) logger.debug( - f"Response after creating ChatHistory: {json.dumps(response_text, indent=2)}" + f"Response after creating ChatHistory: {json.dumps(response_text, indent=2)}, metadata: {metadata}" ) - return JsonResponse({"text": response_text}) + return JsonResponse({"text": response_text, "metadata": metadata}) elif isinstance(response_text, str): ChatHistory.objects.bulk_create( @@ -114,9 +112,9 @@ def chat(request): ] ) logger.debug( - f"Response after creating ChatHistory 2: {json.dumps(response_text, indent=2)}" + f"Response after creating ChatHistory 2: {json.dumps(response_text, indent=2)}, metadata: {metadata}" ) - return JsonResponse({"text": response_text}) + return JsonResponse({"text": response_text, "metadata": metadata}) else: return JsonResponse({"error": "Unexpected response from API"}, status=500) @@ -132,7 +130,7 @@ def chat(request): def get_completion_response( - vector_store, mode, initial_prompt, sanitized_question, session_id + vector_store, mode, initial_prompt, sanitized_question, session_id, metadata ): """ This function generates a response based on a given question. It uses either the 'retrieval_qa' or 'conversation_retrieval' @@ -151,15 +149,18 @@ def get_completion_response( is a string, it is returned after removing markdown code block formatting. """ - logger.debug(f"Entering get_completion_response function") - logger.debug( - f"Mode: {mode}, Initial Prompt: {initial_prompt}, Sanitized Question: {sanitized_question}, Session ID: {session_id}" - ) + # logger.debug(f"Entering get_completion_response function") + # logger.debug( + # f"Mode: {mode}, Initial Prompt: {initial_prompt}, Sanitized Question: {sanitized_question}, Session ID: {session_id}" + # ) chain_type = os.getenv("CHAIN_TYPE", "conversation_retrieval") chain: QAWithSourcesChain if chain_type == "retrieval_qa": chain = getRetrievalQAWithSourcesChain(vector_store, mode, initial_prompt) - response = chain({"question": sanitized_question}, return_only_outputs=True) + response = chain.invoke( + {"question": sanitized_question, "metadata": metadata}, + return_only_outputs=True, + ) response_text = response["answer"] logger.debug(f"RetrievalQA response: {response_text}") elif chain_type == "conversation_retrieval": @@ -171,13 +172,33 @@ def get_completion_response( logger.debug(f"Formatted Chat_history {chat_history}") response = chain.invoke( - {"question": sanitized_question, "chat_history": chat_history}, + { + "question": sanitized_question, + "chat_history": chat_history, + "metadata": metadata, + }, ) - response_text = response.get("answer") + # Assuming 'response' is the JSON object you've provided + source_documents = response["source_documents"] + + # Initialize an empty list to hold metadata from all documents + all_metadata = [] + + # Iterate through each document in the source documents + for document in source_documents: + # Correctly access the metadata attribute or method of the Document object + # Assuming the Document object has a 'metadata' attribute + metadata = document.metadata + + # Add the metadata dictionary to the list + all_metadata.append(metadata) + + response_text = response.get("answer", "") + try: # Attempt to parse the response_text as JSON response_text = json.loads(response_text) - logger.debug(f"Response text after JSON parsing: {response_text}") + except json.JSONDecodeError: # If response_text is not a JSON string, leave it as is pass @@ -194,4 +215,5 @@ def get_completion_response( response_text.replace("```", "").replace("markdown\n", "").strip() ) logger.debug(f"Response text after markdown removal: {response_text}") - return response_text + # print(f"metadata {metadata}") + return response_text, all_metadata diff --git a/dj_backend_server/api/views/views_message.py b/dj_backend_server/api/views/views_message.py index 06e74b43..d1c497f1 100644 --- a/dj_backend_server/api/views/views_message.py +++ b/dj_backend_server/api/views/views_message.py @@ -182,13 +182,9 @@ def send_chat(request): ) # {'from': 'user', 'type': 'text', 'content': 'input text from chat'} # Validate the request data content = data.get("content") - history = data.get("history") - logger.debug(f"Content: {content}") - logger.debug( - f"History: {history}" - ) # history is a list of chat history - None???? - content_type = data.get("type") - metadata = data.get("metadata") or {} + # history = data.get("history") + # logger.debug(f"Content: {content}") + # logger.debug(f"History: {history}") session_id = get_session_id(request=request, bot_id=bot.id) history = ChatHistory.objects.filter(session_id=session_id) @@ -219,11 +215,9 @@ def send_chat(request): "history": history_entries, "token": bot_token, "session_id": session_id, - "metadata": metadata, }, timeout=200, ) - logger.debug(f"External API response: {response.text} and {response}") """ This block will first check if the response content is not empty. If it is empty, @@ -242,7 +236,7 @@ def send_chat(request): else: try: response_json = response.json() - logger.debug(f"Response JSON: {response_json}") + logger.debug(f"External API response 2") except json.JSONDecodeError: logger.error("JSONDecodeError occurred") return JsonResponse( @@ -255,18 +249,21 @@ def send_chat(request): ) bot_response = ChatbotResponse(response.json()) - # context = {'APP_URL': settings.APP_URL, session_id: session_id} + feedback_form_html = render_to_string( "widgets/feedback.html", {"APP_URL": settings.APP_URL, "session_id": session_id}, ) - print(f"Response in JSON {session_id}") + + html_compose = ( + metadata_html_append(response_json, session_id) + feedback_form_html + ) return JsonResponse( { "type": "text", "response": { "text": bot_response.get_bot_reply(), - "html": feedback_form_html, + "html": html_compose, "session_id": session_id, }, } @@ -313,3 +310,44 @@ def handle_feedback(request): return JsonResponse({"error": "Chat history not found"}, status=404) except Exception as e: return JsonResponse({"error": "An error occurred"}, status=500) + + +def metadata_html_append(response_json, session_id): + # Example logic to determine type based on response_json + # This is a placeholder. Adjust according to your actual logic. + type = "document" # or "website", determined dynamically + seen_filenames = set() + metadata_items = [] + + for metadata_entry in response_json.get("metadata", []): + type = metadata_entry.get("type") + if type == "document": + # if the original_filename is the same in for, then show it only one time. + for entry in response_json.get("metadata", []): + original_filename = entry.get("original_filename") + if original_filename not in seen_filenames: + metadata_items.append( + { + "source": entry.get("source"), + "original_filename": original_filename, + } + ) + seen_filenames.add(original_filename) + + if type == "website": + # if the link is the same in for, then show it only one time. + for entry in response_json.get("metadata", []): + link = entry.get("link") + if link not in seen_filenames: + metadata_items.append({"source": entry.get("source"), "link": link}) + seen_filenames.add(link) + + return render_to_string( + "widgets/metadata.html", + { + "APP_URL": settings.APP_URL, + "session_id": session_id, + "metadata_items": metadata_items, + "type": type, + }, + ) diff --git a/dj_backend_server/web/services/chat_history_service.py b/dj_backend_server/web/services/chat_history_service.py index 6e1e011c..6e0643e4 100644 --- a/dj_backend_server/web/services/chat_history_service.py +++ b/dj_backend_server/web/services/chat_history_service.py @@ -54,6 +54,6 @@ def get_chat_history_for_retrieval_chain( memory.save_context({"input": user_query}, {"output": entry.message}) user_query = None - logger.debug(f"Memory PRINT: {memory}") + # logger.debug(f"Memory PRINT: {memory}") # chat_history = memory.load_memory_variables({}) return chat_history diff --git a/dj_backend_server/web/templates/widgets/metadata.html b/dj_backend_server/web/templates/widgets/metadata.html new file mode 100644 index 00000000..3473cb38 --- /dev/null +++ b/dj_backend_server/web/templates/widgets/metadata.html @@ -0,0 +1,21 @@ +{% load i18n %} +{% load static %} + +