From 21171c89456a6554560c8025c92f169a302775fc Mon Sep 17 00:00:00 2001 From: YoshiRyo2 <128435034+YoshiRyo2@users.noreply.github.com> Date: Fri, 6 Feb 2026 17:50:52 -0500 Subject: [PATCH 1/3] Vector Database/Similarity Search --- backend/LLM/chatbot.ipynb | 248 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 backend/LLM/chatbot.ipynb diff --git a/backend/LLM/chatbot.ipynb b/backend/LLM/chatbot.ipynb new file mode 100644 index 0000000..31ef9f0 --- /dev/null +++ b/backend/LLM/chatbot.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"LANGSMITH_TRACING\"] = \"true\"\n", + "os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass()\n", + "\n", + "examsFolder = \"../data/cloudpractitioner\"\n", + "exam1 = \".backend/data/cloudpractitioner/exam_1.json\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 991 questions from the folder.\n" + ] + } + ], + "source": [ + "from langchain_community.document_loaders import DirectoryLoader, JSONLoader\n", + "\n", + "\n", + "loader_kwargs = {\n", + " \"jq_schema\": \".[]\", #iterate over question objects\n", + " \"text_content\": False\n", + "}\n", + "\n", + "\n", + "loader = DirectoryLoader(\n", + " path=\"../data/cloudpractitioner/\", \n", + " glob=\"**/*.json\", #ensures json\n", + " loader_cls=JSONLoader,\n", + " loader_kwargs=loader_kwargs\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "print(f\"Loaded {len(docs)} questions from the folder.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "sanitized_docs = []\n", + "\n", + "for doc in docs:\n", + " # 1. Parse the string inside the 'text' field back into a dictionary\n", + " raw_data = json.loads(doc.page_content)\n", + " \n", + " # 2. Map the actual values to the metadata keys\n", + " doc.metadata = {\n", + " \"choices\": raw_data.get(\"choices\"),\n", + " \"category\": raw_data.get(\"category\"),\n", + " \"answer\": raw_data.get(\"answer\"),\n", + " \"difficulty\": raw_data.get(\"difficulty\"),\n", + " }\n", + " \n", + " # 3. Use the actual question text as the page_content for the vector search\n", + " doc.page_content = raw_data.get(\"question\")\n", + " \n", + " sanitized_docs.append(doc)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What time-savings advantage is offered with the use of Amazon Rekognition?\n", + "{'choices': ['A. Amazon Rekognition provides automatic watermarking of images.', 'B. Amazon Rekognition provides automatic detection of objects appearing in pictures.', 'C. Amazon Rekognition provides the ability to resize millions of images automatically.', 'D. Amazon Rekognition uses Amazon Mechanical Turk to allow humans to bid on object detection jobs.'], 'category': 'Machine Learning', 'answer': 'B', 'difficulty': 2}\n", + "variable docs length 991\n" + ] + } + ], + "source": [ + "#text split\n", + "print(docs[0].page_content)\n", + "print(docs[0].metadata)\n", + "print(\"variable docs length \" + str(len(docs))) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "if not os.environ.get(\"OPENAI_API_KEY\"):\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter API key for OpenAI: \")\n", + "\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated vector of length 1536\n", + "\n", + "[0.0374404639005661, 0.0011903299018740654, 0.05431877076625824, 0.004493457730859518, 0.006717613898217678, 0.017432088032364845, -0.01557812187820673, 0.04613243043422699, -0.010257001966238022, 0.01922585815191269]\n" + ] + } + ], + "source": [ + "#embedding test\n", + "vector_1 = embeddings.embed_query(docs[0].page_content)\n", + "\n", + "print(f\"Generated vector of length {len(vector_1)}\\n\")\n", + "print(vector_1[:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores.upstash import UpstashVectorStore\n", + "import os\n", + "\n", + "os.environ[\"UPSTASH_VECTOR_REST_URL\"] = \"https://loving-kingfish-56853-us1-vector.upstash.io\"\n", + "os.environ[\"UPSTASH_VECTOR_REST_TOKEN\"] = \"\"\n", + "\n", + "store = UpstashVectorStore(\n", + " embedding=embeddings\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adding documents Here" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "#ids = store.add_documents(documents=sanitized_docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Similarity search" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4\n", + "page_content='AWS allows users to manage their resources using a web based user interface. What is the name of this interface?' metadata={'choices': ['A. AWS CLI.', 'B. AWS API.', 'C. AWS SDK.', 'D. AWS Management Console.'], 'category': 'Cloud Concepts', 'answer': 'D', 'difficulty': 2}\n", + "page_content='What is the AWS tool that enables you to use scripts to manage all AWS services and resources?' metadata={'choices': ['A. AWS Console.', 'B. AWS Service Catalog.', 'C. AWS OpsWorks.', 'D. AWS CLI.'], 'category': 'Cloud Concepts', 'answer': 'D', 'difficulty': 2}\n", + "page_content='AWS CloudFormation is designed to help the user:' metadata={'choices': ['A. model and provision resources.', 'B. update application code.', 'C. set up data lakes.', 'D. create reports for billing.'], 'category': 'Management & Governance', 'answer': 'A', 'difficulty': 2}\n", + "page_content='Which of the following services allows customers to manage their agreements with AWS?' metadata={'choices': ['A. AWS Artifact.', 'B. AWS Certificate Manager.', 'C. AWS Systems Manager.', 'D. AWS Organizations.'], 'category': 'Cloud Concepts', 'answer': 'A', 'difficulty': 2}\n" + ] + } + ], + "source": [ + "#find similar questions \n", + "results = store.similarity_search(\n", + " \"AWS allows users to manage their resources using a web based user interface.\"\n", + ")\n", + "\n", + "print(len(results))\n", + "for result in results:\n", + " print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 78e295e3bc32071b58b2e5b6d02b92b72eed3010 Mon Sep 17 00:00:00 2001 From: YoshiRyo2 <128435034+YoshiRyo2@users.noreply.github.com> Date: Tue, 10 Feb 2026 17:40:45 -0500 Subject: [PATCH 2/3] update example_controller.py to add function update_Database --- src/server/controllers/example_controller.py | 53 ++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/server/controllers/example_controller.py b/src/server/controllers/example_controller.py index ab485b4..66474bb 100644 --- a/src/server/controllers/example_controller.py +++ b/src/server/controllers/example_controller.py @@ -1,6 +1,7 @@ from jwt_auth.db import safe_query #syntax: result = safe_query(query,(param1,param2...),fetch="one/all",insert=True/False) import boto3 as aws from dotenv import load_dotenv +import os class AppError(Exception): def __init__(self, message: str, status_code: int = 400): @@ -17,5 +18,57 @@ def sample_function(userid:str,param:type)->type: return ... +import hashlib +from langchain_community.document_loaders import DirectoryLoader, JSONLoader +import json +from langchain_community.vectorstores.upstash import UpstashVectorStore +import getpass +from langchain_openai import OpenAIEmbeddings + +#input should be a folder with json or a singular json file +def update_Database(path:str)->str: #return number of added docs + + os.environ["UPSTASH_VECTOR_REST_URL"] = "https://loving-kingfish-56853-us1-vector.upstash.io" + if not os.environ.get("UPSTASH_VECTOR_REST_TOKEN"): + os.environ["UPSTASH_VECTOR_REST_TOKEN"] = getpass.getpass("Enter API key for Upstash: ") + + if not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ") + loader_kwargs = { + "jq_schema": ".[]", #iterate over question objects. + "text_content": False + } + loader = DirectoryLoader( + path=path, + glob="**/*.json", #ensures json + loader_cls=JSONLoader, + loader_kwargs=loader_kwargs + ) + docs = loader.load() + sanitized_docs = [] + doc_ids = [] # Standardized name + + for doc in docs: + raw_data = json.loads(doc.page_content) + question_text = raw_data.get("question") + unique_id = hashlib.md5(question_text.encode('utf-8')).hexdigest() + + doc.metadata = { + "choices": raw_data.get("choices"), + "category": raw_data.get("category"), + "answer": raw_data.get("answer"), + "difficulty": raw_data.get("difficulty"), + } + doc.page_content = question_text + doc_ids.append(unique_id) + sanitized_docs.append(doc) + + embeddings = OpenAIEmbeddings(model="text-embedding-3-small") + store = UpstashVectorStore( + embedding=embeddings + ) + store.add_documents(documents=sanitized_docs, ids=doc_ids) + return f"Successfully processed {len(sanitized_docs)} documents." + From 4c2cd0557c918de9e9cddcbd033cb4b766061c61 Mon Sep 17 00:00:00 2001 From: agabrielcorujo Date: Tue, 10 Feb 2026 17:47:34 -0500 Subject: [PATCH 3/3] corrected the folder structure --- {backend => src/server}/LLM/chatbot.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {backend => src/server}/LLM/chatbot.ipynb (100%) diff --git a/backend/LLM/chatbot.ipynb b/src/server/LLM/chatbot.ipynb similarity index 100% rename from backend/LLM/chatbot.ipynb rename to src/server/LLM/chatbot.ipynb