diff --git a/src/server/LLM/chatbot.ipynb b/src/server/LLM/chatbot.ipynb new file mode 100644 index 0000000..31ef9f0 --- /dev/null +++ b/src/server/LLM/chatbot.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"LANGSMITH_TRACING\"] = \"true\"\n", + "os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass()\n", + "\n", + "examsFolder = \"../data/cloudpractitioner\"\n", + "exam1 = \".backend/data/cloudpractitioner/exam_1.json\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 991 questions from the folder.\n" + ] + } + ], + "source": [ + "from langchain_community.document_loaders import DirectoryLoader, JSONLoader\n", + "\n", + "\n", + "loader_kwargs = {\n", + " \"jq_schema\": \".[]\", #iterate over question objects\n", + " \"text_content\": False\n", + "}\n", + "\n", + "\n", + "loader = DirectoryLoader(\n", + " path=\"../data/cloudpractitioner/\", \n", + " glob=\"**/*.json\", #ensures json\n", + " loader_cls=JSONLoader,\n", + " loader_kwargs=loader_kwargs\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "print(f\"Loaded {len(docs)} questions from the folder.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "sanitized_docs = []\n", + "\n", + "for doc in docs:\n", + " # 1. Parse the string inside the 'text' field back into a dictionary\n", + " raw_data = json.loads(doc.page_content)\n", + " \n", + " # 2. Map the actual values to the metadata keys\n", + " doc.metadata = {\n", + " \"choices\": raw_data.get(\"choices\"),\n", + " \"category\": raw_data.get(\"category\"),\n", + " \"answer\": raw_data.get(\"answer\"),\n", + " \"difficulty\": raw_data.get(\"difficulty\"),\n", + " }\n", + " \n", + " # 3. Use the actual question text as the page_content for the vector search\n", + " doc.page_content = raw_data.get(\"question\")\n", + " \n", + " sanitized_docs.append(doc)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What time-savings advantage is offered with the use of Amazon Rekognition?\n", + "{'choices': ['A. Amazon Rekognition provides automatic watermarking of images.', 'B. Amazon Rekognition provides automatic detection of objects appearing in pictures.', 'C. Amazon Rekognition provides the ability to resize millions of images automatically.', 'D. Amazon Rekognition uses Amazon Mechanical Turk to allow humans to bid on object detection jobs.'], 'category': 'Machine Learning', 'answer': 'B', 'difficulty': 2}\n", + "variable docs length 991\n" + ] + } + ], + "source": [ + "#text split\n", + "print(docs[0].page_content)\n", + "print(docs[0].metadata)\n", + "print(\"variable docs length \" + str(len(docs))) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "if not os.environ.get(\"OPENAI_API_KEY\"):\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter API key for OpenAI: \")\n", + "\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated vector of length 1536\n", + "\n", + "[0.0374404639005661, 0.0011903299018740654, 0.05431877076625824, 0.004493457730859518, 0.006717613898217678, 0.017432088032364845, -0.01557812187820673, 0.04613243043422699, -0.010257001966238022, 0.01922585815191269]\n" + ] + } + ], + "source": [ + "#embedding test\n", + "vector_1 = embeddings.embed_query(docs[0].page_content)\n", + "\n", + "print(f\"Generated vector of length {len(vector_1)}\\n\")\n", + "print(vector_1[:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores.upstash import UpstashVectorStore\n", + "import os\n", + "\n", + "os.environ[\"UPSTASH_VECTOR_REST_URL\"] = \"https://loving-kingfish-56853-us1-vector.upstash.io\"\n", + "os.environ[\"UPSTASH_VECTOR_REST_TOKEN\"] = \"\"\n", + "\n", + "store = UpstashVectorStore(\n", + " embedding=embeddings\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adding documents Here" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "#ids = store.add_documents(documents=sanitized_docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Similarity search" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4\n", + "page_content='AWS allows users to manage their resources using a web based user interface. What is the name of this interface?' metadata={'choices': ['A. AWS CLI.', 'B. AWS API.', 'C. AWS SDK.', 'D. AWS Management Console.'], 'category': 'Cloud Concepts', 'answer': 'D', 'difficulty': 2}\n", + "page_content='What is the AWS tool that enables you to use scripts to manage all AWS services and resources?' metadata={'choices': ['A. AWS Console.', 'B. AWS Service Catalog.', 'C. AWS OpsWorks.', 'D. AWS CLI.'], 'category': 'Cloud Concepts', 'answer': 'D', 'difficulty': 2}\n", + "page_content='AWS CloudFormation is designed to help the user:' metadata={'choices': ['A. model and provision resources.', 'B. update application code.', 'C. set up data lakes.', 'D. create reports for billing.'], 'category': 'Management & Governance', 'answer': 'A', 'difficulty': 2}\n", + "page_content='Which of the following services allows customers to manage their agreements with AWS?' metadata={'choices': ['A. AWS Artifact.', 'B. AWS Certificate Manager.', 'C. AWS Systems Manager.', 'D. AWS Organizations.'], 'category': 'Cloud Concepts', 'answer': 'A', 'difficulty': 2}\n" + ] + } + ], + "source": [ + "#find similar questions \n", + "results = store.similarity_search(\n", + " \"AWS allows users to manage their resources using a web based user interface.\"\n", + ")\n", + "\n", + "print(len(results))\n", + "for result in results:\n", + " print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/server/controllers/example_controller.py b/src/server/controllers/example_controller.py index ab485b4..66474bb 100644 --- a/src/server/controllers/example_controller.py +++ b/src/server/controllers/example_controller.py @@ -1,6 +1,7 @@ from jwt_auth.db import safe_query #syntax: result = safe_query(query,(param1,param2...),fetch="one/all",insert=True/False) import boto3 as aws from dotenv import load_dotenv +import os class AppError(Exception): def __init__(self, message: str, status_code: int = 400): @@ -17,5 +18,57 @@ def sample_function(userid:str,param:type)->type: return ... +import hashlib +from langchain_community.document_loaders import DirectoryLoader, JSONLoader +import json +from langchain_community.vectorstores.upstash import UpstashVectorStore +import getpass +from langchain_openai import OpenAIEmbeddings + +#input should be a folder with json or a singular json file +def update_Database(path:str)->str: #return number of added docs + + os.environ["UPSTASH_VECTOR_REST_URL"] = "https://loving-kingfish-56853-us1-vector.upstash.io" + if not os.environ.get("UPSTASH_VECTOR_REST_TOKEN"): + os.environ["UPSTASH_VECTOR_REST_TOKEN"] = getpass.getpass("Enter API key for Upstash: ") + + if not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ") + loader_kwargs = { + "jq_schema": ".[]", #iterate over question objects. + "text_content": False + } + loader = DirectoryLoader( + path=path, + glob="**/*.json", #ensures json + loader_cls=JSONLoader, + loader_kwargs=loader_kwargs + ) + docs = loader.load() + sanitized_docs = [] + doc_ids = [] # Standardized name + + for doc in docs: + raw_data = json.loads(doc.page_content) + question_text = raw_data.get("question") + unique_id = hashlib.md5(question_text.encode('utf-8')).hexdigest() + + doc.metadata = { + "choices": raw_data.get("choices"), + "category": raw_data.get("category"), + "answer": raw_data.get("answer"), + "difficulty": raw_data.get("difficulty"), + } + doc.page_content = question_text + doc_ids.append(unique_id) + sanitized_docs.append(doc) + + embeddings = OpenAIEmbeddings(model="text-embedding-3-small") + store = UpstashVectorStore( + embedding=embeddings + ) + store.add_documents(documents=sanitized_docs, ids=doc_ids) + return f"Successfully processed {len(sanitized_docs)} documents." +