Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 248 additions & 0 deletions src/server/LLM/chatbot.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"LANGSMITH_TRACING\"] = \"true\"\n",
"os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass()\n",
"\n",
"examsFolder = \"../data/cloudpractitioner\"\n",
"exam1 = \".backend/data/cloudpractitioner/exam_1.json\"\n"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 991 questions from the folder.\n"
]
}
],
"source": [
"from langchain_community.document_loaders import DirectoryLoader, JSONLoader\n",
"\n",
"\n",
"loader_kwargs = {\n",
" \"jq_schema\": \".[]\", #iterate over question objects\n",
" \"text_content\": False\n",
"}\n",
"\n",
"\n",
"loader = DirectoryLoader(\n",
" path=\"../data/cloudpractitioner/\", \n",
" glob=\"**/*.json\", #ensures json\n",
" loader_cls=JSONLoader,\n",
" loader_kwargs=loader_kwargs\n",
")\n",
"\n",
"docs = loader.load()\n",
"\n",
"print(f\"Loaded {len(docs)} questions from the folder.\")"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"sanitized_docs = []\n",
"\n",
"for doc in docs:\n",
" # 1. Parse the string inside the 'text' field back into a dictionary\n",
" raw_data = json.loads(doc.page_content)\n",
" \n",
" # 2. Map the actual values to the metadata keys\n",
" doc.metadata = {\n",
" \"choices\": raw_data.get(\"choices\"),\n",
" \"category\": raw_data.get(\"category\"),\n",
" \"answer\": raw_data.get(\"answer\"),\n",
" \"difficulty\": raw_data.get(\"difficulty\"),\n",
" }\n",
" \n",
" # 3. Use the actual question text as the page_content for the vector search\n",
" doc.page_content = raw_data.get(\"question\")\n",
" \n",
" sanitized_docs.append(doc)\n"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"What time-savings advantage is offered with the use of Amazon Rekognition?\n",
"{'choices': ['A. Amazon Rekognition provides automatic watermarking of images.', 'B. Amazon Rekognition provides automatic detection of objects appearing in pictures.', 'C. Amazon Rekognition provides the ability to resize millions of images automatically.', 'D. Amazon Rekognition uses Amazon Mechanical Turk to allow humans to bid on object detection jobs.'], 'category': 'Machine Learning', 'answer': 'B', 'difficulty': 2}\n",
"variable docs length 991\n"
]
}
],
"source": [
"#text split\n",
"print(docs[0].page_content)\n",
"print(docs[0].metadata)\n",
"print(\"variable docs length \" + str(len(docs))) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Embedding"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"\n",
"if not os.environ.get(\"OPENAI_API_KEY\"):\n",
" os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter API key for OpenAI: \")\n",
"\n",
"from langchain_openai import OpenAIEmbeddings\n",
"\n",
"embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generated vector of length 1536\n",
"\n",
"[0.0374404639005661, 0.0011903299018740654, 0.05431877076625824, 0.004493457730859518, 0.006717613898217678, 0.017432088032364845, -0.01557812187820673, 0.04613243043422699, -0.010257001966238022, 0.01922585815191269]\n"
]
}
],
"source": [
"#embedding test\n",
"vector_1 = embeddings.embed_query(docs[0].page_content)\n",
"\n",
"print(f\"Generated vector of length {len(vector_1)}\\n\")\n",
"print(vector_1[:10])"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.vectorstores.upstash import UpstashVectorStore\n",
"import os\n",
"\n",
"os.environ[\"UPSTASH_VECTOR_REST_URL\"] = \"https://loving-kingfish-56853-us1-vector.upstash.io\"\n",
"os.environ[\"UPSTASH_VECTOR_REST_TOKEN\"] = \"\"\n",
"\n",
"store = UpstashVectorStore(\n",
" embedding=embeddings\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Adding documents Here"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"#ids = store.add_documents(documents=sanitized_docs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Similarity search"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4\n",
"page_content='AWS allows users to manage their resources using a web based user interface. What is the name of this interface?' metadata={'choices': ['A. AWS CLI.', 'B. AWS API.', 'C. AWS SDK.', 'D. AWS Management Console.'], 'category': 'Cloud Concepts', 'answer': 'D', 'difficulty': 2}\n",
"page_content='What is the AWS tool that enables you to use scripts to manage all AWS services and resources?' metadata={'choices': ['A. AWS Console.', 'B. AWS Service Catalog.', 'C. AWS OpsWorks.', 'D. AWS CLI.'], 'category': 'Cloud Concepts', 'answer': 'D', 'difficulty': 2}\n",
"page_content='AWS CloudFormation is designed to help the user:' metadata={'choices': ['A. model and provision resources.', 'B. update application code.', 'C. set up data lakes.', 'D. create reports for billing.'], 'category': 'Management & Governance', 'answer': 'A', 'difficulty': 2}\n",
"page_content='Which of the following services allows customers to manage their agreements with AWS?' metadata={'choices': ['A. AWS Artifact.', 'B. AWS Certificate Manager.', 'C. AWS Systems Manager.', 'D. AWS Organizations.'], 'category': 'Cloud Concepts', 'answer': 'A', 'difficulty': 2}\n"
]
}
],
"source": [
"#find similar questions \n",
"results = store.similarity_search(\n",
" \"AWS allows users to manage their resources using a web based user interface.\"\n",
")\n",
"\n",
"print(len(results))\n",
"for result in results:\n",
" print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
53 changes: 53 additions & 0 deletions src/server/controllers/example_controller.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from jwt_auth.db import safe_query #syntax: result = safe_query(query,(param1,param2...),fetch="one/all",insert=True/False)
import boto3 as aws
from dotenv import load_dotenv
import os

class AppError(Exception):
def __init__(self, message: str, status_code: int = 400):
Expand All @@ -17,5 +18,57 @@ def sample_function(userid:str,param:type)->type:
return ...


import hashlib
from langchain_community.document_loaders import DirectoryLoader, JSONLoader
import json
from langchain_community.vectorstores.upstash import UpstashVectorStore
import getpass
from langchain_openai import OpenAIEmbeddings

#input should be a folder with json or a singular json file
def update_Database(path:str)->str: #return number of added docs

os.environ["UPSTASH_VECTOR_REST_URL"] = "https://loving-kingfish-56853-us1-vector.upstash.io"
if not os.environ.get("UPSTASH_VECTOR_REST_TOKEN"):
os.environ["UPSTASH_VECTOR_REST_TOKEN"] = getpass.getpass("Enter API key for Upstash: ")

if not os.environ.get("OPENAI_API_KEY"):
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")
loader_kwargs = {
"jq_schema": ".[]", #iterate over question objects.
"text_content": False
}
loader = DirectoryLoader(
path=path,
glob="**/*.json", #ensures json
loader_cls=JSONLoader,
loader_kwargs=loader_kwargs
)
docs = loader.load()
sanitized_docs = []
doc_ids = [] # Standardized name

for doc in docs:
raw_data = json.loads(doc.page_content)
question_text = raw_data.get("question")
unique_id = hashlib.md5(question_text.encode('utf-8')).hexdigest()

doc.metadata = {
"choices": raw_data.get("choices"),
"category": raw_data.get("category"),
"answer": raw_data.get("answer"),
"difficulty": raw_data.get("difficulty"),
}
doc.page_content = question_text
doc_ids.append(unique_id)
sanitized_docs.append(doc)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
store = UpstashVectorStore(
embedding=embeddings
)
store.add_documents(documents=sanitized_docs, ids=doc_ids)
return f"Successfully processed {len(sanitized_docs)} documents."