Add files via upload

Shashank1202 · Jul 23, 2024 · aa10177 · aa10177
1 parent 27d9afd
commit aa10177
Showing 1 changed file with 311 additions and 0 deletions.
diff --git a/research/trails.ipynb b/research/trails.ipynb
@@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain import PromptTemplate\n",
+    "from langchain.chains import RetrievalQA\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "from langchain.vectorstores import Pinecone\n",
+    "from pinecone import Pinecone, ServerlessSpec\n",
+    "from langchain.document_loaders import PyPDFLoader, DirectoryLoader\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain.llms import CTransformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PINECONE_API_KEY=\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Data Loading from the PDF\n",
+    "\n",
+    "def load_pdf(data):\n",
+    "    loader= DirectoryLoader(data,\n",
+    "                    glob=\"*.pdf\",\n",
+    "                    loader_cls=PyPDFLoader)\n",
+    "    \n",
+    "    documents= loader.load()\n",
+    "    return documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extracted_data= load_pdf(r\"C:\\Users\\shash\\Deep_Learning\\AI-Driven-Customer-Chatbot\\data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Creating text chunks\n",
+    "def text_split(extracted_data):\n",
+    "    text_splitter= RecursiveCharacterTextSplitter(chunk_size= 500, chunk_overlap= 20)\n",
+    "    text_chunks= text_splitter.split_documents(extracted_data)\n",
+    "\n",
+    "    return text_chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_chunks= text_split(extracted_data)\n",
+    "print(\"Length of the chunks\", len(text_chunks))\n",
+    "print(type(text_chunks))\n",
+    "print(type(text_chunks[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for doc in extracted_data[:2]:\n",
+    "    print(doc.page_content[:1000])\n",
+    "print(type(extracted_data))\n",
+    "print(type(extracted_data[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#download embedding model\n",
+    "def download_hugging_face_embeddings():\n",
+    "    embeddings= HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
+    "    return embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings= download_hugging_face_embeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_name= \"chatbot\"\n",
+    "\n",
+    "pc=Pinecone(api_key=PINECONE_API_KEY)\n",
+    "\n",
+    "#creating index\n",
+    "\n",
+    "if index_name not in pc.list_indexes().names():\n",
+    "    pc.create_index(\n",
+    "        name= index_name,\n",
+    "        dimension= 384,\n",
+    "        metric= 'cosine',\n",
+    "        spec= ServerlessSpec(cloud= 'aws', region=\"us-east-1\")\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index= pc.Index(index_name)\n",
+    "index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_embeddings(text_chunks, embeddings):\n",
+    "    texts= [chunk.page_content for chunk in text_chunks]\n",
+    "    return embeddings.embed_documents(texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def store_embeddings_in_pinecone(index, text_chunks, embeddings, batch_size= 1000):\n",
+    "    vectors= [(str(i), emb) for i, emb in enumerate(embeddings)]\n",
+    "    \n",
+    "    for i in range(0, len(vectors), batch_size):\n",
+    "        batch= vectors[i:i+batch_size]\n",
+    "        index.upsert(vectors= batch)\n",
+    "        print(f\"upserted batch {i//batch_size+1}of {len(vectors)//batch_size+1}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings_list= create_embeddings(text_chunks, embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "store_embeddings_in_pinecone(index,text_chunks, embeddings_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.vectorstores import Pinecone as LangChainPinecone\n",
+    "\n",
+    "text_key= \"page_content\"\n",
+    "docsearch= LangChainPinecone(index= index, embedding= embeddings, text_key= text_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"Django javascript integration\"\n",
+    "\n",
+    "docs = docsearch.similarity_search(query, k=3)\n",
+    "print(\"Result:\", docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt_template=\"\"\"\n",
+    "Use the following peices of information to answer the user's question.\n",
+    "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n",
+    "\n",
+    "Context: {context}\n",
+    "Question: {question}\n",
+    "\n",
+    "Only return the helpful answer below and nothing else\n",
+    "Helpful answer:\n",
+    "\"\"\"\n",
+    "\n",
+    "PROMPT= PromptTemplate(template=prompt_template, input_variables=[\"context\", \"question\"] )\n",
+    "chain_type_kwargs= {\"prompt\": PROMPT}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm= CTransformers(model=\"model/llama-2-7b-chat.ggmlv3.q4_0.bin\",\n",
+    "                   model_type=\"llama\",\n",
+    "                   config={'max_new_tokens': 512,\n",
+    "                            'temperature': 0.8})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qa= RetrievalQA.from_chain_type(\n",
+    "    llm= llm,\n",
+    "    chain_type= \"stuff\",\n",
+    "    retriever=docsearch.as_retriever(search_kwargs={'k':2}),\n",
+    "    return_source_documents= True,\n",
+    "    chain_type_kwargs= chain_type_kwargs\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while True:\n",
+    "    user_input=input(f\"Input Prompt\")\n",
+    "    result=qa({\"query\": user_input})\n",
+    "    print(\"Response: \", result[\"result\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chatbot",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}