-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
27d9afd
commit aa10177
Showing
1 changed file
with
311 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,311 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain import PromptTemplate\n", | ||
"from langchain.chains import RetrievalQA\n", | ||
"from langchain.embeddings import HuggingFaceEmbeddings\n", | ||
"from langchain.vectorstores import Pinecone\n", | ||
"from pinecone import Pinecone, ServerlessSpec\n", | ||
"from langchain.document_loaders import PyPDFLoader, DirectoryLoader\n", | ||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n", | ||
"from langchain.prompts import PromptTemplate\n", | ||
"from langchain.llms import CTransformers" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"PINECONE_API_KEY=\"\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"## Data Loading from the PDF\n", | ||
"\n", | ||
"def load_pdf(data):\n", | ||
" loader= DirectoryLoader(data,\n", | ||
" glob=\"*.pdf\",\n", | ||
" loader_cls=PyPDFLoader)\n", | ||
" \n", | ||
" documents= loader.load()\n", | ||
" return documents" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"extracted_data= load_pdf(r\"C:\\Users\\shash\\Deep_Learning\\AI-Driven-Customer-Chatbot\\data\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Creating text chunks\n", | ||
"def text_split(extracted_data):\n", | ||
" text_splitter= RecursiveCharacterTextSplitter(chunk_size= 500, chunk_overlap= 20)\n", | ||
" text_chunks= text_splitter.split_documents(extracted_data)\n", | ||
"\n", | ||
" return text_chunks" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"text_chunks= text_split(extracted_data)\n", | ||
"print(\"Length of the chunks\", len(text_chunks))\n", | ||
"print(type(text_chunks))\n", | ||
"print(type(text_chunks[0]))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"for doc in extracted_data[:2]:\n", | ||
" print(doc.page_content[:1000])\n", | ||
"print(type(extracted_data))\n", | ||
"print(type(extracted_data[0]))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#download embedding model\n", | ||
"def download_hugging_face_embeddings():\n", | ||
" embeddings= HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", | ||
" return embeddings" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"embeddings= download_hugging_face_embeddings()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"embeddings" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"index_name= \"chatbot\"\n", | ||
"\n", | ||
"pc=Pinecone(api_key=PINECONE_API_KEY)\n", | ||
"\n", | ||
"#creating index\n", | ||
"\n", | ||
"if index_name not in pc.list_indexes().names():\n", | ||
" pc.create_index(\n", | ||
" name= index_name,\n", | ||
" dimension= 384,\n", | ||
" metric= 'cosine',\n", | ||
" spec= ServerlessSpec(cloud= 'aws', region=\"us-east-1\")\n", | ||
" )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"index= pc.Index(index_name)\n", | ||
"index" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def create_embeddings(text_chunks, embeddings):\n", | ||
" texts= [chunk.page_content for chunk in text_chunks]\n", | ||
" return embeddings.embed_documents(texts)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def store_embeddings_in_pinecone(index, text_chunks, embeddings, batch_size= 1000):\n", | ||
" vectors= [(str(i), emb) for i, emb in enumerate(embeddings)]\n", | ||
" \n", | ||
" for i in range(0, len(vectors), batch_size):\n", | ||
" batch= vectors[i:i+batch_size]\n", | ||
" index.upsert(vectors= batch)\n", | ||
" print(f\"upserted batch {i//batch_size+1}of {len(vectors)//batch_size+1}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"embeddings_list= create_embeddings(text_chunks, embeddings)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"store_embeddings_in_pinecone(index,text_chunks, embeddings_list)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.vectorstores import Pinecone as LangChainPinecone\n", | ||
"\n", | ||
"text_key= \"page_content\"\n", | ||
"docsearch= LangChainPinecone(index= index, embedding= embeddings, text_key= text_key)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"query = \"Django javascript integration\"\n", | ||
"\n", | ||
"docs = docsearch.similarity_search(query, k=3)\n", | ||
"print(\"Result:\", docs)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"prompt_template=\"\"\"\n", | ||
"Use the following peices of information to answer the user's question.\n", | ||
"If you don't know the answer, just say that you don't know, don't try to make up an answer.\n", | ||
"\n", | ||
"Context: {context}\n", | ||
"Question: {question}\n", | ||
"\n", | ||
"Only return the helpful answer below and nothing else\n", | ||
"Helpful answer:\n", | ||
"\"\"\"\n", | ||
"\n", | ||
"PROMPT= PromptTemplate(template=prompt_template, input_variables=[\"context\", \"question\"] )\n", | ||
"chain_type_kwargs= {\"prompt\": PROMPT}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"llm= CTransformers(model=\"model/llama-2-7b-chat.ggmlv3.q4_0.bin\",\n", | ||
" model_type=\"llama\",\n", | ||
" config={'max_new_tokens': 512,\n", | ||
" 'temperature': 0.8})" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"qa= RetrievalQA.from_chain_type(\n", | ||
" llm= llm,\n", | ||
" chain_type= \"stuff\",\n", | ||
" retriever=docsearch.as_retriever(search_kwargs={'k':2}),\n", | ||
" return_source_documents= True,\n", | ||
" chain_type_kwargs= chain_type_kwargs\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"while True:\n", | ||
" user_input=input(f\"Input Prompt\")\n", | ||
" result=qa({\"query\": user_input})\n", | ||
" print(\"Response: \", result[\"result\"])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "chatbot", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.19" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |