Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Shashank1202 authored Jul 23, 2024
1 parent 27d9afd commit aa10177
Showing 1 changed file with 311 additions and 0 deletions.
311 changes: 311 additions & 0 deletions research/trails.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain import PromptTemplate\n",
"from langchain.chains import RetrievalQA\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from langchain.vectorstores import Pinecone\n",
"from pinecone import Pinecone, ServerlessSpec\n",
"from langchain.document_loaders import PyPDFLoader, DirectoryLoader\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.llms import CTransformers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"PINECONE_API_KEY=\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## Data Loading from the PDF\n",
"\n",
"def load_pdf(data):\n",
" loader= DirectoryLoader(data,\n",
" glob=\"*.pdf\",\n",
" loader_cls=PyPDFLoader)\n",
" \n",
" documents= loader.load()\n",
" return documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"extracted_data= load_pdf(r\"C:\\Users\\shash\\Deep_Learning\\AI-Driven-Customer-Chatbot\\data\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Creating text chunks\n",
"def text_split(extracted_data):\n",
" text_splitter= RecursiveCharacterTextSplitter(chunk_size= 500, chunk_overlap= 20)\n",
" text_chunks= text_splitter.split_documents(extracted_data)\n",
"\n",
" return text_chunks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text_chunks= text_split(extracted_data)\n",
"print(\"Length of the chunks\", len(text_chunks))\n",
"print(type(text_chunks))\n",
"print(type(text_chunks[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for doc in extracted_data[:2]:\n",
" print(doc.page_content[:1000])\n",
"print(type(extracted_data))\n",
"print(type(extracted_data[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#download embedding model\n",
"def download_hugging_face_embeddings():\n",
" embeddings= HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
" return embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embeddings= download_hugging_face_embeddings()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"index_name= \"chatbot\"\n",
"\n",
"pc=Pinecone(api_key=PINECONE_API_KEY)\n",
"\n",
"#creating index\n",
"\n",
"if index_name not in pc.list_indexes().names():\n",
" pc.create_index(\n",
" name= index_name,\n",
" dimension= 384,\n",
" metric= 'cosine',\n",
" spec= ServerlessSpec(cloud= 'aws', region=\"us-east-1\")\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"index= pc.Index(index_name)\n",
"index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def create_embeddings(text_chunks, embeddings):\n",
" texts= [chunk.page_content for chunk in text_chunks]\n",
" return embeddings.embed_documents(texts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def store_embeddings_in_pinecone(index, text_chunks, embeddings, batch_size= 1000):\n",
" vectors= [(str(i), emb) for i, emb in enumerate(embeddings)]\n",
" \n",
" for i in range(0, len(vectors), batch_size):\n",
" batch= vectors[i:i+batch_size]\n",
" index.upsert(vectors= batch)\n",
" print(f\"upserted batch {i//batch_size+1}of {len(vectors)//batch_size+1}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embeddings_list= create_embeddings(text_chunks, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"store_embeddings_in_pinecone(index,text_chunks, embeddings_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.vectorstores import Pinecone as LangChainPinecone\n",
"\n",
"text_key= \"page_content\"\n",
"docsearch= LangChainPinecone(index= index, embedding= embeddings, text_key= text_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"Django javascript integration\"\n",
"\n",
"docs = docsearch.similarity_search(query, k=3)\n",
"print(\"Result:\", docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prompt_template=\"\"\"\n",
"Use the following peices of information to answer the user's question.\n",
"If you don't know the answer, just say that you don't know, don't try to make up an answer.\n",
"\n",
"Context: {context}\n",
"Question: {question}\n",
"\n",
"Only return the helpful answer below and nothing else\n",
"Helpful answer:\n",
"\"\"\"\n",
"\n",
"PROMPT= PromptTemplate(template=prompt_template, input_variables=[\"context\", \"question\"] )\n",
"chain_type_kwargs= {\"prompt\": PROMPT}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"llm= CTransformers(model=\"model/llama-2-7b-chat.ggmlv3.q4_0.bin\",\n",
" model_type=\"llama\",\n",
" config={'max_new_tokens': 512,\n",
" 'temperature': 0.8})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"qa= RetrievalQA.from_chain_type(\n",
" llm= llm,\n",
" chain_type= \"stuff\",\n",
" retriever=docsearch.as_retriever(search_kwargs={'k':2}),\n",
" return_source_documents= True,\n",
" chain_type_kwargs= chain_type_kwargs\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"while True:\n",
" user_input=input(f\"Input Prompt\")\n",
" result=qa({\"query\": user_input})\n",
" print(\"Response: \", result[\"result\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "chatbot",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit aa10177

Please sign in to comment.