Skip to content

Commit

Permalink
Preprocessed and vectore stored
Browse files Browse the repository at this point in the history
preporocessed and added the data in vector store
  • Loading branch information
mahikshith committed Oct 6, 2024
1 parent 799fcd1 commit 3c44d50
Show file tree
Hide file tree
Showing 7 changed files with 2,740 additions and 0 deletions.
Empty file added mainproject/__init__.py
Empty file.
55 changes: 55 additions & 0 deletions mainproject/embedstore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Take the data from the documents in pre_process.py and embed it into cassandra database

import os
import pandas as pd
from dotenv import load_dotenv

from langchain_astradb import AstraDBVectorStore

from langchain_google_genai import GoogleGenerativeAIEmbeddings
# getting data
from pre_process import convert_to_doc

load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_END_POINT")

ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_TOKEN")

ASTRA_DB_NAMESPACE = os.getenv("ASTRA_KEYSPACE")

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

def embed_documents(run_state):
# vector store creation
vector_store = AstraDBVectorStore(
collection_name="mykee_powerful_collection",
embedding=embeddings,
api_endpoint=ASTRA_DB_API_ENDPOINT,
token=ASTRA_DB_APPLICATION_TOKEN,
namespace=ASTRA_DB_NAMESPACE,)

store=run_state

if store==None:
docs= convert_to_doc()
inserted_ids = vector_store.add_documents(docs)
else:
return vector_store
return vector_store, inserted_ids

# testing

if __name__ == "__main__":
vector_store, inserted_ids = embed_documents(None)
print(len(inserted_ids))

ans = vector_store.similarity_search('''tell me what are all the earphones available in flipkart and what is the best
quality earphones among them''', 3)
for each_ans in ans :
print(f"{each_ans.page_content} --- {each_ans.metadata}")

# ran it for testing -- got results , always make sure to specifyt the embedding model if google api is used

17 changes: 17 additions & 0 deletions mainproject/mykee_retrive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAI


model = genai.GenerativeModel('gemini-1.5-flash')


genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model = genai.GenerativeModel('gemini-1.5-flash')


retriever = vector_store.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"k": 1, "score_threshold": 0.5},
)
retriever.invoke("Stealing from the bank is a crime", filter={"source": "news"})
Loading

0 comments on commit 3c44d50

Please sign in to comment.