-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
preporocessed and added the data in vector store
- Loading branch information
1 parent
799fcd1
commit 3c44d50
Showing
7 changed files
with
2,740 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Take the data from the documents in pre_process.py and embed it into cassandra database | ||
|
||
import os | ||
import pandas as pd | ||
from dotenv import load_dotenv | ||
|
||
from langchain_astradb import AstraDBVectorStore | ||
|
||
from langchain_google_genai import GoogleGenerativeAIEmbeddings | ||
# getting data | ||
from pre_process import convert_to_doc | ||
|
||
load_dotenv() | ||
|
||
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | ||
|
||
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_END_POINT") | ||
|
||
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_TOKEN") | ||
|
||
ASTRA_DB_NAMESPACE = os.getenv("ASTRA_KEYSPACE") | ||
|
||
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | ||
|
||
def embed_documents(run_state): | ||
# vector store creation | ||
vector_store = AstraDBVectorStore( | ||
collection_name="mykee_powerful_collection", | ||
embedding=embeddings, | ||
api_endpoint=ASTRA_DB_API_ENDPOINT, | ||
token=ASTRA_DB_APPLICATION_TOKEN, | ||
namespace=ASTRA_DB_NAMESPACE,) | ||
|
||
store=run_state | ||
|
||
if store==None: | ||
docs= convert_to_doc() | ||
inserted_ids = vector_store.add_documents(docs) | ||
else: | ||
return vector_store | ||
return vector_store, inserted_ids | ||
|
||
# testing | ||
|
||
if __name__ == "__main__": | ||
vector_store, inserted_ids = embed_documents(None) | ||
print(len(inserted_ids)) | ||
|
||
ans = vector_store.similarity_search('''tell me what are all the earphones available in flipkart and what is the best | ||
quality earphones among them''', 3) | ||
for each_ans in ans : | ||
print(f"{each_ans.page_content} --- {each_ans.metadata}") | ||
|
||
# ran it for testing -- got results , always make sure to specifyt the embedding model if google api is used | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
|
||
import google.generativeai as genai | ||
from langchain_google_genai import GoogleGenerativeAI | ||
|
||
|
||
model = genai.GenerativeModel('gemini-1.5-flash') | ||
|
||
|
||
genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) | ||
model = genai.GenerativeModel('gemini-1.5-flash') | ||
|
||
|
||
retriever = vector_store.as_retriever( | ||
search_type="similarity_score_threshold", | ||
search_kwargs={"k": 1, "score_threshold": 0.5}, | ||
) | ||
retriever.invoke("Stealing from the bank is a crime", filter={"source": "news"}) |
Oops, something went wrong.