Skip to content

Commit

Permalink
SQLite Support for Llama (#50)
Browse files Browse the repository at this point in the history
* initial test for sql embeeding store

* feat: Adding sqlite database support for local model

* minor correction

---------

Co-authored-by: Charles Xu <charlesxuty@berkeley.edu>
  • Loading branch information
Char15Xu and Charles Xu authored Jul 15, 2024
1 parent 5090f4c commit ea2b52e
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 28 deletions.
91 changes: 66 additions & 25 deletions ai_course_bot/ai-chatbot-backend/app/core/actions/llama_seletor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@
from pydantic import BaseModel
import threading
import urllib.parse
import sqlite3
import json
from app.embedding.table_create import execute_all, connect, insert


# Set the environment variable to use the SQL database
SQLDB = False

class Message(BaseModel):
role: str
Expand Down Expand Up @@ -120,43 +124,80 @@ def local_selector(messages:List[Message],stream=True,rag=True,course=None):
with open(path_to_pickle, 'rb') as f:
data_loaded = pickle.load(f)
doc_list = data_loaded['doc_list']
embedding_list = data_loaded['embedding_list']
id_list = data_loaded['id_list']
url_list = data_loaded['url_list']
# time_list = data_loaded['time_list']

query_embed = embedding_model.encode(user_message, return_dense=True, return_sparse=True,
return_colbert_vecs=True)
# model
# cosine_similarities = np.dot(embedding_list, query_embed)
cosine_similarities = np.array(bge_compute_score(query_embed, embedding_list, [1, 1, 1], None, None)['colbert+sparse+dense'])
indices = np.argsort(cosine_similarities)[::-1]
id = id_list[indices]
docs = doc_list[indices]
url = url_list[indices]
# time = time_list[indices]
top_docs=docs[:3]

distances = np.sort(cosine_similarities)[-3:][::-1]
top_id = id[:3]
top_url = url[:3]
# top_url= [f"https://www.youtube.com/watch?v={i}" for i in range(1,4)]
# top_time = time[:3]
query_embed = embedding_model.encode(user_message, return_dense=True, return_sparse=True,
return_colbert_vecs=True)
if SQLDB:
db = connect('embeddings.db')
cur = db.cursor()
cur.execute('Drop table IF EXISTS embeddings;')
cur.execute('create virtual table embeddings using vss0(embedding(1024) factory="Flat,IDMap2" metric_type=INNER_PRODUCT);')
embedding_list = data_loaded['embedding_list']
denses = [embedding['dense_vecs'].tolist() for embedding in embedding_list]
insert(cur, denses)
db.commit()
query_vector = query_embed['dense_vecs'].tolist()
query_vector_json = json.dumps(query_vector)
cur.execute("""
SELECT
rowid,
distance
FROM embeddings
WHERE vss_search(
embedding,
?
)
LIMIT 3;
""", (query_vector_json,))
results = cur.fetchall()
top_indices = [result[0] for result in results]
top_ids = id_list[top_indices]
distances = [result[1] for result in results]
print("top_ids:", top_ids)
print("distances:", distances)
id_doc_url_dic = {id_: (doc, url) for id_, doc, url in zip(id_list, doc_list, url_list)}
top_docs_urls = [id_doc_url_dic[top_id] for top_id in top_ids]
top_docs, top_urls = zip(*top_docs_urls)
print("top_docs:", top_docs, "top_urls:", top_urls)
else:
embedding_list = data_loaded['embedding_list']
# model
# cosine_similarities = np.dot(embedding_list, query_embed)
cosine_similarities = np.array(bge_compute_score(query_embed, embedding_list, [1, 1, 1], None, None)['colbert+sparse+dense'])
indices = np.argsort(cosine_similarities)[::-1]
id = id_list[indices]
docs = doc_list[indices]
url = url_list[indices]
print("indices:", indices)
print("id:", id)
print("docs:", docs)
# time = time_list[indices]
top_docs=docs[:3]
distances = np.sort(cosine_similarities)[-3:][::-1]
top_ids = id[:3]
top_urls = url[:3]
print("top_ids:", top_ids)
print("distances:", distances)
# top_url= [f"https://www.youtube.com/watch?v={i}" for i in range(1,4)]
# top_time = time[:3]

insert_document = ""
reference = []
n=0
none=0
for i in range(len(top_docs)):
if top_url[i]:
reference.append(f"{top_url[i]}")
if top_urls[i]:
reference.append(f"{top_urls[i]}")
else:
reference.append("")
if distances[i] > 0.45:
n+=1
if top_url[i]:
insert_document += f"\"\"\"Reference Number: {n}\nReference Info Path: {top_id[i]}\nReference_Url: {top_url[i]}\nDocument: {top_docs[i]}\"\"\"\n\n"
if top_urls[i]:
insert_document += f"\"\"\"Reference Number: {n}\nReference Info Path: {top_ids[i]}\nReference_Url: {top_urls[i]}\nDocument: {top_docs[i]}\"\"\"\n\n"
else:
cleaned_path = clean_path(top_id[i])
cleaned_path = clean_path(top_ids[i])
insert_document += f"\"\"\"Reference Number: {n}\nReference Info Path: {cleaned_path}\nReference_Url: NONE\nDocument: {top_docs[i]}\"\"\"\n\n"
# print("CLEANED PATH",cleaned_path)
else:
Expand Down
Binary file modified ai_course_bot/ai-chatbot-backend/app/embedding/embeddings.db
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@


# Paths to your vector and vss extensions
EXT_VECTOR_PATH = "./dist/debug/vector0"
EXT_VSS_PATH = "./dist/debug/vss0"
EXT_VECTOR_PATH = "ai_course_bot/ai-chatbot-backend/app/embedding/dist/debug/vector0.dylib"
EXT_VSS_PATH = "ai_course_bot/ai-chatbot-backend/app/embedding/dist/debug/vss0.dylib"
BGE = True
# Connect to the SQLite database and load extensions
def connect(path=":memory:"):
Expand All @@ -34,7 +34,7 @@ def execute_all(cursor, sql, args=None):
return list(map(lambda x: dict(x), results))

# Load data from the pickle file
with open('recursive_seperate_none_BGE_embedding_400_106_full.pkl', 'rb') as f:
with open('ai_course_bot/ai-chatbot-backend/app/embedding/recursive_seperate_none_BGE_embedding_400_106_full.pkl', 'rb') as f:
data_to_store = pickle.load(f)


Expand Down
Binary file added ai_course_bot/ai-chatbot-backend/embeddings.db
Binary file not shown.
Empty file.

0 comments on commit ea2b52e

Please sign in to comment.