-
Notifications
You must be signed in to change notification settings - Fork 0
/
try_rag.py
75 lines (63 loc) · 2.37 KB
/
try_rag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# RAG pipeline trial
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from unstructured.cleaners.core import clean_extra_whitespace
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_qdrant import Qdrant
import os
from dotenv import load_dotenv
_ = load_dotenv(dotenv_path=".env")
os.environ["PATH"] = (
os.environ["PATH"]
+ ";"
+ os.environ["POPPLER_PATH"]
+ ";"
+ os.environ["TESSERACT_PATH"]
)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
is_separator_regex=False,
)
if not os.path.isfile("temp.txt"):
with open("temp.txt", "w", encoding="utf-8") as f:
loader = UnstructuredFileLoader(
"./Docs/input/Designing Creative AI Partners with COFI A Framework for Modeling Interaction in Human-AI Co-Creative Systems.pdf",
# mode="elements",
strategy="hi_res",
post_process=[clean_extra_whitespace],
)
part = loader.load() # list of Document
f.write(part[0].page_content)
chunks = text_splitter.create_documents([part[0].page_content]) # Chunks ready for RAG | List of Documents
else:
with open("temp.txt", "r", encoding="utf-8") as f:
part = f.read() # str
chunks = text_splitter.create_documents([part]) # Chunks ready for RAG | List of Documents
ollama_emb = OllamaEmbeddings(
model=os.environ["OLLAMA_EMBEDDING_MODEL"],
model_kwargs={"keep_alive": 1},
)
# embeddings = ollama_emb.embed_documents(chunks[:3])
store = LocalFileStore("./embeddings/Qdrant")
cache_backed_embeddings = CacheBackedEmbeddings.from_bytes_store(
underlying_embeddings=ollama_emb,
document_embedding_cache=store,
namespace=ollama_emb.model,
)
# cache = cache_backed_embeddings.embed_documents(chunks[:3]) # Failure needs str idk why whereas ollama_emb.embed_documents() works fine
print("Making cache")
qdrant = Qdrant.from_documents(
documents=chunks,
embedding=ollama_emb,
path="./embeddings/Qdrant",
collection_name="CoFi",
)
print("Cache made")
query = "Author of the COFI paper"
found_docs = qdrant.similarity_search(query)
for i in found_docs:
print(i.page_content)