-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
212 additions
and
0 deletions.
There are no files selected for viewing
40 changes: 40 additions & 0 deletions
40
...nized-with-indexes/0-exploring-the-role-of-langchain-indexers-and-retrievers/0-1-query.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import os | ||
from langchain_community.vectorstores import DeepLake | ||
from langchain_openai import OpenAIEmbeddings | ||
|
||
|
||
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") | ||
|
||
my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"] | ||
|
||
my_activeloop_dataset_name = "indexers-retrievers" | ||
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}" | ||
# db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings) | ||
db = DeepLake(dataset_path=dataset_path, embedding=embeddings) | ||
|
||
|
||
# create retriever from db | ||
retriever = db.as_retriever() | ||
|
||
# Once we have the retriever, we can start with question-answering. | ||
|
||
from langchain.chains import RetrievalQA | ||
# from langchain.llms import OpenAI | ||
from langchain_openai import OpenAI | ||
|
||
# create a retrieval chain | ||
qa_chain = RetrievalQA.from_chain_type( | ||
llm=OpenAI(model="gpt-3.5-turbo-instruct"), | ||
chain_type="stuff", | ||
retriever=retriever | ||
) | ||
|
||
|
||
# We can query our document that is an about specific topic that can be found in the documents. | ||
|
||
query = "How Google plans to challenge OpenAI?" | ||
# response = qa_chain.run(query) | ||
response = qa_chain.invoke(query) | ||
|
||
|
||
print(response) |
119 changes: 119 additions & 0 deletions
119
...loring-the-role-of-langchain-indexers-and-retrievers/0-load-text-file-create-and-query.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
# from langchain.document_loaders import TextLoader | ||
from langchain_community.document_loaders import TextLoader | ||
|
||
# Here we use the TextLoader class to load a text file. | ||
# Remember to install the required packages with the following command: | ||
# pip install langchain==0.0.208 deeplake openai==0.27.8 tiktoken. | ||
|
||
|
||
# text to write to a local file | ||
# taken from https://www.theverge.com/2023/3/14/23639313/google-ai-language-model-palm-api-challenge-openai | ||
text = """Google opens up its AI language model PaLM to challenge OpenAI and GPT-3 | ||
Google is offering developers access to one of its most advanced AI language models: PaLM. | ||
The search giant is launching an API for PaLM alongside a number of AI enterprise tools | ||
it says will help businesses “generate text, images, code, videos, audio, and more from | ||
simple natural language prompts.” | ||
PaLM is a large language model, or LLM, similar to the GPT series created by OpenAI or | ||
Meta’s LLaMA family of models. Google first announced PaLM in April 2022. Like other LLMs, | ||
PaLM is a flexible system that can potentially carry out all sorts of text generation and | ||
editing tasks. You could train PaLM to be a conversational chatbot like ChatGPT, for | ||
example, or you could use it for tasks like summarizing text or even writing code. | ||
(It’s similar to features Google also announced today for its Workspace apps like Google | ||
Docs and Gmail.) | ||
""" | ||
|
||
# write text to local file | ||
with open("my_file.txt", "w") as file: | ||
file.write(text) | ||
|
||
# use TextLoader to load text from local file | ||
loader = TextLoader("my_file.txt") | ||
docs_from_file = loader.load() | ||
|
||
print(len(docs_from_file)) | ||
# 1 | ||
|
||
# | ||
# Then, we use CharacterTextSplitter to split the docs into texts. | ||
|
||
from langchain.text_splitter import CharacterTextSplitter | ||
|
||
# create a text splitter | ||
#text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20) | ||
|
||
text_splitter = CharacterTextSplitter(chunk_size=373, chunk_overlap=20) | ||
|
||
# split documents into chunks | ||
docs = text_splitter.split_documents(docs_from_file) | ||
|
||
print(len(docs)) | ||
# 2 | ||
|
||
# These embeddings allow us to effectively search for documents or portions of documents | ||
# that relate to our query by examining their semantic similarities. | ||
|
||
# from langchain.embeddings import OpenAIEmbeddings | ||
# from langchain_community.embeddings import OpenAIEmbeddings | ||
from langchain_openai import OpenAIEmbeddings | ||
|
||
# Before executing the following code, make sure to have | ||
# your OpenAI key saved in the “OPENAI_API_KEY” environment variable. | ||
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") | ||
|
||
# Let’s create an instance of a Deep Lake dataset. | ||
|
||
#from langchain.vectorstores import DeepLake | ||
from langchain_community.vectorstores import DeepLake | ||
|
||
# Before executing the following code, make sure to have your | ||
# Activeloop key saved in the “ACTIVELOOP_TOKEN” environment variable. | ||
|
||
# create Deep Lake dataset | ||
# TODO: use your organization id here. (by default, org id is your username) | ||
import os | ||
my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"] | ||
|
||
my_activeloop_dataset_name = "indexers-retrievers" | ||
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}" | ||
# db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings) | ||
db = DeepLake(dataset_path=dataset_path, embedding=embeddings) | ||
# add documents to our Deep Lake dataset | ||
db.add_documents(docs) | ||
|
||
|
||
# In this example, we are adding text documents to the dataset. | ||
# However, being Deep Lake multimodal, we could have also added images to it, | ||
# specifying an image embedder model. This could be useful for searching images | ||
# according to a text query or using an image as a query (and thus looking for similar images). | ||
|
||
# As datasets become bigger, storing them in local memory becomes less manageable. | ||
# In this example, we could have also used a local vector store, as we are uploading only two documents. | ||
# However, in a typical production scenario, thousands or millions of documents | ||
# could be used and accessed from different programs, | ||
# thus having the need for a centralized cloud dataset. | ||
|
||
# Back to the code example of this lesson. Next, we create a retriever. | ||
|
||
# create retriever from db | ||
retriever = db.as_retriever() | ||
|
||
# Once we have the retriever, we can start with question-answering. | ||
|
||
from langchain.chains import RetrievalQA | ||
# from langchain.llms import OpenAI | ||
from langchain_openai import OpenAI | ||
|
||
# create a retrieval chain | ||
qa_chain = RetrievalQA.from_chain_type( | ||
llm=OpenAI(model="gpt-3.5-turbo-instruct"), | ||
chain_type="stuff", | ||
retriever=retriever | ||
) | ||
|
||
|
||
# We can query our document that is an about specific topic that can be found in the documents. | ||
|
||
query = "How Google plans to challenge OpenAI?" | ||
response = qa_chain.run(query) | ||
print(response) |
40 changes: 40 additions & 0 deletions
40
...ndexes/0-exploring-the-role-of-langchain-indexers-and-retrievers/1-document-compressor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from langchain.retrievers import ContextualCompressionRetriever | ||
from langchain.retrievers.document_compressors import LLMChainExtractor | ||
|
||
import os | ||
from langchain_community.vectorstores import DeepLake | ||
from langchain_openai import OpenAIEmbeddings | ||
from langchain_openai import OpenAI | ||
|
||
|
||
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") | ||
|
||
my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"] | ||
|
||
my_activeloop_dataset_name = "indexers-retrievers" | ||
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}" | ||
# db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings) | ||
db = DeepLake(dataset_path=dataset_path, embedding=embeddings) | ||
|
||
|
||
# create retriever from db | ||
retriever = db.as_retriever() | ||
|
||
# create GPT3 wrapper | ||
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0) | ||
|
||
# create compressor for the retriever | ||
compressor = LLMChainExtractor.from_llm(llm) | ||
compression_retriever = ContextualCompressionRetriever( | ||
base_compressor=compressor, | ||
base_retriever=retriever | ||
) | ||
|
||
|
||
# retrieving compressed documents | ||
retrieved_docs = compression_retriever.get_relevant_documents( | ||
"How Google plans to challenge OpenAI?" | ||
) | ||
print(retrieved_docs[0].page_content) | ||
|
||
|
13 changes: 13 additions & 0 deletions
13
...anized-with-indexes/0-exploring-the-role-of-langchain-indexers-and-retrievers/my_file.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Google opens up its AI language model PaLM to challenge OpenAI and GPT-3 | ||
Google is offering developers access to one of its most advanced AI language models: PaLM. | ||
The search giant is launching an API for PaLM alongside a number of AI enterprise tools | ||
it says will help businesses “generate text, images, code, videos, audio, and more from | ||
simple natural language prompts.” | ||
|
||
PaLM is a large language model, or LLM, similar to the GPT series created by OpenAI or | ||
Meta’s LLaMA family of models. Google first announced PaLM in April 2022. Like other LLMs, | ||
PaLM is a flexible system that can potentially carry out all sorts of text generation and | ||
editing tasks. You could train PaLM to be a conversational chatbot like ChatGPT, for | ||
example, or you could use it for tasks like summarizing text or even writing code. | ||
(It’s similar to features Google also announced today for its Workspace apps like Google | ||
Docs and Gmail.) |