Feat: update code examples (#13)

* Update examples, use credentials from environ
apify · Aug 4, 2024 · 522e3f1 · 522e3f1
1 parent 12fb3fb
commit 522e3f1
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 85 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [0.1.3](https://github.com/apify/apify-haystack/releases/tag/0.1.3)  (2024-08-04)
+
+🚀 Features
+- Update code examples in the documentation
+
 ## [0.1.2](https://github.com/apify/apify-haystack/releases/tag/0.1.2)  (2024-08-02)
 
 🐛 Bug Fixes

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "apify_haystack"
-version = "0.1.2"
+version = "0.1.3"
 description = "Apify-haystack integration"
 authors = ["Apify Technologies s.r.o. <support@apify.com>"]
 homepage = "https://apify.com"

diff --git a/src/apify_haystack/examples/apify_actor_call.py b/src/apify_haystack/examples/apify_actor_call.py
@@ -14,13 +14,14 @@
 .....
 """
 
-from dotenv import load_dotenv
+import os
+
 from haystack import Document
 
 from apify_haystack import ApifyDatasetFromActorCall
 
-# Set APIFY_API_TOKEN here or load it from .env file
-apify_api_token = "" or load_dotenv()
+# Set API keys here
+os.environ["APIFY_API_TOKEN"] = ""
 
 actor_id = "apify/website-content-crawler"
 run_input = {
@@ -37,7 +38,6 @@ def dataset_mapping_function(dataset_item: dict) -> Document:
     actor_id=actor_id,
     run_input=run_input,
     dataset_mapping_function=dataset_mapping_function,
-    apify_api_token=str(apify_api_token),
 )
 print(f"Calling the Apify actor {actor_id} ... crawling will take some time ...")
 print("You can monitor the progress at: https://console.apify.com/actors/runs")

diff --git a/src/apify_haystack/examples/crawl_and_process_data.py b/src/apify_haystack/examples/crawl_and_process_data.py
@@ -2,7 +2,7 @@
 Crawl websites, scrape text content, and store it in the InMemoryDocumentStore.
 
 This script demonstrates how to extract content from a website using Apify's Website Content Crawler.
-The content is then cleaned, split into smaller chunks, embedded, and stored in the InMemoryDocumentStore.
+The content is split into smaller chunks, embedded, and stored in the InMemoryDocumentStore.
 
 After the pipeline is executed, the documents are retrieved from the document store
 using BM25 retrieval and vector similarity.
@@ -16,80 +16,68 @@
  'split_id': 0, 'split_idx_start': 0, '_split_overlap': ......
 .....
 """
-
 import os
 
-from dotenv import load_dotenv
 from haystack import Document, Pipeline
 from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
-from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
 from haystack.components.writers import DocumentWriter
 from haystack.document_stores.in_memory import InMemoryDocumentStore
-from haystack.utils.auth import Secret
 
 from apify_haystack import ApifyDatasetFromActorCall
 
-load_dotenv()
-
-# Set APIFY_API_TOKEN here or use it from .env file
-apify_api_token = "" or os.getenv("APIFY_API_TOKEN")
-openai_api_key = "" or os.getenv("OPENAI_API_KEY")
-
-actor_id = "apify/website-content-crawler"
-run_input = {
-    "maxCrawlPages": 3,  # limit the number of pages to crawl
-    "startUrls": [{"url": "https://haystack.deepset.ai/"}],
-}
-
-
-def dataset_mapping_function(dataset_item: dict) -> Document:
-    return Document(content=dataset_item.get("text"), meta={"url": dataset_item.get("url")})
-
+os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN"
+os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"
 
 document_loader = ApifyDatasetFromActorCall(
-    actor_id=actor_id,
-    run_input=run_input,
-    dataset_mapping_function=dataset_mapping_function,
-    apify_api_token=apify_api_token,
+    actor_id="apify/website-content-crawler",
+    run_input={
+        "maxCrawlPages": 3,  # limit the number of pages to crawl
+        "startUrls": [{"url": "https://haystack.deepset.ai/"}],
+    },
+    dataset_mapping_function=lambda item: Document(content=item["text"] or "", meta={"url": item["url"]}),
 )
 
 document_store = InMemoryDocumentStore()
 print(f"Initialized InMemoryDocumentStore with {document_store.count_documents()} documents")
 
-document_cleaner = DocumentCleaner()
 document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)
-document_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(openai_api_key))
+document_embedder = OpenAIDocumentEmbedder()
 document_writer = DocumentWriter(document_store)
 
 pipe = Pipeline()
 pipe.add_component("document_loader", document_loader)
-pipe.add_component("document_cleaner", document_cleaner)
 pipe.add_component("document_splitter", document_splitter)
 pipe.add_component("document_embedder", document_embedder)
 pipe.add_component("document_writer", document_writer)
 
-pipe.connect("document_loader", "document_cleaner")
-pipe.connect("document_cleaner", "document_splitter")
+pipe.connect("document_loader", "document_splitter")
 pipe.connect("document_splitter", "document_embedder")
 pipe.connect("document_embedder", "document_writer")
 
-print(
-    "Running pipeline the Apify document_loader -> document_cleaner -> document_splitter "
-    "-> document_embedder -> document_writer"
-)
-print("Crawling will take some time ...")
-print("You can visit https://console.apify.com/actors/runs to monitor the progress")
-
-pipe.run({"document_loader": {}})
+print("\nCrawling will take some time ...")
+print("You can visit https://console.apify.com/actors/runs to monitor the progress\n")
 
+pipe.run({})
 print(f"Added {document_store.count_documents()} to vector from Website Content Crawler")
 
-print("Retrieving documents from the document store using BM25")
-print("query='Haystack'")
-for doc in document_store.bm25_retrieval("Haystack", top_k=1):
-    print(doc)
+print("\n ### Retrieving documents from the document store using BM25 ###\n")
+print("query='Haystack'\n")
+
+bm25_retriever = InMemoryBM25Retriever(document_store)
+
+for doc in bm25_retriever.run("Haystack", top_k=1)["documents"]:
+    print(doc.content)
+
+print("\n ### Retrieving documents from the document store using vector similarity ###\n")
+retrieval_pipe = Pipeline()
+retrieval_pipe.add_component("embedder", OpenAITextEmbedder())
+retrieval_pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=1))
+
+retrieval_pipe.connect("embedder.embedding", "retriever.query_embedding")
+
+results = retrieval_pipe.run({"embedder": {"text": "What is Haystack?"}})
 
-print("Retrieving documents from the document store using vector similarity")
-print("query='What is Haystack'")
-for doc in document_store.embedding_retrieval(OpenAITextEmbedder().run("Haystack")["embedding"], top_k=1):
-    print(doc)
+for doc in results["retriever"]["documents"]:
+    print(doc.content)
diff --git a/src/apify_haystack/examples/rag_with_crawled_website.py b/src/apify_haystack/examples/rag_with_crawled_website.py
@@ -20,21 +20,18 @@
 
 import os
 
-from dotenv import load_dotenv
 from haystack import Document, Pipeline
 from haystack.components.builders import PromptBuilder
 from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
 from haystack.components.generators import OpenAIGenerator
 from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
 from haystack.document_stores.in_memory import InMemoryDocumentStore
-from haystack.utils.auth import Secret
 
 from apify_haystack import ApifyDatasetFromActorCall
 
-# Set APIFY_API_TOKEN here or use it from .env file
-load_dotenv()
-apify_api_token = "" or os.getenv("APIFY_API_TOKEN")
-openai_api_key = "" or os.getenv("OPENAI_API_KEY")
+# Set API keys here
+os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN"
+os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"
 
 actor_id = "apify/website-content-crawler"
 run_input = {
@@ -51,15 +48,14 @@ def dataset_mapping_function(dataset_item: dict) -> Document:
     actor_id=actor_id,
     run_input=run_input,
     dataset_mapping_function=dataset_mapping_function,
-    apify_api_token=apify_api_token,
 )
 
 # Components
 print("Initializing components...")
 document_store = InMemoryDocumentStore()
 
-docs_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(openai_api_key))
-text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(openai_api_key))
+docs_embedder = OpenAIDocumentEmbedder()
+text_embedder = OpenAITextEmbedder()
 retriever = InMemoryEmbeddingRetriever(document_store)
 generator = OpenAIGenerator(model="gpt-3.5-turbo")