From 522e3f11e4ba8daeb166ffe60735cbc346f3e6aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20Spilka?= <jiri.spilka@apify.com>
Date: Sun, 4 Aug 2024 21:03:05 +0200
Subject: [PATCH] Feat: update code examples (#13)

* Update examples, use credentials from environ
---
 CHANGELOG.md                                  |  5 ++
 poetry.lock                                   | 50 ++++++------
 pyproject.toml                                |  2 +-
 .../examples/apify_actor_call.py              |  8 +-
 .../examples/crawl_and_process_data.py        | 80 ++++++++-----------
 .../examples/rag_with_crawled_website.py      | 14 ++--
 6 files changed, 74 insertions(+), 85 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 615254d..9ba4f60 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [0.1.3](https://github.com/apify/apify-haystack/releases/tag/0.1.3)  (2024-08-04)
+
+🚀 Features
+- Update code examples in the documentation
+
 ## [0.1.2](https://github.com/apify/apify-haystack/releases/tag/0.1.2)  (2024-08-02)
 
 🐛 Bug Fixes
diff --git a/poetry.lock b/poetry.lock
index 9371db9..7fca981 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -763,13 +763,13 @@ files = [
 
 [[package]]
 name = "openai"
-version = "1.37.2"
+version = "1.38.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-1.37.2-py3-none-any.whl", hash = "sha256:22dfcf48cd9ff1a6b1d8de09121261bc72315515a9faadad541be93a8bb1e961"},
-    {file = "openai-1.37.2.tar.gz", hash = "sha256:fd97cb235cfce0f7c902c64e58a5cd9fea19e7c180d9a4e2a2b8e533dbd9f424"},
+    {file = "openai-1.38.0-py3-none-any.whl", hash = "sha256:a19ef052f1676320f52183ae6f9775da6d888fbe3aec57886117163c095d9f7c"},
+    {file = "openai-1.38.0.tar.gz", hash = "sha256:30fb324bf452ecb1194ca7dbc64566a4d7aa054c6a5da857937ede7d517a220b"},
 ]
 
 [package.dependencies]
@@ -1236,29 +1236,29 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "ruff"
-version = "0.5.5"
+version = "0.5.6"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.5.5-py3-none-linux_armv6l.whl", hash = "sha256:605d589ec35d1da9213a9d4d7e7a9c761d90bba78fc8790d1c5e65026c1b9eaf"},
-    {file = "ruff-0.5.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:00817603822a3e42b80f7c3298c8269e09f889ee94640cd1fc7f9329788d7bf8"},
-    {file = "ruff-0.5.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:187a60f555e9f865a2ff2c6984b9afeffa7158ba6e1eab56cb830404c942b0f3"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe26fc46fa8c6e0ae3f47ddccfbb136253c831c3289bba044befe68f467bfb16"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4ad25dd9c5faac95c8e9efb13e15803cd8bbf7f4600645a60ffe17c73f60779b"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f70737c157d7edf749bcb952d13854e8f745cec695a01bdc6e29c29c288fc36e"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:cfd7de17cef6ab559e9f5ab859f0d3296393bc78f69030967ca4d87a541b97a0"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a09b43e02f76ac0145f86a08e045e2ea452066f7ba064fd6b0cdccb486f7c3e7"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0b856cb19c60cd40198be5d8d4b556228e3dcd545b4f423d1ad812bfdca5884"},
-    {file = "ruff-0.5.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3687d002f911e8a5faf977e619a034d159a8373514a587249cc00f211c67a091"},
-    {file = "ruff-0.5.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ac9dc814e510436e30d0ba535f435a7f3dc97f895f844f5b3f347ec8c228a523"},
-    {file = "ruff-0.5.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:af9bdf6c389b5add40d89b201425b531e0a5cceb3cfdcc69f04d3d531c6be74f"},
-    {file = "ruff-0.5.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d40a8533ed545390ef8315b8e25c4bb85739b90bd0f3fe1280a29ae364cc55d8"},
-    {file = "ruff-0.5.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:cab904683bf9e2ecbbe9ff235bfe056f0eba754d0168ad5407832928d579e7ab"},
-    {file = "ruff-0.5.5-py3-none-win32.whl", hash = "sha256:696f18463b47a94575db635ebb4c178188645636f05e934fdf361b74edf1bb2d"},
-    {file = "ruff-0.5.5-py3-none-win_amd64.whl", hash = "sha256:50f36d77f52d4c9c2f1361ccbfbd09099a1b2ea5d2b2222c586ab08885cf3445"},
-    {file = "ruff-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3191317d967af701f1b73a31ed5788795936e423b7acce82a2b63e26eb3e89d6"},
-    {file = "ruff-0.5.5.tar.gz", hash = "sha256:cc5516bdb4858d972fbc31d246bdb390eab8df1a26e2353be2dbc0c2d7f5421a"},
+    {file = "ruff-0.5.6-py3-none-linux_armv6l.whl", hash = "sha256:a0ef5930799a05522985b9cec8290b185952f3fcd86c1772c3bdbd732667fdcd"},
+    {file = "ruff-0.5.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b652dc14f6ef5d1552821e006f747802cc32d98d5509349e168f6bf0ee9f8f42"},
+    {file = "ruff-0.5.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:80521b88d26a45e871f31e4b88938fd87db7011bb961d8afd2664982dfc3641a"},
+    {file = "ruff-0.5.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9bc8f328a9f1309ae80e4d392836e7dbc77303b38ed4a7112699e63d3b066ab"},
+    {file = "ruff-0.5.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d394940f61f7720ad371ddedf14722ee1d6250fd8d020f5ea5a86e7be217daf"},
+    {file = "ruff-0.5.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:111a99cdb02f69ddb2571e2756e017a1496c2c3a2aeefe7b988ddab38b416d36"},
+    {file = "ruff-0.5.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e395daba77a79f6dc0d07311f94cc0560375ca20c06f354c7c99af3bf4560c5d"},
+    {file = "ruff-0.5.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c476acb43c3c51e3c614a2e878ee1589655fa02dab19fe2db0423a06d6a5b1b6"},
+    {file = "ruff-0.5.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2ff8003f5252fd68425fd53d27c1f08b201d7ed714bb31a55c9ac1d4c13e2eb"},
+    {file = "ruff-0.5.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c94e084ba3eaa80c2172918c2ca2eb2230c3f15925f4ed8b6297260c6ef179ad"},
+    {file = "ruff-0.5.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1f77c1c3aa0669fb230b06fb24ffa3e879391a3ba3f15e3d633a752da5a3e670"},
+    {file = "ruff-0.5.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f908148c93c02873210a52cad75a6eda856b2cbb72250370ce3afef6fb99b1ed"},
+    {file = "ruff-0.5.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:563a7ae61ad284187d3071d9041c08019975693ff655438d8d4be26e492760bd"},
+    {file = "ruff-0.5.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:94fe60869bfbf0521e04fd62b74cbca21cbc5beb67cbb75ab33fe8c174f54414"},
+    {file = "ruff-0.5.6-py3-none-win32.whl", hash = "sha256:e6a584c1de6f8591c2570e171cc7ce482bb983d49c70ddf014393cd39e9dfaed"},
+    {file = "ruff-0.5.6-py3-none-win_amd64.whl", hash = "sha256:d7fe7dccb1a89dc66785d7aa0ac283b2269712d8ed19c63af908fdccca5ccc1a"},
+    {file = "ruff-0.5.6-py3-none-win_arm64.whl", hash = "sha256:57c6c0dd997b31b536bff49b9eee5ed3194d60605a4427f735eeb1f9c1b8d264"},
+    {file = "ruff-0.5.6.tar.gz", hash = "sha256:07c9e3c2a8e1fe377dd460371c3462671a728c981c3205a5217291422209f642"},
 ]
 
 [[package]]
@@ -1326,13 +1326,13 @@ files = [
 
 [[package]]
 name = "tqdm"
-version = "4.66.4"
+version = "4.66.5"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tqdm-4.66.4-py3-none-any.whl", hash = "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644"},
-    {file = "tqdm-4.66.4.tar.gz", hash = "sha256:e4d936c9de8727928f3be6079590e97d9abfe8d39a590be678eb5919ffc186bb"},
+    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
+    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
 ]
 
 [package.dependencies]
diff --git a/pyproject.toml b/pyproject.toml
index 3f1652c..9e9b062 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "apify_haystack"
-version = "0.1.2"
+version = "0.1.3"
 description = "Apify-haystack integration"
 authors = ["Apify Technologies s.r.o. <support@apify.com>"]
 homepage = "https://apify.com"
diff --git a/src/apify_haystack/examples/apify_actor_call.py b/src/apify_haystack/examples/apify_actor_call.py
index 6a8475d..9661506 100644
--- a/src/apify_haystack/examples/apify_actor_call.py
+++ b/src/apify_haystack/examples/apify_actor_call.py
@@ -14,13 +14,14 @@
 .....
 """
 
-from dotenv import load_dotenv
+import os
+
 from haystack import Document
 
 from apify_haystack import ApifyDatasetFromActorCall
 
-# Set APIFY_API_TOKEN here or load it from .env file
-apify_api_token = "" or load_dotenv()
+# Set API keys here
+os.environ["APIFY_API_TOKEN"] = ""
 
 actor_id = "apify/website-content-crawler"
 run_input = {
@@ -37,7 +38,6 @@ def dataset_mapping_function(dataset_item: dict) -> Document:
     actor_id=actor_id,
     run_input=run_input,
     dataset_mapping_function=dataset_mapping_function,
-    apify_api_token=str(apify_api_token),
 )
 print(f"Calling the Apify actor {actor_id} ... crawling will take some time ...")
 print("You can monitor the progress at: https://console.apify.com/actors/runs")
diff --git a/src/apify_haystack/examples/crawl_and_process_data.py b/src/apify_haystack/examples/crawl_and_process_data.py
index 8012d67..fc81327 100644
--- a/src/apify_haystack/examples/crawl_and_process_data.py
+++ b/src/apify_haystack/examples/crawl_and_process_data.py
@@ -2,7 +2,7 @@
 Crawl websites, scrape text content, and store it in the InMemoryDocumentStore.
 
 This script demonstrates how to extract content from a website using Apify's Website Content Crawler.
-The content is then cleaned, split into smaller chunks, embedded, and stored in the InMemoryDocumentStore.
+The content is split into smaller chunks, embedded, and stored in the InMemoryDocumentStore.
 
 After the pipeline is executed, the documents are retrieved from the document store
 using BM25 retrieval and vector similarity.
@@ -16,80 +16,68 @@
  'split_id': 0, 'split_idx_start': 0, '_split_overlap': ......
 .....
 """
-
 import os
 
-from dotenv import load_dotenv
 from haystack import Document, Pipeline
 from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
-from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
 from haystack.components.writers import DocumentWriter
 from haystack.document_stores.in_memory import InMemoryDocumentStore
-from haystack.utils.auth import Secret
 
 from apify_haystack import ApifyDatasetFromActorCall
 
-load_dotenv()
-
-# Set APIFY_API_TOKEN here or use it from .env file
-apify_api_token = "" or os.getenv("APIFY_API_TOKEN")
-openai_api_key = "" or os.getenv("OPENAI_API_KEY")
-
-actor_id = "apify/website-content-crawler"
-run_input = {
-    "maxCrawlPages": 3,  # limit the number of pages to crawl
-    "startUrls": [{"url": "https://haystack.deepset.ai/"}],
-}
-
-
-def dataset_mapping_function(dataset_item: dict) -> Document:
-    return Document(content=dataset_item.get("text"), meta={"url": dataset_item.get("url")})
-
+os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN"
+os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"
 
 document_loader = ApifyDatasetFromActorCall(
-    actor_id=actor_id,
-    run_input=run_input,
-    dataset_mapping_function=dataset_mapping_function,
-    apify_api_token=apify_api_token,
+    actor_id="apify/website-content-crawler",
+    run_input={
+        "maxCrawlPages": 3,  # limit the number of pages to crawl
+        "startUrls": [{"url": "https://haystack.deepset.ai/"}],
+    },
+    dataset_mapping_function=lambda item: Document(content=item["text"] or "", meta={"url": item["url"]}),
 )
 
 document_store = InMemoryDocumentStore()
 print(f"Initialized InMemoryDocumentStore with {document_store.count_documents()} documents")
 
-document_cleaner = DocumentCleaner()
 document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)
-document_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(openai_api_key))
+document_embedder = OpenAIDocumentEmbedder()
 document_writer = DocumentWriter(document_store)
 
 pipe = Pipeline()
 pipe.add_component("document_loader", document_loader)
-pipe.add_component("document_cleaner", document_cleaner)
 pipe.add_component("document_splitter", document_splitter)
 pipe.add_component("document_embedder", document_embedder)
 pipe.add_component("document_writer", document_writer)
 
-pipe.connect("document_loader", "document_cleaner")
-pipe.connect("document_cleaner", "document_splitter")
+pipe.connect("document_loader", "document_splitter")
 pipe.connect("document_splitter", "document_embedder")
 pipe.connect("document_embedder", "document_writer")
 
-print(
-    "Running pipeline the Apify document_loader -> document_cleaner -> document_splitter "
-    "-> document_embedder -> document_writer"
-)
-print("Crawling will take some time ...")
-print("You can visit https://console.apify.com/actors/runs to monitor the progress")
-
-pipe.run({"document_loader": {}})
+print("\nCrawling will take some time ...")
+print("You can visit https://console.apify.com/actors/runs to monitor the progress\n")
 
+pipe.run({})
 print(f"Added {document_store.count_documents()} to vector from Website Content Crawler")
 
-print("Retrieving documents from the document store using BM25")
-print("query='Haystack'")
-for doc in document_store.bm25_retrieval("Haystack", top_k=1):
-    print(doc)
+print("\n ### Retrieving documents from the document store using BM25 ###\n")
+print("query='Haystack'\n")
+
+bm25_retriever = InMemoryBM25Retriever(document_store)
+
+for doc in bm25_retriever.run("Haystack", top_k=1)["documents"]:
+    print(doc.content)
+
+print("\n ### Retrieving documents from the document store using vector similarity ###\n")
+retrieval_pipe = Pipeline()
+retrieval_pipe.add_component("embedder", OpenAITextEmbedder())
+retrieval_pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=1))
+
+retrieval_pipe.connect("embedder.embedding", "retriever.query_embedding")
+
+results = retrieval_pipe.run({"embedder": {"text": "What is Haystack?"}})
 
-print("Retrieving documents from the document store using vector similarity")
-print("query='What is Haystack'")
-for doc in document_store.embedding_retrieval(OpenAITextEmbedder().run("Haystack")["embedding"], top_k=1):
-    print(doc)
+for doc in results["retriever"]["documents"]:
+    print(doc.content)
diff --git a/src/apify_haystack/examples/rag_with_crawled_website.py b/src/apify_haystack/examples/rag_with_crawled_website.py
index c277b0e..a64820f 100644
--- a/src/apify_haystack/examples/rag_with_crawled_website.py
+++ b/src/apify_haystack/examples/rag_with_crawled_website.py
@@ -20,21 +20,18 @@
 
 import os
 
-from dotenv import load_dotenv
 from haystack import Document, Pipeline
 from haystack.components.builders import PromptBuilder
 from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
 from haystack.components.generators import OpenAIGenerator
 from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
 from haystack.document_stores.in_memory import InMemoryDocumentStore
-from haystack.utils.auth import Secret
 
 from apify_haystack import ApifyDatasetFromActorCall
 
-# Set APIFY_API_TOKEN here or use it from .env file
-load_dotenv()
-apify_api_token = "" or os.getenv("APIFY_API_TOKEN")
-openai_api_key = "" or os.getenv("OPENAI_API_KEY")
+# Set API keys here
+os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN"
+os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"
 
 actor_id = "apify/website-content-crawler"
 run_input = {
@@ -51,15 +48,14 @@ def dataset_mapping_function(dataset_item: dict) -> Document:
     actor_id=actor_id,
     run_input=run_input,
     dataset_mapping_function=dataset_mapping_function,
-    apify_api_token=apify_api_token,
 )
 
 # Components
 print("Initializing components...")
 document_store = InMemoryDocumentStore()
 
-docs_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(openai_api_key))
-text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(openai_api_key))
+docs_embedder = OpenAIDocumentEmbedder()
+text_embedder = OpenAITextEmbedder()
 retriever = InMemoryEmbeddingRetriever(document_store)
 generator = OpenAIGenerator(model="gpt-3.5-turbo")