feat: Add ability to specify actor inputs in the run() method (#14)

* Add ability to specify actor inputs in the run() method * Update examples.
apify · Sep 5, 2024 · 2ac9183 · 2ac9183
1 parent 522e3f1
commit 2ac9183
Show file tree

Hide file tree

Showing 8 changed files with 501 additions and 206 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [0.1.4](https://github.com/apify/apify-haystack/releases/tag/0.1.4)  (2024-09-04)
+
+🚀 Features
+- Allow to add Actor inputs (`run_input`) in the ApifyDatasetFromActorCall.run() method
+
 ## [0.1.3](https://github.com/apify/apify-haystack/releases/tag/0.1.3)  (2024-08-04)
 
 🚀 Features

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -70,7 +70,7 @@ Important notes:
 
 - Ensure the version number in `pyproject.toml` is updated before creating a new release. If a stable version with the same version number already exists on PyPI, the publish process will fail.
 - The release process also fails if the version is not documented in `CHANGELOG.md`. Make sure to describe the changes in the new version there.
-- After a stable release, ensure to increment the version number in both `pyproject.toml` and `apify_haystack/__init__.py`.
+- After a stable release, ensure to increment the version number in both `pyproject.toml`.
 
 ### Beta release checklist
 

diff --git a/README.md b/README.md
@@ -68,7 +68,8 @@ See other examples in the [examples directory](https://github.com/apify/apify-ha
 - Load a dataset from Apify and convert it to a Haystack Document
 - Call [Website Content Crawler](https://apify.com/apify/website-content-crawler) and convert the data into the Haystack Documents
 - Crawl websites, retrieve text content, and store it in the `InMemoryDocumentStore`
-- Retrieval-Augmented Generation (RAG): Extracting text from a website & question answering
+- Retrieval-Augmented Generation (RAG): Extracting text from a website & question answering <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/apify_haystack_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+- Analyze Your Instagram Comments’ Vibe with Apify and Haystack
 
 ## Support
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "apify_haystack"
-version = "0.1.3"
+version = "0.1.4"
 description = "Apify-haystack integration"
 authors = ["Apify Technologies s.r.o. <support@apify.com>"]
 homepage = "https://apify.com"
@@ -55,6 +55,7 @@ pytest = "~8.3.0"
 pytest-cov = "~5.0.0"
 ruff = "~0.5.0"
 setuptools = "~70.3.0"  # setuptools are used by pytest, but not explicitly required
+black = "^24.8.0"
 
 [tool.ruff]
 line-length = 120

diff --git a/src/apify_haystack/apify_dataset.py b/src/apify_haystack/apify_dataset.py
@@ -78,7 +78,7 @@ def __init__(
         self,
         actor_id: str,
         dataset_mapping_function: Callable[[dict], Document],
-        run_input: dict,
+        run_input: dict | None = None,
         *,
         build: str | None = None,
         memory_mbytes: int | None = None,
@@ -89,10 +89,12 @@ def __init__(
 
         Args:
             actor_id (str): The ID or name of the Actor on the Apify platform.
-            run_input (Dict): The input object of the Actor that you're trying to run.
             dataset_mapping_function (Callable): A function that takes a single
                 dictionary (an Apify dataset item) and converts it to an
                 instance of the Document class.
+            run_input (dict, optional): The input parameters for the Actor you want to run. This can be provided
+                either in the `run` method or during the class instantiation. If specified in both places,
+                the inputs will be merged.
             build (str, optional): Optionally specifies the actor build to run.
                 It can be either a build tag or build number.
             memory_mbytes (int, optional): Optional memory limit for the run, in megabytes.
@@ -118,16 +120,33 @@ def __init__(
         if httpx_client := getattr(self.client.http_client, "httpx_client", None):
             httpx_client.headers["user-agent"] += f"; {HAYSTACK_ATTRIBUTE_USER_AGENT}"
 
-    @component.output_types(documents=list[Document])
-    def run(self):  # type: ignore[no-untyped-def] # noqa: ANN201
+    @component.output_types(documents=list[Document])  # type: ignore[misc]
+    def run(self, run_input: dict | None = None):  # type: ignore[no-untyped-def] # noqa: ANN201
         """Run an Actor on the Apify platform and wait for results to be ready.
 
+        Args:
+            run_input (dict, optional): The input parameters for the Actor you want to run. This can be provided
+                either in the `run` method or during the class instantiation. If specified in both places,
+                the inputs will be merged.
+
+        Example:
+            .. code-block:: python
+
+                actor = ApifyDatasetFromActorCall(
+                    actor_id="apify/website-content-crawler",
+                    run_input={ "maxCrawlPages": 5 }
+                )
+                dataset = actor.run(run_input={ "startUrls": [{"url": "https://haystack.deepset.ai/"}] })
+
         Type-hint note: the output is not type-hinted, otherwise linting complains
         that `run` method is not haystack a component
         """
+        if not (run_input := _merge_inputs(run_input, self.run_input)):
+            raise ValueError("Please provide the run_input either in the constructor or in the run method.")
+
         if not (
             actor_call := self.client.actor(self.actor_id).call(
-                run_input=self.run_input,
+                run_input=run_input,
                 build=self.build,
                 memory_mbytes=self.memory_mbytes,
                 timeout_secs=self.timeout_secs,
@@ -156,7 +175,7 @@ def __init__(
         self,
         task_id: str,
         dataset_mapping_function: Callable[[dict], Document],
-        task_input: dict,
+        task_input: dict | None = None,
         *,
         build: str | None = None,
         memory_mbytes: int | None = None,
@@ -167,10 +186,12 @@ def __init__(
 
         Args:
             task_id (str): The ID or name of the Actor on the Apify platform.
-            task_input (Dict): The input object of the Actor that you're trying to run.
             dataset_mapping_function (Callable): A function that takes a single
                 dictionary (an Apify dataset item) and converts it to an
                 instance of the Document class.
+            task_input (dict, optional): The input parameters for the Actor you want to run. This can be provided
+                either in the `run` method or during the class instantiation. If specified in both places,
+                the inputs will be merged.
             build (str, optional): Optionally specifies the Actor build to run.
                 It can be either a build tag or build number.
             memory_mbytes (int, optional): Optional memory limit for the run, in megabytes.
@@ -196,13 +217,22 @@ def __init__(
         if httpx_client := getattr(self.client.http_client, "httpx_client", None):
             httpx_client.headers["user-agent"] += f"; {HAYSTACK_ATTRIBUTE_USER_AGENT}"
 
-    @component.output_types(documents=list[Document])
-    def run(self):  # type: ignore[no-untyped-def] # noqa: ANN201
+    @component.output_types(documents=list[Document])  # type: ignore[misc]
+    def run(self, task_input: dict | None = None):  # type: ignore[no-untyped-def] # noqa: ANN201
         """Run an Actor on the Apify platform and wait for results to be ready.
 
+        Args:
+            task_input (dict, optional): The input parameters for the Actor you want to run. This can be provided
+                either in the `run` method or during the class instantiation. If specified in both places,
+                the inputs will be merged.
+
         Type-hint note: the output is not type-hinted, otherwise linting complains
         that `run` method is not haystack a component
         """
+        task_input = task_input or self.task_input
+        if not task_input:
+            raise ValueError("Please provide the task_input either in the constructor or in the run method.")
+
         if not (
             actor_call := self.client.task(self.task_id).call(
                 task_input=self.task_input,
@@ -221,3 +251,9 @@ def run(self):  # type: ignore[no-untyped-def] # noqa: ANN201
             dataset_mapping_function=self.dataset_mapping_function,
         )
         return loader.run()
+
+
+def _merge_inputs(input1: dict | None, input2: dict | None) -> dict | None:
+    if input1 and input2:
+        return input1 | input2
+    return input1 or input2
diff --git a/src/apify_haystack/examples/crawl_and_process_data.py b/src/apify_haystack/examples/crawl_and_process_data.py
@@ -16,6 +16,7 @@
  'split_id': 0, 'split_idx_start': 0, '_split_overlap': ......
 .....
 """
+
 import os
 
 from haystack import Document, Pipeline

diff --git a/src/apify_haystack/examples/instagram_comments_analysis.py b/src/apify_haystack/examples/instagram_comments_analysis.py
@@ -0,0 +1,85 @@
+"""
+Analyze Your Instagram Comments' Vibe with Apify and Haystack
+
+This script demonstrates how to extract comments from an Instagram post using Apify's Instagram Scraper.
+The content is processed using the Haystack pipeline and the vibe of the comments is analyzed using the LLM model.
+
+URL = "https://www.instagram.com/p/C_a9jcRuJZZ/"  # @tiffintech on How to easily keep up with tech?
+
+Expected output:
+......
+Overall, the Instagram post seems to be generating positive energy and high engagement. The comments are filled with
+emojis like 😍, 😂, ❤️, 🙏🏿, and 🙌 which show excitement and enthusiasm .....
+.... Overall, the engagement patterns suggest that the post is vibrating with high energy and positivity.
+"""
+
+import os
+
+from haystack import Document, Pipeline
+from haystack.components.builders import PromptBuilder
+from haystack.components.generators import OpenAIGenerator
+from haystack.components.preprocessors import DocumentCleaner
+
+from apify_haystack import ApifyDatasetFromActorCall
+
+os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN"
+os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"
+
+
+def dataset_mapping_function(dataset_item: dict) -> Document:
+    return Document(
+        content=dataset_item.get("text"),
+        meta={"ownerUsername": dataset_item.get("ownerUsername")},
+    )
+
+
+document_loader = ApifyDatasetFromActorCall(
+    actor_id="apify/instagram-comment-scraper",
+    run_input={"resultsLimit": 50},
+    dataset_mapping_function=dataset_mapping_function,
+)
+
+prompt = """
+Analyze these Instagram comments to determine if the post is generating positive energy, excitement,
+or high engagement. Focus on sentiment, emotional tone, and engagement patterns to conclude if
+the post is 'vibrating' with high energy. Be concise."
+
+Context:
+{% for document in documents %}
+    {{ document.content }}
+{% endfor %}
+
+Analysis:
+"""
+
+cleaner = DocumentCleaner(
+    remove_empty_lines=True,
+    remove_extra_whitespaces=True,
+    remove_repeated_substrings=True,
+)
+prompt_builder = PromptBuilder(template=prompt)
+generator = OpenAIGenerator(model="gpt-3.5-turbo")
+
+
+pipe = Pipeline()
+pipe.add_component("loader", document_loader)
+pipe.add_component("cleaner", cleaner)
+pipe.add_component("prompt_builder", prompt_builder)
+pipe.add_component("llm", generator)
+pipe.connect("loader", "cleaner")
+pipe.connect("cleaner", "prompt_builder")
+pipe.connect("prompt_builder", "llm")
+
+print("Vibe analysis:\n")
+
+url1 = "https://www.instagram.com/p/C_a9jcRuJZZ/"  # @tiffintech on How to easily keep up with tech?
+print(f"\nScraping Instagram comments for: {url1} and running analysis (it might take around 30-60 secs) ...\n")
+
+res = pipe.run({"loader": {"run_input": {"directUrls": [url1]}}})
+print("Analysis:", res.get("llm", {}).get("replies", ["No response"])[0])
+
+url2 = "https://www.instagram.com/p/C_RgBzogufK/"  # @maharishis on Affordable Care Act
+print(f"\nScraping Instagram comments for: {url2} and running analysis (it might take around 30-60 secs)... \n")
+
+res = pipe.run({"loader": {"run_input": {"directUrls": [url2]}}})
+print("Analysis:", res.get("llm", {}).get("replies", ["No response"])[0])