From 522e3f11e4ba8daeb166ffe60735cbc346f3e6aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Spilka?= Date: Sun, 4 Aug 2024 21:03:05 +0200 Subject: [PATCH] Feat: update code examples (#13) * Update examples, use credentials from environ --- CHANGELOG.md | 5 ++ poetry.lock | 50 ++++++------ pyproject.toml | 2 +- .../examples/apify_actor_call.py | 8 +- .../examples/crawl_and_process_data.py | 80 ++++++++----------- .../examples/rag_with_crawled_website.py | 14 ++-- 6 files changed, 74 insertions(+), 85 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 615254d..9ba4f60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [0.1.3](https://github.com/apify/apify-haystack/releases/tag/0.1.3) (2024-08-04) + +🚀 Features +- Update code examples in the documentation + ## [0.1.2](https://github.com/apify/apify-haystack/releases/tag/0.1.2) (2024-08-02) 🐛 Bug Fixes diff --git a/poetry.lock b/poetry.lock index 9371db9..7fca981 100644 --- a/poetry.lock +++ b/poetry.lock @@ -763,13 +763,13 @@ files = [ [[package]] name = "openai" -version = "1.37.2" +version = "1.38.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.37.2-py3-none-any.whl", hash = "sha256:22dfcf48cd9ff1a6b1d8de09121261bc72315515a9faadad541be93a8bb1e961"}, - {file = "openai-1.37.2.tar.gz", hash = "sha256:fd97cb235cfce0f7c902c64e58a5cd9fea19e7c180d9a4e2a2b8e533dbd9f424"}, + {file = "openai-1.38.0-py3-none-any.whl", hash = "sha256:a19ef052f1676320f52183ae6f9775da6d888fbe3aec57886117163c095d9f7c"}, + {file = "openai-1.38.0.tar.gz", hash = "sha256:30fb324bf452ecb1194ca7dbc64566a4d7aa054c6a5da857937ede7d517a220b"}, ] [package.dependencies] @@ -1236,29 +1236,29 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "ruff" -version = "0.5.5" +version = "0.5.6" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.5.5-py3-none-linux_armv6l.whl", hash = "sha256:605d589ec35d1da9213a9d4d7e7a9c761d90bba78fc8790d1c5e65026c1b9eaf"}, - {file = "ruff-0.5.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:00817603822a3e42b80f7c3298c8269e09f889ee94640cd1fc7f9329788d7bf8"}, - {file = "ruff-0.5.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:187a60f555e9f865a2ff2c6984b9afeffa7158ba6e1eab56cb830404c942b0f3"}, - {file = "ruff-0.5.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe26fc46fa8c6e0ae3f47ddccfbb136253c831c3289bba044befe68f467bfb16"}, - {file = "ruff-0.5.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4ad25dd9c5faac95c8e9efb13e15803cd8bbf7f4600645a60ffe17c73f60779b"}, - {file = "ruff-0.5.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f70737c157d7edf749bcb952d13854e8f745cec695a01bdc6e29c29c288fc36e"}, - {file = "ruff-0.5.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:cfd7de17cef6ab559e9f5ab859f0d3296393bc78f69030967ca4d87a541b97a0"}, - {file = "ruff-0.5.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a09b43e02f76ac0145f86a08e045e2ea452066f7ba064fd6b0cdccb486f7c3e7"}, - {file = "ruff-0.5.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0b856cb19c60cd40198be5d8d4b556228e3dcd545b4f423d1ad812bfdca5884"}, - {file = "ruff-0.5.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3687d002f911e8a5faf977e619a034d159a8373514a587249cc00f211c67a091"}, - {file = "ruff-0.5.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ac9dc814e510436e30d0ba535f435a7f3dc97f895f844f5b3f347ec8c228a523"}, - {file = "ruff-0.5.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:af9bdf6c389b5add40d89b201425b531e0a5cceb3cfdcc69f04d3d531c6be74f"}, - {file = "ruff-0.5.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d40a8533ed545390ef8315b8e25c4bb85739b90bd0f3fe1280a29ae364cc55d8"}, - {file = "ruff-0.5.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:cab904683bf9e2ecbbe9ff235bfe056f0eba754d0168ad5407832928d579e7ab"}, - {file = "ruff-0.5.5-py3-none-win32.whl", hash = "sha256:696f18463b47a94575db635ebb4c178188645636f05e934fdf361b74edf1bb2d"}, - {file = "ruff-0.5.5-py3-none-win_amd64.whl", hash = "sha256:50f36d77f52d4c9c2f1361ccbfbd09099a1b2ea5d2b2222c586ab08885cf3445"}, - {file = "ruff-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3191317d967af701f1b73a31ed5788795936e423b7acce82a2b63e26eb3e89d6"}, - {file = "ruff-0.5.5.tar.gz", hash = "sha256:cc5516bdb4858d972fbc31d246bdb390eab8df1a26e2353be2dbc0c2d7f5421a"}, + {file = "ruff-0.5.6-py3-none-linux_armv6l.whl", hash = "sha256:a0ef5930799a05522985b9cec8290b185952f3fcd86c1772c3bdbd732667fdcd"}, + {file = "ruff-0.5.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b652dc14f6ef5d1552821e006f747802cc32d98d5509349e168f6bf0ee9f8f42"}, + {file = "ruff-0.5.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:80521b88d26a45e871f31e4b88938fd87db7011bb961d8afd2664982dfc3641a"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9bc8f328a9f1309ae80e4d392836e7dbc77303b38ed4a7112699e63d3b066ab"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d394940f61f7720ad371ddedf14722ee1d6250fd8d020f5ea5a86e7be217daf"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:111a99cdb02f69ddb2571e2756e017a1496c2c3a2aeefe7b988ddab38b416d36"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e395daba77a79f6dc0d07311f94cc0560375ca20c06f354c7c99af3bf4560c5d"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c476acb43c3c51e3c614a2e878ee1589655fa02dab19fe2db0423a06d6a5b1b6"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2ff8003f5252fd68425fd53d27c1f08b201d7ed714bb31a55c9ac1d4c13e2eb"}, + {file = "ruff-0.5.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c94e084ba3eaa80c2172918c2ca2eb2230c3f15925f4ed8b6297260c6ef179ad"}, + {file = "ruff-0.5.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1f77c1c3aa0669fb230b06fb24ffa3e879391a3ba3f15e3d633a752da5a3e670"}, + {file = "ruff-0.5.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f908148c93c02873210a52cad75a6eda856b2cbb72250370ce3afef6fb99b1ed"}, + {file = "ruff-0.5.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:563a7ae61ad284187d3071d9041c08019975693ff655438d8d4be26e492760bd"}, + {file = "ruff-0.5.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:94fe60869bfbf0521e04fd62b74cbca21cbc5beb67cbb75ab33fe8c174f54414"}, + {file = "ruff-0.5.6-py3-none-win32.whl", hash = "sha256:e6a584c1de6f8591c2570e171cc7ce482bb983d49c70ddf014393cd39e9dfaed"}, + {file = "ruff-0.5.6-py3-none-win_amd64.whl", hash = "sha256:d7fe7dccb1a89dc66785d7aa0ac283b2269712d8ed19c63af908fdccca5ccc1a"}, + {file = "ruff-0.5.6-py3-none-win_arm64.whl", hash = "sha256:57c6c0dd997b31b536bff49b9eee5ed3194d60605a4427f735eeb1f9c1b8d264"}, + {file = "ruff-0.5.6.tar.gz", hash = "sha256:07c9e3c2a8e1fe377dd460371c3462671a728c981c3205a5217291422209f642"}, ] [[package]] @@ -1326,13 +1326,13 @@ files = [ [[package]] name = "tqdm" -version = "4.66.4" +version = "4.66.5" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" files = [ - {file = "tqdm-4.66.4-py3-none-any.whl", hash = "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644"}, - {file = "tqdm-4.66.4.tar.gz", hash = "sha256:e4d936c9de8727928f3be6079590e97d9abfe8d39a590be678eb5919ffc186bb"}, + {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"}, + {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"}, ] [package.dependencies] diff --git a/pyproject.toml b/pyproject.toml index 3f1652c..9e9b062 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "apify_haystack" -version = "0.1.2" +version = "0.1.3" description = "Apify-haystack integration" authors = ["Apify Technologies s.r.o. "] homepage = "https://apify.com" diff --git a/src/apify_haystack/examples/apify_actor_call.py b/src/apify_haystack/examples/apify_actor_call.py index 6a8475d..9661506 100644 --- a/src/apify_haystack/examples/apify_actor_call.py +++ b/src/apify_haystack/examples/apify_actor_call.py @@ -14,13 +14,14 @@ ..... """ -from dotenv import load_dotenv +import os + from haystack import Document from apify_haystack import ApifyDatasetFromActorCall -# Set APIFY_API_TOKEN here or load it from .env file -apify_api_token = "" or load_dotenv() +# Set API keys here +os.environ["APIFY_API_TOKEN"] = "" actor_id = "apify/website-content-crawler" run_input = { @@ -37,7 +38,6 @@ def dataset_mapping_function(dataset_item: dict) -> Document: actor_id=actor_id, run_input=run_input, dataset_mapping_function=dataset_mapping_function, - apify_api_token=str(apify_api_token), ) print(f"Calling the Apify actor {actor_id} ... crawling will take some time ...") print("You can monitor the progress at: https://console.apify.com/actors/runs") diff --git a/src/apify_haystack/examples/crawl_and_process_data.py b/src/apify_haystack/examples/crawl_and_process_data.py index 8012d67..fc81327 100644 --- a/src/apify_haystack/examples/crawl_and_process_data.py +++ b/src/apify_haystack/examples/crawl_and_process_data.py @@ -2,7 +2,7 @@ Crawl websites, scrape text content, and store it in the InMemoryDocumentStore. This script demonstrates how to extract content from a website using Apify's Website Content Crawler. -The content is then cleaned, split into smaller chunks, embedded, and stored in the InMemoryDocumentStore. +The content is split into smaller chunks, embedded, and stored in the InMemoryDocumentStore. After the pipeline is executed, the documents are retrieved from the document store using BM25 retrieval and vector similarity. @@ -16,80 +16,68 @@ 'split_id': 0, 'split_idx_start': 0, '_split_overlap': ...... ..... """ - import os -from dotenv import load_dotenv from haystack import Document, Pipeline from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder -from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever from haystack.components.writers import DocumentWriter from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.utils.auth import Secret from apify_haystack import ApifyDatasetFromActorCall -load_dotenv() - -# Set APIFY_API_TOKEN here or use it from .env file -apify_api_token = "" or os.getenv("APIFY_API_TOKEN") -openai_api_key = "" or os.getenv("OPENAI_API_KEY") - -actor_id = "apify/website-content-crawler" -run_input = { - "maxCrawlPages": 3, # limit the number of pages to crawl - "startUrls": [{"url": "https://haystack.deepset.ai/"}], -} - - -def dataset_mapping_function(dataset_item: dict) -> Document: - return Document(content=dataset_item.get("text"), meta={"url": dataset_item.get("url")}) - +os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN" +os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY" document_loader = ApifyDatasetFromActorCall( - actor_id=actor_id, - run_input=run_input, - dataset_mapping_function=dataset_mapping_function, - apify_api_token=apify_api_token, + actor_id="apify/website-content-crawler", + run_input={ + "maxCrawlPages": 3, # limit the number of pages to crawl + "startUrls": [{"url": "https://haystack.deepset.ai/"}], + }, + dataset_mapping_function=lambda item: Document(content=item["text"] or "", meta={"url": item["url"]}), ) document_store = InMemoryDocumentStore() print(f"Initialized InMemoryDocumentStore with {document_store.count_documents()} documents") -document_cleaner = DocumentCleaner() document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50) -document_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(openai_api_key)) +document_embedder = OpenAIDocumentEmbedder() document_writer = DocumentWriter(document_store) pipe = Pipeline() pipe.add_component("document_loader", document_loader) -pipe.add_component("document_cleaner", document_cleaner) pipe.add_component("document_splitter", document_splitter) pipe.add_component("document_embedder", document_embedder) pipe.add_component("document_writer", document_writer) -pipe.connect("document_loader", "document_cleaner") -pipe.connect("document_cleaner", "document_splitter") +pipe.connect("document_loader", "document_splitter") pipe.connect("document_splitter", "document_embedder") pipe.connect("document_embedder", "document_writer") -print( - "Running pipeline the Apify document_loader -> document_cleaner -> document_splitter " - "-> document_embedder -> document_writer" -) -print("Crawling will take some time ...") -print("You can visit https://console.apify.com/actors/runs to monitor the progress") - -pipe.run({"document_loader": {}}) +print("\nCrawling will take some time ...") +print("You can visit https://console.apify.com/actors/runs to monitor the progress\n") +pipe.run({}) print(f"Added {document_store.count_documents()} to vector from Website Content Crawler") -print("Retrieving documents from the document store using BM25") -print("query='Haystack'") -for doc in document_store.bm25_retrieval("Haystack", top_k=1): - print(doc) +print("\n ### Retrieving documents from the document store using BM25 ###\n") +print("query='Haystack'\n") + +bm25_retriever = InMemoryBM25Retriever(document_store) + +for doc in bm25_retriever.run("Haystack", top_k=1)["documents"]: + print(doc.content) + +print("\n ### Retrieving documents from the document store using vector similarity ###\n") +retrieval_pipe = Pipeline() +retrieval_pipe.add_component("embedder", OpenAITextEmbedder()) +retrieval_pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=1)) + +retrieval_pipe.connect("embedder.embedding", "retriever.query_embedding") + +results = retrieval_pipe.run({"embedder": {"text": "What is Haystack?"}}) -print("Retrieving documents from the document store using vector similarity") -print("query='What is Haystack'") -for doc in document_store.embedding_retrieval(OpenAITextEmbedder().run("Haystack")["embedding"], top_k=1): - print(doc) +for doc in results["retriever"]["documents"]: + print(doc.content) diff --git a/src/apify_haystack/examples/rag_with_crawled_website.py b/src/apify_haystack/examples/rag_with_crawled_website.py index c277b0e..a64820f 100644 --- a/src/apify_haystack/examples/rag_with_crawled_website.py +++ b/src/apify_haystack/examples/rag_with_crawled_website.py @@ -20,21 +20,18 @@ import os -from dotenv import load_dotenv from haystack import Document, Pipeline from haystack.components.builders import PromptBuilder from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder from haystack.components.generators import OpenAIGenerator from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.utils.auth import Secret from apify_haystack import ApifyDatasetFromActorCall -# Set APIFY_API_TOKEN here or use it from .env file -load_dotenv() -apify_api_token = "" or os.getenv("APIFY_API_TOKEN") -openai_api_key = "" or os.getenv("OPENAI_API_KEY") +# Set API keys here +os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN" +os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY" actor_id = "apify/website-content-crawler" run_input = { @@ -51,15 +48,14 @@ def dataset_mapping_function(dataset_item: dict) -> Document: actor_id=actor_id, run_input=run_input, dataset_mapping_function=dataset_mapping_function, - apify_api_token=apify_api_token, ) # Components print("Initializing components...") document_store = InMemoryDocumentStore() -docs_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(openai_api_key)) -text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(openai_api_key)) +docs_embedder = OpenAIDocumentEmbedder() +text_embedder = OpenAITextEmbedder() retriever = InMemoryEmbeddingRetriever(document_store) generator = OpenAIGenerator(model="gpt-3.5-turbo")