Skip to content

Commit

Permalink
Feat: update code examples (#13)
Browse files Browse the repository at this point in the history
* Update examples, use credentials from environ
  • Loading branch information
jirispilka authored Aug 4, 2024
1 parent 12fb3fb commit 522e3f1
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 85 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## [0.1.3](https://github.com/apify/apify-haystack/releases/tag/0.1.3) (2024-08-04)

🚀 Features
- Update code examples in the documentation

## [0.1.2](https://github.com/apify/apify-haystack/releases/tag/0.1.2) (2024-08-02)

🐛 Bug Fixes
Expand Down
50 changes: 25 additions & 25 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "apify_haystack"
version = "0.1.2"
version = "0.1.3"
description = "Apify-haystack integration"
authors = ["Apify Technologies s.r.o. <support@apify.com>"]
homepage = "https://apify.com"
Expand Down
8 changes: 4 additions & 4 deletions src/apify_haystack/examples/apify_actor_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
.....
"""

from dotenv import load_dotenv
import os

from haystack import Document

from apify_haystack import ApifyDatasetFromActorCall

# Set APIFY_API_TOKEN here or load it from .env file
apify_api_token = "" or load_dotenv()
# Set API keys here
os.environ["APIFY_API_TOKEN"] = ""

actor_id = "apify/website-content-crawler"
run_input = {
Expand All @@ -37,7 +38,6 @@ def dataset_mapping_function(dataset_item: dict) -> Document:
actor_id=actor_id,
run_input=run_input,
dataset_mapping_function=dataset_mapping_function,
apify_api_token=str(apify_api_token),
)
print(f"Calling the Apify actor {actor_id} ... crawling will take some time ...")
print("You can monitor the progress at: https://console.apify.com/actors/runs")
Expand Down
80 changes: 34 additions & 46 deletions src/apify_haystack/examples/crawl_and_process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Crawl websites, scrape text content, and store it in the InMemoryDocumentStore.
This script demonstrates how to extract content from a website using Apify's Website Content Crawler.
The content is then cleaned, split into smaller chunks, embedded, and stored in the InMemoryDocumentStore.
The content is split into smaller chunks, embedded, and stored in the InMemoryDocumentStore.
After the pipeline is executed, the documents are retrieved from the document store
using BM25 retrieval and vector similarity.
Expand All @@ -16,80 +16,68 @@
'split_id': 0, 'split_idx_start': 0, '_split_overlap': ......
.....
"""

import os

from dotenv import load_dotenv
from haystack import Document, Pipeline
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.utils.auth import Secret

from apify_haystack import ApifyDatasetFromActorCall

load_dotenv()

# Set APIFY_API_TOKEN here or use it from .env file
apify_api_token = "" or os.getenv("APIFY_API_TOKEN")
openai_api_key = "" or os.getenv("OPENAI_API_KEY")

actor_id = "apify/website-content-crawler"
run_input = {
"maxCrawlPages": 3, # limit the number of pages to crawl
"startUrls": [{"url": "https://haystack.deepset.ai/"}],
}


def dataset_mapping_function(dataset_item: dict) -> Document:
return Document(content=dataset_item.get("text"), meta={"url": dataset_item.get("url")})

os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN"
os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"

document_loader = ApifyDatasetFromActorCall(
actor_id=actor_id,
run_input=run_input,
dataset_mapping_function=dataset_mapping_function,
apify_api_token=apify_api_token,
actor_id="apify/website-content-crawler",
run_input={
"maxCrawlPages": 3, # limit the number of pages to crawl
"startUrls": [{"url": "https://haystack.deepset.ai/"}],
},
dataset_mapping_function=lambda item: Document(content=item["text"] or "", meta={"url": item["url"]}),
)

document_store = InMemoryDocumentStore()
print(f"Initialized InMemoryDocumentStore with {document_store.count_documents()} documents")

document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)
document_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(openai_api_key))
document_embedder = OpenAIDocumentEmbedder()
document_writer = DocumentWriter(document_store)

pipe = Pipeline()
pipe.add_component("document_loader", document_loader)
pipe.add_component("document_cleaner", document_cleaner)
pipe.add_component("document_splitter", document_splitter)
pipe.add_component("document_embedder", document_embedder)
pipe.add_component("document_writer", document_writer)

pipe.connect("document_loader", "document_cleaner")
pipe.connect("document_cleaner", "document_splitter")
pipe.connect("document_loader", "document_splitter")
pipe.connect("document_splitter", "document_embedder")
pipe.connect("document_embedder", "document_writer")

print(
"Running pipeline the Apify document_loader -> document_cleaner -> document_splitter "
"-> document_embedder -> document_writer"
)
print("Crawling will take some time ...")
print("You can visit https://console.apify.com/actors/runs to monitor the progress")

pipe.run({"document_loader": {}})
print("\nCrawling will take some time ...")
print("You can visit https://console.apify.com/actors/runs to monitor the progress\n")

pipe.run({})
print(f"Added {document_store.count_documents()} to vector from Website Content Crawler")

print("Retrieving documents from the document store using BM25")
print("query='Haystack'")
for doc in document_store.bm25_retrieval("Haystack", top_k=1):
print(doc)
print("\n ### Retrieving documents from the document store using BM25 ###\n")
print("query='Haystack'\n")

bm25_retriever = InMemoryBM25Retriever(document_store)

for doc in bm25_retriever.run("Haystack", top_k=1)["documents"]:
print(doc.content)

print("\n ### Retrieving documents from the document store using vector similarity ###\n")
retrieval_pipe = Pipeline()
retrieval_pipe.add_component("embedder", OpenAITextEmbedder())
retrieval_pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=1))

retrieval_pipe.connect("embedder.embedding", "retriever.query_embedding")

results = retrieval_pipe.run({"embedder": {"text": "What is Haystack?"}})

print("Retrieving documents from the document store using vector similarity")
print("query='What is Haystack'")
for doc in document_store.embedding_retrieval(OpenAITextEmbedder().run("Haystack")["embedding"], top_k=1):
print(doc)
for doc in results["retriever"]["documents"]:
print(doc.content)
14 changes: 5 additions & 9 deletions src/apify_haystack/examples/rag_with_crawled_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,18 @@

import os

from dotenv import load_dotenv
from haystack import Document, Pipeline
from haystack.components.builders import PromptBuilder
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.utils.auth import Secret

from apify_haystack import ApifyDatasetFromActorCall

# Set APIFY_API_TOKEN here or use it from .env file
load_dotenv()
apify_api_token = "" or os.getenv("APIFY_API_TOKEN")
openai_api_key = "" or os.getenv("OPENAI_API_KEY")
# Set API keys here
os.environ["APIFY_API_TOKEN"] = "YOUR-APIFY-API-TOKEN"
os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"

actor_id = "apify/website-content-crawler"
run_input = {
Expand All @@ -51,15 +48,14 @@ def dataset_mapping_function(dataset_item: dict) -> Document:
actor_id=actor_id,
run_input=run_input,
dataset_mapping_function=dataset_mapping_function,
apify_api_token=apify_api_token,
)

# Components
print("Initializing components...")
document_store = InMemoryDocumentStore()

docs_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(openai_api_key))
text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(openai_api_key))
docs_embedder = OpenAIDocumentEmbedder()
text_embedder = OpenAITextEmbedder()
retriever = InMemoryEmbeddingRetriever(document_store)
generator = OpenAIGenerator(model="gpt-3.5-turbo")

Expand Down

0 comments on commit 522e3f1

Please sign in to comment.