From 9cef1a4d208252f355caa9412b882a54e68f4a8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Trunk=C3=A1t?= Date: Fri, 28 Jul 2023 11:20:11 +0200 Subject: [PATCH] Documentating and adding a manifest.json --- templates/js-langchain/README.md | 25 +++++++++++++++++-------- templates/js-langchain/src/main.js | 7 +++---- templates/manifest.json | 21 +++++++++++++++++++++ 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/templates/js-langchain/README.md b/templates/js-langchain/README.md index e4fd96b5..8ec82f88 100644 --- a/templates/js-langchain/README.md +++ b/templates/js-langchain/README.md @@ -2,21 +2,30 @@ > LangChain is a framework for developing applications powered by language models. -This example template illustrates how to use LangChain.js to crawl the web data, vectorize them, and prompt the OpenAI model. All of this within a single Apify Actor. +This example template illustrates how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model. All of this within a single Apify Actor and slightly over a hundered lines of code. + +## Included features + +- **[Apify SDK](https://docs.apify.com/sdk/js/)** - a toolkit for building actors +- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your actor's input +- **[Langchain.js](https://github.com/hwchase17/langchainjs)** - a framework for developing applications powered by language models +- **[OpenAI](https://openai.com/)** - a powerful language model + +## How it works The code contains following steps: -- Crawls given website using [Website Content Crawler](https://apify.com/mtrunkat/website-content-crawler) Actor. -- Vectorizes the data using the [OpenAI](https://openai.com/) API. -- Caches the vector index in the [key-value store](https://docs.apify.com/platform/storage/key-value-store) so that when you run Actor for the same website again, the cached data are used. -- Data are fed to the OpenAI model using the [Langchain.js](https://github.com/hwchase17/langchainjs), and a given query is asked. +1. Crawls given website using [Website Content Crawler](https://apify.com/mtrunkat/website-content-crawler) Actor. +2. Vectorizes the data using the [OpenAI](https://openai.com/) API. +3. Caches the vector index in the [key-value store](https://docs.apify.com/platform/storage/key-value-store) so that when you run Actor for the same website again, the cached data are used to speed it up. +4. Data are fed to the OpenAI model using the [Langchain.js](https://github.com/hwchase17/langchainjs), and a given query is asked. -## Prerequisites +## Before you start To be able to run this template both locally and at the Apify Platform, you need to: -- Have an [Apify account](https://console.apify.com/) and sign into it using `apify login` command. This is needed for running the [Website Content Crawler](https://apify.com/mtrunkat/website-content-crawler) Actor to gather the data. +- Have an [Apify account](https://console.apify.com/) and sign into it using `apify login` command in your terminal. Without this, you won't be able to run the required [Website Content Crawler](https://apify.com/mtrunkat/website-content-crawler) Actor to gather the data. - Have an [OpenAI](https://openai.com/) account and an API key. This is needed for vectorizing the data and also to be able to prompt the OpenAI model. - When running locally store this as OPENAI_API_KEY environment variable (https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console). - - When running on Apify platform, you can simply paste this into the input field in the UI. + - When running on Apify platform, you can simply paste this into the input field in the input UI. ## Production use diff --git a/templates/js-langchain/src/main.js b/templates/js-langchain/src/main.js index e54d5296..03e1cbc5 100644 --- a/templates/js-langchain/src/main.js +++ b/templates/js-langchain/src/main.js @@ -5,7 +5,7 @@ import { HNSWLib } from 'langchain/vectorstores/hnswlib'; import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; import { RetrievalQAChain } from 'langchain/chains'; import { OpenAI } from 'langchain/llms/openai'; -import { rmdir } from 'node:fs/promises'; +import { rm } from 'node:fs/promises'; import { retrieveVectorIndex, cacheVectorIndex } from './vector_index_cache.js'; @@ -58,10 +58,9 @@ if (reinitializeIndex) { } ); - const docs = await loader.load(); - // Initialize the vector index from the crawled documents. console.log('Feeding vector index with crawling results...'); + const docs = await loader.load(); vectorStore = await HNSWLib.fromDocuments( docs, new OpenAIEmbeddings({ openAIApiKey }) @@ -93,7 +92,7 @@ const res = await chain.call({ query }); console.log(`\n${res.text}\n`); // Remove the vector index directory as we have it cached in the key-value store for the next time. -await rmdir(VECTOR_INDEX_PATH, { recursive: true }); +await rm(VECTOR_INDEX_PATH, { recursive: true }); await Actor.setValue('OUTPUT', res); await Actor.exit(); diff --git a/templates/manifest.json b/templates/manifest.json index 53286818..db3baea8 100644 --- a/templates/manifest.json +++ b/templates/manifest.json @@ -360,6 +360,27 @@ "cypress/e2e/second-spec.cy.js", "cypress/support/e2e.js" ] + }, + { + "id": "js-langchain", + "name": "project_langchain_js", + "label": "Langchain", + "category": "javascript", + "technologies": [ + "nodejs", + "langchain" + ], + "description": "Example of how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model.", + "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/js-crawlee-puppeteer-chrome.zip?raw=true", + "defaultRunOptions": { + "build": "latest", + "memoryMbytes": 4096, + "timeoutSecs": 3600 + }, + "showcaseFiles": [ + "src/main.js", + "src/vector_index_cache.js" + ] } ] }