feat: Adding a js-langchain template draft (#144)

This is a first draft, could you take a look if this makes sense to you this way? --------- Co-authored-by: Ondra Urban <23726914+mnmkng@users.noreply.github.com> Co-authored-by: Jakub Drobník <drobnik.j@gmail.com> Co-authored-by: František Nesveda <fnesveda@users.noreply.github.com>
apify · Aug 2, 2023 · 435a4b0 · 435a4b0
1 parent 7ec39e4
commit 435a4b0
Show file tree

Hide file tree

Showing 10 changed files with 340 additions and 0 deletions.
diff --git a/templates/js-langchain/.actor/Dockerfile b/templates/js-langchain/.actor/Dockerfile
@@ -0,0 +1,32 @@
+# Specify the base Docker image. You can read more about
+# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
+# You can also use any other image from Docker Hub.
+FROM apify/actor-node:18
+
+RUN apk add g++ make py3-pip
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY package*.json ./
+
+# Install NPM packages, skip optional and development dependencies to
+# keep the image small. Avoid logging too much and print the dependency
+# tree for debugging
+RUN npm --quiet set progress=false \
+    && npm install --omit=dev --omit=optional \
+    && echo "Installed NPM packages:" \
+    && (npm list --omit=dev --all || true) \
+    && echo "Node.js version:" \
+    && node --version \
+    && echo "NPM version:" \
+    && npm --version \
+    && rm -r ~/.npm
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after NPM install, quick build will be really fast
+# for most source file changes.
+COPY . ./
+
+
+# Run the image.
+CMD npm start --silent
diff --git a/templates/js-langchain/.actor/actor.json b/templates/js-langchain/.actor/actor.json
@@ -0,0 +1,14 @@
+{
+    "actorSpecification": 1,
+    "name": "project-langchain",
+    "title": "Project LangChain",
+    "description": "LangChain project in JavaScript.",
+    "version": "0.0",
+    "meta": {
+        "templateId": "js-langchain"
+    },
+    "dockerfile": "./Dockerfile",
+    "environmentVariables": {
+        "OPENAI_API_KEY": "@OPENAI_API_KEY"
+    }
+}
diff --git a/templates/js-langchain/.actor/input_schema.json b/templates/js-langchain/.actor/input_schema.json
@@ -0,0 +1,47 @@
+{
+    "title": "Input schema for LangChainJS example",
+    "description": "Enter start URL of a website(s) to crawl, configure other optional settings, and decide if you want to use existing cached vector database.",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "startUrls": {
+            "title": "Start URLs",
+            "type": "array",
+            "description": "One or more URLs of pages where the crawler will start. Note that the Actor will additionally only crawl sub-pages of these URLs. For example, for start URL `https://www.example.com/blog`, it will crawl pages like `https://example.com/blog/article-1`, but will skip `https://example.com/docs/something-else`.",
+            "editor": "requestListSources",
+            "prefill": [
+                {
+                    "url": "https://wikipedia.com"
+                }
+            ]
+        },
+        "maxCrawlPages": {
+            "title": "Max pages",
+            "type": "integer",
+            "description": "The maximum number pages to crawl. It includes the start URLs, pagination pages, pages with no content, etc. The crawler will automatically finish after reaching this number. This setting is useful to prevent accidental crawler runaway.",
+            "minimum": 0,
+            "default": 9999999,
+            "prefill": 3
+        },
+        "openAIApiKey": {
+            "title": "OpenAI API key",
+            "type": "string",
+            "description": "Enter your [OpenAI](https://openai.com/) account and an API key. This is needed for vectorizing the data and also to be able to prompt the OpenAI model.",
+            "editor": "textfield",
+            "isSecret": true
+        },
+        "query": {
+            "title": "Query",
+            "type": "string",
+            "description": "The query you want to ask the model about the crawled data.",
+            "editor": "textfield",
+            "prefill": "What is Wikipedia?"
+        },
+        "forceRecrawl": {
+            "title": "Re-crawl the data",
+            "type": "boolean",
+            "description": "If enabled, the data will be re-crawled even if cached vector index is available.",
+            "default": false
+        }
+    }
+}
diff --git a/templates/js-langchain/.dockerignore b/templates/js-langchain/.dockerignore
@@ -0,0 +1,13 @@
+# configurations
+.idea
+
+# crawlee and apify storage folders
+apify_storage
+crawlee_storage
+storage
+
+# installed files
+node_modules
+
+# git folder
+.git
diff --git a/templates/js-langchain/.gitignore b/templates/js-langchain/.gitignore
@@ -0,0 +1,9 @@
+# This file tells Git which files shouldn't be added to source control
+
+.DS_Store
+.idea
+node_modules
+storage
+
+# This location is used to locally cache
+vector_index
diff --git a/templates/js-langchain/README.md b/templates/js-langchain/README.md
@@ -0,0 +1,36 @@
+# LangChain.js example
+
+> LangChain is a framework for developing applications powered by language models.
+
+This example template illustrates how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model. All of this within a single Apify Actor and slightly over a hundered lines of code.
+
+## Included features
+
+- **[Apify SDK](https://docs.apify.com/sdk/js/)** - a toolkit for building actors
+- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your actor's input
+- **[Langchain.js](https://github.com/hwchase17/langchainjs)** - a framework for developing applications powered by language models
+- **[OpenAI](https://openai.com/)** - a powerful language model
+
+## How it works
+
+The code contains following steps:
+1. Crawls given website using [Website Content Crawler](https://apify.com/mtrunkat/website-content-crawler) Actor.
+2. Vectorizes the data using the [OpenAI](https://openai.com/) API.
+3. Caches the vector index in the [key-value store](https://docs.apify.com/platform/storage/key-value-store) so that when you run Actor for the same website again, the cached data are used to speed it up.
+4. Data are fed to the OpenAI model using the [Langchain.js](https://github.com/hwchase17/langchainjs), and a given query is asked.
+
+## Before you start
+
+To be able to run this template both locally and at the Apify Platform, you need to:
+- Have an [Apify account](https://console.apify.com/) and sign into it using `apify login` command in your terminal. Without this, you won't be able to run the required [Website Content Crawler](https://apify.com/mtrunkat/website-content-crawler) Actor to gather the data.
+- Have an [OpenAI](https://openai.com/) account and an API key. This is needed for vectorizing the data and also to be able to prompt the OpenAI model.
+    - When running locally store this as OPENAI_API_KEY environment variable (https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console).
+    - When running on Apify platform, you can simply paste this into the input field in the input UI.
+
+## Production use
+
+> This serves purely as an example of the whole pipeline.
+
+For production use, we recommend you to:
+- Separate crawling, data vectorization, and prompting into separate Actors. This way, you can run them independently and scale them separately.
+- Replace local vector store with [Pinecone](https://www.pinecone.io/) or similar database. See the [LangChain.js](https://js.langchain.com/docs/) documentation for more information.
diff --git a/templates/js-langchain/package.json b/templates/js-langchain/package.json
@@ -0,0 +1,21 @@
+{
+    "name": "project-langchain",
+    "version": "0.0.1",
+    "type": "module",
+    "description": "This is a boilerplate of an Apify actor.",
+    "engines": {
+        "node": ">=18.0.0"
+    },
+    "dependencies": {
+        "apify": "^3.0.0",
+        "hnswlib-node": "^1.4.2",
+        "langchain": "^0.0.82",
+        "tar": "^6.1.14"
+    },
+    "scripts": {
+        "start": "node src/main.js",
+        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
+    },
+    "author": "It's not you it's me",
+    "license": "ISC"
+}
diff --git a/templates/js-langchain/src/main.js b/templates/js-langchain/src/main.js
@@ -0,0 +1,98 @@
+import { Actor } from 'apify';
+import { ApifyDatasetLoader } from 'langchain/document_loaders/web/apify_dataset';
+import { Document } from 'langchain/document';
+import { HNSWLib } from 'langchain/vectorstores/hnswlib';
+import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
+import { RetrievalQAChain } from 'langchain/chains';
+import { OpenAI } from 'langchain/llms/openai';
+import { rm } from 'node:fs/promises';
+
+import { retrieveVectorIndex, cacheVectorIndex } from './vector_index_cache.js';
+
+await Actor.init();
+
+// There are 2 steps you need to proceed first in order to be able to run this template:
+// 1. If you are running template locally then you need to authenticate to Apify platform by calling `apify login` in your terminal. Without this, you won't be able to run the required Website Content Crawler Actor to gather the data.
+// 2. Configure the OPENAI_API_KEY environment variable (https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console) with your OpenAI API key you obtain at https://platform.openai.com/account/api-keys.
+const { OPENAI_API_KEY, APIFY_TOKEN } = process.env;
+
+// You can configure the input for the Actor in the Apify UI when running on the Apify platform or editing storage/key_value_stores/default/INPUT.json when running locally.
+const {
+    startUrls = [{ url: 'https://wikipedia.com' }],
+    maxCrawlPages = 3,
+    forceRecrawl = false, // Enforce a re-crawl of website content and re-creation of the vector index.
+    query = 'What is Wikipedia?',
+    openAIApiKey = OPENAI_API_KEY, // This is a fallback to the OPENAI_API_KEY environment variable when value is not present in the input.
+} = await Actor.getInput() || {};
+
+// Local directory where the vector index will be stored.
+const VECTOR_INDEX_PATH = './vector_index';
+
+if (!openAIApiKey) throw new Error('Please configure the OPENAI_API_KEY as environment variable or enter it into the input!');
+if (!APIFY_TOKEN) throw new Error('Please configure the APIFY_TOKEN environment variable! Call `apify login` in your terminal to authenticate.');
+
+// Now we want to creare a vector index from the crawled documents.
+// Following object represents an input for the https://apify.com/apify/website-content-crawler actor that crawls the website to gather the data.
+const websiteContentCrawlerInput = { startUrls, maxCrawlPages };
+
+// This variable will contain a vector index that we will use to retrieve the most relevant documents for a given query.
+let vectorStore;
+
+// First, we check if the vector index is already cached. If not, we run the website content crawler to get the documents.
+// By setting up forceRecrawl=true you can enforce a re-scrape of the website content and re-creation of the vector index.
+console.log('Fetching cached vector index from key-value store...');
+const reinitializeIndex = forceRecrawl || !(await retrieveVectorIndex(websiteContentCrawlerInput));
+if (reinitializeIndex) {
+    // Run the Actor, wait for it to finish, and fetch its results from the Apify dataset into a LangChain document loader.
+    console.log('Vector index was not found.')
+    console.log('Running apify/website-content-crawler to gather the data...');
+    const loader = await ApifyDatasetLoader.fromActorCall(
+        'apify/website-content-crawler',
+        websiteContentCrawlerInput,
+        {
+            datasetMappingFunction: (item) => new Document({
+                pageContent: (item.text || ''),
+                metadata: { source: item.url },
+            }),
+            clientOptions: { token: APIFY_TOKEN },
+        }
+    );
+
+    // Initialize the vector index from the crawled documents.
+    console.log('Feeding vector index with crawling results...');
+    const docs = await loader.load();
+    vectorStore = await HNSWLib.fromDocuments(
+        docs,
+        new OpenAIEmbeddings({ openAIApiKey })
+    );
+
+    // Save the vector index to the key-value store so that we can skip this phase in the next run.
+    console.log('Saving vector index to the disk...')
+    await vectorStore.save(VECTOR_INDEX_PATH);
+    await cacheVectorIndex(websiteContentCrawlerInput, VECTOR_INDEX_PATH);
+}
+
+// Load the vector index from the disk if not already initialized above.
+if (!vectorStore) {
+    console.log('Initializing the vector store...');
+    vectorStore = await HNSWLib.load(
+        VECTOR_INDEX_PATH,
+        new OpenAIEmbeddings({ openAIApiKey })
+    );
+}
+
+// Next, create the retrieval chain and enter a query:
+console.log('Asking model a question...');
+const model = new OpenAI({ openAIApiKey });
+const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever(), {
+    returnSourceDocuments: true,
+});
+const res = await chain.call({ query });
+
+console.log(`\n${res.text}\n`);
+
+// Remove the vector index directory as we have it cached in the key-value store for the next time.
+await rm(VECTOR_INDEX_PATH, { recursive: true });
+
+await Actor.setValue('OUTPUT', res);
+await Actor.exit();
diff --git a/templates/js-langchain/src/vector_index_cache.js b/templates/js-langchain/src/vector_index_cache.js
@@ -0,0 +1,49 @@
+import { Actor } from 'apify';
+import { createHash } from 'crypto';
+import { finished } from 'node:stream/promises';
+import { Readable } from 'node:stream';
+import tar from 'tar';
+
+const VECTOR_INDEX_CACHE_STORE_NAME = 'vector-index-cache';
+
+/**
+ * Generates vector index cache key as a hash of provided configuration object.
+ *
+ * @param {Object} config
+ * @returns {String}
+ */
+function getIndexCacheKey(config) {
+    const hash = createHash('md5').update(JSON.stringify(config)).digest('hex');
+
+    return `${hash}.tar`;
+}
+
+/**
+ * Caches a vector index from `indexPath` in a VECTOR_INDEX_CACHE_STORE_NAME key-value store a key generated from `config`.
+ *
+ * @param {Object} config
+ * @param {String} indexPath
+ */
+export async function cacheVectorIndex(config, indexPath) {
+    const vectorIndexCacheStore = await Actor.openKeyValueStore(VECTOR_INDEX_CACHE_STORE_NAME);
+    const packedVectorIndexStream = tar.c({}, [indexPath]);
+
+    await vectorIndexCacheStore.setValue(getIndexCacheKey(config), packedVectorIndexStream, { contentType: 'application/tar' });
+}
+
+/**
+ * Fetches a cached vector index from a VECTOR_INDEX_CACHE_STORE_NAME key-value store and extracts it to the current directory.
+ *
+ * @param {Object} config
+ * @returns {Boolean} noting if the vector index was found in the cache
+ */
+export async function retrieveVectorIndex(config) {
+    const vectorIndexCacheStore = await Actor.openKeyValueStore(VECTOR_INDEX_CACHE_STORE_NAME);
+
+    const vectorIndexRecord = await vectorIndexCacheStore.getValue(getIndexCacheKey(config));
+    if (!vectorIndexRecord) return false;
+
+    await finished(Readable.from(vectorIndexRecord).pipe(tar.x({ strip: 1, C: '.' })));
+
+    return true;
+}
diff --git a/templates/manifest.json b/templates/manifest.json
@@ -489,6 +489,27 @@
             "useCases": [
                 "WEB_SCRAPING"
             ]
+        },
+        {
+            "id": "js-langchain",
+            "name": "project_langchain_js",
+            "label": "Langchain",
+            "category": "javascript",
+            "technologies": [
+                "nodejs",
+                "langchain"
+            ],
+            "description": "Example of how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model.",
+            "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/js-langchain.zip?raw=true",
+            "defaultRunOptions": {
+                "build": "latest",
+                "memoryMbytes": 4096,
+                "timeoutSecs": 3600
+            },
+            "showcaseFiles": [
+                "src/main.js",
+                "src/vector_index_cache.js"
+            ]
         }
     ]
 }