diff --git a/README.md b/README.md index 852cf464..e4731a7d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ # Apify Actor templates > This repository stores boilerplate templates and code examples for [Apify Actor](https://apify.com/actors). -> The purpose of these templates is to help devlopers get started with actor development on the Apify platform. +> The purpose of these templates is to help developers get started with actor development on the Apify platform. ## How to use the templates diff --git a/templates/js-langchain/.eslintrc b/templates/js-langchain/.eslintrc new file mode 100644 index 00000000..6c22ab8b --- /dev/null +++ b/templates/js-langchain/.eslintrc @@ -0,0 +1,4 @@ +{ + "extends": "@apify", + "root": true +} diff --git a/templates/js-langchain/README.md b/templates/js-langchain/README.md index 030a3248..6a5c2bdc 100644 --- a/templates/js-langchain/README.md +++ b/templates/js-langchain/README.md @@ -1,7 +1,6 @@ ## LangChain.js template > LangChain is a framework for developing applications powered by language models. -> This example template illustrates how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model. All of this is within a single Apify Actor and slightly over a hundred lines of code. @@ -33,7 +32,6 @@ To be able to run this template both locally and on the Apify platform, you need ## Production use > This serves purely as an example of the whole pipeline. -> For production use, we recommend you to: @@ -49,6 +47,5 @@ For production use, we recommend you to: - [Video guide on getting data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM) - A short guide on [how to create web scrapers using code templates](https://www.youtube.com/watch?v=u-i-Korzf8w) -
-[langchain content crawler](https://www.youtube.com/watch?v=8uvHH-ocSes) +[Web Scraping Data for Generative AI](https://www.youtube.com/watch?v=8uvHH-ocSes) diff --git a/templates/js-langchain/package.json b/templates/js-langchain/package.json index ec2abbe5..ee75a053 100644 --- a/templates/js-langchain/package.json +++ b/templates/js-langchain/package.json @@ -1,21 +1,30 @@ { "name": "project-langchain", - "version": "0.0.1", + "version": "0.0.2", "type": "module", "description": "This is a boilerplate of an Apify actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { + "@langchain/community": "^0.2.32", + "@langchain/core": "^0.2.31", + "@langchain/openai": "^0.2.10", "apify": "^3.1.10", - "hnswlib-node": "^1.4.2", - "langchain": "^0.1.11", + "hnswlib-node": "^3.0.0", + "langchain": "^0.2.18", "tar": "^6.1.14" }, "scripts": { "start": "node src/main.js", + "lint": "eslint ./src --ext .js,.jsx", + "lint:fix": "eslint ./src --ext .js,.jsx --fix", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", - "license": "ISC" + "license": "ISC", + "devDependencies": { + "@apify/eslint-config": "^0.4.0", + "eslint": "^8.57.0" + } } diff --git a/templates/js-langchain/src/main.js b/templates/js-langchain/src/main.js index 91e4ec72..797cc6bb 100644 --- a/templates/js-langchain/src/main.js +++ b/templates/js-langchain/src/main.js @@ -1,24 +1,31 @@ -import { Actor } from 'apify'; -import { ApifyDatasetLoader } from 'langchain/document_loaders/web/apify_dataset'; -import { Document } from 'langchain/document'; -import { HNSWLib } from 'langchain/vectorstores/hnswlib'; -import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; -import { RetrievalQAChain } from 'langchain/chains'; -import { OpenAI } from 'langchain/llms/openai'; import { rm } from 'node:fs/promises'; -// this is ESM project, and as such, it requires you to specify extensions in your relative imports -// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions +import { ApifyDatasetLoader } from '@langchain/community/document_loaders/web/apify_dataset'; +import { HNSWLib } from '@langchain/community/vectorstores/hnswlib'; +import { ChatPromptTemplate } from '@langchain/core/prompts'; +import { OpenAI, OpenAIEmbeddings } from '@langchain/openai'; +import { Actor, log } from 'apify'; +import { createStuffDocumentsChain } from 'langchain/chains/combine_documents'; +import { createRetrievalChain } from 'langchain/chains/retrieval'; +import { Document } from 'langchain/document'; + +// This is ESM project, and as such, it requires you to specify extensions in your relative imports. +// Read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions import { retrieveVectorIndex, cacheVectorIndex } from './vector_index_cache.js'; await Actor.init(); -// There are 2 steps you need to proceed first in order to be able to run this template: -// 1. If you are running template locally then you need to authenticate to Apify platform by calling `apify login` in your terminal. Without this, you won't be able to run the required Website Content Crawler Actor to gather the data. -// 2. Configure the OPENAI_API_KEY environment variable (https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console) with your OpenAI API key you obtain at https://platform.openai.com/account/api-keys. +// Follow these steps to run this template: +// 1. If running locally, authenticate to the Apify platform by executing `apify login` in your terminal. +// This is necessary to run the Website Content Crawler Actor for data gathering. +// 2. Set the `OPENAI_API_KEY` environment variable with your OpenAI API key, which can be obtained from +// https://platform.openai.com/account/api-keys. Refer to +// https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console for guidance +// on setting environment variables. const { OPENAI_API_KEY, APIFY_TOKEN } = process.env; -// You can configure the input for the Actor in the Apify UI when running on the Apify platform or editing storage/key_value_stores/default/INPUT.json when running locally. +// You can configure the input for the Actor in the Apify UI when running on the Apify platform or editing +// storage/key_value_stores/default/INPUT.json when running locally. const { startUrls = [{ url: 'https://wikipedia.com' }], maxCrawlPages = 3, @@ -30,10 +37,14 @@ const { // Local directory where the vector index will be stored. const VECTOR_INDEX_PATH = './vector_index'; +const prompt = ChatPromptTemplate.fromTemplate( + `Answer the user's question: {input} based on the following context {context}`, +); + if (!openAIApiKey) throw new Error('Please configure the OPENAI_API_KEY as environment variable or enter it into the input!'); if (!APIFY_TOKEN) throw new Error('Please configure the APIFY_TOKEN environment variable! Call `apify login` in your terminal to authenticate.'); -// Now we want to creare a vector index from the crawled documents. +// Now we want to create a vector index from the crawled documents. // Following object represents an input for the https://apify.com/apify/website-content-crawler actor that crawls the website to gather the data. const websiteContentCrawlerInput = { startUrls, maxCrawlPages }; @@ -42,12 +53,12 @@ let vectorStore; // First, we check if the vector index is already cached. If not, we run the website content crawler to get the documents. // By setting up forceRecrawl=true you can enforce a re-scrape of the website content and re-creation of the vector index. -console.log('Fetching cached vector index from key-value store...'); +log.info('Fetching cached vector index from key-value store...'); const reinitializeIndex = forceRecrawl || !(await retrieveVectorIndex(websiteContentCrawlerInput)); if (reinitializeIndex) { // Run the Actor, wait for it to finish, and fetch its results from the Apify dataset into a LangChain document loader. - console.log('Vector index was not found.') - console.log('Running apify/website-content-crawler to gather the data...'); + log.info('Vector index was not found.'); + log.info('Running apify/website-content-crawler to gather the data...'); const loader = await ApifyDatasetLoader.fromActorCall( 'apify/website-content-crawler', websiteContentCrawlerInput, @@ -57,41 +68,47 @@ if (reinitializeIndex) { metadata: { source: item.url }, }), clientOptions: { token: APIFY_TOKEN }, - } + }, ); // Initialize the vector index from the crawled documents. - console.log('Feeding vector index with crawling results...'); + log.info('Feeding vector index with crawling results...'); const docs = await loader.load(); vectorStore = await HNSWLib.fromDocuments( docs, - new OpenAIEmbeddings({ openAIApiKey }) + new OpenAIEmbeddings({ openAIApiKey }), ); // Save the vector index to the key-value store so that we can skip this phase in the next run. - console.log('Saving vector index to the disk...') + log.info('Saving vector index to the disk...'); await vectorStore.save(VECTOR_INDEX_PATH); await cacheVectorIndex(websiteContentCrawlerInput, VECTOR_INDEX_PATH); } // Load the vector index from the disk if not already initialized above. if (!vectorStore) { - console.log('Initializing the vector store...'); - vectorStore = await HNSWLib.load( - VECTOR_INDEX_PATH, - new OpenAIEmbeddings({ openAIApiKey }) - ); + log.info('Initializing the vector store...'); + vectorStore = await HNSWLib.load(VECTOR_INDEX_PATH, new OpenAIEmbeddings({ openAIApiKey })); } // Next, create the retrieval chain and enter a query: -console.log('Asking model a question...'); -const model = new OpenAI({ openAIApiKey }); -const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever(), { +const llm = new OpenAI({ openAIApiKey }); +const combineDocsChain = await createStuffDocumentsChain({ + llm, + prompt, +}); + +const chain = await createRetrievalChain({ + combineDocsChain, + retriever: vectorStore.asRetriever(), returnSourceDocuments: true, }); -const res = await chain.call({ query }); -console.log(`\n${res.text}\n`); +log.info('Asking model a question...'); +const res = await chain.invoke({ input: query }); + +log.info(`Question: ${query}`); +log.info(`Model response: ${res.answer}`); // Remove the vector index directory as we have it cached in the key-value store for the next time. await rm(VECTOR_INDEX_PATH, { recursive: true }); diff --git a/templates/js-langchain/src/vector_index_cache.js b/templates/js-langchain/src/vector_index_cache.js index 865e6096..6ef72eda 100644 --- a/templates/js-langchain/src/vector_index_cache.js +++ b/templates/js-langchain/src/vector_index_cache.js @@ -1,7 +1,8 @@ -import { Actor } from 'apify'; import { createHash } from 'crypto'; -import { finished } from 'node:stream/promises'; import { Readable } from 'node:stream'; +import { finished } from 'node:stream/promises'; + +import { Actor } from 'apify'; import tar from 'tar'; const VECTOR_INDEX_CACHE_STORE_NAME = 'vector-index-cache';