From 5afb64dd027b2fedf32ba9bfa82f77b546672af4 Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Wed, 26 Jul 2023 15:09:08 +0200 Subject: [PATCH 01/14] feat(app): cheerio js readme + code --- .../.actor/input_schema.json | 10 ++++++-- templates/js-crawlee-cheerio/README.md | 23 ++++++++++++------- templates/js-crawlee-cheerio/src/main.js | 16 +++++++++---- templates/js-crawlee-cheerio/src/routes.js | 21 ----------------- 4 files changed, 35 insertions(+), 35 deletions(-) delete mode 100644 templates/js-crawlee-cheerio/src/routes.js diff --git a/templates/js-crawlee-cheerio/.actor/input_schema.json b/templates/js-crawlee-cheerio/.actor/input_schema.json index 46485c70..fbdacef7 100644 --- a/templates/js-crawlee-cheerio/.actor/input_schema.json +++ b/templates/js-crawlee-cheerio/.actor/input_schema.json @@ -10,9 +10,15 @@ "editor": "requestListSources", "prefill": [ { - "url": "https://apify.com" + "url": "https://crawlee.com" } ] - } + }, + "maxRequestsPerCrawl": { + "title": "Max Requests per Crawl", + "type": "integer", + "description": "Maximum number of requests that can be made by this crawler.", + "default": 100 + }, } } diff --git a/templates/js-crawlee-cheerio/README.md b/templates/js-crawlee-cheerio/README.md index b3a9ff90..37cc5c2b 100644 --- a/templates/js-crawlee-cheerio/README.md +++ b/templates/js-crawlee-cheerio/README.md @@ -1,12 +1,19 @@ -# CheerioCrawler Actor template +# Cheerio Actor template -This template is a production ready boilerplate for developing with `CheerioCrawler`. Use this to bootstrap your projects using the most up-to-date code. +A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/). -> We decided to split Apify SDK into two libraries, [Crawlee](https://crawlee.dev) and [Apify SDK v3](https://docs.apify.com/sdk/js). Crawlee will retain all the crawling and scraping-related tools and will always strive to be the best web scraping library for its community. At the same time, Apify SDK will continue to exist, but keep only the Apify-specific features related to building actors on the Apify platform. Read the [upgrading guide](https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3) to learn about the changes. +## Included features -If you're looking for examples or want to learn more visit: +- **[Crawlee](https://docs.apify.com/sdk/python/)** - toolkit for building Apify Actors +- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your Actor's input +- **[Dataset](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-datasets)** - store structured data where each object stored has the same attributes +- **[Cheerio](https://cheerio.js.org/)** - a fast, flexible & elegant library for parsing and manipulating HTML and XML -- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform) -- [Cheerio Tutorial](https://crawlee.dev/docs/guides/cheerio-crawler-guide) -- [Documentation](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) -- [Examples](https://crawlee.dev/docs/examples/cheerio-crawler) +## How it works + +This code is a JavaScript script that uses Cheerio to scrape data from a website. It then stores the website titles in a dataset. + +- The script loads the HTML of the provided URLs from `startUrls` field in input schema. +- Uses Cheerio `requestHandler` function to scrape the website titles. +- Number of crawls is limited by `maxPagesPerCrawl` field from input schema. +- Then the results are saved to a dataset. diff --git a/templates/js-crawlee-cheerio/src/main.js b/templates/js-crawlee-cheerio/src/main.js index 24ffea04..0d32819f 100644 --- a/templates/js-crawlee-cheerio/src/main.js +++ b/templates/js-crawlee-cheerio/src/main.js @@ -7,19 +7,27 @@ // For more information, see https://docs.apify.com/sdk/js import { Actor } from 'apify'; // For more information, see https://crawlee.dev -import { CheerioCrawler } from 'crawlee'; -import { router } from './routes.js'; +import { CheerioCrawler, Dataset } from 'crawlee'; // Initialize the Apify SDK await Actor.init(); -const startUrls = ['https://apify.com']; +const { startUrls, maxRequestsPerCrawl } = await Actor.getInput() || { startUrls: ["https://crawlee.com"], maxRequestsPerCrawl: 100 }; const proxyConfiguration = await Actor.createProxyConfiguration(); const crawler = new CheerioCrawler({ proxyConfiguration, - requestHandler: router, + maxRequestsPerCrawl, + requestHandler: async ({ enqueueLinks, request, $, log }) => { + log.info('enqueueing new URLs'); + await enqueueLinks( { globs: ['https://crawlee.com/*'], label: 'detail' }); + + const title = $('title').text(); + log.info(`${title}`, { url: request.loadedUrl }); + + await Dataset.pushData({ url: request.loadedUrl, title }); + }, }); await crawler.run(startUrls); diff --git a/templates/js-crawlee-cheerio/src/routes.js b/templates/js-crawlee-cheerio/src/routes.js deleted file mode 100644 index ad218430..00000000 --- a/templates/js-crawlee-cheerio/src/routes.js +++ /dev/null @@ -1,21 +0,0 @@ -import { Dataset, createCheerioRouter } from 'crawlee'; - -export const router = createCheerioRouter(); - -router.addDefaultHandler(async ({ enqueueLinks, log }) => { - log.info(`enqueueing new URLs`); - await enqueueLinks({ - globs: ['https://apify.com/*'], - label: 'detail', - }); -}); - -router.addHandler('detail', async ({ request, $, log }) => { - const title = $('title').text(); - log.info(`${title}`, { url: request.loadedUrl }); - - await Dataset.pushData({ - url: request.loadedUrl, - title, - }); -}); From 7588f57841a487a691a5b51c3155cba46758b37d Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Wed, 26 Jul 2023 15:26:49 +0200 Subject: [PATCH 02/14] feat(app): cheerio ts readme + code --- templates/js-crawlee-cheerio/README.md | 2 +- templates/js-crawlee-cheerio/src/main.js | 2 +- templates/manifest.json | 2 +- .../.actor/input_schema.json | 10 ++++++-- templates/ts-crawlee-cheerio/README.md | 23 ++++++++++++------- templates/ts-crawlee-cheerio/src/main.ts | 16 +++++++++---- templates/ts-crawlee-cheerio/src/routes.ts | 21 ----------------- 7 files changed, 38 insertions(+), 38 deletions(-) delete mode 100644 templates/ts-crawlee-cheerio/src/routes.ts diff --git a/templates/js-crawlee-cheerio/README.md b/templates/js-crawlee-cheerio/README.md index 37cc5c2b..6fa4eaa2 100644 --- a/templates/js-crawlee-cheerio/README.md +++ b/templates/js-crawlee-cheerio/README.md @@ -1,4 +1,4 @@ -# Cheerio Actor template +# JavaScript Cheerio Actor template A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/). diff --git a/templates/js-crawlee-cheerio/src/main.js b/templates/js-crawlee-cheerio/src/main.js index 0d32819f..f265801c 100644 --- a/templates/js-crawlee-cheerio/src/main.js +++ b/templates/js-crawlee-cheerio/src/main.js @@ -12,7 +12,7 @@ import { CheerioCrawler, Dataset } from 'crawlee'; // Initialize the Apify SDK await Actor.init(); -const { startUrls, maxRequestsPerCrawl } = await Actor.getInput() || { startUrls: ["https://crawlee.com"], maxRequestsPerCrawl: 100 }; +const { startUrls = ["https://crawlee.com"], maxRequestsPerCrawl = 100 } = await Actor.getInput(); const proxyConfiguration = await Actor.createProxyConfiguration(); diff --git a/templates/manifest.json b/templates/manifest.json index a53f4f2f..0f4aa60b 100644 --- a/templates/manifest.json +++ b/templates/manifest.json @@ -222,7 +222,7 @@ "crawlee", "cheerio" ], - "description": "A scraper example that uses HTTP requests and Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.", + "description": "A scraper example that uses Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.", "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/js-crawlee-cheerio.zip?raw=true", "defaultRunOptions": { "build": "latest", diff --git a/templates/ts-crawlee-cheerio/.actor/input_schema.json b/templates/ts-crawlee-cheerio/.actor/input_schema.json index 46485c70..fbdacef7 100644 --- a/templates/ts-crawlee-cheerio/.actor/input_schema.json +++ b/templates/ts-crawlee-cheerio/.actor/input_schema.json @@ -10,9 +10,15 @@ "editor": "requestListSources", "prefill": [ { - "url": "https://apify.com" + "url": "https://crawlee.com" } ] - } + }, + "maxRequestsPerCrawl": { + "title": "Max Requests per Crawl", + "type": "integer", + "description": "Maximum number of requests that can be made by this crawler.", + "default": 100 + }, } } diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md index b3a9ff90..d7fbd962 100644 --- a/templates/ts-crawlee-cheerio/README.md +++ b/templates/ts-crawlee-cheerio/README.md @@ -1,12 +1,19 @@ -# CheerioCrawler Actor template +# TypeScript Cheerio Actor template -This template is a production ready boilerplate for developing with `CheerioCrawler`. Use this to bootstrap your projects using the most up-to-date code. +A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/). -> We decided to split Apify SDK into two libraries, [Crawlee](https://crawlee.dev) and [Apify SDK v3](https://docs.apify.com/sdk/js). Crawlee will retain all the crawling and scraping-related tools and will always strive to be the best web scraping library for its community. At the same time, Apify SDK will continue to exist, but keep only the Apify-specific features related to building actors on the Apify platform. Read the [upgrading guide](https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3) to learn about the changes. +## Included features -If you're looking for examples or want to learn more visit: +- **[Crawlee](https://docs.apify.com/sdk/python/)** - toolkit for building Apify Actors +- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your Actor's input +- **[Dataset](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-datasets)** - store structured data where each object stored has the same attributes +- **[Cheerio](https://cheerio.js.org/)** - a fast, flexible & elegant library for parsing and manipulating HTML and XML -- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform) -- [Cheerio Tutorial](https://crawlee.dev/docs/guides/cheerio-crawler-guide) -- [Documentation](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) -- [Examples](https://crawlee.dev/docs/examples/cheerio-crawler) +## How it works + +This code is a TypeScript script that uses Cheerio to scrape data from a website. It then stores the website titles in a dataset. + +- The script loads the HTML of the provided URLs from `startUrls` field in input schema. +- Uses Cheerio `requestHandler` function to scrape the website titles. +- Number of crawls is limited by `maxPagesPerCrawl` field from input schema. +- Then the results are saved to a dataset. diff --git a/templates/ts-crawlee-cheerio/src/main.ts b/templates/ts-crawlee-cheerio/src/main.ts index 24ffea04..da8cadf9 100644 --- a/templates/ts-crawlee-cheerio/src/main.ts +++ b/templates/ts-crawlee-cheerio/src/main.ts @@ -7,19 +7,27 @@ // For more information, see https://docs.apify.com/sdk/js import { Actor } from 'apify'; // For more information, see https://crawlee.dev -import { CheerioCrawler } from 'crawlee'; -import { router } from './routes.js'; +import { CheerioCrawler, Dataset, Dictionary } from 'crawlee'; // Initialize the Apify SDK await Actor.init(); -const startUrls = ['https://apify.com']; +const { startUrls = ['https://crawlee.com'], maxRequestsPerCrawl = 100 } = (await Actor.getInput() as Dictionary); const proxyConfiguration = await Actor.createProxyConfiguration(); const crawler = new CheerioCrawler({ proxyConfiguration, - requestHandler: router, + maxRequestsPerCrawl, + requestHandler: async ({ enqueueLinks, request, $, log }) => { + log.info('enqueueing new URLs'); + await enqueueLinks({ globs: ['https://crawlee.com/*'], label: 'detail' }); + + const title = $('title').text(); + log.info(`${title}`, { url: request.loadedUrl }); + + await Dataset.pushData({ url: request.loadedUrl, title }); + }, }); await crawler.run(startUrls); diff --git a/templates/ts-crawlee-cheerio/src/routes.ts b/templates/ts-crawlee-cheerio/src/routes.ts deleted file mode 100644 index ad218430..00000000 --- a/templates/ts-crawlee-cheerio/src/routes.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { Dataset, createCheerioRouter } from 'crawlee'; - -export const router = createCheerioRouter(); - -router.addDefaultHandler(async ({ enqueueLinks, log }) => { - log.info(`enqueueing new URLs`); - await enqueueLinks({ - globs: ['https://apify.com/*'], - label: 'detail', - }); -}); - -router.addHandler('detail', async ({ request, $, log }) => { - const title = $('title').text(); - log.info(`${title}`, { url: request.loadedUrl }); - - await Dataset.pushData({ - url: request.loadedUrl, - title, - }); -}); From a0ad7a2cb966f23a9f3c7380544f486b38925a4b Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Wed, 26 Jul 2023 15:31:22 +0200 Subject: [PATCH 03/14] feat(app): fix wrong manifest --- templates/manifest.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/manifest.json b/templates/manifest.json index 0f4aa60b..ea046d2e 100644 --- a/templates/manifest.json +++ b/templates/manifest.json @@ -301,7 +301,7 @@ "cheerio" ], "description": "Skeleton project that helps you quickly bootstrap `CheerioCrawler` in JavaScript. It's best for developers who already know Apify SDK and Crawlee.", - "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/ts-crawlee-cheerio.zip?raw=true", + "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/js-bootstrap-cheerio-crawler.zip?raw=true", "defaultRunOptions": { "build": "latest", "memoryMbytes": 2048, @@ -326,7 +326,7 @@ "crawlee", "cheerio" ], - "description": "A scraper example that uses HTTP requests and Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.", + "description": "A scraper example that uses Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.", "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/ts-crawlee-cheerio.zip?raw=true", "defaultRunOptions": { "build": "latest", From 51411fa03b51c675b0986e7535dc46f875de8244 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Turo=C5=88?= Date: Fri, 28 Jul 2023 09:15:50 +0100 Subject: [PATCH 04/14] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jan Bárta <45016873+jbartadev@users.noreply.github.com> --- templates/js-crawlee-cheerio/.actor/input_schema.json | 2 +- templates/ts-crawlee-cheerio/.actor/input_schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/js-crawlee-cheerio/.actor/input_schema.json b/templates/js-crawlee-cheerio/.actor/input_schema.json index fbdacef7..c8a2bfb2 100644 --- a/templates/js-crawlee-cheerio/.actor/input_schema.json +++ b/templates/js-crawlee-cheerio/.actor/input_schema.json @@ -10,7 +10,7 @@ "editor": "requestListSources", "prefill": [ { - "url": "https://crawlee.com" + "url": "https://crawlee.dev" } ] }, diff --git a/templates/ts-crawlee-cheerio/.actor/input_schema.json b/templates/ts-crawlee-cheerio/.actor/input_schema.json index fbdacef7..c8a2bfb2 100644 --- a/templates/ts-crawlee-cheerio/.actor/input_schema.json +++ b/templates/ts-crawlee-cheerio/.actor/input_schema.json @@ -10,7 +10,7 @@ "editor": "requestListSources", "prefill": [ { - "url": "https://crawlee.com" + "url": "https://crawlee.dev" } ] }, From d70071714b84330c9902af27d773f8fd8e38f4d8 Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Fri, 28 Jul 2023 10:27:44 +0200 Subject: [PATCH 05/14] feat(app): cr objections --- templates/js-crawlee-cheerio/README.md | 2 +- templates/js-crawlee-cheerio/src/main.js | 4 ++-- templates/ts-crawlee-cheerio/README.md | 4 ++-- templates/ts-crawlee-cheerio/src/main.ts | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/templates/js-crawlee-cheerio/README.md b/templates/js-crawlee-cheerio/README.md index 6fa4eaa2..1df637d0 100644 --- a/templates/js-crawlee-cheerio/README.md +++ b/templates/js-crawlee-cheerio/README.md @@ -1,4 +1,4 @@ -# JavaScript Cheerio Actor template +# JavaScript CheerioCrawler Actor template A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/). diff --git a/templates/js-crawlee-cheerio/src/main.js b/templates/js-crawlee-cheerio/src/main.js index f265801c..3dcc761a 100644 --- a/templates/js-crawlee-cheerio/src/main.js +++ b/templates/js-crawlee-cheerio/src/main.js @@ -12,7 +12,7 @@ import { CheerioCrawler, Dataset } from 'crawlee'; // Initialize the Apify SDK await Actor.init(); -const { startUrls = ["https://crawlee.com"], maxRequestsPerCrawl = 100 } = await Actor.getInput(); +const { startUrls, maxRequestsPerCrawl } = await Actor.getInput(); const proxyConfiguration = await Actor.createProxyConfiguration(); @@ -21,7 +21,7 @@ const crawler = new CheerioCrawler({ maxRequestsPerCrawl, requestHandler: async ({ enqueueLinks, request, $, log }) => { log.info('enqueueing new URLs'); - await enqueueLinks( { globs: ['https://crawlee.com/*'], label: 'detail' }); + await enqueueLinks(); const title = $('title').text(); log.info(`${title}`, { url: request.loadedUrl }); diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md index d7fbd962..087c9933 100644 --- a/templates/ts-crawlee-cheerio/README.md +++ b/templates/ts-crawlee-cheerio/README.md @@ -1,6 +1,6 @@ -# TypeScript Cheerio Actor template +# TypeScript CheerioCrawler Actor template -A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/). +A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/) wrapped into [CheerioCrawler](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler). ## Included features diff --git a/templates/ts-crawlee-cheerio/src/main.ts b/templates/ts-crawlee-cheerio/src/main.ts index da8cadf9..4649fd2e 100644 --- a/templates/ts-crawlee-cheerio/src/main.ts +++ b/templates/ts-crawlee-cheerio/src/main.ts @@ -12,7 +12,7 @@ import { CheerioCrawler, Dataset, Dictionary } from 'crawlee'; // Initialize the Apify SDK await Actor.init(); -const { startUrls = ['https://crawlee.com'], maxRequestsPerCrawl = 100 } = (await Actor.getInput() as Dictionary); +const { startUrls, maxRequestsPerCrawl } = (await Actor.getInput() as Dictionary); const proxyConfiguration = await Actor.createProxyConfiguration(); @@ -21,7 +21,7 @@ const crawler = new CheerioCrawler({ maxRequestsPerCrawl, requestHandler: async ({ enqueueLinks, request, $, log }) => { log.info('enqueueing new URLs'); - await enqueueLinks({ globs: ['https://crawlee.com/*'], label: 'detail' }); + await enqueueLinks(); const title = $('title').text(); log.info(`${title}`, { url: request.loadedUrl }); From aa9826b7e49f318bc75d87cbca0da100706cccf7 Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Fri, 28 Jul 2023 10:28:54 +0200 Subject: [PATCH 06/14] feat(app): cr objections --- templates/js-crawlee-cheerio/README.md | 2 +- templates/ts-crawlee-cheerio/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/js-crawlee-cheerio/README.md b/templates/js-crawlee-cheerio/README.md index 1df637d0..7609f31e 100644 --- a/templates/js-crawlee-cheerio/README.md +++ b/templates/js-crawlee-cheerio/README.md @@ -1,4 +1,4 @@ -# JavaScript CheerioCrawler Actor template +# JavaScript Crawlee & CheerioCrawler Actor template A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/). diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md index 087c9933..0776ca44 100644 --- a/templates/ts-crawlee-cheerio/README.md +++ b/templates/ts-crawlee-cheerio/README.md @@ -1,4 +1,4 @@ -# TypeScript CheerioCrawler Actor template +# TypeScript Crawlee & CheerioCrawler Actor template A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/) wrapped into [CheerioCrawler](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler). From 017ac4fd59ca691330ae349e2727faaa8ac80f8d Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Fri, 28 Jul 2023 14:33:08 +0200 Subject: [PATCH 07/14] feat(app): fix tests --- templates/js-crawlee-cheerio/src/main.js | 2 +- templates/ts-crawlee-cheerio/src/main.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/js-crawlee-cheerio/src/main.js b/templates/js-crawlee-cheerio/src/main.js index 3dcc761a..1856b23b 100644 --- a/templates/js-crawlee-cheerio/src/main.js +++ b/templates/js-crawlee-cheerio/src/main.js @@ -12,7 +12,7 @@ import { CheerioCrawler, Dataset } from 'crawlee'; // Initialize the Apify SDK await Actor.init(); -const { startUrls, maxRequestsPerCrawl } = await Actor.getInput(); +const { startUrls, maxRequestsPerCrawl } = await Actor.getInput() || {}; const proxyConfiguration = await Actor.createProxyConfiguration(); diff --git a/templates/ts-crawlee-cheerio/src/main.ts b/templates/ts-crawlee-cheerio/src/main.ts index 4649fd2e..b11e957c 100644 --- a/templates/ts-crawlee-cheerio/src/main.ts +++ b/templates/ts-crawlee-cheerio/src/main.ts @@ -12,7 +12,7 @@ import { CheerioCrawler, Dataset, Dictionary } from 'crawlee'; // Initialize the Apify SDK await Actor.init(); -const { startUrls, maxRequestsPerCrawl } = (await Actor.getInput() as Dictionary); +const { startUrls, maxRequestsPerCrawl } = (await Actor.getInput() as Dictionary) || {}; const proxyConfiguration = await Actor.createProxyConfiguration(); From 18c2b7d3f6e1e0ace35d0121feac8efcd6c527fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Turo=C5=88?= Date: Tue, 1 Aug 2023 12:34:42 +0100 Subject: [PATCH 08/14] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jan Bárta <45016873+jbartadev@users.noreply.github.com> --- templates/ts-crawlee-cheerio/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md index 0776ca44..dbc5a42e 100644 --- a/templates/ts-crawlee-cheerio/README.md +++ b/templates/ts-crawlee-cheerio/README.md @@ -11,9 +11,9 @@ A template example built with [Crawlee](https://crawlee.dev) to scrape data from ## How it works -This code is a TypeScript script that uses Cheerio to scrape data from a website. It then stores the website titles in a dataset. +This code is a TypeScript script that uses [Crawlee CheerioCralwer](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) framework to crawl a website and extract the data from the crawled URLs with Cheerio. It then stores the website titles in a dataset. -- The script loads the HTML of the provided URLs from `startUrls` field in input schema. +- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. - Uses Cheerio `requestHandler` function to scrape the website titles. - Number of crawls is limited by `maxPagesPerCrawl` field from input schema. - Then the results are saved to a dataset. From 1aa0fd6e1812d65222e65b01aa8eb3591e5caca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Turo=C5=88?= Date: Wed, 2 Aug 2023 08:07:47 +0100 Subject: [PATCH 09/14] Update templates/ts-crawlee-cheerio/README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jan Bárta <45016873+jbartadev@users.noreply.github.com> --- templates/ts-crawlee-cheerio/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md index dbc5a42e..28e96b2d 100644 --- a/templates/ts-crawlee-cheerio/README.md +++ b/templates/ts-crawlee-cheerio/README.md @@ -16,4 +16,3 @@ This code is a TypeScript script that uses [Crawlee CheerioCralwer](https://craw - The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. - Uses Cheerio `requestHandler` function to scrape the website titles. - Number of crawls is limited by `maxPagesPerCrawl` field from input schema. -- Then the results are saved to a dataset. From 6f74a68518fbf7cf720a0f80b16ca56135ec2024 Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Wed, 2 Aug 2023 09:10:45 +0200 Subject: [PATCH 10/14] fix cr objections --- templates/js-crawlee-cheerio/README.md | 6 ++---- templates/ts-crawlee-cheerio/README.md | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/templates/js-crawlee-cheerio/README.md b/templates/js-crawlee-cheerio/README.md index 7609f31e..fde77b04 100644 --- a/templates/js-crawlee-cheerio/README.md +++ b/templates/js-crawlee-cheerio/README.md @@ -13,7 +13,5 @@ A template example built with [Crawlee](https://crawlee.dev) to scrape data from This code is a JavaScript script that uses Cheerio to scrape data from a website. It then stores the website titles in a dataset. -- The script loads the HTML of the provided URLs from `startUrls` field in input schema. -- Uses Cheerio `requestHandler` function to scrape the website titles. -- Number of crawls is limited by `maxPagesPerCrawl` field from input schema. -- Then the results are saved to a dataset. +- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of crawls is limited by `maxPagesPerCrawl` field from input schema. +- The crawler uses `requestHandler` for each URL to extract the data from the page with the Cheerio library and to save the title and URL of each page to the dataset. It also logs out each result that is being saved. diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md index 28e96b2d..1b82a7c1 100644 --- a/templates/ts-crawlee-cheerio/README.md +++ b/templates/ts-crawlee-cheerio/README.md @@ -13,6 +13,5 @@ A template example built with [Crawlee](https://crawlee.dev) to scrape data from This code is a TypeScript script that uses [Crawlee CheerioCralwer](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) framework to crawl a website and extract the data from the crawled URLs with Cheerio. It then stores the website titles in a dataset. -- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. -- Uses Cheerio `requestHandler` function to scrape the website titles. -- Number of crawls is limited by `maxPagesPerCrawl` field from input schema. +- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of crawls is limited by `maxPagesPerCrawl` field from input schema. +- The crawler uses `requestHandler` for each URL to extract the data from the page with the Cheerio library and to save the title and URL of each page to the dataset. It also logs out each result that is being saved. From 58438bda4659120dca2c4071b68d88d70f74ee71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Turo=C5=88?= Date: Wed, 9 Aug 2023 10:50:11 +0100 Subject: [PATCH 11/14] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Martin Adámek --- templates/js-crawlee-cheerio/src/main.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/templates/js-crawlee-cheerio/src/main.js b/templates/js-crawlee-cheerio/src/main.js index 1856b23b..9a25b08c 100644 --- a/templates/js-crawlee-cheerio/src/main.js +++ b/templates/js-crawlee-cheerio/src/main.js @@ -12,14 +12,17 @@ import { CheerioCrawler, Dataset } from 'crawlee'; // Initialize the Apify SDK await Actor.init(); -const { startUrls, maxRequestsPerCrawl } = await Actor.getInput() || {}; +const { + startUrls = ['https://crawlee.dev'], + maxRequestsPerCrawl = 100, +} = await Actor.getInput() ?? {}; const proxyConfiguration = await Actor.createProxyConfiguration(); const crawler = new CheerioCrawler({ proxyConfiguration, maxRequestsPerCrawl, - requestHandler: async ({ enqueueLinks, request, $, log }) => { + async requestHandler({ enqueueLinks, request, $, log }) { log.info('enqueueing new URLs'); await enqueueLinks(); From 07de88ec42309ea51ec9c056c1cd1eb7fdb8b73a Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Wed, 9 Aug 2023 11:57:46 +0200 Subject: [PATCH 12/14] Fix PR objections --- templates/js-crawlee-cheerio/README.md | 7 ++++--- templates/ts-crawlee-cheerio/README.md | 5 +++-- templates/ts-crawlee-cheerio/package.json | 4 ++-- templates/ts-crawlee-cheerio/src/main.ts | 9 +++++++-- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/templates/js-crawlee-cheerio/README.md b/templates/js-crawlee-cheerio/README.md index fde77b04..3aa110cb 100644 --- a/templates/js-crawlee-cheerio/README.md +++ b/templates/js-crawlee-cheerio/README.md @@ -1,10 +1,11 @@ # JavaScript Crawlee & CheerioCrawler Actor template -A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/). +A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/) wrapped into [CheerioCrawler](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler). ## Included features -- **[Crawlee](https://docs.apify.com/sdk/python/)** - toolkit for building Apify Actors +- **[Apify SDK](https://docs.apify.com/sdk/js)** - toolkit for building Actors +- **[Crawlee](https://crawlee.dev)** - web scraping and browser automation library - **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your Actor's input - **[Dataset](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-datasets)** - store structured data where each object stored has the same attributes - **[Cheerio](https://cheerio.js.org/)** - a fast, flexible & elegant library for parsing and manipulating HTML and XML @@ -13,5 +14,5 @@ A template example built with [Crawlee](https://crawlee.dev) to scrape data from This code is a JavaScript script that uses Cheerio to scrape data from a website. It then stores the website titles in a dataset. -- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of crawls is limited by `maxPagesPerCrawl` field from input schema. +- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of crawler pages is limited by `maxPagesPerCrawl` field from input schema. - The crawler uses `requestHandler` for each URL to extract the data from the page with the Cheerio library and to save the title and URL of each page to the dataset. It also logs out each result that is being saved. diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md index 1b82a7c1..e50268b9 100644 --- a/templates/ts-crawlee-cheerio/README.md +++ b/templates/ts-crawlee-cheerio/README.md @@ -4,7 +4,8 @@ A template example built with [Crawlee](https://crawlee.dev) to scrape data from ## Included features -- **[Crawlee](https://docs.apify.com/sdk/python/)** - toolkit for building Apify Actors +- **[Apify SDK](https://docs.apify.com/sdk/js)** - toolkit for building Actors +- **[Crawlee](https://crawlee.dev)** - web scraping and browser automation library - **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your Actor's input - **[Dataset](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-datasets)** - store structured data where each object stored has the same attributes - **[Cheerio](https://cheerio.js.org/)** - a fast, flexible & elegant library for parsing and manipulating HTML and XML @@ -13,5 +14,5 @@ A template example built with [Crawlee](https://crawlee.dev) to scrape data from This code is a TypeScript script that uses [Crawlee CheerioCralwer](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) framework to crawl a website and extract the data from the crawled URLs with Cheerio. It then stores the website titles in a dataset. -- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of crawls is limited by `maxPagesPerCrawl` field from input schema. +- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of crawler pages is limited by `maxPagesPerCrawl` field from input schema. - The crawler uses `requestHandler` for each URL to extract the data from the page with the Cheerio library and to save the title and URL of each page to the dataset. It also logs out each result that is being saved. diff --git a/templates/ts-crawlee-cheerio/package.json b/templates/ts-crawlee-cheerio/package.json index 9fd5fc6e..4a52bf69 100644 --- a/templates/ts-crawlee-cheerio/package.json +++ b/templates/ts-crawlee-cheerio/package.json @@ -7,8 +7,8 @@ "node": ">=16.0.0" }, "dependencies": { - "apify": "^3.0.0", - "crawlee": "^3.0.0" + "apify": "^3.1.8", + "crawlee": "^3.5.0" }, "devDependencies": { "@apify/eslint-config-ts": "^0.2.3", diff --git a/templates/ts-crawlee-cheerio/src/main.ts b/templates/ts-crawlee-cheerio/src/main.ts index b11e957c..c0b34376 100644 --- a/templates/ts-crawlee-cheerio/src/main.ts +++ b/templates/ts-crawlee-cheerio/src/main.ts @@ -7,12 +7,17 @@ // For more information, see https://docs.apify.com/sdk/js import { Actor } from 'apify'; // For more information, see https://crawlee.dev -import { CheerioCrawler, Dataset, Dictionary } from 'crawlee'; +import { CheerioCrawler, Dataset } from 'crawlee'; + +interface Input { + startUrls: string[]; + maxRequestsPerCrawl: number; +} // Initialize the Apify SDK await Actor.init(); -const { startUrls, maxRequestsPerCrawl } = (await Actor.getInput() as Dictionary) || {}; +const { startUrls, maxRequestsPerCrawl } = await Actor.getInputOrThrow(); const proxyConfiguration = await Actor.createProxyConfiguration(); From 3f9fb879854c4b1050fefeb9969bead4814cc91c Mon Sep 17 00:00:00 2001 From: HonzaTuron Date: Wed, 9 Aug 2023 13:05:18 +0200 Subject: [PATCH 13/14] fix copy --- templates/js-crawlee-cheerio/README.md | 2 +- templates/ts-crawlee-cheerio/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/js-crawlee-cheerio/README.md b/templates/js-crawlee-cheerio/README.md index 3aa110cb..51f0d5a9 100644 --- a/templates/js-crawlee-cheerio/README.md +++ b/templates/js-crawlee-cheerio/README.md @@ -14,5 +14,5 @@ A template example built with [Crawlee](https://crawlee.dev) to scrape data from This code is a JavaScript script that uses Cheerio to scrape data from a website. It then stores the website titles in a dataset. -- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of crawler pages is limited by `maxPagesPerCrawl` field from input schema. +- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of scraped pages is limited by `maxPagesPerCrawl` field from input schema. - The crawler uses `requestHandler` for each URL to extract the data from the page with the Cheerio library and to save the title and URL of each page to the dataset. It also logs out each result that is being saved. diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md index e50268b9..60d8e846 100644 --- a/templates/ts-crawlee-cheerio/README.md +++ b/templates/ts-crawlee-cheerio/README.md @@ -14,5 +14,5 @@ A template example built with [Crawlee](https://crawlee.dev) to scrape data from This code is a TypeScript script that uses [Crawlee CheerioCralwer](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) framework to crawl a website and extract the data from the crawled URLs with Cheerio. It then stores the website titles in a dataset. -- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of crawler pages is limited by `maxPagesPerCrawl` field from input schema. +- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of scraped pages is limited by `maxPagesPerCrawl` field from input schema. - The crawler uses `requestHandler` for each URL to extract the data from the page with the Cheerio library and to save the title and URL of each page to the dataset. It also logs out each result that is being saved. From a165ab9eb4907448ffa8f547b7a33baef4fe55b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Turo=C5=88?= Date: Mon, 14 Aug 2023 15:11:18 +0100 Subject: [PATCH 14/14] Update templates/ts-crawlee-cheerio/src/main.ts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Martin Adámek --- templates/ts-crawlee-cheerio/src/main.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/templates/ts-crawlee-cheerio/src/main.ts b/templates/ts-crawlee-cheerio/src/main.ts index c0b34376..4af922fa 100644 --- a/templates/ts-crawlee-cheerio/src/main.ts +++ b/templates/ts-crawlee-cheerio/src/main.ts @@ -17,7 +17,10 @@ interface Input { // Initialize the Apify SDK await Actor.init(); -const { startUrls, maxRequestsPerCrawl } = await Actor.getInputOrThrow(); +const { + startUrls = ['https://crawlee.dev'], + maxRequestsPerCrawl = 100, +} = await Actor.getInput() ?? {} as Input; const proxyConfiguration = await Actor.createProxyConfiguration();