feat: Crawlee + Cheerio templates code refactoring + Readme update (#172

) Closes apify/apify-web#2669. Removing routes.js, updating readme, code refactoring, correct typing. --------- Co-authored-by: Jan Bárta <45016873+jbartadev@users.noreply.github.com> Co-authored-by: Martin Adámek <banan23@gmail.com>
apify · Aug 14, 2023 · 334dc03 · 334dc03
1 parent 5d7b242
commit 334dc03
Show file tree

Hide file tree

Showing 10 changed files with 84 additions and 75 deletions.
diff --git a/templates/js-crawlee-cheerio/.actor/input_schema.json b/templates/js-crawlee-cheerio/.actor/input_schema.json
@@ -10,9 +10,15 @@
             "editor": "requestListSources",
             "prefill": [
                 {
-                    "url": "https://apify.com"
+                    "url": "https://crawlee.dev"
                 }
             ]
-        }
+        },
+        "maxRequestsPerCrawl": {
+            "title": "Max Requests per Crawl",
+            "type": "integer",
+            "description": "Maximum number of requests that can be made by this crawler.",
+            "default": 100
+        },
     }
 }
diff --git a/templates/js-crawlee-cheerio/README.md b/templates/js-crawlee-cheerio/README.md
@@ -1,12 +1,18 @@
-# CheerioCrawler Actor template
+# JavaScript Crawlee & CheerioCrawler Actor template
 
-This template is a production ready boilerplate for developing with `CheerioCrawler`. Use this to bootstrap your projects using the most up-to-date code.
+A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/) wrapped into [CheerioCrawler](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler).
 
-> We decided to split Apify SDK into two libraries, [Crawlee](https://crawlee.dev) and [Apify SDK v3](https://docs.apify.com/sdk/js). Crawlee will retain all the crawling and scraping-related tools and will always strive to be the best web scraping library for its community. At the same time, Apify SDK will continue to exist, but keep only the Apify-specific features related to building actors on the Apify platform. Read the [upgrading guide](https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3) to learn about the changes.
+## Included features
 
-If you're looking for examples or want to learn more visit:
+- **[Apify SDK](https://docs.apify.com/sdk/js)** - toolkit for building Actors
+- **[Crawlee](https://crawlee.dev)** - web scraping and browser automation library
+- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your Actor's input
+- **[Dataset](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-datasets)** - store structured data where each object stored has the same attributes
+- **[Cheerio](https://cheerio.js.org/)** - a fast, flexible & elegant library for parsing and manipulating HTML and XML
 
-- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform)
-- [Cheerio Tutorial](https://crawlee.dev/docs/guides/cheerio-crawler-guide)
-- [Documentation](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler)
-- [Examples](https://crawlee.dev/docs/examples/cheerio-crawler)
+## How it works
+
+This code is a JavaScript script that uses Cheerio to scrape data from a website. It then stores the website titles in a dataset.
+
+- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of scraped pages is limited by `maxPagesPerCrawl` field from input schema.
+- The crawler uses `requestHandler` for each URL to extract the data from the page with the Cheerio library and to save the title and URL of each page to the dataset. It also logs out each result that is being saved.
diff --git a/templates/js-crawlee-cheerio/src/main.js b/templates/js-crawlee-cheerio/src/main.js
@@ -7,19 +7,30 @@
 // For more information, see https://docs.apify.com/sdk/js
 import { Actor } from 'apify';
 // For more information, see https://crawlee.dev
-import { CheerioCrawler } from 'crawlee';
-import { router } from './routes.js';
+import { CheerioCrawler, Dataset } from 'crawlee';
 
 // Initialize the Apify SDK
 await Actor.init();
 
-const startUrls = ['https://apify.com'];
+const {
+    startUrls = ['https://crawlee.dev'],
+    maxRequestsPerCrawl = 100,
+} = await Actor.getInput() ?? {};
 
 const proxyConfiguration = await Actor.createProxyConfiguration();
 
 const crawler = new CheerioCrawler({
     proxyConfiguration,
-    requestHandler: router,
+    maxRequestsPerCrawl,
+    async requestHandler({ enqueueLinks, request, $, log }) {
+        log.info('enqueueing new URLs');
+        await enqueueLinks();
+
+        const title = $('title').text();
+        log.info(`${title}`, { url: request.loadedUrl });
+
+        await Dataset.pushData({ url: request.loadedUrl, title });
+    },
 });
 
 await crawler.run(startUrls);

diff --git a/templates/js-crawlee-cheerio/src/routes.js b/templates/js-crawlee-cheerio/src/routes.js
diff --git a/templates/manifest.json b/templates/manifest.json
@@ -222,7 +222,7 @@
                 "crawlee",
                 "cheerio"
             ],
-            "description": "A scraper example that uses HTTP requests and Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.",
+            "description": "A scraper example that uses Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.",
             "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/js-crawlee-cheerio.zip?raw=true",
             "defaultRunOptions": {
                 "build": "latest",
@@ -301,7 +301,7 @@
                 "cheerio"
             ],
             "description": "Skeleton project that helps you quickly bootstrap `CheerioCrawler` in JavaScript. It's best for developers who already know Apify SDK and Crawlee.",
-            "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/ts-crawlee-cheerio.zip?raw=true",
+            "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/js-bootstrap-cheerio-crawler.zip?raw=true",
             "defaultRunOptions": {
                 "build": "latest",
                 "memoryMbytes": 2048,
@@ -326,7 +326,7 @@
                 "crawlee",
                 "cheerio"
             ],
-            "description": "A scraper example that uses HTTP requests and Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.",
+            "description": "A scraper example that uses Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.",
             "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/ts-crawlee-cheerio.zip?raw=true",
             "defaultRunOptions": {
                 "build": "latest",

diff --git a/templates/ts-crawlee-cheerio/.actor/input_schema.json b/templates/ts-crawlee-cheerio/.actor/input_schema.json
@@ -10,9 +10,15 @@
             "editor": "requestListSources",
             "prefill": [
                 {
-                    "url": "https://apify.com"
+                    "url": "https://crawlee.dev"
                 }
             ]
-        }
+        },
+        "maxRequestsPerCrawl": {
+            "title": "Max Requests per Crawl",
+            "type": "integer",
+            "description": "Maximum number of requests that can be made by this crawler.",
+            "default": 100
+        },
     }
 }
diff --git a/templates/ts-crawlee-cheerio/README.md b/templates/ts-crawlee-cheerio/README.md
@@ -1,12 +1,18 @@
-# CheerioCrawler Actor template
+# TypeScript Crawlee & CheerioCrawler Actor template
 
-This template is a production ready boilerplate for developing with `CheerioCrawler`. Use this to bootstrap your projects using the most up-to-date code.
+A template example built with [Crawlee](https://crawlee.dev) to scrape data from a website using [Cheerio](https://cheerio.js.org/) wrapped into [CheerioCrawler](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler).
 
-> We decided to split Apify SDK into two libraries, [Crawlee](https://crawlee.dev) and [Apify SDK v3](https://docs.apify.com/sdk/js). Crawlee will retain all the crawling and scraping-related tools and will always strive to be the best web scraping library for its community. At the same time, Apify SDK will continue to exist, but keep only the Apify-specific features related to building actors on the Apify platform. Read the [upgrading guide](https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3) to learn about the changes.
+## Included features
 
-If you're looking for examples or want to learn more visit:
+- **[Apify SDK](https://docs.apify.com/sdk/js)** - toolkit for building Actors
+- **[Crawlee](https://crawlee.dev)** - web scraping and browser automation library
+- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and easily validate a schema for your Actor's input
+- **[Dataset](https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-datasets)** - store structured data where each object stored has the same attributes
+- **[Cheerio](https://cheerio.js.org/)** - a fast, flexible & elegant library for parsing and manipulating HTML and XML
 
-- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform)
-- [Cheerio Tutorial](https://crawlee.dev/docs/guides/cheerio-crawler-guide)
-- [Documentation](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler)
-- [Examples](https://crawlee.dev/docs/examples/cheerio-crawler)
+## How it works
+
+This code is a TypeScript script that uses [Crawlee CheerioCralwer](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) framework to crawl a website and extract the data from the crawled URLs with Cheerio. It then stores the website titles in a dataset.
+
+- The crawler starts with URLs provided from the input `startUrls` field defined by the input schema. Number of scraped pages is limited by `maxPagesPerCrawl` field from input schema.
+- The crawler uses `requestHandler` for each URL to extract the data from the page with the Cheerio library and to save the title and URL of each page to the dataset. It also logs out each result that is being saved.
diff --git a/templates/ts-crawlee-cheerio/package.json b/templates/ts-crawlee-cheerio/package.json
@@ -7,8 +7,8 @@
         "node": ">=16.0.0"
     },
     "dependencies": {
-        "apify": "^3.0.0",
-        "crawlee": "^3.0.0"
+        "apify": "^3.1.8",
+        "crawlee": "^3.5.0"
     },
     "devDependencies": {
         "@apify/eslint-config-ts": "^0.2.3",

diff --git a/templates/ts-crawlee-cheerio/src/main.ts b/templates/ts-crawlee-cheerio/src/main.ts
@@ -7,19 +7,35 @@
 // For more information, see https://docs.apify.com/sdk/js
 import { Actor } from 'apify';
 // For more information, see https://crawlee.dev
-import { CheerioCrawler } from 'crawlee';
-import { router } from './routes.js';
+import { CheerioCrawler, Dataset } from 'crawlee';
+
+interface Input {
+    startUrls: string[];
+    maxRequestsPerCrawl: number;
+}
 
 // Initialize the Apify SDK
 await Actor.init();
 
-const startUrls = ['https://apify.com'];
+const {
+    startUrls = ['https://crawlee.dev'],
+    maxRequestsPerCrawl = 100,
+} = await Actor.getInput<Input>() ?? {} as Input;
 
 const proxyConfiguration = await Actor.createProxyConfiguration();
 
 const crawler = new CheerioCrawler({
     proxyConfiguration,
-    requestHandler: router,
+    maxRequestsPerCrawl,
+    requestHandler: async ({ enqueueLinks, request, $, log }) => {
+        log.info('enqueueing new URLs');
+        await enqueueLinks();
+
+        const title = $('title').text();
+        log.info(`${title}`, { url: request.loadedUrl });
+
+        await Dataset.pushData({ url: request.loadedUrl, title });
+    },
 });
 
 await crawler.run(startUrls);

diff --git a/templates/ts-crawlee-cheerio/src/routes.ts b/templates/ts-crawlee-cheerio/src/routes.ts