feat: Fixes for cheerio + crawlee templates and improved comments (#177)

apify · Aug 17, 2023 · 70e21e2 · 70e21e2
1 parent e21ade7
commit 70e21e2
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 25 deletions.
diff --git a/templates/js-crawlee-cheerio/.actor/input_schema.json b/templates/js-crawlee-cheerio/.actor/input_schema.json
@@ -19,6 +19,6 @@
             "type": "integer",
             "description": "Maximum number of requests that can be made by this crawler.",
             "default": 100
-        },
+        }
     }
 }
diff --git a/templates/js-crawlee-cheerio/src/main.js b/templates/js-crawlee-cheerio/src/main.js
@@ -1,17 +1,13 @@
-/**
- * This template is a production ready boilerplate for developing with `CheerioCrawler`.
- * Use this to bootstrap your projects using the most up-to-date code.
- * If you're looking for examples or want to learn more, see README.
- */
 
-// For more information, see https://docs.apify.com/sdk/js
+// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
 import { Actor } from 'apify';
-// For more information, see https://crawlee.dev
+// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
 import { CheerioCrawler, Dataset } from 'crawlee';
 
-// Initialize the Apify SDK
+// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
 await Actor.init();
 
+// Structure of input is defined in input_schema.json
 const {
     startUrls = ['https://crawlee.dev'],
     maxRequestsPerCrawl = 100,
@@ -26,14 +22,16 @@ const crawler = new CheerioCrawler({
         log.info('enqueueing new URLs');
         await enqueueLinks();
 
+        // Extract title from the page.
         const title = $('title').text();
         log.info(`${title}`, { url: request.loadedUrl });
 
+        // Save url and title to Dataset - a table-like storage.
         await Dataset.pushData({ url: request.loadedUrl, title });
     },
 });
 
 await crawler.run(startUrls);
 
-// Exit successfully
+// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
 await Actor.exit();
diff --git a/templates/manifest.json b/templates/manifest.json
@@ -231,8 +231,7 @@
             },
             "skipOptionalDeps": true,
             "showcaseFiles": [
-                "src/main.js",
-                "src/routes.js"
+                "src/main.js"
             ],
             "useCases": [
                 "WEB_SCRAPING"
@@ -335,8 +334,7 @@
             },
             "skipOptionalDeps": true,
             "showcaseFiles": [
-                "src/main.ts",
-                "src/routes.ts"
+                "src/main.ts"
             ],
             "useCases": [
                 "WEB_SCRAPING"

diff --git a/templates/ts-crawlee-cheerio/.actor/input_schema.json b/templates/ts-crawlee-cheerio/.actor/input_schema.json
@@ -19,6 +19,6 @@
             "type": "integer",
             "description": "Maximum number of requests that can be made by this crawler.",
             "default": 100
-        },
+        }
     }
 }
diff --git a/templates/ts-crawlee-cheerio/src/main.ts b/templates/ts-crawlee-cheerio/src/main.ts
@@ -1,22 +1,17 @@
-/**
- * This template is a production ready boilerplate for developing with `CheerioCrawler`.
- * Use this to bootstrap your projects using the most up-to-date code.
- * If you're looking for examples or want to learn more, see README.
- */
-
-// For more information, see https://docs.apify.com/sdk/js
+// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
 import { Actor } from 'apify';
-// For more information, see https://crawlee.dev
+// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
 import { CheerioCrawler, Dataset } from 'crawlee';
 
 interface Input {
     startUrls: string[];
     maxRequestsPerCrawl: number;
 }
 
-// Initialize the Apify SDK
+// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
 await Actor.init();
 
+// Structure of input is defined in input_schema.json
 const {
     startUrls = ['https://crawlee.dev'],
     maxRequestsPerCrawl = 100,
@@ -31,14 +26,16 @@ const crawler = new CheerioCrawler({
         log.info('enqueueing new URLs');
         await enqueueLinks();
 
+        // Extract title from the page.
         const title = $('title').text();
         log.info(`${title}`, { url: request.loadedUrl });
 
+        // Save url and title to Dataset - a table-like storage.
         await Dataset.pushData({ url: request.loadedUrl, title });
     },
 });
 
 await crawler.run(startUrls);
 
-// Exit successfully
+// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
 await Actor.exit();