Skip to content

Commit

Permalink
feat: Fixes for cheerio + crawlee templates and improved comments (#177)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbartadev authored Aug 17, 2023
1 parent e21ade7 commit 70e21e2
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 25 deletions.
2 changes: 1 addition & 1 deletion templates/js-crawlee-cheerio/.actor/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@
"type": "integer",
"description": "Maximum number of requests that can be made by this crawler.",
"default": 100
},
}
}
}
16 changes: 7 additions & 9 deletions templates/js-crawlee-cheerio/src/main.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
/**
* This template is a production ready boilerplate for developing with `CheerioCrawler`.
* Use this to bootstrap your projects using the most up-to-date code.
* If you're looking for examples or want to learn more, see README.
*/

// For more information, see https://docs.apify.com/sdk/js
// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
import { Actor } from 'apify';
// For more information, see https://crawlee.dev
// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
import { CheerioCrawler, Dataset } from 'crawlee';

// Initialize the Apify SDK
// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
await Actor.init();

// Structure of input is defined in input_schema.json
const {
startUrls = ['https://crawlee.dev'],
maxRequestsPerCrawl = 100,
Expand All @@ -26,14 +22,16 @@ const crawler = new CheerioCrawler({
log.info('enqueueing new URLs');
await enqueueLinks();

// Extract title from the page.
const title = $('title').text();
log.info(`${title}`, { url: request.loadedUrl });

// Save url and title to Dataset - a table-like storage.
await Dataset.pushData({ url: request.loadedUrl, title });
},
});

await crawler.run(startUrls);

// Exit successfully
// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
await Actor.exit();
6 changes: 2 additions & 4 deletions templates/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,7 @@
},
"skipOptionalDeps": true,
"showcaseFiles": [
"src/main.js",
"src/routes.js"
"src/main.js"
],
"useCases": [
"WEB_SCRAPING"
Expand Down Expand Up @@ -335,8 +334,7 @@
},
"skipOptionalDeps": true,
"showcaseFiles": [
"src/main.ts",
"src/routes.ts"
"src/main.ts"
],
"useCases": [
"WEB_SCRAPING"
Expand Down
2 changes: 1 addition & 1 deletion templates/ts-crawlee-cheerio/.actor/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@
"type": "integer",
"description": "Maximum number of requests that can be made by this crawler.",
"default": 100
},
}
}
}
17 changes: 7 additions & 10 deletions templates/ts-crawlee-cheerio/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
/**
* This template is a production ready boilerplate for developing with `CheerioCrawler`.
* Use this to bootstrap your projects using the most up-to-date code.
* If you're looking for examples or want to learn more, see README.
*/

// For more information, see https://docs.apify.com/sdk/js
// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
import { Actor } from 'apify';
// For more information, see https://crawlee.dev
// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
import { CheerioCrawler, Dataset } from 'crawlee';

interface Input {
startUrls: string[];
maxRequestsPerCrawl: number;
}

// Initialize the Apify SDK
// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
await Actor.init();

// Structure of input is defined in input_schema.json
const {
startUrls = ['https://crawlee.dev'],
maxRequestsPerCrawl = 100,
Expand All @@ -31,14 +26,16 @@ const crawler = new CheerioCrawler({
log.info('enqueueing new URLs');
await enqueueLinks();

// Extract title from the page.
const title = $('title').text();
log.info(`${title}`, { url: request.loadedUrl });

// Save url and title to Dataset - a table-like storage.
await Dataset.pushData({ url: request.loadedUrl, title });
},
});

await crawler.run(startUrls);

// Exit successfully
// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
await Actor.exit();

0 comments on commit 70e21e2

Please sign in to comment.