-
Notifications
You must be signed in to change notification settings - Fork 0
/
ok.js
100 lines (83 loc) · 2.94 KB
/
ok.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
// Function to fetch and scrape data from a single page
async function scrapePage(url) {
try {
const response = await axios.get(url);
const html = response.data;
const $ = cheerio.load(html);
// Select all <a> tags with class "block-wrapper-link" and get their "href" attribute
const links = $('a.block-wrapper-link').map((_, element) => $(element).attr('href')).get();
return links;
} catch (error) {
console.log(error);
return [];
}
}
async function getTotalPages() {
const lastPageUrl = 'https://awspartyrockhackathon.devpost.com/project-gallery?page=61';
try {
// Extract the page number from the last page URL
const totalPages = parseInt(lastPageUrl.split('=')[1]);
return totalPages;
} catch (error) {
console.log(error);
return 0;
}
}
// Function to scrape data from all pages
async function scrapeAllPages() {
const totalPages = await getTotalPages();
const baseUrl = 'https://awspartyrockhackathon.devpost.com/project-gallery?page=';
// Array to store all links from all pages
let allLinks = [];
// Loop through each page and scrape data
for (let page = 1; page <= totalPages; page++) {
const pageUrl = baseUrl + page;
const links = await scrapePage(pageUrl);
allLinks = allLinks.concat(links);
}
return allLinks;
}
// Function to scrape links of the specified format from HTML content
function scrapeLinks(html) {
const $ = cheerio.load(html);
const formattedLinks = [];
// Select all <a> tags with specific attributes and extract their "href" attribute
$('nav.app-links ul[data-role="software-urls"] li a').each((_, element) => {
const link = $(element).attr('href');
formattedLinks.push(link);
});
return formattedLinks;
}
// Main function to initiate scraping and save links to JSON files
async function main() {
try {
// Scrape links from all pages
const allLinks = await scrapeAllPages();
// Write all links to a JSON file
fs.writeFile('all_links.json', JSON.stringify(allLinks, null, 2), (err) => {
if (err) throw err;
console.log('All links saved to all_links.json');
});
// Scrape links of the specified format from each link
const allFormattedLinks = [];
for (let i = 0; i < allLinks.length; i++) {
const link = allLinks[i];
console.log('Scraping links from:', link);
const response = await axios.get(link);
const html = response.data;
const formattedLinks = scrapeLinks(html); // Use the scrapeLinks function here
allFormattedLinks.push(...formattedLinks);
}
// Write all formatted links to a JSON file
fs.writeFile('formatted_links.json', JSON.stringify(allFormattedLinks, null, 2), (err) => {
if (err) throw err;
console.log('Formatted links saved to formatted_links.json');
});
} catch (error) {
console.log(error);
}
}
main();