From cf0d695a067a3e4718fd6c85943abac7ef73bf34 Mon Sep 17 00:00:00 2001 From: Darren Dsouza Date: Mon, 6 May 2024 18:48:38 +0530 Subject: [PATCH 1/6] improved logic --- index.js | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/index.js b/index.js index 2dc37e9..4e07a06 100644 --- a/index.js +++ b/index.js @@ -6,7 +6,7 @@ const path = require('path'); // Function to get dropdown options async function getOptions(page, selector) { return await page.evaluate((selector) => { - let options = Array.from(document.querySelector(selector).options); + const options = Array.from(document.querySelector(selector).options); return options.map(option => ({ text: option.text, value: option.value @@ -14,13 +14,7 @@ async function getOptions(page, selector) { }, selector); } -async function getValue(page, selector) { - return await page.evaluate((selector) => { - const selectedOption = document.querySelector(selector).value; - return selectedOption; - }, selector); -} - +// Function to scrape data async function scrapeData() { try { const browser = await puppeteer.launch({ headless: true }); @@ -39,8 +33,8 @@ async function scrapeData() { // Fetch dropdown options for the year const years = await getOptions(page, '#CPHPage_ddFinyear'); - // Filter out the "-Select Year-" option - const validYears = years.filter(year => year.value !== "-1"); + // Filter out the "-Select Year-" option and the specific year "2024-2025" + const validYears = years.filter(year => year.value !== "-1" && year.text !== "2024-2025"); for (let year of validYears) { const yearFolder = path.join(dataDir, year.text.replace(/[\\/:*?"<>|]/g, '-')); @@ -51,7 +45,7 @@ async function scrapeData() { await page.select('#CPHPage_ddFinyear', year.value); await new Promise(resolve => setTimeout(resolve, 2000)); - // Fetch dropdown options + // Fetch dropdown options for states const states = await getOptions(page, '#CPHPage_ddState'); for (let state of states) { @@ -62,6 +56,7 @@ async function scrapeData() { await page.select('#CPHPage_ddState', state.value); await new Promise(resolve => setTimeout(resolve, 2000)); + let districts = await getOptions(page, '#CPHPage_ddDistrict'); for (let district of districts) { @@ -121,4 +116,3 @@ async function scrapeData() { } scrapeData(); - \ No newline at end of file From 89e502d1f70f36b0c2fc3d06005bd09b3693502d Mon Sep 17 00:00:00 2001 From: Darren Dsouza Date: Mon, 6 May 2024 20:00:20 +0530 Subject: [PATCH 2/6] if table not found proceed to the next table --- index.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index.js b/index.js index 4e07a06..f34822d 100644 --- a/index.js +++ b/index.js @@ -83,8 +83,8 @@ async function scrapeData() { const categoryNameForFile = category.text.replace(/[\\/:*?"<>|]/g, '-'); await page.select('#CPHPage_ddCategory', category.value); await page.click('#CPHPage_btnShow'); // Assume there's a Show button to refresh the data - await new Promise(resolve => setTimeout(resolve, 2000)); - await page.waitForSelector('#tableReportTable'); + await new Promise(resolve => setTimeout(resolve, 5000)); + await page.waitForSelector('#tableReportTable', { timeout: 5000 }).catch(() => console.log('Table not found, proceeding to next category')); const data = await page.evaluate(() => { const rows = Array.from(document.querySelectorAll('#tableReportTable tr')); From 82f1da7a55dfb8759310df23159db0afe8e38cb8 Mon Sep 17 00:00:00 2001 From: Darren Dsouza Date: Mon, 6 May 2024 20:48:17 +0530 Subject: [PATCH 3/6] reduced timeout --- index.js | 8 ++-- index1.js | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 index1.js diff --git a/index.js b/index.js index f34822d..e242b16 100644 --- a/index.js +++ b/index.js @@ -43,7 +43,7 @@ async function scrapeData() { } await page.select('#CPHPage_ddFinyear', year.value); - await new Promise(resolve => setTimeout(resolve, 2000)); + await new Promise(resolve => setTimeout(resolve, 500)); // Fetch dropdown options for states const states = await getOptions(page, '#CPHPage_ddState'); @@ -55,7 +55,7 @@ async function scrapeData() { } await page.select('#CPHPage_ddState', state.value); - await new Promise(resolve => setTimeout(resolve, 2000)); + await new Promise(resolve => setTimeout(resolve, 500)); let districts = await getOptions(page, '#CPHPage_ddDistrict'); @@ -66,7 +66,7 @@ async function scrapeData() { } await page.select('#CPHPage_ddDistrict', district.value); - await new Promise(resolve => setTimeout(resolve, 2000)); + await new Promise(resolve => setTimeout(resolve, 500)); let blocks = await getOptions(page, '#CPHPage_ddBlock'); for (let block of blocks) { @@ -76,7 +76,7 @@ async function scrapeData() { } await page.select('#CPHPage_ddBlock', block.value); - await new Promise(resolve => setTimeout(resolve, 2000)); + await new Promise(resolve => setTimeout(resolve, 500)); let categories = await getOptions(page, '#CPHPage_ddCategory'); for (let category of categories) { diff --git a/index1.js b/index1.js new file mode 100644 index 0000000..8bf4d88 --- /dev/null +++ b/index1.js @@ -0,0 +1,114 @@ +const puppeteer = require('puppeteer'); +const fs = require('fs'); +const { stringify } = require('csv-stringify'); +const path = require('path'); + +// Function to get dropdown options +async function getOptions(page, selector) { + return await page.evaluate((selector) => { + const options = Array.from(document.querySelector(selector).options); + return options.map(option => ({ + text: option.text, + value: option.value + })); + }, selector); +} + +// Function to scrape data +async function scrapeData() { + try { + const browser = await puppeteer.launch({ headless: true }); + const page = await browser.newPage(); + await page.goto('https://ejalshakti.gov.in/JJM/JJMReports/BasicInformation/JJMRep_AbstractData_D.aspx', { waitUntil: 'networkidle0' }); + await page.waitForSelector('#CPHPage_ddFinyear'); // Wait for the first dropdown to appear + + const dataDir = path.join(__dirname, 'data'); + if (!fs.existsSync(dataDir)) { + fs.mkdirSync(dataDir); + } + + const years = await getOptions(page, '#CPHPage_ddFinyear'); + const validYears = years.filter(year => year.value !== "-1" && year.text !== "2024-2025"); + + for (let year of validYears) { + const yearFolder = path.join(dataDir, year.text.replace(/[\\/:*?"<>|]/g, '-')); + if (!fs.existsSync(yearFolder)) { + fs.mkdirSync(yearFolder); + } + + await page.select('#CPHPage_ddFinyear', year.value); + await new Promise(resolve => setTimeout(resolve, 500)); + + const states = await getOptions(page, '#CPHPage_ddState'); + for (let state of states) { + console.log(`Processing State: ${state.text}`); + const stateFolder = path.join(yearFolder, state.text.replace(/[\\/:*?"<>|]/g, '-')); + if (!fs.existsSync(stateFolder)) { + fs.mkdirSync(stateFolder); + } + + await page.select('#CPHPage_ddState', state.value); + await new Promise(resolve => setTimeout(resolve, 500)); + + let districts = await getOptions(page, '#CPHPage_ddDistrict'); + for (let district of districts) { + console.log(` Processing District: ${district.text}`); + const districtFolder = path.join(stateFolder, district.text.replace(/[\\/:*?"<>|]/g, '-')); + if (!fs.existsSync(districtFolder)) { + fs.mkdirSync(districtFolder); + } + + await page.select('#CPHPage_ddDistrict', district.value); + await new Promise(resolve => setTimeout(resolve, 500)); + let blocks = await getOptions(page, '#CPHPage_ddBlock'); + + for (let block of blocks) { + const blockFolder = path.join(districtFolder, block.text.replace(/[\\/:*?"<>|]/g, '-')); + if (!fs.existsSync(blockFolder)) { + fs.mkdirSync(blockFolder); + } + + await page.select('#CPHPage_ddBlock', block.value); + await new Promise(resolve => setTimeout(resolve, 500)); + let categories = await getOptions(page, '#CPHPage_ddCategory'); + + for (let category of categories) { + const categoryNameForFile = category.text.replace(/[\\/:*?"<>|]/g, '-'); + await page.select('#CPHPage_ddCategory', category.value); + await page.click('#CPHPage_btnShow'); + await new Promise(resolve => setTimeout(resolve, 5000)); + + try { + await page.waitForSelector('#tableReportTable', { timeout: 5000 }); + const data = await page.evaluate(() => { + const rows = Array.from(document.querySelectorAll('#tableReportTable tr')); + return rows.map(row => { + const columns = row.querySelectorAll('th, td'); + return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' ')); + }); + }); + + const csvFilePath = path.join(blockFolder, `${categoryNameForFile}.csv`); + stringify(data, (err, output) => { + if (err) throw err; + fs.writeFile(csvFilePath, output, (err) => { + if (err) throw err; + console.log(`Data saved: ${csvFilePath}`); + }); + }); + } catch (error) { + console.log(`Table not found for ${categoryNameForFile}, skipped.`); + } + } + } + } + } + } + + await browser.close(); + } catch (err) { + console.error(`An error occurred: ${err}`); + } +} + +scrapeData(); From d23edc055637d418b696582782996ffb53f512a1 Mon Sep 17 00:00:00 2001 From: Darren Dsouza Date: Mon, 6 May 2024 20:53:33 +0530 Subject: [PATCH 4/6] fix: Reduced timeout --- index1.js | 114 ------------------------------------------------------ 1 file changed, 114 deletions(-) delete mode 100644 index1.js diff --git a/index1.js b/index1.js deleted file mode 100644 index 8bf4d88..0000000 --- a/index1.js +++ /dev/null @@ -1,114 +0,0 @@ -const puppeteer = require('puppeteer'); -const fs = require('fs'); -const { stringify } = require('csv-stringify'); -const path = require('path'); - -// Function to get dropdown options -async function getOptions(page, selector) { - return await page.evaluate((selector) => { - const options = Array.from(document.querySelector(selector).options); - return options.map(option => ({ - text: option.text, - value: option.value - })); - }, selector); -} - -// Function to scrape data -async function scrapeData() { - try { - const browser = await puppeteer.launch({ headless: true }); - const page = await browser.newPage(); - await page.goto('https://ejalshakti.gov.in/JJM/JJMReports/BasicInformation/JJMRep_AbstractData_D.aspx', { waitUntil: 'networkidle0' }); - await page.waitForSelector('#CPHPage_ddFinyear'); // Wait for the first dropdown to appear - - const dataDir = path.join(__dirname, 'data'); - if (!fs.existsSync(dataDir)) { - fs.mkdirSync(dataDir); - } - - const years = await getOptions(page, '#CPHPage_ddFinyear'); - const validYears = years.filter(year => year.value !== "-1" && year.text !== "2024-2025"); - - for (let year of validYears) { - const yearFolder = path.join(dataDir, year.text.replace(/[\\/:*?"<>|]/g, '-')); - if (!fs.existsSync(yearFolder)) { - fs.mkdirSync(yearFolder); - } - - await page.select('#CPHPage_ddFinyear', year.value); - await new Promise(resolve => setTimeout(resolve, 500)); - - const states = await getOptions(page, '#CPHPage_ddState'); - for (let state of states) { - console.log(`Processing State: ${state.text}`); - const stateFolder = path.join(yearFolder, state.text.replace(/[\\/:*?"<>|]/g, '-')); - if (!fs.existsSync(stateFolder)) { - fs.mkdirSync(stateFolder); - } - - await page.select('#CPHPage_ddState', state.value); - await new Promise(resolve => setTimeout(resolve, 500)); - - let districts = await getOptions(page, '#CPHPage_ddDistrict'); - for (let district of districts) { - console.log(` Processing District: ${district.text}`); - const districtFolder = path.join(stateFolder, district.text.replace(/[\\/:*?"<>|]/g, '-')); - if (!fs.existsSync(districtFolder)) { - fs.mkdirSync(districtFolder); - } - - await page.select('#CPHPage_ddDistrict', district.value); - await new Promise(resolve => setTimeout(resolve, 500)); - let blocks = await getOptions(page, '#CPHPage_ddBlock'); - - for (let block of blocks) { - const blockFolder = path.join(districtFolder, block.text.replace(/[\\/:*?"<>|]/g, '-')); - if (!fs.existsSync(blockFolder)) { - fs.mkdirSync(blockFolder); - } - - await page.select('#CPHPage_ddBlock', block.value); - await new Promise(resolve => setTimeout(resolve, 500)); - let categories = await getOptions(page, '#CPHPage_ddCategory'); - - for (let category of categories) { - const categoryNameForFile = category.text.replace(/[\\/:*?"<>|]/g, '-'); - await page.select('#CPHPage_ddCategory', category.value); - await page.click('#CPHPage_btnShow'); - await new Promise(resolve => setTimeout(resolve, 5000)); - - try { - await page.waitForSelector('#tableReportTable', { timeout: 5000 }); - const data = await page.evaluate(() => { - const rows = Array.from(document.querySelectorAll('#tableReportTable tr')); - return rows.map(row => { - const columns = row.querySelectorAll('th, td'); - return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' ')); - }); - }); - - const csvFilePath = path.join(blockFolder, `${categoryNameForFile}.csv`); - stringify(data, (err, output) => { - if (err) throw err; - fs.writeFile(csvFilePath, output, (err) => { - if (err) throw err; - console.log(`Data saved: ${csvFilePath}`); - }); - }); - } catch (error) { - console.log(`Table not found for ${categoryNameForFile}, skipped.`); - } - } - } - } - } - } - - await browser.close(); - } catch (err) { - console.error(`An error occurred: ${err}`); - } -} - -scrapeData(); From f00563bd03a5451fb6dda88cbf93faf2cc5359b1 Mon Sep 17 00:00:00 2001 From: Darren Dsouza Date: Mon, 6 May 2024 21:06:56 +0530 Subject: [PATCH 5/6] logs district when changed --- index.js | 54 +++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/index.js b/index.js index e242b16..a40b191 100644 --- a/index.js +++ b/index.js @@ -20,20 +20,14 @@ async function scrapeData() { const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); await page.goto('https://ejalshakti.gov.in/JJM/JJMReports/BasicInformation/JJMRep_AbstractData_D.aspx', { waitUntil: 'networkidle0' }); - await page.waitForSelector('#CPHPage_ddFinyear'); // Wait for the first dropdown to appear const dataDir = path.join(__dirname, 'data'); - - // Create the parent directory if it doesn't exist if (!fs.existsSync(dataDir)) { fs.mkdirSync(dataDir); } - // Fetch dropdown options for the year const years = await getOptions(page, '#CPHPage_ddFinyear'); - - // Filter out the "-Select Year-" option and the specific year "2024-2025" const validYears = years.filter(year => year.value !== "-1" && year.text !== "2024-2025"); for (let year of validYears) { @@ -43,30 +37,29 @@ async function scrapeData() { } await page.select('#CPHPage_ddFinyear', year.value); - await new Promise(resolve => setTimeout(resolve, 500)); + await new Promise(resolve => setTimeout(resolve, 2000)); - // Fetch dropdown options for states const states = await getOptions(page, '#CPHPage_ddState'); - for (let state of states) { + console.log(`Processing State: ${state.text}`); const stateFolder = path.join(yearFolder, state.text.replace(/[\\/:*?"<>|]/g, '-')); if (!fs.existsSync(stateFolder)) { fs.mkdirSync(stateFolder); } await page.select('#CPHPage_ddState', state.value); - await new Promise(resolve => setTimeout(resolve, 500)); + await new Promise(resolve => setTimeout(resolve, 2000)); let districts = await getOptions(page, '#CPHPage_ddDistrict'); - for (let district of districts) { + console.log(` Processing District: ${district.text}`); const districtFolder = path.join(stateFolder, district.text.replace(/[\\/:*?"<>|]/g, '-')); if (!fs.existsSync(districtFolder)) { fs.mkdirSync(districtFolder); } await page.select('#CPHPage_ddDistrict', district.value); - await new Promise(resolve => setTimeout(resolve, 500)); + await new Promise(resolve => setTimeout(resolve, 2000)); let blocks = await getOptions(page, '#CPHPage_ddBlock'); for (let block of blocks) { @@ -76,33 +69,36 @@ async function scrapeData() { } await page.select('#CPHPage_ddBlock', block.value); - await new Promise(resolve => setTimeout(resolve, 500)); + await new Promise(resolve => setTimeout(resolve, 2000)); let categories = await getOptions(page, '#CPHPage_ddCategory'); for (let category of categories) { const categoryNameForFile = category.text.replace(/[\\/:*?"<>|]/g, '-'); await page.select('#CPHPage_ddCategory', category.value); - await page.click('#CPHPage_btnShow'); // Assume there's a Show button to refresh the data + await page.click('#CPHPage_btnShow'); await new Promise(resolve => setTimeout(resolve, 5000)); - await page.waitForSelector('#tableReportTable', { timeout: 5000 }).catch(() => console.log('Table not found, proceeding to next category')); - const data = await page.evaluate(() => { - const rows = Array.from(document.querySelectorAll('#tableReportTable tr')); - return rows.map(row => { - const columns = row.querySelectorAll('th, td'); - return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' ')); + try { + await page.waitForSelector('#tableReportTable', { timeout: 5000 }); + const data = await page.evaluate(() => { + const rows = Array.from(document.querySelectorAll('#tableReportTable tr')); + return rows.map(row => { + const columns = row.querySelectorAll('th, td'); + return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' ')); + }); }); - }); - // Save to CSV file - const csvFilePath = path.join(blockFolder, `${categoryNameForFile}.csv`); - stringify(data, (err, output) => { - if (err) throw err; - fs.writeFile(csvFilePath, output, (err) => { + const csvFilePath = path.join(blockFolder, `${categoryNameForFile}.csv`); + stringify(data, (err, output) => { if (err) throw err; - console.log(`${categoryNameForFile}.csv saved in ${block.text} folder.`); + fs.writeFile(csvFilePath, output, (err) => { + if (err) throw err; + console.log(`Data saved: ${csvFilePath}`); + }); }); - }); + } catch (error) { + console.log(`Table not found for ${categoryNameForFile}, skipped.`); + } } } } @@ -111,7 +107,7 @@ async function scrapeData() { await browser.close(); } catch (err) { - console.error(err); + console.error(`An error occurred: ${err}`); } } From 9a97a2b830fdd2cef8dcbbbdc1b42d52d4db3cab Mon Sep 17 00:00:00 2001 From: Darren Dsouza Date: Mon, 6 May 2024 21:13:41 +0530 Subject: [PATCH 6/6] removed unneccesary iteration --- index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.js b/index.js index a40b191..3f053d9 100644 --- a/index.js +++ b/index.js @@ -28,7 +28,7 @@ async function scrapeData() { } const years = await getOptions(page, '#CPHPage_ddFinyear'); - const validYears = years.filter(year => year.value !== "-1" && year.text !== "2024-2025"); + const validYears = years.filter(year => year.value !== "-1"); for (let year of validYears) { const yearFolder = path.join(dataDir, year.text.replace(/[\\/:*?"<>|]/g, '-'));