Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduced Timeout #3

Merged
merged 8 commits into from
May 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 24 additions & 34 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,28 @@ const path = require('path');
// Function to get dropdown options
async function getOptions(page, selector) {
return await page.evaluate((selector) => {
let options = Array.from(document.querySelector(selector).options);
const options = Array.from(document.querySelector(selector).options);
return options.map(option => ({
text: option.text,
value: option.value
}));
}, selector);
}

async function getValue(page, selector) {
return await page.evaluate((selector) => {
const selectedOption = document.querySelector(selector).value;
return selectedOption;
}, selector);
}

// Function to scrape data
async function scrapeData() {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto('https://ejalshakti.gov.in/JJM/JJMReports/BasicInformation/JJMRep_AbstractData_D.aspx', { waitUntil: 'networkidle0' });

await page.waitForSelector('#CPHPage_ddFinyear'); // Wait for the first dropdown to appear

const dataDir = path.join(__dirname, 'data');

// Create the parent directory if it doesn't exist
if (!fs.existsSync(dataDir)) {
fs.mkdirSync(dataDir);
}

// Fetch dropdown options for the year
const years = await getOptions(page, '#CPHPage_ddFinyear');

// Filter out the "-Select Year-" option
const validYears = years.filter(year => year.value !== "-1");

for (let year of validYears) {
Expand All @@ -51,20 +39,20 @@ async function scrapeData() {
await page.select('#CPHPage_ddFinyear', year.value);
await new Promise(resolve => setTimeout(resolve, 2000));

// Fetch dropdown options
const states = await getOptions(page, '#CPHPage_ddState');

for (let state of states) {
console.log(`Processing State: ${state.text}`);
const stateFolder = path.join(yearFolder, state.text.replace(/[\\/:*?"<>|]/g, '-'));
if (!fs.existsSync(stateFolder)) {
fs.mkdirSync(stateFolder);
}

await page.select('#CPHPage_ddState', state.value);
await new Promise(resolve => setTimeout(resolve, 2000));

let districts = await getOptions(page, '#CPHPage_ddDistrict');

for (let district of districts) {
console.log(` Processing District: ${district.text}`);
const districtFolder = path.join(stateFolder, district.text.replace(/[\\/:*?"<>|]/g, '-'));
if (!fs.existsSync(districtFolder)) {
fs.mkdirSync(districtFolder);
Expand All @@ -87,27 +75,30 @@ async function scrapeData() {
for (let category of categories) {
const categoryNameForFile = category.text.replace(/[\\/:*?"<>|]/g, '-');
await page.select('#CPHPage_ddCategory', category.value);
await page.click('#CPHPage_btnShow'); // Assume there's a Show button to refresh the data
await page.click('#CPHPage_btnShow');
await new Promise(resolve => setTimeout(resolve, 5000));
await page.waitForSelector('#tableReportTable');

const data = await page.evaluate(() => {
const rows = Array.from(document.querySelectorAll('#tableReportTable tr'));
return rows.map(row => {
const columns = row.querySelectorAll('th, td');
return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' '));
try {
await page.waitForSelector('#tableReportTable', { timeout: 5000 });
const data = await page.evaluate(() => {
const rows = Array.from(document.querySelectorAll('#tableReportTable tr'));
return rows.map(row => {
const columns = row.querySelectorAll('th, td');
return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' '));
});
});
});

// Save to CSV file
const csvFilePath = path.join(blockFolder, `${categoryNameForFile}.csv`);
stringify(data, (err, output) => {
if (err) throw err;
fs.writeFile(csvFilePath, output, (err) => {
const csvFilePath = path.join(blockFolder, `${categoryNameForFile}.csv`);
stringify(data, (err, output) => {
if (err) throw err;
console.log(`${categoryNameForFile}.csv saved in ${block.text} folder.`);
fs.writeFile(csvFilePath, output, (err) => {
if (err) throw err;
console.log(`Data saved: ${csvFilePath}`);
});
});
});
} catch (error) {
console.log(`Table not found for ${categoryNameForFile}, skipped.`);
}
}
}
}
Expand All @@ -116,9 +107,8 @@ async function scrapeData() {

await browser.close();
} catch (err) {
console.error(err);
console.error(`An error occurred: ${err}`);
}
}

scrapeData();

Loading