Skip to content

Commit

Permalink
Update index1.js
Browse files Browse the repository at this point in the history
  • Loading branch information
Savio629 authored May 19, 2024
1 parent ba4a51a commit 0b66041
Showing 1 changed file with 99 additions and 177 deletions.
276 changes: 99 additions & 177 deletions index1.js
Original file line number Diff line number Diff line change
@@ -1,195 +1,117 @@
const puppeteer = require('puppeteer');
const fs = require('fs');
const { stringify } = require('csv-stringify');
const path = require('path');
const createCsvWriter = require('csv-writer').createObjectCsvWriter;

// Function to get dropdown options
async function getOptions(page, selector) {
return await page.evaluate((selector) => {
let options = Array.from(document.querySelector(selector).options);
return options.map(option => ({
text: option.text,
value: option.value
}));
}, selector);
}
const args = process.argv.slice(2);
const startIndex = parseInt(args[0]);
const endIndex = parseInt(args[1]);

async function getValue(page, selector) {
return await page.evaluate((selector) => {
const selectedOption = document.querySelector(selector).value;
return selectedOption;
}, selector);
if (isNaN(startIndex) || isNaN(endIndex) || startIndex < 0 || endIndex < startIndex) {
console.error('Invalid arguments. Usage: node index.js <startIndex> <endIndex>');
process.exit(1);
}

async function scrapeData(startDistrict, endDistrict) {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto('https://ejalshakti.gov.in/JJM/JJMReports/BasicInformation/JJMRep_AbstractData_D.aspx', { waitUntil: 'networkidle0' });

await page.waitForSelector('#CPHPage_ddFinyear'); // Wait for the first dropdown to appear
(async () => {
const browser = await puppeteer.launch({ headless: false }); // Set to true to run headless
const page = await browser.newPage();
await page.goto('https://ejalshakti.gov.in/jjm/JJMReports/profiles/rpt_VillageProfile.aspx', { waitUntil: 'networkidle2' });

// Selectors for the dropdowns and show button
const selectors = {
state: '#CPHPage_ddState',
district: '#CPHPage_ddDistrict',
block: '#CPHPage_ddblock',
panchayat: '#CPHPage_ddPanchayat',
village: '#CPHPage_ddVillage',
showButton: '#CPHPage_btnShow'
};

// Helper function to select an option and wait
async function selectOption(selector, value, waitTime = 2000) {
await page.select(selector, value);
await new Promise(resolve => setTimeout(resolve, waitTime)); // Wait for specified time for the page to load
}

const dataDir = path.join(__dirname, 'data');
// Function to get dropdown options excluding the default
async function getOptions(selector) {
return await page.evaluate((selector) => {
const options = Array.from(document.querySelector(selector).options);
return options.filter(opt => opt.value !== '-1').map(opt => ({ value: opt.value, text: opt.text }));
}, selector);
}

// Create the parent directory if it doesn't exist
if (!fs.existsSync(dataDir)) {
fs.mkdirSync(dataDir);
// Create directories if not exist
function createDirectory(dir) {
if (!fs.existsSync(dir)){
fs.mkdirSync(dir, { recursive: true });
}
}

// Fetch dropdown options for the year
const years = await getOptions(page, '#CPHPage_ddFinyear');

// Filter out the "-Select Year-" option
const validYears = years.filter(year => year.value === "2024-2025" );

for (let year of validYears) {
const yearFolder = path.join(dataDir, year.text.replace(/[\\/:*?"<>|]/g, '-'));
if (!fs.existsSync(yearFolder)) {
fs.mkdirSync(yearFolder);
}
console.log(year.value);
await page.select('#CPHPage_ddFinyear', year.value);
await new Promise(resolve => setTimeout(resolve, 500));

// Select Odisha from the state dropdown
await page.select('#CPHPage_ddState', '24');
await new Promise(resolve => setTimeout(resolve, 500));

const stateFolder = path.join(yearFolder, 'Odisha');
if (!fs.existsSync(stateFolder)) {
fs.mkdirSync(stateFolder);
}
console.log('Odisha');
await new Promise(resolve => setTimeout(resolve, 500));

let districts = await getOptions(page, '#CPHPage_ddDistrict');
// Save data to CSV
async function saveToCsv(data, dir, filename) {
createDirectory(dir);
const csvWriter = createCsvWriter({
path: path.join(dir, filename),
header: Object.keys(data[0]).map(key => ({id: key, title: key}))
});
await csvWriter.writeRecords(data);
}

// Limit districts based on input arguments
const start = startDistrict || 0;
const end = endDistrict || districts.length - 1;
// Scrape data and save to CSV
async function scrapeAndSaveData(state, district, block, panchayat, village) {
await page.click(selectors.showButton);
await new Promise(resolve => setTimeout(resolve, 5000)); // Wait for 5 seconds for data to load

// Scrape population data
const populationData = await page.evaluate(() => {
return {
totalPopulation: document.querySelector('#CPHPage_lblToptalPop').innerText.trim(),
scPopulation: document.querySelector('#CPHPage_lblSCPop').innerText.trim(),
stPopulation: document.querySelector('#CPHPage_lblSTPop').innerText.trim(),
genPopulation: document.querySelector('#CPHPage_lblGENPop').innerText.trim()
};
});

// Scrape connection information
const connectionData = await page.evaluate(() => {
return {
totalHouseholds: document.querySelector('#CPHPage_lblHouseHolds').innerText.trim(),
tapConnections: document.querySelector('#CPHPage_lblHouseConnection').innerText.trim(),
pwsAvailable: document.querySelector('#CPHPage_lblIsPWS').innerText.trim(),
jjmStatus: document.querySelector('#CPHPage_lblvillagestatus').innerText.trim()
};
});

const baseDir = path.join(__dirname, 'data', state, district, block, panchayat, village);
await saveToCsv([populationData], baseDir, 'population.csv');
await saveToCsv([connectionData], baseDir, 'connection_information.csv');
}

for (let i = start; i <= end; i++) {
const district = districts[i];
const districtFolder = path.join(stateFolder, district.text.replace(/[\\/:*?"<>|]/g, '-'));
if (!fs.existsSync(districtFolder)) {
fs.mkdirSync(districtFolder);
}
console.log(district.text);
await page.select('#CPHPage_ddDistrict', district.value);
await new Promise(resolve => setTimeout(resolve, 500));
let blocks = await getOptions(page, '#CPHPage_ddBlock');

for (let block of blocks) {
const blockFolder = path.join(districtFolder, block.text.replace(/[\\/:*?"<>|]/g, '-'));
if (!fs.existsSync(blockFolder)) {
fs.mkdirSync(blockFolder);
}

await page.select('#CPHPage_ddBlock', block.value);
await new Promise(resolve => setTimeout(resolve, 500));
let categories = await getOptions(page, '#CPHPage_ddCategory');

for (let category of categories) {
const categoryNameForFile = category.text.replace(/[\\/:*?"<>|]/g, '-');
await page.select('#CPHPage_ddCategory', category.value);
await new Promise(resolve => setTimeout(resolve, 2000));
const radioButtonsPresent = await page.evaluate(() => {
const radioButtons = document.querySelectorAll('#CPHPage_rdbvillages input[type="radio"]');
return radioButtons.length > 0;
});
if (radioButtonsPresent) {
// Handle radio buttons
const radioButtons = await page.evaluate(() => {
const radios = Array.from(document.querySelectorAll('#CPHPage_rdbvillages input[type="radio"]'));
return radios.map(radio => ({
id: radio.id,
value: radio.value,
label: radio.nextElementSibling.textContent.trim() // Fetch text between <label> tags
}));
});

if (radioButtons.length > 0) {
// Handle radio buttons if available
for (let radioButton of radioButtons) {
try {
await page.evaluate((id) => {
const element = document.getElementById(id);
if (element) {
element.click();
} else {
throw new Error(`Element with id ${id} not found.`);
}
}, radioButton.id);

// Click on the "Show" button
await page.click('#CPHPage_btnShow');
await new Promise(resolve => setTimeout(resolve, 6500));

// Wait for the data to load
await page.waitForSelector('#tableReportTable', { timeout: 5000 }).catch(() => console.log('Table not found, proceeding to next radio button'));

const data = await page.evaluate(() => {
const rows = Array.from(document.querySelectorAll('#tableReportTable tr'));
return rows.map(row => {
const columns = row.querySelectorAll('th, td');
return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' '));
});
});

// Save to CSV file using radio button label instead of value
const csvFilePath = path.join(blockFolder, `${radioButton.label}.csv`);
stringify(data, (err, output) => {
if (err) throw err;
fs.writeFile(csvFilePath, output, (err) => {
if (err) throw err;
// console.log(`${radioButton.label}.csv saved in ${block.text} folder.`);
});
});
} catch (error) {
console.error(error);
continue;
}
}
}
} else {
// Handle categories if radio buttons are not present
await page.click('#CPHPage_btnShow');
await new Promise(resolve => setTimeout(resolve, 5000));

// Wait for the data to load
await page.waitForSelector('#tableReportTable', { timeout: 5000 }).catch(() => console.log('Table not found, proceeding to next category'));

const data = await page.evaluate(() => {
const rows = Array.from(document.querySelectorAll('#tableReportTable tr'));
return rows.map(row => {
const columns = row.querySelectorAll('th, td');
return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' '));
});
});

// Save to CSV file using category name instead of value
const csvFilePath = path.join(blockFolder, `${categoryNameForFile}.csv`);
stringify(data, (err, output) => {
if (err) throw err;
fs.writeFile(csvFilePath, output, (err) => {
if (err) throw err;
// console.log(`${categoryNameForFile}.csv saved in ${block.text} folder.`);
});
});
}
}
// Set the state to Odisha
const odishaValue = '24'; // Odisha value from the dropdown
await selectOption(selectors.state, odishaValue, 2000);

const districts = await getOptions(selectors.district);
const districtsToProcess = districts.slice(startIndex - 1, endIndex);

for (const district of districtsToProcess) {
console.log(`Starting district: ${district.text}`);
await selectOption(selectors.district, district.value, 2000);
const blocks = await getOptions(selectors.block);
for (const block of blocks) {
await selectOption(selectors.block, block.value, 2000);
const panchayats = await getOptions(selectors.panchayat);
for (const panchayat of panchayats) {
await selectOption(selectors.panchayat, panchayat.value, 2000);
const villages = await getOptions(selectors.village);
for (const village of villages) {
await selectOption(selectors.village, village.value, 2000);
await scrapeAndSaveData('Odisha', district.text, block.text, panchayat.text, village.text);
}
}
}

await browser.close();
} catch (err) {
console.error(err);
}
}

// Extract start and end districts from command line arguments
const startDistrict = parseInt(process.argv[2]);
const endDistrict = parseInt(process.argv[3]);

scrapeData(startDistrict, endDistrict);
await browser.close();
})();

0 comments on commit 0b66041

Please sign in to comment.