-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
99 additions
and
177 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,195 +1,117 @@ | ||
const puppeteer = require('puppeteer'); | ||
const fs = require('fs'); | ||
const { stringify } = require('csv-stringify'); | ||
const path = require('path'); | ||
const createCsvWriter = require('csv-writer').createObjectCsvWriter; | ||
|
||
// Function to get dropdown options | ||
async function getOptions(page, selector) { | ||
return await page.evaluate((selector) => { | ||
let options = Array.from(document.querySelector(selector).options); | ||
return options.map(option => ({ | ||
text: option.text, | ||
value: option.value | ||
})); | ||
}, selector); | ||
} | ||
const args = process.argv.slice(2); | ||
const startIndex = parseInt(args[0]); | ||
const endIndex = parseInt(args[1]); | ||
|
||
async function getValue(page, selector) { | ||
return await page.evaluate((selector) => { | ||
const selectedOption = document.querySelector(selector).value; | ||
return selectedOption; | ||
}, selector); | ||
if (isNaN(startIndex) || isNaN(endIndex) || startIndex < 0 || endIndex < startIndex) { | ||
console.error('Invalid arguments. Usage: node index.js <startIndex> <endIndex>'); | ||
process.exit(1); | ||
} | ||
|
||
async function scrapeData(startDistrict, endDistrict) { | ||
try { | ||
const browser = await puppeteer.launch({ headless: true }); | ||
const page = await browser.newPage(); | ||
await page.goto('https://ejalshakti.gov.in/JJM/JJMReports/BasicInformation/JJMRep_AbstractData_D.aspx', { waitUntil: 'networkidle0' }); | ||
|
||
await page.waitForSelector('#CPHPage_ddFinyear'); // Wait for the first dropdown to appear | ||
(async () => { | ||
const browser = await puppeteer.launch({ headless: false }); // Set to true to run headless | ||
const page = await browser.newPage(); | ||
await page.goto('https://ejalshakti.gov.in/jjm/JJMReports/profiles/rpt_VillageProfile.aspx', { waitUntil: 'networkidle2' }); | ||
|
||
// Selectors for the dropdowns and show button | ||
const selectors = { | ||
state: '#CPHPage_ddState', | ||
district: '#CPHPage_ddDistrict', | ||
block: '#CPHPage_ddblock', | ||
panchayat: '#CPHPage_ddPanchayat', | ||
village: '#CPHPage_ddVillage', | ||
showButton: '#CPHPage_btnShow' | ||
}; | ||
|
||
// Helper function to select an option and wait | ||
async function selectOption(selector, value, waitTime = 2000) { | ||
await page.select(selector, value); | ||
await new Promise(resolve => setTimeout(resolve, waitTime)); // Wait for specified time for the page to load | ||
} | ||
|
||
const dataDir = path.join(__dirname, 'data'); | ||
// Function to get dropdown options excluding the default | ||
async function getOptions(selector) { | ||
return await page.evaluate((selector) => { | ||
const options = Array.from(document.querySelector(selector).options); | ||
return options.filter(opt => opt.value !== '-1').map(opt => ({ value: opt.value, text: opt.text })); | ||
}, selector); | ||
} | ||
|
||
// Create the parent directory if it doesn't exist | ||
if (!fs.existsSync(dataDir)) { | ||
fs.mkdirSync(dataDir); | ||
// Create directories if not exist | ||
function createDirectory(dir) { | ||
if (!fs.existsSync(dir)){ | ||
fs.mkdirSync(dir, { recursive: true }); | ||
} | ||
} | ||
|
||
// Fetch dropdown options for the year | ||
const years = await getOptions(page, '#CPHPage_ddFinyear'); | ||
|
||
// Filter out the "-Select Year-" option | ||
const validYears = years.filter(year => year.value === "2024-2025" ); | ||
|
||
for (let year of validYears) { | ||
const yearFolder = path.join(dataDir, year.text.replace(/[\\/:*?"<>|]/g, '-')); | ||
if (!fs.existsSync(yearFolder)) { | ||
fs.mkdirSync(yearFolder); | ||
} | ||
console.log(year.value); | ||
await page.select('#CPHPage_ddFinyear', year.value); | ||
await new Promise(resolve => setTimeout(resolve, 500)); | ||
|
||
// Select Odisha from the state dropdown | ||
await page.select('#CPHPage_ddState', '24'); | ||
await new Promise(resolve => setTimeout(resolve, 500)); | ||
|
||
const stateFolder = path.join(yearFolder, 'Odisha'); | ||
if (!fs.existsSync(stateFolder)) { | ||
fs.mkdirSync(stateFolder); | ||
} | ||
console.log('Odisha'); | ||
await new Promise(resolve => setTimeout(resolve, 500)); | ||
|
||
let districts = await getOptions(page, '#CPHPage_ddDistrict'); | ||
// Save data to CSV | ||
async function saveToCsv(data, dir, filename) { | ||
createDirectory(dir); | ||
const csvWriter = createCsvWriter({ | ||
path: path.join(dir, filename), | ||
header: Object.keys(data[0]).map(key => ({id: key, title: key})) | ||
}); | ||
await csvWriter.writeRecords(data); | ||
} | ||
|
||
// Limit districts based on input arguments | ||
const start = startDistrict || 0; | ||
const end = endDistrict || districts.length - 1; | ||
// Scrape data and save to CSV | ||
async function scrapeAndSaveData(state, district, block, panchayat, village) { | ||
await page.click(selectors.showButton); | ||
await new Promise(resolve => setTimeout(resolve, 5000)); // Wait for 5 seconds for data to load | ||
|
||
// Scrape population data | ||
const populationData = await page.evaluate(() => { | ||
return { | ||
totalPopulation: document.querySelector('#CPHPage_lblToptalPop').innerText.trim(), | ||
scPopulation: document.querySelector('#CPHPage_lblSCPop').innerText.trim(), | ||
stPopulation: document.querySelector('#CPHPage_lblSTPop').innerText.trim(), | ||
genPopulation: document.querySelector('#CPHPage_lblGENPop').innerText.trim() | ||
}; | ||
}); | ||
|
||
// Scrape connection information | ||
const connectionData = await page.evaluate(() => { | ||
return { | ||
totalHouseholds: document.querySelector('#CPHPage_lblHouseHolds').innerText.trim(), | ||
tapConnections: document.querySelector('#CPHPage_lblHouseConnection').innerText.trim(), | ||
pwsAvailable: document.querySelector('#CPHPage_lblIsPWS').innerText.trim(), | ||
jjmStatus: document.querySelector('#CPHPage_lblvillagestatus').innerText.trim() | ||
}; | ||
}); | ||
|
||
const baseDir = path.join(__dirname, 'data', state, district, block, panchayat, village); | ||
await saveToCsv([populationData], baseDir, 'population.csv'); | ||
await saveToCsv([connectionData], baseDir, 'connection_information.csv'); | ||
} | ||
|
||
for (let i = start; i <= end; i++) { | ||
const district = districts[i]; | ||
const districtFolder = path.join(stateFolder, district.text.replace(/[\\/:*?"<>|]/g, '-')); | ||
if (!fs.existsSync(districtFolder)) { | ||
fs.mkdirSync(districtFolder); | ||
} | ||
console.log(district.text); | ||
await page.select('#CPHPage_ddDistrict', district.value); | ||
await new Promise(resolve => setTimeout(resolve, 500)); | ||
let blocks = await getOptions(page, '#CPHPage_ddBlock'); | ||
|
||
for (let block of blocks) { | ||
const blockFolder = path.join(districtFolder, block.text.replace(/[\\/:*?"<>|]/g, '-')); | ||
if (!fs.existsSync(blockFolder)) { | ||
fs.mkdirSync(blockFolder); | ||
} | ||
|
||
await page.select('#CPHPage_ddBlock', block.value); | ||
await new Promise(resolve => setTimeout(resolve, 500)); | ||
let categories = await getOptions(page, '#CPHPage_ddCategory'); | ||
|
||
for (let category of categories) { | ||
const categoryNameForFile = category.text.replace(/[\\/:*?"<>|]/g, '-'); | ||
await page.select('#CPHPage_ddCategory', category.value); | ||
await new Promise(resolve => setTimeout(resolve, 2000)); | ||
const radioButtonsPresent = await page.evaluate(() => { | ||
const radioButtons = document.querySelectorAll('#CPHPage_rdbvillages input[type="radio"]'); | ||
return radioButtons.length > 0; | ||
}); | ||
if (radioButtonsPresent) { | ||
// Handle radio buttons | ||
const radioButtons = await page.evaluate(() => { | ||
const radios = Array.from(document.querySelectorAll('#CPHPage_rdbvillages input[type="radio"]')); | ||
return radios.map(radio => ({ | ||
id: radio.id, | ||
value: radio.value, | ||
label: radio.nextElementSibling.textContent.trim() // Fetch text between <label> tags | ||
})); | ||
}); | ||
|
||
if (radioButtons.length > 0) { | ||
// Handle radio buttons if available | ||
for (let radioButton of radioButtons) { | ||
try { | ||
await page.evaluate((id) => { | ||
const element = document.getElementById(id); | ||
if (element) { | ||
element.click(); | ||
} else { | ||
throw new Error(`Element with id ${id} not found.`); | ||
} | ||
}, radioButton.id); | ||
|
||
// Click on the "Show" button | ||
await page.click('#CPHPage_btnShow'); | ||
await new Promise(resolve => setTimeout(resolve, 6500)); | ||
|
||
// Wait for the data to load | ||
await page.waitForSelector('#tableReportTable', { timeout: 5000 }).catch(() => console.log('Table not found, proceeding to next radio button')); | ||
|
||
const data = await page.evaluate(() => { | ||
const rows = Array.from(document.querySelectorAll('#tableReportTable tr')); | ||
return rows.map(row => { | ||
const columns = row.querySelectorAll('th, td'); | ||
return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' ')); | ||
}); | ||
}); | ||
|
||
// Save to CSV file using radio button label instead of value | ||
const csvFilePath = path.join(blockFolder, `${radioButton.label}.csv`); | ||
stringify(data, (err, output) => { | ||
if (err) throw err; | ||
fs.writeFile(csvFilePath, output, (err) => { | ||
if (err) throw err; | ||
// console.log(`${radioButton.label}.csv saved in ${block.text} folder.`); | ||
}); | ||
}); | ||
} catch (error) { | ||
console.error(error); | ||
continue; | ||
} | ||
} | ||
} | ||
} else { | ||
// Handle categories if radio buttons are not present | ||
await page.click('#CPHPage_btnShow'); | ||
await new Promise(resolve => setTimeout(resolve, 5000)); | ||
|
||
// Wait for the data to load | ||
await page.waitForSelector('#tableReportTable', { timeout: 5000 }).catch(() => console.log('Table not found, proceeding to next category')); | ||
|
||
const data = await page.evaluate(() => { | ||
const rows = Array.from(document.querySelectorAll('#tableReportTable tr')); | ||
return rows.map(row => { | ||
const columns = row.querySelectorAll('th, td'); | ||
return Array.from(columns, column => column.innerText.trim().replace(/\n/g, ' ')); | ||
}); | ||
}); | ||
|
||
// Save to CSV file using category name instead of value | ||
const csvFilePath = path.join(blockFolder, `${categoryNameForFile}.csv`); | ||
stringify(data, (err, output) => { | ||
if (err) throw err; | ||
fs.writeFile(csvFilePath, output, (err) => { | ||
if (err) throw err; | ||
// console.log(`${categoryNameForFile}.csv saved in ${block.text} folder.`); | ||
}); | ||
}); | ||
} | ||
} | ||
// Set the state to Odisha | ||
const odishaValue = '24'; // Odisha value from the dropdown | ||
await selectOption(selectors.state, odishaValue, 2000); | ||
|
||
const districts = await getOptions(selectors.district); | ||
const districtsToProcess = districts.slice(startIndex - 1, endIndex); | ||
|
||
for (const district of districtsToProcess) { | ||
console.log(`Starting district: ${district.text}`); | ||
await selectOption(selectors.district, district.value, 2000); | ||
const blocks = await getOptions(selectors.block); | ||
for (const block of blocks) { | ||
await selectOption(selectors.block, block.value, 2000); | ||
const panchayats = await getOptions(selectors.panchayat); | ||
for (const panchayat of panchayats) { | ||
await selectOption(selectors.panchayat, panchayat.value, 2000); | ||
const villages = await getOptions(selectors.village); | ||
for (const village of villages) { | ||
await selectOption(selectors.village, village.value, 2000); | ||
await scrapeAndSaveData('Odisha', district.text, block.text, panchayat.text, village.text); | ||
} | ||
} | ||
} | ||
|
||
await browser.close(); | ||
} catch (err) { | ||
console.error(err); | ||
} | ||
} | ||
|
||
// Extract start and end districts from command line arguments | ||
const startDistrict = parseInt(process.argv[2]); | ||
const endDistrict = parseInt(process.argv[3]); | ||
|
||
scrapeData(startDistrict, endDistrict); | ||
await browser.close(); | ||
})(); |