diff --git a/.gitignore b/.gitignore index c88585f..9d91fbf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ node_modules -*.json *test.js \ No newline at end of file diff --git a/scripts/scrape.js b/scripts/scrape.js index 02e7a60..e3bfceb 100644 --- a/scripts/scrape.js +++ b/scripts/scrape.js @@ -1,9 +1,9 @@ - var request = require("request") +var request = require("request") ,cheerio = require("cheerio") ,fs = require("fs") ,async = require("async") ,_ = require("underscore") - var config ={ +var config ={ nb_row : 794 ,base_url : '\ http://www.creationdentreprise.sn/rechercher-une-societe?\ @@ -20,56 +20,50 @@ var rx = /]*>([^<]+)<\/a>/ ; //scrapper main function scraper_main(){ - var rows = config.nb_row + var rows = config.nb_row ,urls = []; - while(rows--){ + while(rows--){ urls.push(config.base_url + rows) - } - //End building urls - async.mapSeries(urls, - function(url, cb){ - request(url, - function(err,res, body){ - if(err){ - cb(null,err) - } else { - cb(null,body) - } - }) + } + //End building urls + async.mapSeries(urls,function(url, cb){ + request(url,function(err,res, body){ + if(err){ + cb(null,err) + } else { + cb(null,body) + } + }) }, function(err, results){ data =[] - _.each( - results, - function(result){ - data.push(parseBody(result)) - }) - console.log(data) - store_main(data) - }) + _.each(results,function(result){ + data.push(parseBody(result)) + }) + console.log(data) + store_main(data) + } + ) } //store into file function store_main(parsed_data){ - fs.appendFile('datas/link.json',parsed_data.join("\n"), - function (err) { - if(err){throw err; return;}; - console.log("data was wrote into tmp file") - } - ); + fs.appendFile('datas/link.json',parsed_data.join("\n"), function(err){ + if(err){ + throw err; return; + }; + console.log("data was wrote into tmp file") + }); } // parse body function parseBody(body){ $ = cheerio.load(body) var rows= [] console.log($("div").html()) - $("div .views-table tr").each( - function(idx, html){ + $("div .views-table tr").each(function(idx, html){ var row =[] - $(this).find("td").each( - function(idx , html){ + $(this).find("td").each(function(idx , html){ if(rx3.test($(this).html())){ - raw = config.base_site - + $(this).find("a").attr("href") + raw = config.base_site + $(this).find("a").attr("href") }else if(rx.test($(this).html())){ raw = $(this).html().match(rx)[1] }else if(rx2.test($(this).html())){ @@ -77,15 +71,10 @@ function parseBody(body){ }else{ raw = $(this).html() } - row.push(_.unescape(new String(raw).trim() - .replace(/\n/g, "") - .replace(/,/g," ") - )) - } - ) - rows.push(row.join(',')) - } -) -return rows.join('\n') + row.push(_.unescape(new String(raw).trim().replace(/\n/g, "").replace(/,/g," "))) + }) + rows.push(row.join(',')) + }) + return rows.join('\n') } scraper_main() diff --git a/scripts/scrape_detail.js b/scripts/scrape_detail.js index dee30f6..695b590 100644 --- a/scripts/scrape_detail.js +++ b/scripts/scrape_detail.js @@ -9,31 +9,28 @@ function scraper_details(){ console.log("scraper_details") fs.readFile('datas/link.json', function (err, data) { if (err){throw err} - lines = data.toString().split('\n').filter( - function(line){ return line.length >0 } - ) + lines = data.toString().split('\n').filter(function(line){ + return line.length >0 + }) async.map(lines, function(line, cb){ - request(line.split(",").pop(), - function(err, res, body){ + request(line.split(",").pop(),function(err, res, body){ if(err){ cb(null, [line ,err]) } else { cb(null, [line, body]) } }) - }, - function(err, results){ - data =[] - _.each( - results, - function(result){ + }, + function(err, results){ + data =[] + _.each(results,function(result){ data.push(result[0] + "," + parseBody(result[1]).join(",")) - }) - console.log("++++++++++++++++++++" + data) - store_details(data) + }) + console.log("++++++++++++++++++++" + data) + store_details(data) }) - }); + }); } // parse body function parseBody(body){ @@ -42,27 +39,14 @@ function parseBody(body){ $("div .content ").each( function(idx, html){ //var row =[] - $(this).find(".field").each( - function(idx, html){ - /*$(this).find("div").each( - function(idx , html){ - if ($(this).text().indexOf(":")==-1){ - row.push($(this).text() - .replace(/\n/g, "") - .replace(/,/g," ") - ) - } - } - )*/ - if( $(this).text().indexOf("de commerce:") !=-1 - ||$(this).text().indexOf("Capital:") !=-1){ - console.log($(this).text()) - row.push($(this).text().split(":").pop()) - } - } - ) + $(this).find(".field").each(function(idx, html){ + if( $(this).text().indexOf("de commerce:") !=-1 ||$(this).text().indexOf("Capital:") !=-1){ + console.log($(this).text()) + row.push($(this).text().split(":").pop()) + } + }) }) - return _.unique(row) + return _.unique(row) } //store into file function store_details(data){ @@ -77,11 +61,9 @@ Secteur d activite,\ url,\ Registre de commerce,\ Capital\n') - fs.appendFile('datas/data.csv',data.join("\n"), - function (err) { + fs.appendFile('datas/data.csv',data.join("\n"), function(err){ if(err){throw err; return;}; console.log("data was wrote into tmp file") - } - ); + }); } scraper_details()