Skip to content

Commit

Permalink
indent
Browse files Browse the repository at this point in the history
  • Loading branch information
aliounedia committed Feb 7, 2014
1 parent cf8b65f commit a142fe0
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 86 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
node_modules
*.json
*test.js
81 changes: 35 additions & 46 deletions scripts/scrape.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
var request = require("request")
var request = require("request")
,cheerio = require("cheerio")
,fs = require("fs")
,async = require("async")
,_ = require("underscore")
var config ={
var config ={
nb_row : 794
,base_url : '\
http://www.creationdentreprise.sn/rechercher-une-societe?\
Expand All @@ -20,72 +20,61 @@ var rx = /<a[^>]*>([^<]+)<\/a>/
;
//scrapper main
function scraper_main(){
var rows = config.nb_row
var rows = config.nb_row
,urls = [];
while(rows--){
while(rows--){
urls.push(config.base_url + rows)
}
//End building urls
async.mapSeries(urls,
function(url, cb){
request(url,
function(err,res, body){
if(err){
cb(null,err)
} else {
cb(null,body)
}
})
}
//End building urls
async.mapSeries(urls,function(url, cb){
request(url,function(err,res, body){
if(err){
cb(null,err)
} else {
cb(null,body)
}
})
},
function(err, results){
data =[]
_.each(
results,
function(result){
data.push(parseBody(result))
})
console.log(data)
store_main(data)
})
_.each(results,function(result){
data.push(parseBody(result))
})
console.log(data)
store_main(data)
}
)
}
//store into file
function store_main(parsed_data){
fs.appendFile('datas/link.json',parsed_data.join("\n"),
function (err) {
if(err){throw err; return;};
console.log("data was wrote into tmp file")
}
);
fs.appendFile('datas/link.json',parsed_data.join("\n"), function(err){
if(err){
throw err; return;
};
console.log("data was wrote into tmp file")
});
}
// parse body
function parseBody(body){
$ = cheerio.load(body)
var rows= []
console.log($("div").html())
$("div .views-table tr").each(
function(idx, html){
$("div .views-table tr").each(function(idx, html){
var row =[]
$(this).find("td").each(
function(idx , html){
$(this).find("td").each(function(idx , html){
if(rx3.test($(this).html())){
raw = config.base_site
+ $(this).find("a").attr("href")
raw = config.base_site + $(this).find("a").attr("href")
}else if(rx.test($(this).html())){
raw = $(this).html().match(rx)[1]
}else if(rx2.test($(this).html())){
raw = $(this).html().match(rx2)[1]
}else{
raw = $(this).html()
}
row.push(_.unescape(new String(raw).trim()
.replace(/\n/g, "")
.replace(/,/g," ")
))
}
)
rows.push(row.join(','))
}
)
return rows.join('\n')
row.push(_.unescape(new String(raw).trim().replace(/\n/g, "").replace(/,/g," ")))
})
rows.push(row.join(','))
})
return rows.join('\n')
}
scraper_main()
60 changes: 21 additions & 39 deletions scripts/scrape_detail.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,28 @@ function scraper_details(){
console.log("scraper_details")
fs.readFile('datas/link.json', function (err, data) {
if (err){throw err}
lines = data.toString().split('\n').filter(
function(line){ return line.length >0 }
)
lines = data.toString().split('\n').filter(function(line){
return line.length >0
})
async.map(lines,
function(line, cb){
request(line.split(",").pop(),
function(err, res, body){
request(line.split(",").pop(),function(err, res, body){
if(err){
cb(null, [line ,err])
} else {
cb(null, [line, body])
}
})
},
function(err, results){
data =[]
_.each(
results,
function(result){
},
function(err, results){
data =[]
_.each(results,function(result){
data.push(result[0] + "," + parseBody(result[1]).join(","))
})
console.log("++++++++++++++++++++" + data)
store_details(data)
})
console.log("++++++++++++++++++++" + data)
store_details(data)
})
});
});
}
// parse body
function parseBody(body){
Expand All @@ -42,27 +39,14 @@ function parseBody(body){
$("div .content ").each(
function(idx, html){
//var row =[]
$(this).find(".field").each(
function(idx, html){
/*$(this).find("div").each(
function(idx , html){
if ($(this).text().indexOf(":")==-1){
row.push($(this).text()
.replace(/\n/g, "")
.replace(/,/g," ")
)
}
}
)*/
if( $(this).text().indexOf("de commerce:") !=-1
||$(this).text().indexOf("Capital:") !=-1){
console.log($(this).text())
row.push($(this).text().split(":").pop())
}
}
)
$(this).find(".field").each(function(idx, html){
if( $(this).text().indexOf("de commerce:") !=-1 ||$(this).text().indexOf("Capital:") !=-1){
console.log($(this).text())
row.push($(this).text().split(":").pop())
}
})
})
return _.unique(row)
return _.unique(row)
}
//store into file
function store_details(data){
Expand All @@ -77,11 +61,9 @@ Secteur d activite,\
url,\
Registre de commerce,\
Capital\n')
fs.appendFile('datas/data.csv',data.join("\n"),
function (err) {
fs.appendFile('datas/data.csv',data.join("\n"), function(err){
if(err){throw err; return;};
console.log("data was wrote into tmp file")
}
);
});
}
scraper_details()

0 comments on commit a142fe0

Please sign in to comment.