I am writing a web crawler in Node js with the request module and redis as url cache.
What I try to accomplish is a constantly crawler loop (endlessly) which gets a url from redis and makes a https request.
I have tried to add paralell requests limitation. For example 10 parallel requests.
I do not know why but no matter if I choose 10 or 1000 the outcome is always the same.
Within one minute only between 100 and 200 requests get processed.
Please give me a hint where am I doing wrong. I would like to process 60.000 requests (urls) per minute, but even if I choose 10k parallel requests as limit it process only between 100 and 200 requests within a minute.
My code as below:
the crawler loop:
var limit = 1000;
var running = 0;
function loop() {
while(running < limit) {
req(function(){
running--;
loop();
});
running++;
}
}
loop();
function req (callback){
client.select(2);
client.RANDOMKEY([], function (err, result) {
var url = result;
client.del(url);
if(err) {
return callback();
}
else if (url == null || url == "" ) {
return callback();
}
else {
request(url,function(error, response, body){
return callback();
});
}
});
}
What I have tried:
I tried to play with setTimeout and fire more concurrent requests even that didn't work out no matter what I have tried I never got more than 200 requests being processed in a minute. Cannot believe that this is the limit node js is capable of.