I want to do a script in NodeJS that help me to web crawling all kinds of sites. This is what I've tried so far (I found this script):
const Crawler = require("crawler");
let obselete = []; // Array of what was crawled already
let c = new Crawler();
function crawlAllUrls(url) {
//console.log(`Crawling ${url}`);
c.queue({
uri: url,
callback: function (err, res, done) {
if (err) throw err;
//console.log("debug 1");
let $ = res.$;
//console.log("debug 2");
try {
let urls = $("a");
//console.log("debug 3");
console.log(res);
Object.keys(urls).forEach((item) => {
//console.log("debug 4");
if (urls[item].type === 'tag') {
//console.log("debug 5");
let href = urls[item].attribs.href;
//console.log("debug 5.1");
if (href && !obselete.includes(href) && href.startsWith(url)) {
//console.log("debug 6");
href = href.trim();
obselete.push(href);
// Slow down the
//console.log("debug 7");
setTimeout(function() {
href.startsWith('http') ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`) // The latter might need extra code to test if its the same site and it is a full domain with no URI
//console.log("debug 8");
}, 5000)
}
}
});
} catch (e) {
//console.log("debug 9");
console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
done()
//console.log("debug 10");
}
//console.log("debug 11");
done();
//console.log("debug 12");
}
})
}
crawlAllUrls('https://www.amazon.com/Roku-Streaming-Device-Vision-Controls/dp/B09BKCDXZC/ref=lp_16225007011_1_2');
OBS: This script as it's now works perfectly! My issue is when I want to get web content from another kind of sites (obvious another URL), sites that are using the directive ng-app of AngularJS ( see image from Mozilla Firefox inspect mode: https://www.dropbox.com/s/pylmv0ge11u00ws/img5.PNG?dl=0 ). You can see in the image that this website uses AngularJS to handle requests and that returns JSON file format data, that JSON raw data is what I needed. I've censored site name because is a public auctions governmental website. I want to know if is possible to web crawl that site and if I don't, I want to know why? Site have any crawl blocking method or because website security (encrypted connection TLS / SSL)...
Thanks!
Aucun commentaire:
Enregistrer un commentaire