vendredi 26 novembre 2021

How can I do web crawling successfully with NodeJS?

I want to do a script in NodeJS that help me to web crawling all kinds of sites. This is what I've tried so far (I found this script):

const Crawler = require("crawler");

let obselete = []; // Array of what was crawled already

let c = new Crawler();

function crawlAllUrls(url) {
 //console.log(`Crawling ${url}`);
 c.queue({
     uri: url,
     callback: function (err, res, done) {
         if (err) throw err;
         //console.log("debug 1");
         let $ = res.$;
         //console.log("debug 2");
         try {
             let urls = $("a");
             //console.log("debug 3");
             console.log(res);
             Object.keys(urls).forEach((item) => {
                 //console.log("debug 4");
                 if (urls[item].type === 'tag') {
                     //console.log("debug 5");
                     let href = urls[item].attribs.href;
                     //console.log("debug 5.1");
                     if (href && !obselete.includes(href) && href.startsWith(url)) {
                         //console.log("debug 6");
                         href = href.trim();
                         obselete.push(href);
                         // Slow down the
                         //console.log("debug 7");
                         setTimeout(function() {
                             href.startsWith('http') ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`) // The latter might need extra code to test if its the same site and it is a full domain with no URI
                             //console.log("debug 8");
                        }, 5000)
                     }
                 }
             });
         } catch (e) {
             //console.log("debug 9");
             console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
             done()
             //console.log("debug 10");
         }
         //console.log("debug 11");
         done();
         //console.log("debug 12");
     }
 })
}

crawlAllUrls('https://www.amazon.com/Roku-Streaming-Device-Vision-Controls/dp/B09BKCDXZC/ref=lp_16225007011_1_2');

OBS: This script as it's now works perfectly! My issue is when I want to get web content from another kind of sites (obvious another URL), sites that are using the directive ng-app of AngularJS ( see image from Mozilla Firefox inspect mode: https://www.dropbox.com/s/pylmv0ge11u00ws/img5.PNG?dl=0 ). You can see in the image that this website uses AngularJS to handle requests and that returns JSON file format data, that JSON raw data is what I needed. I've censored site name because is a public auctions governmental website. I want to know if is possible to web crawl that site and if I don't, I want to know why? Site have any crawl blocking method or because website security (encrypted connection TLS / SSL)...

Thanks!




Aucun commentaire:

Enregistrer un commentaire