web: Web Scrapping in NodeJS recursively promise all

vendredi 19 novembre 2021

Web Scrapping in NodeJS recursively promise all

I have created a recursive function for Web scrapping in Node JS. And for the scrapping part I have used promise.all recursively.I have created a recursive function for Web scrapping in Node JS.

But the problem is after processing all the data of via getScrappingData I can see the final result in terminal with console.log but Not able to return the result from getScrappingData function it is running in loop.

Don't know why but I am trying to return the result but the control is not going to the calling function.

Please can anyone help me with this. What is wrong in my code ?

import axios from 'axios';
import jsdom from 'jsdom';
import CommonUtils from '../utils/common';
import {
  MESSAGE, REQUEST_TYPE, NUMBER,
} from '../utils/constant';

const { JSDOM } = jsdom;
const maper = {};
const docs = [];
let scrappingCount = 0;
const scrappedData = [];
const landGrabberService = {};
let nestedUrls = [];
const newAddedUrl = [];
let getData = false;
const preDocs = [];
let preHead;
let preImg;
const getBody = async (url) => {
  const options = {
    url,
    method: 'GET',
  };

  try {
    axios.defaults.headers.post['Content-Type'] = 'application/x-www-form-urlencoded';
    axios.defaults.transformResponse = [];
    const promise = axios(options);
    const pro = new Promise(((resolve, reject) => {
      promise.then((response) => {
        if (response.status === 200) {
          resolve(response.data);
        }
      }).catch((err) => 0);
    }));
    return pro;
  } catch (error) {
    console.log('42');
    return 0;
  }
};

const getAllURls = async (html, baseUrl) => {
  const doc = new JSDOM(html);
  const redirectUrl = [];
  const imgUrl = [];
  let docHead;
  try {
    doc.window.document.querySelectorAll('a').forEach((link) => {
      const regrx = new RegExp(/(^https|Home|showpublisheddocument|^http?:\/\/[^\s]+)/g);
      if (regrx.test(link.href)) {
        redirectUrl.push(link.href);
      }
    });
    doc.window.document.querySelectorAll('img').forEach((image) => {
      const regexMedia = new RegExp(/(https|http)/g);
      if (regexMedia.test(image.src)) {
        imgUrl.push(image.src);
      } else {
        imgUrl.push(baseUrl + image.src);
      }
    });
    docHead = doc.window?.document?.querySelector('title')?.textContent;
    return { anchorUrl: redirectUrl, imgUrl, header: docHead };
  } catch (error) {
    return CommonUtils.throwError(error);
  }
};

const getScrappingData = async (reqData) => {
  try {
    const {
      search, url,
    } = reqData;

    if (url) {
      const urlDetails = new URL(url);
      const baseUrl = urlDetails.host;
      const htmlBody = await getBody(url);
      if (htmlBody) {
        const urls = await getAllURls(htmlBody, baseUrl);
        const imgTags = urls.imgUrl;
        nestedUrls = urls.anchorUrl;
        const docHead = urls.header;
        const key = docHead?.split('|');
        nestedUrls.map((u) => {
          if (!(u in maper) && !maper[u]) {
            maper[u] = false;
          }
          return maper[u];
        });
        const notInclude = ['pdf', 'facebook', 'google', 'instagram', 'twitter', 'youtube', 'mailto', 'showpublisheddocument', 'recruit', 'home', 'Home', 'html'];
        const allowMedia = ['showpublisheddocument', 'pdf', 'Home'];
        const promise = nestedUrls.map(async (u) => {
          if (!notInclude.some((el) => u.includes(el))) {
            // CONDITION FOR WEB URL
            if (u in maper && !maper[u] && scrappingCount < 1) {
              maper[u] = true;
              const prepareData = {
                url: u,
                search,
              };
              await getScrappingData(prepareData);
            }
          } else if (allowMedia.some((el) => u.includes(el))) {
            // CONDITION FOR MEDIA DOCS
            console.log('------------------------------------------->', key[0]);
            if (key[NUMBER.ZERO].includes(search)) {
              const relativePath = new RegExp(/(^https|^http)/g);
              if (relativePath.test(u)) {
                docs.push(u);
              } else {
                docs.push(baseUrl + u);
              }
              newAddedUrl.push(url);
              if (preDocs.length === 0) {
                preDocs.push(1);
                preHead = docHead;
                preImg = imgTags;
              }
              newAddedUrl.forEach((x) => {
                if (x !== url && scrappingCount < 1) {
                  scrappingCount += 1;
                  scrappedData.push({
                    title: preHead,
                    domain: x,
                    pdf: docs,
                    images: preImg,
                  });
                  docs.length = 0;
                }
              });
            }
          }
          if ((nestedUrls.length - 1 === nestedUrls.indexOf(u)) && scrappingCount === 1 && !getData) {
            scrappedData.push({
              title: docHead,
              domain: url,
              pdf: docs,
              images: imgTags,
            });
            console.log('-------------------EXIT-----------------------------');
            getData = true;
          }
        });
        const result = await Promise.all(promise);
        if (result) {
          if (getData) {
            console.log(scrappedData);
            return scrappedData;
          }
        }
      } else {
        for (let i = 0; i < nestedUrls.length; i += 1) {
          if (nestedUrls[i] === url) {
            const data = {
              url: nestedUrls[i + 1], search,
            };
            // getScrappingData(data);
            break;
          }
        }
      }
    } else {
      return { message: MESSAGE.INVALID_URI };
    }
    if (getData) {
      console.log(scrappedData);
      return scrappedData;
    }
  } catch (error) {
    console.log(error);
    return CommonUtils.throwError(error);
  }
};

/**
 * @description Get landGrabber scrapping data
 * @returns {object} return object
 */
const mainFun = async (scrappingData) => {
  try {
    const { search, type } = scrappingData;
    const googleUrls = [];
    let returnObject;
    let { url } = scrappingData;
    const prepareData = {
        url, search, type,
    };
    returnObject = await getScrappingData(prepareData);
    console.log('-----------------------FINAL RESULT------------------------');
    
    console.log('EXIT----->');
    return returnObject;
  } catch (err) {
    return CommonUtils.throwError(err);
  }
};

web

vendredi 19 novembre 2021

Web Scrapping in NodeJS recursively promise all

Aucun commentaire:

Enregistrer un commentaire