I have created a recursive function for Web scrapping in Node JS. And for the scrapping part I have used promise.all recursively.I have created a recursive function for Web scrapping in Node JS.
But the problem is after processing all the data of via getScrappingData I can see the final result in terminal with console.log but Not able to return the result from getScrappingData function it is running in loop.
Don't know why but I am trying to return the result but the control is not going to the calling function.
Please can anyone help me with this. What is wrong in my code ?
import axios from 'axios';
import jsdom from 'jsdom';
import CommonUtils from '../utils/common';
import {
MESSAGE, REQUEST_TYPE, NUMBER,
} from '../utils/constant';
const { JSDOM } = jsdom;
const maper = {};
const docs = [];
let scrappingCount = 0;
const scrappedData = [];
const landGrabberService = {};
let nestedUrls = [];
const newAddedUrl = [];
let getData = false;
const preDocs = [];
let preHead;
let preImg;
const getBody = async (url) => {
const options = {
url,
method: 'GET',
};
try {
axios.defaults.headers.post['Content-Type'] = 'application/x-www-form-urlencoded';
axios.defaults.transformResponse = [];
const promise = axios(options);
const pro = new Promise(((resolve, reject) => {
promise.then((response) => {
if (response.status === 200) {
resolve(response.data);
}
}).catch((err) => 0);
}));
return pro;
} catch (error) {
console.log('42');
return 0;
}
};
const getAllURls = async (html, baseUrl) => {
const doc = new JSDOM(html);
const redirectUrl = [];
const imgUrl = [];
let docHead;
try {
doc.window.document.querySelectorAll('a').forEach((link) => {
const regrx = new RegExp(/(^https|Home|showpublisheddocument|^http?:\/\/[^\s]+)/g);
if (regrx.test(link.href)) {
redirectUrl.push(link.href);
}
});
doc.window.document.querySelectorAll('img').forEach((image) => {
const regexMedia = new RegExp(/(https|http)/g);
if (regexMedia.test(image.src)) {
imgUrl.push(image.src);
} else {
imgUrl.push(baseUrl + image.src);
}
});
docHead = doc.window?.document?.querySelector('title')?.textContent;
return { anchorUrl: redirectUrl, imgUrl, header: docHead };
} catch (error) {
return CommonUtils.throwError(error);
}
};
const getScrappingData = async (reqData) => {
try {
const {
search, url,
} = reqData;
if (url) {
const urlDetails = new URL(url);
const baseUrl = urlDetails.host;
const htmlBody = await getBody(url);
if (htmlBody) {
const urls = await getAllURls(htmlBody, baseUrl);
const imgTags = urls.imgUrl;
nestedUrls = urls.anchorUrl;
const docHead = urls.header;
const key = docHead?.split('|');
nestedUrls.map((u) => {
if (!(u in maper) && !maper[u]) {
maper[u] = false;
}
return maper[u];
});
const notInclude = ['pdf', 'facebook', 'google', 'instagram', 'twitter', 'youtube', 'mailto', 'showpublisheddocument', 'recruit', 'home', 'Home', 'html'];
const allowMedia = ['showpublisheddocument', 'pdf', 'Home'];
const promise = nestedUrls.map(async (u) => {
if (!notInclude.some((el) => u.includes(el))) {
// CONDITION FOR WEB URL
if (u in maper && !maper[u] && scrappingCount < 1) {
maper[u] = true;
const prepareData = {
url: u,
search,
};
await getScrappingData(prepareData);
}
} else if (allowMedia.some((el) => u.includes(el))) {
// CONDITION FOR MEDIA DOCS
console.log('------------------------------------------->', key[0]);
if (key[NUMBER.ZERO].includes(search)) {
const relativePath = new RegExp(/(^https|^http)/g);
if (relativePath.test(u)) {
docs.push(u);
} else {
docs.push(baseUrl + u);
}
newAddedUrl.push(url);
if (preDocs.length === 0) {
preDocs.push(1);
preHead = docHead;
preImg = imgTags;
}
newAddedUrl.forEach((x) => {
if (x !== url && scrappingCount < 1) {
scrappingCount += 1;
scrappedData.push({
title: preHead,
domain: x,
pdf: docs,
images: preImg,
});
docs.length = 0;
}
});
}
}
if ((nestedUrls.length - 1 === nestedUrls.indexOf(u)) && scrappingCount === 1 && !getData) {
scrappedData.push({
title: docHead,
domain: url,
pdf: docs,
images: imgTags,
});
console.log('-------------------EXIT-----------------------------');
getData = true;
}
});
const result = await Promise.all(promise);
if (result) {
if (getData) {
console.log(scrappedData);
return scrappedData;
}
}
} else {
for (let i = 0; i < nestedUrls.length; i += 1) {
if (nestedUrls[i] === url) {
const data = {
url: nestedUrls[i + 1], search,
};
// getScrappingData(data);
break;
}
}
}
} else {
return { message: MESSAGE.INVALID_URI };
}
if (getData) {
console.log(scrappedData);
return scrappedData;
}
} catch (error) {
console.log(error);
return CommonUtils.throwError(error);
}
};
/**
* @description Get landGrabber scrapping data
* @returns {object} return object
*/
const mainFun = async (scrappingData) => {
try {
const { search, type } = scrappingData;
const googleUrls = [];
let returnObject;
let { url } = scrappingData;
const prepareData = {
url, search, type,
};
returnObject = await getScrappingData(prepareData);
console.log('-----------------------FINAL RESULT------------------------');
console.log('EXIT----->');
return returnObject;
} catch (err) {
return CommonUtils.throwError(err);
}
};
Aucun commentaire:
Enregistrer un commentaire