mercredi 17 novembre 2021

how to Store data into array from foreach in recursive function

Here is the code when I execute this code then the result is null result is not waited for foreach to complete into recursive function. return scrappedData gives null but it should return all the data which is scrapped from the recursive function.

Here is the code when I execute this code then the result is null result is not waited for foreach to complete into recursive function. return scrappedData gives null but it should return all the data which is scrapped from the recursive function.

Here is the code when I execute this code then the result is null result is not waited for foreach to complete into recursive function. return scrappedData gives null but it should return all the data which is scrapped from the recursive function.

Here is the code when I execute this code then the result is null result is not waited for foreach to complete into recursive function. return scrappedData gives null but it should return all the data which is scrapped from the recursive function.

import jsdom from 'jsdom';

const {
    JSDOM
} = jsdom;
const maper = {};
const docs = [];
let scrappingCount = 0;
let port = 0;
const scrappedData = [];
const landGrabberService = {};
// GET HTML BODY
const getBody = async (url) => {

    const options = {
        url,
        method: 'GET',
    };

    axios.defaults.headers.post['Content-Type'] = 'application/x-www-form-urlencoded';
    const promise = axios(options);
    const dataPromise = promise.then((response) => response.data);
    promise.catch((error) => {
        console.log('ERRORR');
        console.log(error.errno);
    });
    return dataPromise;
};
// GET ALL ANCHORS
const getAllURls = async (html, baseUrl) => {
    const doc = new JSDOM(html);
    const redirectUrl = [];
    const imgUrl = [];
    let docHead;
    try {
        doc.window.document.querySelectorAll('a').forEach((link) => {
            const regrx = new RegExp(/(^https|Home|showpublisheddocument|^http?:\/\/[^\s]+)/g);
            if (regrx.test(link.href)) {
                redirectUrl.push(link.href);
            }
        });
        doc.window.document.querySelectorAll('img').forEach((image) => {
            const regexMedia = new RegExp(/(https|http)/g);
            if (regexMedia.test(image.src)) {
                imgUrl.push(image.src);
            } else {
                imgUrl.push(baseUrl + image.src);
            }
        });
        docHead = doc.window.document.querySelector('title').textContent;
        return {
            anchorUrl: redirectUrl,
            imgUrl,
            header: docHead
        };
    } catch (error) {
        //
    }
};
// FOR WEB SCRAPPING
const getScrappingData = async (preparedObj) => {
    try {
        const {
            urls,
            search,
            baseUrl,
            url,
        } = preparedObj;
        const imgTags = urls.imgUrl;
        const nestedUrls = urls.anchorUrl;
        const docHead = urls.header;
        const key = docHead.split('|');
        nestedUrls.map((u) => {
            if (!(u in maper) && !maper[u]) {
                maper[u] = false;
            }
            return maper[u];
        });
        const notInclude = ['pdf', 'facebook', 'google', 'instagram', 'twitter', 'youtube', 'mailto', 'showpublisheddocument', 'recruit', 'home', 'Home', 'vimeo', 'visionkershaw2030'];
        const allowMedia = ['showpublisheddocument', 'pdf', 'Home', 'home'];
        console.log(nestedUrls);
        nestedUrls.forEach((u) => {
            if (!notInclude.some((el) => u.includes(el))) {
                if (u in maper && !maper[u] && scrappingCount < 2) {
                    maper[u] = true;
                    const prepareData = {
                        url: u,
                        search,
                    };
                    getLandGrabberDetails(prepareData);
                }
            } else if (allowMedia.some((el) => u.includes(el))) {
                if (key[0].includes(search)) {
                    const relativePath = new RegExp(/(^https|^http)/g);
                    if (relativePath.test(u)) {
                        docs.push(u);
                    } else {
                        docs.push(baseUrl + u);
                    }
                    scrappingCount += 1;
                    scrappedData.push({
                        title: docHead,
                        domain: url,
                        pdf: docs,
                        images: imgTags,
                    });
                    console.log(scrappedData);
                    if (scrappingCount > NUMBER.ONE) {
                        process.exit(NUMBER.ZERO);
                    }
                }
            }
        });
        return scrappedData;
    } catch (err) {
        console.log(err);
    }
};
// FOR PREPARESCRAPPING DATA
const prepareScrapping = async (scrappingData) => {
    let responseObj;
    const {
        url,
        search
    } = scrappingData;
    if (url) {
        const urlDetails = new URL(url);
        const baseUrl = urlDetails.host;
        const htmlBody = await getBody(url);
        const urls = await getAllURls(htmlBody, baseUrl);
        const preparedObj = {
            urls,
            search,
            baseUrl,
            url,
        };
        responseObj = await getScrappingData(preparedObj);
    } else {
        responseObj = {
            message: 'invalid'
        };
    }
    return responseObj;
};

// STARTS
const getLandGrabberDetails = async (scrappingData) => {
    try {
        const {
            search,
            type
        } = scrappingData;
        const googleUrls = [];
        let returnObject;
        let {
            url
        } = scrappingData;
        
            const prepareData = {
                url,
                search,
                type,
            };
            returnObject = await prepareScrapping(prepareData);
        
        return returnObject;
    } catch (err) {
        console.log(error);
    }
};```



Aucun commentaire:

Enregistrer un commentaire