mardi 16 novembre 2021

How do I scrape images from udemy using NodeJS and puppeteer

This is my code - scraping courses titles works ok, but I have problem with images

const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin());
const fs = require('fs/promises')
function sleep(ms)
{
    return new Promise(resolve => setTimeout(resolve, ms));
}
async function start()
{
    const browser = await puppeteer.launch({ headless: true });
    const page = await browser.newPage();
    await page.goto("https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL");
    await sleep(5000);
    const names = await page.evaluate(() => {
        return Array.from(document.querySelectorAll(".course-list--container--3zXPS div.udlite-focus-visible-target.udlite-heading-md.course-card--course-title--vVEjC")).map(x => x.textContent)
    })
    const images = await page.evaluate(() => {
        return Array.from(
            document.querySelectorAll(".course-list--container--3zXPS div.course-card--image-wrapper--1F9ny")
        ).map((image) => image.getAttribute(`src`));
    });


    let m = ";";
    for (let i = 0; i < names.length; i++)
    {
        names[i] = i+m+names[i]+m+images[i]
    }
    await fs.writeFile("courses.txt", names.join("\r\n"))
    await page.screenshot({ path: "udemy.png", fullPage: true });
    await browser.close();
}
start()

Now it returns null instead images url, if I change src to srcset nothing changes. The page that I want to scrape the images from is https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL

On a screenshot that this script takes I can see that courses icons are blacked out.




Aucun commentaire:

Enregistrer un commentaire