I have the need to scrape a list of URLs to collect the Title, ImageURL and Content form the URLs and put them into a JSON file to import.
But I can't seem to get it to work in a for loop, as it worked for one URL only but not since adding the FOR loop in.
Could I get some help on this?
const puppeteer = require('puppeteer');
async function scrapPamphlets()
{
const browser = await puppeteer.launch();
const page = await browser.newPage();
const urls = ['https://www.bibleed.com/the-divine-origin-of-the-bible.html','https://www.bibleed.com/god-and-creation.html'];
for (let i = 0; i < urls.length; i++) {
const pamphletURL = urls[i];
await page.goto(`${pamphletURL}`, {waitUntil: 'networkidle2'});
let pamphletData = await page.evaluate(() => {
let data = [];
// get the page elements
let pamphletElms = document.querySelectorAll('div[class="wsite-section-elements"]');
// get the pamphlet data
pamphletElms.forEach((pamphletelement) => {
let pamphletJson = {};
try {
pamphletJson.title = pamphletelement.querySelector('h2').innerText;
pamphletJson.imgURL = 'https://www.bibleed.com' + pamphletelement.querySelector('div div.wsite-image a img').getAttribute('src');
pamphletJson.txtContent = pamphletelement.querySelector('.paragraph').innerText;
}
catch (exception){
}
data.push(pamphletJson);
});
return data;
})
}
// save data to json file
const fs = require('fs');
fs.writeFile('pamphletData.json', JSON.stringify(pamphletData), err => err ? console.log(err): null);
await browser.close();
}
scrapPamphlets();
Aucun commentaire:
Enregistrer un commentaire