Skip to main content

Using Nodejs and Puppeteer to Scrape a Site


npm i puppeteer (This will also install chromium.)

// require the puppeteer package
const puppeteer = require("puppeteer");
// create self invoking function
(async () => {
const browser = await puppeteer.launch({
headless: false, // default is true
defaultViewport: null, // use full page width
});
const page = await browser.newPage();
await page.goto(
"https://clicks.co.za/beauty/fragrances/c/OH200004?q=%3Arelevance%3Acategory%3AOH300032&text=&count=30"
);
let nextBtnIsAvailable = true;
let allResults = [];
while (nextBtnIsAvailable === true) {
// The evaluate function is where you can interact with the DOM.
let data = await page.evaluate(async () => {
let results = [];
const items = Array.from(document.querySelectorAll("div.productBlock"));
items.forEach((item) => {
results.push({
src: item.querySelector("a > img").getAttribute("src"),
price: item
.querySelector(".detailContent > .price-wrap > .price")
.textContent.replace(/[\r\n]/g, "")
.trim(),
brand: item.querySelector(".detailContent a h5").getAttribute("title"),
description: item.querySelector(".detailContent .product-name p").textContent,
onPromotion: item.querySelector(".badges.promotionsSticker img") ? true : false,
inStock: !item
.querySelector(".add_to_cart_form button.btn")
.textContent.includes("Out of stock"),
});
});
return results;
});
// Merge the array data from each page into the allResults array.
allResults = [...allResults, ...data];
// checks if the next page link is available
try {
const nextBtn = await page.waitForSelector("div.lastBox.nextBtn > a", {
timeout: 5000,
});
if (nextBtn) {
nextBtnIsAvailable = true;
} else {
nextBtnIsAvailable = false;
}
} catch (err) {
nextBtnIsAvailable = false;
}
// If the next page link is available click it.
if (nextBtnIsAvailable === true) {
await page.click("div.lastBox.nextBtn > a");
}
// Wait for 4 sec before before paging again.
await page.waitFor(4000);
}
console.log(allResults);
browser.close();
})();
Last updated on