const fs = require("fs"); const { chromium } = require("playwright"); const headed = process.argv.includes("--headed"); const productUrls = [ "https://www.prismaticpowders.com/shop/powder-coating-colors/PSS-11248/high-gloss-black" ]; const outputJson = "prismatic_powders.json"; function clean(text) { return (text || "").replace(/\s+/g, " ").trim(); } function absoluteUrl(baseUrl, maybeUrl) { if (!maybeUrl) return ""; try { return new URL(maybeUrl, baseUrl).href; } catch { return maybeUrl; } } function unique(items) { return [...new Set(items.filter(Boolean).map(clean).filter(Boolean))]; } async function getLinkByText(page, patterns) { const links = await page.locator("a").evaluateAll((anchors) => anchors.map(a => ({ text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(), href: a.getAttribute("href") || "" })) ); for (const link of links) { if (patterns.some(p => new RegExp(p, "i").test(link.text))) { return absoluteUrl(page.url(), link.href); } } return ""; } function parsePriceTiers(plainText) { const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)]; return priceMatches.map(m => { const rangeText = clean(m[1]); const price = parseFloat(m[2]); let min = null; let max = null; const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/); if (rangeMatch) { min = parseInt(rangeMatch[1], 10); max = parseInt(rangeMatch[2], 10); } const plusMatch = rangeText.match(/(\d+)\s*\+/); if (plusMatch) { min = parseInt(plusMatch[1], 10); max = null; } return { min, max, price }; }); } async function getSampleImageUrl(page) { const imageUrls = await page.locator("img").evaluateAll((imgs) => imgs.map(img => img.currentSrc || img.src || img.getAttribute("src") || img.getAttribute("data-src") || "" ).filter(Boolean) ); return ( imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) || imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) || imageUrls.find(src => /prismatic|powder|color/i.test(src)) || "" ); } async function parseProduct(page, url) { console.log(`Scraping ${url}`); const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 60000 }); await page.waitForTimeout(3000); const status = response ? response.status() : 0; const pageTitle = clean(await page.title().catch(() => "")); const plainText = clean(await page.locator("body").innerText().catch(() => "")); console.log(`HTTP status: ${status}`); console.log(`Page title: ${pageTitle}`); // Do not silently output a fake product if blocked. if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) { throw new Error("403 Forbidden returned by site."); } const title = clean(await page.locator("h1").first().innerText().catch(() => "")); const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i); const sku = skuMatch ? skuMatch[1] : ""; const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is); const description = descMatch ? clean(descMatch[1]) : ""; const priceTiers = parsePriceTiers(plainText); const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]); const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]); const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]); const sampleImageUrl = await getSampleImageUrl(page); return { sku, color_name: title, description, price_tiers: priceTiers, safety_data_sheet_url: safetyDataSheetUrl, technical_data_sheet_url: technicalDataSheetUrl, application_guide_url: applicationGuideUrl, sample_image_url: sampleImageUrl, product_url: url, scraped_at: new Date().toISOString() }; } (async () => { const browser = await chromium.launch({ headless: !headed }); const context = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", viewport: { width: 1365, height: 900 }, locale: "en-US", timezoneId: "America/New_York" }); const page = await context.newPage(); const results = []; const errors = []; for (const url of productUrls) { try { const row = await parseProduct(page, url); results.push(row); await page.waitForTimeout(3000); } catch (err) { console.warn(`Failed ${url}: ${err.message}`); errors.push({ product_url: url, error: err.message, scraped_at: new Date().toISOString() }); } } await browser.close(); // If you prefer only the array, change this to JSON.stringify(results, null, 2) const output = { results, errors }; fs.writeFileSync(outputJson, JSON.stringify(output, null, 2), "utf8"); console.log(`Done. Output: ${outputJson}`); })();