const fs = require("fs"); const { chromium } = require("playwright"); const headed = process.argv.includes("--headed"); const retryErrors = process.argv.includes("--retry-errors"); function getArgValue(name, defaultValue) { const prefix = `--${name}=`; const found = process.argv.find(x => x.startsWith(prefix)); return found ? found.slice(prefix.length) : defaultValue; } const inputFile = getArgValue("input-file", "product-urls.txt"); const outputJson = getArgValue("output-json", "prismatic_powders.json"); const progressLog = getArgValue("progress-log", "prismatic-scrape-progress.log"); const minDelaySeconds = parseInt(getArgValue("min-delay-seconds", "8"), 10); const maxDelaySeconds = parseInt(getArgValue("max-delay-seconds", "18"), 10); const pageSettleSeconds = parseInt(getArgValue("page-settle-seconds", "4"), 10); const maxProducts = parseInt(getArgValue("max-products", "0"), 10); function clean(text) { return (text || "").replace(/\s+/g, " ").trim(); } function cleanUrl(url) { return (url || "").split("?")[0].split("#")[0].trim(); } function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } function randomDelayMs() { const minMs = Math.max(0, minDelaySeconds * 1000); const maxMs = Math.max(minMs, maxDelaySeconds * 1000); return Math.floor(minMs + Math.random() * (maxMs - minMs + 1)); } function logLine(message) { const line = `[${new Date().toISOString()}] ${message}`; console.log(line); fs.appendFileSync(progressLog, line + "\r\n", "utf8"); } function absoluteUrl(baseUrl, maybeUrl) { if (!maybeUrl) return ""; try { return new URL(maybeUrl, baseUrl).href; } catch { return maybeUrl; } } function loadInputUrls() { if (!fs.existsSync(inputFile)) { throw new Error(`Input file not found: ${inputFile}`); } const urls = fs.readFileSync(inputFile, "utf8") .split(/\r?\n/) .map(cleanUrl) .filter(Boolean) .filter(x => !x.startsWith("#")) .filter(x => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(x)); return [...new Set(urls)]; } function loadOutput() { if (!fs.existsSync(outputJson)) { return { results: [], errors: [] }; } try { const parsed = JSON.parse(fs.readFileSync(outputJson, "utf8")); if (Array.isArray(parsed)) { return { results: parsed, errors: [] }; } return { results: Array.isArray(parsed.results) ? parsed.results : [], errors: Array.isArray(parsed.errors) ? parsed.errors : [] }; } catch (err) { const backup = `${outputJson}.invalid-${Date.now()}.bak`; fs.copyFileSync(outputJson, backup); throw new Error(`Could not parse existing ${outputJson}. Backed it up to ${backup}. Error: ${err.message}`); } } function saveOutput(data) { const tempFile = `${outputJson}.tmp`; fs.writeFileSync(tempFile, JSON.stringify(data, null, 2), "utf8"); fs.renameSync(tempFile, outputJson); } function parsePriceTiers(plainText) { const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)]; return priceMatches.map(m => { const rangeText = clean(m[1]); const price = parseFloat(m[2]); let min = null; let max = null; const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/); if (rangeMatch) { min = parseInt(rangeMatch[1], 10); max = parseInt(rangeMatch[2], 10); } const plusMatch = rangeText.match(/(\d+)\s*\+/); if (plusMatch) { min = parseInt(plusMatch[1], 10); max = null; } return { min, max, price }; }); } async function getLinkByText(page, patterns) { const links = await page.locator("a").evaluateAll((anchors) => anchors.map(a => ({ text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(), href: a.getAttribute("href") || "" })) ); for (const link of links) { if (patterns.some(p => new RegExp(p, "i").test(link.text))) { return absoluteUrl(page.url(), link.href); } } return ""; } async function getSampleImageUrl(page) { const imageUrls = await page.locator("img").evaluateAll((imgs) => imgs.map(img => img.currentSrc || img.src || img.getAttribute("src") || img.getAttribute("data-src") || "" ).filter(Boolean) ); return ( imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) || imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) || imageUrls.find(src => /prismatic|powder|color/i.test(src)) || "" ); } async function parseProduct(page, url) { logLine(`Scraping ${url}`); const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 60000 }); await page.waitForTimeout(pageSettleSeconds * 1000); const status = response ? response.status() : 0; const pageTitle = clean(await page.title().catch(() => "")); const plainText = clean(await page.locator("body").innerText().catch(() => "")); logLine(`HTTP status ${status}; title "${pageTitle}"`); if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) { throw new Error("403 Forbidden returned by site."); } if (status === 404 || /404|Page Not Found/i.test(pageTitle)) { throw new Error("404 Not Found returned by site."); } const title = clean(await page.locator("h1").first().innerText().catch(() => "")); const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i); const sku = skuMatch ? skuMatch[1] : ""; if (!sku && !title) { throw new Error("Could not find SKU or title on product page."); } const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is); const description = descMatch ? clean(descMatch[1]) : ""; const priceTiers = parsePriceTiers(plainText); const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]); const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]); const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]); const sampleImageUrl = await getSampleImageUrl(page); return { sku, color_name: title, description, price_tiers: priceTiers, safety_data_sheet_url: safetyDataSheetUrl, technical_data_sheet_url: technicalDataSheetUrl, application_guide_url: applicationGuideUrl, sample_image_url: sampleImageUrl, product_url: url, scraped_at: new Date().toISOString() }; } (async () => { const allUrls = loadInputUrls(); const data = loadOutput(); const completedUrls = new Set(data.results.map(r => cleanUrl(r.product_url)).filter(Boolean)); const errorUrls = new Set(data.errors.map(e => cleanUrl(e.product_url)).filter(Boolean)); let remainingUrls = allUrls.filter(url => { if (completedUrls.has(url)) return false; if (!retryErrors && errorUrls.has(url)) return false; return true; }); if (maxProducts > 0) { remainingUrls = remainingUrls.slice(0, maxProducts); } logLine(`Input URLs: ${allUrls.length}`); logLine(`Already scraped: ${completedUrls.size}`); logLine(`Existing errors: ${errorUrls.size}`); logLine(`Retry errors: ${retryErrors ? "yes" : "no"}`); logLine(`This run target count: ${remainingUrls.length}`); logLine(`Delay range: ${minDelaySeconds}-${maxDelaySeconds} seconds; page settle: ${pageSettleSeconds} seconds`); if (remainingUrls.length === 0) { logLine("Nothing to scrape. Done."); saveOutput(data); return; } const browser = await chromium.launch({ headless: !headed }); const context = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", viewport: { width: 1365, height: 900 }, locale: "en-US", timezoneId: "America/New_York" }); const page = await context.newPage(); let processedThisRun = 0; for (const url of remainingUrls) { try { const row = await parseProduct(page, url); // If retrying an old error, keep the old error history but avoid duplicate successful result. if (!completedUrls.has(url)) { data.results.push(row); completedUrls.add(url); } processedThisRun++; saveOutput(data); logLine(`Saved result ${processedThisRun}/${remainingUrls.length}: ${row.sku || "(no sku)"} ${row.color_name || ""}`); } catch (err) { const errorRecord = { product_url: url, error: err.message, scraped_at: new Date().toISOString() }; data.errors.push(errorRecord); saveOutput(data); logLine(`ERROR ${url}: ${err.message}`); } const delay = randomDelayMs(); logLine(`Waiting ${(delay / 1000).toFixed(1)} seconds before next product...`); await sleep(delay); } await browser.close(); logLine(`Done. Results: ${data.results.length}; Errors: ${data.errors.length}; Output: ${outputJson}`); })();