# Crawl and Index Prismatic Colors - Known-Good Style JSON.ps1 # # Rollback to the earlier working browser pattern: # - Playwright Chromium # - Full Chrome-style User-Agent # - JSON output # - Structured price tiers # - Color matches from #collection-list # # First-time setup: # Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass # .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 -InstallPlaywright # # Normal run: # .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 # # Watch browser: # .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 -Headed param( [switch]$InstallPlaywright, [switch]$Headed ) $ErrorActionPreference = "Stop" function Ensure-NodeAvailable { if (-not (Get-Command node -ErrorAction SilentlyContinue)) { throw "Node.js is required. Install Node.js LTS from https://nodejs.org/" } if (-not (Get-Command npm -ErrorAction SilentlyContinue)) { throw "npm is required. It usually comes with Node.js." } } function Install-PlaywrightIfNeeded { param([bool]$Requested) Ensure-NodeAvailable if ($Requested -or -not (Test-Path ".\node_modules\playwright")) { Write-Host "Installing Playwright package locally..." npm init -y | Out-Null npm install playwright | Out-Null Write-Host "Installing Playwright Chromium browser..." npx playwright install chromium } } function Write-NodeScraper { # Single-quoted here-string prevents PowerShell from interpreting JavaScript regex/template strings. $js = @' const fs = require("fs"); const { chromium } = require("playwright"); const headed = process.argv.includes("--headed"); const productUrls = [ "https://www.prismaticpowders.com/shop/powder-coating-colors/PSS-11248/high-gloss-black" ]; const outputJson = "prismatic_powders.json"; function clean(text) { return (text || "").replace(/\s+/g, " ").trim(); } function absoluteUrl(baseUrl, maybeUrl) { if (!maybeUrl) return ""; try { return new URL(maybeUrl, baseUrl).href; } catch { return maybeUrl; } } function unique(items) { return [...new Set(items.filter(Boolean).map(clean).filter(Boolean))]; } async function getLinkByText(page, patterns) { const links = await page.locator("a").evaluateAll((anchors) => anchors.map(a => ({ text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(), href: a.getAttribute("href") || "" })) ); for (const link of links) { if (patterns.some(p => new RegExp(p, "i").test(link.text))) { return absoluteUrl(page.url(), link.href); } } return ""; } function parsePriceTiers(plainText) { const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)]; return priceMatches.map(m => { const rangeText = clean(m[1]); const price = parseFloat(m[2]); let min = null; let max = null; const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/); if (rangeMatch) { min = parseInt(rangeMatch[1], 10); max = parseInt(rangeMatch[2], 10); } const plusMatch = rangeText.match(/(\d+)\s*\+/); if (plusMatch) { min = parseInt(plusMatch[1], 10); max = null; } return { min, max, price }; }); } async function getSampleImageUrl(page) { const imageUrls = await page.locator("img").evaluateAll((imgs) => imgs.map(img => img.currentSrc || img.src || img.getAttribute("src") || img.getAttribute("data-src") || "" ).filter(Boolean) ); return ( imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) || imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) || imageUrls.find(src => /prismatic|powder|color/i.test(src)) || "" ); } async function parseProduct(page, url) { console.log(`Scraping ${url}`); const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 60000 }); await page.waitForTimeout(3000); const status = response ? response.status() : 0; const pageTitle = clean(await page.title().catch(() => "")); const plainText = clean(await page.locator("body").innerText().catch(() => "")); console.log(`HTTP status: ${status}`); console.log(`Page title: ${pageTitle}`); // Do not silently output a fake product if blocked. if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) { throw new Error("403 Forbidden returned by site."); } const title = clean(await page.locator("h1").first().innerText().catch(() => "")); const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i); const sku = skuMatch ? skuMatch[1] : ""; const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is); const description = descMatch ? clean(descMatch[1]) : ""; const priceTiers = parsePriceTiers(plainText); const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]); const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]); const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]); const sampleImageUrl = await getSampleImageUrl(page); return { sku, color_name: title, description, price_tiers: priceTiers, safety_data_sheet_url: safetyDataSheetUrl, technical_data_sheet_url: technicalDataSheetUrl, application_guide_url: applicationGuideUrl, sample_image_url: sampleImageUrl, product_url: url, scraped_at: new Date().toISOString() }; } (async () => { const browser = await chromium.launch({ headless: !headed }); const context = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", viewport: { width: 1365, height: 900 }, locale: "en-US", timezoneId: "America/New_York" }); const page = await context.newPage(); const results = []; const errors = []; for (const url of productUrls) { try { const row = await parseProduct(page, url); results.push(row); await page.waitForTimeout(3000); } catch (err) { console.warn(`Failed ${url}: ${err.message}`); errors.push({ product_url: url, error: err.message, scraped_at: new Date().toISOString() }); } } await browser.close(); // If you prefer only the array, change this to JSON.stringify(results, null, 2) const output = { results, errors }; fs.writeFileSync(outputJson, JSON.stringify(output, null, 2), "utf8"); console.log(`Done. Output: ${outputJson}`); })(); '@ Set-Content -Path ".\prismatic-browser-scraper.js" -Value $js -Encoding UTF8 } try { Install-PlaywrightIfNeeded -Requested:$InstallPlaywright Write-NodeScraper Write-Host "Running browser scraper..." if ($Headed) { node .\prismatic-browser-scraper.js --headed } else { node .\prismatic-browser-scraper.js } } catch { Write-Error $_.Exception.Message exit 1 }