Files
PowderCoatingLogix/scripts/Web Scraping/backup/Get-Product-Info.ps1
T

266 lines
7.0 KiB
PowerShell

# Crawl and Index Prismatic Colors - Known-Good Style JSON.ps1
#
# Rollback to the earlier working browser pattern:
# - Playwright Chromium
# - Full Chrome-style User-Agent
# - JSON output
# - Structured price tiers
# - Color matches from #collection-list
#
# First-time setup:
# Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 -InstallPlaywright
#
# Normal run:
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1
#
# Watch browser:
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 -Headed
param(
[switch]$InstallPlaywright,
[switch]$Headed
)
$ErrorActionPreference = "Stop"
function Ensure-NodeAvailable {
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
throw "Node.js is required. Install Node.js LTS from https://nodejs.org/"
}
if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {
throw "npm is required. It usually comes with Node.js."
}
}
function Install-PlaywrightIfNeeded {
param([bool]$Requested)
Ensure-NodeAvailable
if ($Requested -or -not (Test-Path ".\node_modules\playwright")) {
Write-Host "Installing Playwright package locally..."
npm init -y | Out-Null
npm install playwright | Out-Null
Write-Host "Installing Playwright Chromium browser..."
npx playwright install chromium
}
}
function Write-NodeScraper {
# Single-quoted here-string prevents PowerShell from interpreting JavaScript regex/template strings.
$js = @'
const fs = require("fs");
const { chromium } = require("playwright");
const headed = process.argv.includes("--headed");
const productUrls = [
"https://www.prismaticpowders.com/shop/powder-coating-colors/PSS-11248/high-gloss-black"
];
const outputJson = "prismatic_powders.json";
function clean(text) {
return (text || "").replace(/\s+/g, " ").trim();
}
function absoluteUrl(baseUrl, maybeUrl) {
if (!maybeUrl) return "";
try {
return new URL(maybeUrl, baseUrl).href;
} catch {
return maybeUrl;
}
}
function unique(items) {
return [...new Set(items.filter(Boolean).map(clean).filter(Boolean))];
}
async function getLinkByText(page, patterns) {
const links = await page.locator("a").evaluateAll((anchors) =>
anchors.map(a => ({
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
href: a.getAttribute("href") || ""
}))
);
for (const link of links) {
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
return absoluteUrl(page.url(), link.href);
}
}
return "";
}
function parsePriceTiers(plainText) {
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
return priceMatches.map(m => {
const rangeText = clean(m[1]);
const price = parseFloat(m[2]);
let min = null;
let max = null;
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
if (rangeMatch) {
min = parseInt(rangeMatch[1], 10);
max = parseInt(rangeMatch[2], 10);
}
const plusMatch = rangeText.match(/(\d+)\s*\+/);
if (plusMatch) {
min = parseInt(plusMatch[1], 10);
max = null;
}
return {
min,
max,
price
};
});
}
async function getSampleImageUrl(page) {
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
imgs.map(img =>
img.currentSrc ||
img.src ||
img.getAttribute("src") ||
img.getAttribute("data-src") ||
""
).filter(Boolean)
);
return (
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
""
);
}
async function parseProduct(page, url) {
console.log(`Scraping ${url}`);
const response = await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 60000
});
await page.waitForTimeout(3000);
const status = response ? response.status() : 0;
const pageTitle = clean(await page.title().catch(() => ""));
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
console.log(`HTTP status: ${status}`);
console.log(`Page title: ${pageTitle}`);
// Do not silently output a fake product if blocked.
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
throw new Error("403 Forbidden returned by site.");
}
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
const sku = skuMatch ? skuMatch[1] : "";
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
const description = descMatch ? clean(descMatch[1]) : "";
const priceTiers = parsePriceTiers(plainText);
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
const sampleImageUrl = await getSampleImageUrl(page);
return {
sku,
color_name: title,
description,
price_tiers: priceTiers,
safety_data_sheet_url: safetyDataSheetUrl,
technical_data_sheet_url: technicalDataSheetUrl,
application_guide_url: applicationGuideUrl,
sample_image_url: sampleImageUrl,
product_url: url,
scraped_at: new Date().toISOString()
};
}
(async () => {
const browser = await chromium.launch({
headless: !headed
});
const context = await browser.newContext({
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
viewport: { width: 1365, height: 900 },
locale: "en-US",
timezoneId: "America/New_York"
});
const page = await context.newPage();
const results = [];
const errors = [];
for (const url of productUrls) {
try {
const row = await parseProduct(page, url);
results.push(row);
await page.waitForTimeout(3000);
} catch (err) {
console.warn(`Failed ${url}: ${err.message}`);
errors.push({
product_url: url,
error: err.message,
scraped_at: new Date().toISOString()
});
}
}
await browser.close();
// If you prefer only the array, change this to JSON.stringify(results, null, 2)
const output = {
results,
errors
};
fs.writeFileSync(outputJson, JSON.stringify(output, null, 2), "utf8");
console.log(`Done. Output: ${outputJson}`);
})();
'@
Set-Content -Path ".\prismatic-browser-scraper.js" -Value $js -Encoding UTF8
}
try {
Install-PlaywrightIfNeeded -Requested:$InstallPlaywright
Write-NodeScraper
Write-Host "Running browser scraper..."
if ($Headed) {
node .\prismatic-browser-scraper.js --headed
}
else {
node .\prismatic-browser-scraper.js
}
}
catch {
Write-Error $_.Exception.Message
exit 1
}