Commit misc scripts, feature specs, SQL deploy scripts, and settings updates
This commit is contained in:
@@ -0,0 +1,410 @@
|
||||
# Get-Product-Info-Resumable.ps1
|
||||
#
|
||||
# Resumable, slow/polite Prismatic Powders product scraper.
|
||||
#
|
||||
# Inputs:
|
||||
# .\product-urls.txt
|
||||
#
|
||||
# Outputs:
|
||||
# .\prismatic_powders.json
|
||||
# .\prismatic-scrape-progress.log
|
||||
#
|
||||
# First-time setup:
|
||||
# Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
|
||||
# .\Get-Product-Info-Resumable.ps1 -InstallPlaywright -Headed -MaxProducts 5
|
||||
#
|
||||
# Normal full run:
|
||||
# .\Get-Product-Info-Resumable.ps1
|
||||
#
|
||||
# Test first 25 remaining:
|
||||
# .\Get-Product-Info-Resumable.ps1 -MaxProducts 25 -Headed
|
||||
#
|
||||
# Retry failed URLs too:
|
||||
# .\Get-Product-Info-Resumable.ps1 -RetryErrors
|
||||
#
|
||||
# Slow it down more:
|
||||
# .\Get-Product-Info-Resumable.ps1 -MinDelaySeconds 12 -MaxDelaySeconds 25
|
||||
|
||||
param(
|
||||
[switch]$InstallPlaywright,
|
||||
[switch]$Headed,
|
||||
|
||||
[string]$InputFile = ".\product-urls.txt",
|
||||
[string]$OutputJson = ".\prismatic_powders.json",
|
||||
[string]$ProgressLog = ".\prismatic-scrape-progress.log",
|
||||
|
||||
[int]$MinDelaySeconds = 4,
|
||||
[int]$MaxDelaySeconds = 10,
|
||||
[int]$PageSettleSeconds = 4,
|
||||
|
||||
# 0 means no limit.
|
||||
[int]$MaxProducts = 0,
|
||||
|
||||
# By default, URLs in errors are skipped on resume.
|
||||
# Use -RetryErrors to try failed URLs again.
|
||||
[switch]$RetryErrors
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function Ensure-NodeAvailable {
|
||||
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
|
||||
throw "Node.js is required. Install Node.js LTS from https://nodejs.org/"
|
||||
}
|
||||
|
||||
if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {
|
||||
throw "npm is required. It usually comes with Node.js."
|
||||
}
|
||||
}
|
||||
|
||||
function Install-PlaywrightIfNeeded {
|
||||
param([bool]$Requested)
|
||||
|
||||
Ensure-NodeAvailable
|
||||
|
||||
if ($Requested -or -not (Test-Path ".\node_modules\playwright")) {
|
||||
Write-Host "Installing Playwright package locally..."
|
||||
npm init -y | Out-Null
|
||||
npm install playwright | Out-Null
|
||||
|
||||
Write-Host "Installing Playwright Chromium browser..."
|
||||
npx playwright install chromium
|
||||
}
|
||||
}
|
||||
|
||||
function Write-NodeScraper {
|
||||
$js = @'
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
const retryErrors = process.argv.includes("--retry-errors");
|
||||
|
||||
function getArgValue(name, defaultValue) {
|
||||
const prefix = `--${name}=`;
|
||||
const found = process.argv.find(x => x.startsWith(prefix));
|
||||
return found ? found.slice(prefix.length) : defaultValue;
|
||||
}
|
||||
|
||||
const inputFile = getArgValue("input-file", "product-urls.txt");
|
||||
const outputJson = getArgValue("output-json", "prismatic_powders.json");
|
||||
const progressLog = getArgValue("progress-log", "prismatic-scrape-progress.log");
|
||||
|
||||
const minDelaySeconds = parseInt(getArgValue("min-delay-seconds", "8"), 10);
|
||||
const maxDelaySeconds = parseInt(getArgValue("max-delay-seconds", "18"), 10);
|
||||
const pageSettleSeconds = parseInt(getArgValue("page-settle-seconds", "4"), 10);
|
||||
const maxProducts = parseInt(getArgValue("max-products", "0"), 10);
|
||||
|
||||
function clean(text) {
|
||||
return (text || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function cleanUrl(url) {
|
||||
return (url || "").split("?")[0].split("#")[0].trim();
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function randomDelayMs() {
|
||||
const minMs = Math.max(0, minDelaySeconds * 1000);
|
||||
const maxMs = Math.max(minMs, maxDelaySeconds * 1000);
|
||||
return Math.floor(minMs + Math.random() * (maxMs - minMs + 1));
|
||||
}
|
||||
|
||||
function logLine(message) {
|
||||
const line = `[${new Date().toISOString()}] ${message}`;
|
||||
console.log(line);
|
||||
fs.appendFileSync(progressLog, line + "\r\n", "utf8");
|
||||
}
|
||||
|
||||
function absoluteUrl(baseUrl, maybeUrl) {
|
||||
if (!maybeUrl) return "";
|
||||
try {
|
||||
return new URL(maybeUrl, baseUrl).href;
|
||||
} catch {
|
||||
return maybeUrl;
|
||||
}
|
||||
}
|
||||
|
||||
function loadInputUrls() {
|
||||
if (!fs.existsSync(inputFile)) {
|
||||
throw new Error(`Input file not found: ${inputFile}`);
|
||||
}
|
||||
|
||||
const urls = fs.readFileSync(inputFile, "utf8")
|
||||
.split(/\r?\n/)
|
||||
.map(cleanUrl)
|
||||
.filter(Boolean)
|
||||
.filter(x => !x.startsWith("#"))
|
||||
.filter(x => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(x));
|
||||
|
||||
return [...new Set(urls)];
|
||||
}
|
||||
|
||||
function loadOutput() {
|
||||
if (!fs.existsSync(outputJson)) {
|
||||
return { results: [], errors: [] };
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(fs.readFileSync(outputJson, "utf8"));
|
||||
|
||||
if (Array.isArray(parsed)) {
|
||||
return { results: parsed, errors: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
results: Array.isArray(parsed.results) ? parsed.results : [],
|
||||
errors: Array.isArray(parsed.errors) ? parsed.errors : []
|
||||
};
|
||||
} catch (err) {
|
||||
const backup = `${outputJson}.invalid-${Date.now()}.bak`;
|
||||
fs.copyFileSync(outputJson, backup);
|
||||
throw new Error(`Could not parse existing ${outputJson}. Backed it up to ${backup}. Error: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
function saveOutput(data) {
|
||||
const tempFile = `${outputJson}.tmp`;
|
||||
fs.writeFileSync(tempFile, JSON.stringify(data, null, 2), "utf8");
|
||||
fs.renameSync(tempFile, outputJson);
|
||||
}
|
||||
|
||||
function parsePriceTiers(plainText) {
|
||||
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
|
||||
|
||||
return priceMatches.map(m => {
|
||||
const rangeText = clean(m[1]);
|
||||
const price = parseFloat(m[2]);
|
||||
|
||||
let min = null;
|
||||
let max = null;
|
||||
|
||||
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
|
||||
if (rangeMatch) {
|
||||
min = parseInt(rangeMatch[1], 10);
|
||||
max = parseInt(rangeMatch[2], 10);
|
||||
}
|
||||
|
||||
const plusMatch = rangeText.match(/(\d+)\s*\+/);
|
||||
if (plusMatch) {
|
||||
min = parseInt(plusMatch[1], 10);
|
||||
max = null;
|
||||
}
|
||||
|
||||
return { min, max, price };
|
||||
});
|
||||
}
|
||||
|
||||
async function getLinkByText(page, patterns) {
|
||||
const links = await page.locator("a").evaluateAll((anchors) =>
|
||||
anchors.map(a => ({
|
||||
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
|
||||
href: a.getAttribute("href") || ""
|
||||
}))
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
|
||||
return absoluteUrl(page.url(), link.href);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
async function getSampleImageUrl(page) {
|
||||
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
|
||||
imgs.map(img =>
|
||||
img.currentSrc ||
|
||||
img.src ||
|
||||
img.getAttribute("src") ||
|
||||
img.getAttribute("data-src") ||
|
||||
""
|
||||
).filter(Boolean)
|
||||
);
|
||||
|
||||
return (
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
|
||||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
async function parseProduct(page, url) {
|
||||
logLine(`Scraping ${url}`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
await page.waitForTimeout(pageSettleSeconds * 1000);
|
||||
|
||||
const status = response ? response.status() : 0;
|
||||
const pageTitle = clean(await page.title().catch(() => ""));
|
||||
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
|
||||
|
||||
logLine(`HTTP status ${status}; title "${pageTitle}"`);
|
||||
|
||||
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
|
||||
throw new Error("403 Forbidden returned by site.");
|
||||
}
|
||||
|
||||
if (status === 404 || /404|Page Not Found/i.test(pageTitle)) {
|
||||
throw new Error("404 Not Found returned by site.");
|
||||
}
|
||||
|
||||
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
|
||||
|
||||
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
|
||||
const sku = skuMatch ? skuMatch[1] : "";
|
||||
|
||||
if (!sku && !title) {
|
||||
throw new Error("Could not find SKU or title on product page.");
|
||||
}
|
||||
|
||||
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
|
||||
const description = descMatch ? clean(descMatch[1]) : "";
|
||||
|
||||
const priceTiers = parsePriceTiers(plainText);
|
||||
|
||||
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
|
||||
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
|
||||
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
|
||||
const sampleImageUrl = await getSampleImageUrl(page);
|
||||
|
||||
return {
|
||||
sku,
|
||||
color_name: title,
|
||||
description,
|
||||
price_tiers: priceTiers,
|
||||
safety_data_sheet_url: safetyDataSheetUrl,
|
||||
technical_data_sheet_url: technicalDataSheetUrl,
|
||||
application_guide_url: applicationGuideUrl,
|
||||
sample_image_url: sampleImageUrl,
|
||||
product_url: url,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const allUrls = loadInputUrls();
|
||||
const data = loadOutput();
|
||||
|
||||
const completedUrls = new Set(data.results.map(r => cleanUrl(r.product_url)).filter(Boolean));
|
||||
const errorUrls = new Set(data.errors.map(e => cleanUrl(e.product_url)).filter(Boolean));
|
||||
|
||||
let remainingUrls = allUrls.filter(url => {
|
||||
if (completedUrls.has(url)) return false;
|
||||
if (!retryErrors && errorUrls.has(url)) return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (maxProducts > 0) {
|
||||
remainingUrls = remainingUrls.slice(0, maxProducts);
|
||||
}
|
||||
|
||||
logLine(`Input URLs: ${allUrls.length}`);
|
||||
logLine(`Already scraped: ${completedUrls.size}`);
|
||||
logLine(`Existing errors: ${errorUrls.size}`);
|
||||
logLine(`Retry errors: ${retryErrors ? "yes" : "no"}`);
|
||||
logLine(`This run target count: ${remainingUrls.length}`);
|
||||
logLine(`Delay range: ${minDelaySeconds}-${maxDelaySeconds} seconds; page settle: ${pageSettleSeconds} seconds`);
|
||||
|
||||
if (remainingUrls.length === 0) {
|
||||
logLine("Nothing to scrape. Done.");
|
||||
saveOutput(data);
|
||||
return;
|
||||
}
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: !headed
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
let processedThisRun = 0;
|
||||
|
||||
for (const url of remainingUrls) {
|
||||
try {
|
||||
const row = await parseProduct(page, url);
|
||||
|
||||
// If retrying an old error, keep the old error history but avoid duplicate successful result.
|
||||
if (!completedUrls.has(url)) {
|
||||
data.results.push(row);
|
||||
completedUrls.add(url);
|
||||
}
|
||||
|
||||
processedThisRun++;
|
||||
saveOutput(data);
|
||||
|
||||
logLine(`Saved result ${processedThisRun}/${remainingUrls.length}: ${row.sku || "(no sku)"} ${row.color_name || ""}`);
|
||||
} catch (err) {
|
||||
const errorRecord = {
|
||||
product_url: url,
|
||||
error: err.message,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
data.errors.push(errorRecord);
|
||||
saveOutput(data);
|
||||
|
||||
logLine(`ERROR ${url}: ${err.message}`);
|
||||
}
|
||||
|
||||
const delay = randomDelayMs();
|
||||
logLine(`Waiting ${(delay / 1000).toFixed(1)} seconds before next product...`);
|
||||
await sleep(delay);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
logLine(`Done. Results: ${data.results.length}; Errors: ${data.errors.length}; Output: ${outputJson}`);
|
||||
})();
|
||||
'@
|
||||
|
||||
Set-Content -Path ".\prismatic-browser-scraper.js" -Value $js -Encoding UTF8
|
||||
}
|
||||
|
||||
try {
|
||||
Install-PlaywrightIfNeeded -Requested:$InstallPlaywright
|
||||
Write-NodeScraper
|
||||
|
||||
Write-Host "Running resumable browser scraper..."
|
||||
|
||||
$nodeArgs = @(
|
||||
".\prismatic-browser-scraper.js",
|
||||
"--input-file=$InputFile",
|
||||
"--output-json=$OutputJson",
|
||||
"--progress-log=$ProgressLog",
|
||||
"--min-delay-seconds=$MinDelaySeconds",
|
||||
"--max-delay-seconds=$MaxDelaySeconds",
|
||||
"--page-settle-seconds=$PageSettleSeconds",
|
||||
"--max-products=$MaxProducts"
|
||||
)
|
||||
|
||||
if ($Headed) {
|
||||
$nodeArgs += "--headed"
|
||||
}
|
||||
|
||||
if ($RetryErrors) {
|
||||
$nodeArgs += "--retry-errors"
|
||||
}
|
||||
|
||||
node @nodeArgs
|
||||
}
|
||||
catch {
|
||||
Write-Error $_.Exception.Message
|
||||
exit 1
|
||||
}
|
||||
Reference in New Issue
Block a user