Commit misc scripts, feature specs, SQL deploy scripts, and settings updates
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,49 @@
|
||||
BEGIN TRANSACTION;
|
||||
GO
|
||||
|
||||
ALTER TABLE [CompanyPreferences] ADD [FirstInvoiceCreatedAt] datetime2 NULL;
|
||||
GO
|
||||
|
||||
ALTER TABLE [CompanyPreferences] ADD [FirstJobCreatedAt] datetime2 NULL;
|
||||
GO
|
||||
|
||||
ALTER TABLE [CompanyPreferences] ADD [FirstQuoteCreatedAt] datetime2 NULL;
|
||||
GO
|
||||
|
||||
ALTER TABLE [CompanyPreferences] ADD [FirstWorkflowCompleted] bit NOT NULL DEFAULT CAST(0 AS bit);
|
||||
GO
|
||||
|
||||
ALTER TABLE [CompanyPreferences] ADD [FirstWorkflowCompletedAt] datetime2 NULL;
|
||||
GO
|
||||
|
||||
ALTER TABLE [CompanyPreferences] ADD [GuidedActivationDismissedAt] datetime2 NULL;
|
||||
GO
|
||||
|
||||
ALTER TABLE [CompanyPreferences] ADD [OnboardingPath] nvarchar(max) NULL;
|
||||
GO
|
||||
|
||||
UPDATE [PricingTiers] SET [CreatedAt] = '2026-04-28T16:40:22.3595055Z'
|
||||
WHERE [Id] = 1;
|
||||
SELECT @@ROWCOUNT;
|
||||
|
||||
GO
|
||||
|
||||
UPDATE [PricingTiers] SET [CreatedAt] = '2026-04-28T16:40:22.3595063Z'
|
||||
WHERE [Id] = 2;
|
||||
SELECT @@ROWCOUNT;
|
||||
|
||||
GO
|
||||
|
||||
UPDATE [PricingTiers] SET [CreatedAt] = '2026-04-28T16:40:22.3595065Z'
|
||||
WHERE [Id] = 3;
|
||||
SELECT @@ROWCOUNT;
|
||||
|
||||
GO
|
||||
|
||||
INSERT INTO [__EFMigrationsHistory] ([MigrationId], [ProductVersion])
|
||||
VALUES (N'20260428164026_AddGuidedActivationFields', N'8.0.11');
|
||||
GO
|
||||
|
||||
COMMIT;
|
||||
GO
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,319 @@
|
||||
# Discover-Prismatic-Product-Urls-By-ColorParam.ps1
|
||||
#
|
||||
# Discovers Prismatic Powders product URLs by visiting color filter URLs like:
|
||||
# https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_red
|
||||
#
|
||||
# Outputs:
|
||||
# .\product-urls.txt
|
||||
# .\color-discovery-log.json
|
||||
#
|
||||
# First-time setup:
|
||||
# Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
|
||||
# .\Discover-Prismatic-Product-Urls-By-ColorParam.ps1 -InstallPlaywright -Headed
|
||||
#
|
||||
# Normal run:
|
||||
# .\Discover-Prismatic-Product-Urls-By-ColorParam.ps1
|
||||
#
|
||||
# Watch browser:
|
||||
# .\Discover-Prismatic-Product-Urls-By-ColorParam.ps1 -Headed
|
||||
|
||||
param(
|
||||
[switch]$InstallPlaywright,
|
||||
[switch]$Headed,
|
||||
[int]$MaxScrollsPerColor = 180,
|
||||
[int]$StopAfterNoNewScrolls = 10
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function Ensure-NodeAvailable {
|
||||
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
|
||||
throw "Node.js is required. Install Node.js LTS from https://nodejs.org/"
|
||||
}
|
||||
|
||||
if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {
|
||||
throw "npm is required. It usually comes with Node.js."
|
||||
}
|
||||
}
|
||||
|
||||
function Install-PlaywrightIfNeeded {
|
||||
param([bool]$Requested)
|
||||
|
||||
Ensure-NodeAvailable
|
||||
|
||||
if ($Requested -or -not (Test-Path ".\node_modules\playwright")) {
|
||||
Write-Host "Installing Playwright package locally..."
|
||||
npm init -y | Out-Null
|
||||
npm install playwright | Out-Null
|
||||
|
||||
Write-Host "Installing Playwright Chromium browser..."
|
||||
npx playwright install chromium
|
||||
}
|
||||
}
|
||||
|
||||
function Write-NodeDiscoveryScript {
|
||||
$js = @'
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
|
||||
function getArgValue(name, defaultValue) {
|
||||
const prefix = `--${name}=`;
|
||||
const found = process.argv.find(x => x.startsWith(prefix));
|
||||
return found ? found.slice(prefix.length) : defaultValue;
|
||||
}
|
||||
|
||||
const maxScrollsPerColor = parseInt(getArgValue("max-scrolls-per-color", "180"), 10);
|
||||
const stopAfterNoNewScrolls = parseInt(getArgValue("stop-after-no-new-scrolls", "10"), 10);
|
||||
|
||||
const baseUrl = "https://www.prismaticpowders.com/shop/powder-coating-colors";
|
||||
const outputFile = "product-urls.txt";
|
||||
const logFile = "color-discovery-log.json";
|
||||
|
||||
// Update this list if you find more color params in the site HTML.
|
||||
const colorParams = [
|
||||
"pris_black",
|
||||
"pris_blue",
|
||||
"pris_bronze",
|
||||
"pris_brown",
|
||||
"pris_clear",
|
||||
"pris_copper",
|
||||
"pris_gold",
|
||||
"pris_gray",
|
||||
"pris_green",
|
||||
"pris_orange",
|
||||
"pris_pink",
|
||||
"pris_purple",
|
||||
"pris_red",
|
||||
"pris_silver",
|
||||
"pris_tan",
|
||||
"pris_white",
|
||||
"pris_yellow"
|
||||
];
|
||||
|
||||
function cleanUrl(url) {
|
||||
return (url || "").split("?")[0].split("#")[0].trim();
|
||||
}
|
||||
|
||||
function isProductUrl(url) {
|
||||
return /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(url || "");
|
||||
}
|
||||
|
||||
function readExistingUrls() {
|
||||
if (!fs.existsSync(outputFile)) return [];
|
||||
|
||||
return fs.readFileSync(outputFile, "utf8")
|
||||
.split(/\r?\n/)
|
||||
.map(cleanUrl)
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function writeUrls(urls) {
|
||||
const sorted = [...urls].sort();
|
||||
fs.writeFileSync(outputFile, sorted.join("\r\n") + "\r\n", "utf8");
|
||||
}
|
||||
|
||||
function readLog() {
|
||||
if (!fs.existsSync(logFile)) {
|
||||
return {
|
||||
completed_colors: {},
|
||||
runs: []
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(fs.readFileSync(logFile, "utf8"));
|
||||
} catch {
|
||||
return {
|
||||
completed_colors: {},
|
||||
runs: []
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function writeLog(log) {
|
||||
fs.writeFileSync(logFile, JSON.stringify(log, null, 2), "utf8");
|
||||
}
|
||||
|
||||
async function collectProductLinks(page) {
|
||||
const links = await page.locator("a").evaluateAll(anchors =>
|
||||
anchors
|
||||
.map(a => a.href)
|
||||
.filter(Boolean)
|
||||
.filter(h => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(h))
|
||||
);
|
||||
|
||||
return links.map(cleanUrl).filter(Boolean);
|
||||
}
|
||||
|
||||
async function scrollAndCollect(page, urls, label) {
|
||||
let noNewScrolls = 0;
|
||||
let totalAddedForThisColor = 0;
|
||||
|
||||
for (let i = 0; i < maxScrollsPerColor; i++) {
|
||||
const before = urls.size;
|
||||
|
||||
for (const link of await collectProductLinks(page)) {
|
||||
urls.add(link);
|
||||
}
|
||||
|
||||
const after = urls.size;
|
||||
const added = after - before;
|
||||
totalAddedForThisColor += added;
|
||||
|
||||
if (added === 0) {
|
||||
noNewScrolls++;
|
||||
} else {
|
||||
noNewScrolls = 0;
|
||||
}
|
||||
|
||||
writeUrls(urls);
|
||||
|
||||
console.log(`[${label}] Scroll ${i + 1}/${maxScrollsPerColor}: +${added}, total ${after}, no-new ${noNewScrolls}`);
|
||||
|
||||
if (noNewScrolls >= stopAfterNoNewScrolls) {
|
||||
break;
|
||||
}
|
||||
|
||||
await page.mouse.wheel(0, 2500);
|
||||
await page.waitForTimeout(1500);
|
||||
}
|
||||
|
||||
return totalAddedForThisColor;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const existingUrls = readExistingUrls();
|
||||
const urls = new Set(existingUrls);
|
||||
const log = readLog();
|
||||
|
||||
console.log(`Existing URLs in ${outputFile}: ${existingUrls.length}`);
|
||||
|
||||
const browser = await chromium.launch({ headless: !headed });
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const runRecord = {
|
||||
started_at: new Date().toISOString(),
|
||||
existing_at_start: existingUrls.length,
|
||||
colors_attempted: []
|
||||
};
|
||||
|
||||
for (const color of colorParams) {
|
||||
if (log.completed_colors[color]) {
|
||||
console.log(`Skipping completed color: ${color}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const url = `${baseUrl}?color=${encodeURIComponent(color)}`;
|
||||
console.log("");
|
||||
console.log(`Opening color filter: ${color}`);
|
||||
console.log(url);
|
||||
|
||||
try {
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
const status = response ? response.status() : "unknown";
|
||||
console.log(`HTTP status: ${status}`);
|
||||
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
const before = urls.size;
|
||||
const addedDuringScroll = await scrollAndCollect(page, urls, color);
|
||||
const after = urls.size;
|
||||
const netAdded = after - before;
|
||||
|
||||
log.completed_colors[color] = {
|
||||
url,
|
||||
http_status: status,
|
||||
added: netAdded,
|
||||
added_during_scroll: addedDuringScroll,
|
||||
total_after: after,
|
||||
completed_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
runRecord.colors_attempted.push({
|
||||
color,
|
||||
url,
|
||||
http_status: status,
|
||||
added: netAdded,
|
||||
total_after: after
|
||||
});
|
||||
|
||||
writeLog(log);
|
||||
writeUrls(urls);
|
||||
|
||||
console.log(`Color complete: ${color}; added ${netAdded}; total ${after}`);
|
||||
|
||||
// Polite pause between filters.
|
||||
await page.waitForTimeout(3000);
|
||||
} catch (err) {
|
||||
console.log(`Color failed: ${color}; ${err.message}`);
|
||||
|
||||
runRecord.colors_attempted.push({
|
||||
color,
|
||||
url,
|
||||
added: 0,
|
||||
error: err.message
|
||||
});
|
||||
|
||||
writeLog(log);
|
||||
}
|
||||
}
|
||||
|
||||
runRecord.finished_at = new Date().toISOString();
|
||||
runRecord.final_total = urls.size;
|
||||
runRecord.new_this_run = urls.size - existingUrls.length;
|
||||
|
||||
log.runs.push(runRecord);
|
||||
writeLog(log);
|
||||
writeUrls(urls);
|
||||
|
||||
console.log("");
|
||||
console.log("Color-param discovery complete.");
|
||||
console.log(`Existing at start: ${existingUrls.length}`);
|
||||
console.log(`Final total: ${urls.size}`);
|
||||
console.log(`New this run: ${urls.size - existingUrls.length}`);
|
||||
console.log(`Output: ${outputFile}`);
|
||||
console.log(`Log: ${logFile}`);
|
||||
|
||||
await browser.close();
|
||||
})();
|
||||
'@
|
||||
|
||||
Set-Content -Path ".\discover-prismatic-by-color-param.js" -Value $js -Encoding UTF8
|
||||
}
|
||||
|
||||
try {
|
||||
Install-PlaywrightIfNeeded -Requested:$InstallPlaywright
|
||||
Write-NodeDiscoveryScript
|
||||
|
||||
Write-Host "Running color-param URL discovery..."
|
||||
|
||||
$nodeArgs = @(
|
||||
".\discover-prismatic-by-color-param.js",
|
||||
"--max-scrolls-per-color=$MaxScrollsPerColor",
|
||||
"--stop-after-no-new-scrolls=$StopAfterNoNewScrolls"
|
||||
)
|
||||
|
||||
if ($Headed) {
|
||||
$nodeArgs += "--headed"
|
||||
}
|
||||
|
||||
node @nodeArgs
|
||||
}
|
||||
catch {
|
||||
Write-Error $_.Exception.Message
|
||||
exit 1
|
||||
}
|
||||
@@ -0,0 +1,410 @@
|
||||
# Get-Product-Info-Resumable.ps1
|
||||
#
|
||||
# Resumable, slow/polite Prismatic Powders product scraper.
|
||||
#
|
||||
# Inputs:
|
||||
# .\product-urls.txt
|
||||
#
|
||||
# Outputs:
|
||||
# .\prismatic_powders.json
|
||||
# .\prismatic-scrape-progress.log
|
||||
#
|
||||
# First-time setup:
|
||||
# Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
|
||||
# .\Get-Product-Info-Resumable.ps1 -InstallPlaywright -Headed -MaxProducts 5
|
||||
#
|
||||
# Normal full run:
|
||||
# .\Get-Product-Info-Resumable.ps1
|
||||
#
|
||||
# Test first 25 remaining:
|
||||
# .\Get-Product-Info-Resumable.ps1 -MaxProducts 25 -Headed
|
||||
#
|
||||
# Retry failed URLs too:
|
||||
# .\Get-Product-Info-Resumable.ps1 -RetryErrors
|
||||
#
|
||||
# Slow it down more:
|
||||
# .\Get-Product-Info-Resumable.ps1 -MinDelaySeconds 12 -MaxDelaySeconds 25
|
||||
|
||||
param(
|
||||
[switch]$InstallPlaywright,
|
||||
[switch]$Headed,
|
||||
|
||||
[string]$InputFile = ".\product-urls.txt",
|
||||
[string]$OutputJson = ".\prismatic_powders.json",
|
||||
[string]$ProgressLog = ".\prismatic-scrape-progress.log",
|
||||
|
||||
[int]$MinDelaySeconds = 4,
|
||||
[int]$MaxDelaySeconds = 10,
|
||||
[int]$PageSettleSeconds = 4,
|
||||
|
||||
# 0 means no limit.
|
||||
[int]$MaxProducts = 0,
|
||||
|
||||
# By default, URLs in errors are skipped on resume.
|
||||
# Use -RetryErrors to try failed URLs again.
|
||||
[switch]$RetryErrors
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function Ensure-NodeAvailable {
|
||||
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
|
||||
throw "Node.js is required. Install Node.js LTS from https://nodejs.org/"
|
||||
}
|
||||
|
||||
if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {
|
||||
throw "npm is required. It usually comes with Node.js."
|
||||
}
|
||||
}
|
||||
|
||||
function Install-PlaywrightIfNeeded {
|
||||
param([bool]$Requested)
|
||||
|
||||
Ensure-NodeAvailable
|
||||
|
||||
if ($Requested -or -not (Test-Path ".\node_modules\playwright")) {
|
||||
Write-Host "Installing Playwright package locally..."
|
||||
npm init -y | Out-Null
|
||||
npm install playwright | Out-Null
|
||||
|
||||
Write-Host "Installing Playwright Chromium browser..."
|
||||
npx playwright install chromium
|
||||
}
|
||||
}
|
||||
|
||||
function Write-NodeScraper {
|
||||
$js = @'
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
const retryErrors = process.argv.includes("--retry-errors");
|
||||
|
||||
function getArgValue(name, defaultValue) {
|
||||
const prefix = `--${name}=`;
|
||||
const found = process.argv.find(x => x.startsWith(prefix));
|
||||
return found ? found.slice(prefix.length) : defaultValue;
|
||||
}
|
||||
|
||||
const inputFile = getArgValue("input-file", "product-urls.txt");
|
||||
const outputJson = getArgValue("output-json", "prismatic_powders.json");
|
||||
const progressLog = getArgValue("progress-log", "prismatic-scrape-progress.log");
|
||||
|
||||
const minDelaySeconds = parseInt(getArgValue("min-delay-seconds", "8"), 10);
|
||||
const maxDelaySeconds = parseInt(getArgValue("max-delay-seconds", "18"), 10);
|
||||
const pageSettleSeconds = parseInt(getArgValue("page-settle-seconds", "4"), 10);
|
||||
const maxProducts = parseInt(getArgValue("max-products", "0"), 10);
|
||||
|
||||
function clean(text) {
|
||||
return (text || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function cleanUrl(url) {
|
||||
return (url || "").split("?")[0].split("#")[0].trim();
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function randomDelayMs() {
|
||||
const minMs = Math.max(0, minDelaySeconds * 1000);
|
||||
const maxMs = Math.max(minMs, maxDelaySeconds * 1000);
|
||||
return Math.floor(minMs + Math.random() * (maxMs - minMs + 1));
|
||||
}
|
||||
|
||||
function logLine(message) {
|
||||
const line = `[${new Date().toISOString()}] ${message}`;
|
||||
console.log(line);
|
||||
fs.appendFileSync(progressLog, line + "\r\n", "utf8");
|
||||
}
|
||||
|
||||
function absoluteUrl(baseUrl, maybeUrl) {
|
||||
if (!maybeUrl) return "";
|
||||
try {
|
||||
return new URL(maybeUrl, baseUrl).href;
|
||||
} catch {
|
||||
return maybeUrl;
|
||||
}
|
||||
}
|
||||
|
||||
function loadInputUrls() {
|
||||
if (!fs.existsSync(inputFile)) {
|
||||
throw new Error(`Input file not found: ${inputFile}`);
|
||||
}
|
||||
|
||||
const urls = fs.readFileSync(inputFile, "utf8")
|
||||
.split(/\r?\n/)
|
||||
.map(cleanUrl)
|
||||
.filter(Boolean)
|
||||
.filter(x => !x.startsWith("#"))
|
||||
.filter(x => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(x));
|
||||
|
||||
return [...new Set(urls)];
|
||||
}
|
||||
|
||||
function loadOutput() {
|
||||
if (!fs.existsSync(outputJson)) {
|
||||
return { results: [], errors: [] };
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(fs.readFileSync(outputJson, "utf8"));
|
||||
|
||||
if (Array.isArray(parsed)) {
|
||||
return { results: parsed, errors: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
results: Array.isArray(parsed.results) ? parsed.results : [],
|
||||
errors: Array.isArray(parsed.errors) ? parsed.errors : []
|
||||
};
|
||||
} catch (err) {
|
||||
const backup = `${outputJson}.invalid-${Date.now()}.bak`;
|
||||
fs.copyFileSync(outputJson, backup);
|
||||
throw new Error(`Could not parse existing ${outputJson}. Backed it up to ${backup}. Error: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
function saveOutput(data) {
|
||||
const tempFile = `${outputJson}.tmp`;
|
||||
fs.writeFileSync(tempFile, JSON.stringify(data, null, 2), "utf8");
|
||||
fs.renameSync(tempFile, outputJson);
|
||||
}
|
||||
|
||||
function parsePriceTiers(plainText) {
|
||||
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
|
||||
|
||||
return priceMatches.map(m => {
|
||||
const rangeText = clean(m[1]);
|
||||
const price = parseFloat(m[2]);
|
||||
|
||||
let min = null;
|
||||
let max = null;
|
||||
|
||||
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
|
||||
if (rangeMatch) {
|
||||
min = parseInt(rangeMatch[1], 10);
|
||||
max = parseInt(rangeMatch[2], 10);
|
||||
}
|
||||
|
||||
const plusMatch = rangeText.match(/(\d+)\s*\+/);
|
||||
if (plusMatch) {
|
||||
min = parseInt(plusMatch[1], 10);
|
||||
max = null;
|
||||
}
|
||||
|
||||
return { min, max, price };
|
||||
});
|
||||
}
|
||||
|
||||
async function getLinkByText(page, patterns) {
|
||||
const links = await page.locator("a").evaluateAll((anchors) =>
|
||||
anchors.map(a => ({
|
||||
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
|
||||
href: a.getAttribute("href") || ""
|
||||
}))
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
|
||||
return absoluteUrl(page.url(), link.href);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
async function getSampleImageUrl(page) {
|
||||
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
|
||||
imgs.map(img =>
|
||||
img.currentSrc ||
|
||||
img.src ||
|
||||
img.getAttribute("src") ||
|
||||
img.getAttribute("data-src") ||
|
||||
""
|
||||
).filter(Boolean)
|
||||
);
|
||||
|
||||
return (
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
|
||||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
async function parseProduct(page, url) {
|
||||
logLine(`Scraping ${url}`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
await page.waitForTimeout(pageSettleSeconds * 1000);
|
||||
|
||||
const status = response ? response.status() : 0;
|
||||
const pageTitle = clean(await page.title().catch(() => ""));
|
||||
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
|
||||
|
||||
logLine(`HTTP status ${status}; title "${pageTitle}"`);
|
||||
|
||||
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
|
||||
throw new Error("403 Forbidden returned by site.");
|
||||
}
|
||||
|
||||
if (status === 404 || /404|Page Not Found/i.test(pageTitle)) {
|
||||
throw new Error("404 Not Found returned by site.");
|
||||
}
|
||||
|
||||
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
|
||||
|
||||
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
|
||||
const sku = skuMatch ? skuMatch[1] : "";
|
||||
|
||||
if (!sku && !title) {
|
||||
throw new Error("Could not find SKU or title on product page.");
|
||||
}
|
||||
|
||||
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
|
||||
const description = descMatch ? clean(descMatch[1]) : "";
|
||||
|
||||
const priceTiers = parsePriceTiers(plainText);
|
||||
|
||||
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
|
||||
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
|
||||
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
|
||||
const sampleImageUrl = await getSampleImageUrl(page);
|
||||
|
||||
return {
|
||||
sku,
|
||||
color_name: title,
|
||||
description,
|
||||
price_tiers: priceTiers,
|
||||
safety_data_sheet_url: safetyDataSheetUrl,
|
||||
technical_data_sheet_url: technicalDataSheetUrl,
|
||||
application_guide_url: applicationGuideUrl,
|
||||
sample_image_url: sampleImageUrl,
|
||||
product_url: url,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const allUrls = loadInputUrls();
|
||||
const data = loadOutput();
|
||||
|
||||
const completedUrls = new Set(data.results.map(r => cleanUrl(r.product_url)).filter(Boolean));
|
||||
const errorUrls = new Set(data.errors.map(e => cleanUrl(e.product_url)).filter(Boolean));
|
||||
|
||||
let remainingUrls = allUrls.filter(url => {
|
||||
if (completedUrls.has(url)) return false;
|
||||
if (!retryErrors && errorUrls.has(url)) return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (maxProducts > 0) {
|
||||
remainingUrls = remainingUrls.slice(0, maxProducts);
|
||||
}
|
||||
|
||||
logLine(`Input URLs: ${allUrls.length}`);
|
||||
logLine(`Already scraped: ${completedUrls.size}`);
|
||||
logLine(`Existing errors: ${errorUrls.size}`);
|
||||
logLine(`Retry errors: ${retryErrors ? "yes" : "no"}`);
|
||||
logLine(`This run target count: ${remainingUrls.length}`);
|
||||
logLine(`Delay range: ${minDelaySeconds}-${maxDelaySeconds} seconds; page settle: ${pageSettleSeconds} seconds`);
|
||||
|
||||
if (remainingUrls.length === 0) {
|
||||
logLine("Nothing to scrape. Done.");
|
||||
saveOutput(data);
|
||||
return;
|
||||
}
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: !headed
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
let processedThisRun = 0;
|
||||
|
||||
for (const url of remainingUrls) {
|
||||
try {
|
||||
const row = await parseProduct(page, url);
|
||||
|
||||
// If retrying an old error, keep the old error history but avoid duplicate successful result.
|
||||
if (!completedUrls.has(url)) {
|
||||
data.results.push(row);
|
||||
completedUrls.add(url);
|
||||
}
|
||||
|
||||
processedThisRun++;
|
||||
saveOutput(data);
|
||||
|
||||
logLine(`Saved result ${processedThisRun}/${remainingUrls.length}: ${row.sku || "(no sku)"} ${row.color_name || ""}`);
|
||||
} catch (err) {
|
||||
const errorRecord = {
|
||||
product_url: url,
|
||||
error: err.message,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
data.errors.push(errorRecord);
|
||||
saveOutput(data);
|
||||
|
||||
logLine(`ERROR ${url}: ${err.message}`);
|
||||
}
|
||||
|
||||
const delay = randomDelayMs();
|
||||
logLine(`Waiting ${(delay / 1000).toFixed(1)} seconds before next product...`);
|
||||
await sleep(delay);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
logLine(`Done. Results: ${data.results.length}; Errors: ${data.errors.length}; Output: ${outputJson}`);
|
||||
})();
|
||||
'@
|
||||
|
||||
Set-Content -Path ".\prismatic-browser-scraper.js" -Value $js -Encoding UTF8
|
||||
}
|
||||
|
||||
try {
|
||||
Install-PlaywrightIfNeeded -Requested:$InstallPlaywright
|
||||
Write-NodeScraper
|
||||
|
||||
Write-Host "Running resumable browser scraper..."
|
||||
|
||||
$nodeArgs = @(
|
||||
".\prismatic-browser-scraper.js",
|
||||
"--input-file=$InputFile",
|
||||
"--output-json=$OutputJson",
|
||||
"--progress-log=$ProgressLog",
|
||||
"--min-delay-seconds=$MinDelaySeconds",
|
||||
"--max-delay-seconds=$MaxDelaySeconds",
|
||||
"--page-settle-seconds=$PageSettleSeconds",
|
||||
"--max-products=$MaxProducts"
|
||||
)
|
||||
|
||||
if ($Headed) {
|
||||
$nodeArgs += "--headed"
|
||||
}
|
||||
|
||||
if ($RetryErrors) {
|
||||
$nodeArgs += "--retry-errors"
|
||||
}
|
||||
|
||||
node @nodeArgs
|
||||
}
|
||||
catch {
|
||||
Write-Error $_.Exception.Message
|
||||
exit 1
|
||||
}
|
||||
@@ -0,0 +1,410 @@
|
||||
# Get-Product-Info-Resumable.ps1
|
||||
#
|
||||
# Resumable, slow/polite Prismatic Powders product scraper.
|
||||
#
|
||||
# Inputs:
|
||||
# .\product-urls.txt
|
||||
#
|
||||
# Outputs:
|
||||
# .\prismatic_powders.json
|
||||
# .\prismatic-scrape-progress.log
|
||||
#
|
||||
# First-time setup:
|
||||
# Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
|
||||
# .\Get-Product-Info-Resumable.ps1 -InstallPlaywright -Headed -MaxProducts 5
|
||||
#
|
||||
# Normal full run:
|
||||
# .\Get-Product-Info-Resumable.ps1
|
||||
#
|
||||
# Test first 25 remaining:
|
||||
# .\Get-Product-Info-Resumable.ps1 -MaxProducts 25 -Headed
|
||||
#
|
||||
# Retry failed URLs too:
|
||||
# .\Get-Product-Info-Resumable.ps1 -RetryErrors
|
||||
#
|
||||
# Slow it down more:
|
||||
# .\Get-Product-Info-Resumable.ps1 -MinDelaySeconds 12 -MaxDelaySeconds 25
|
||||
|
||||
param(
|
||||
[switch]$InstallPlaywright,
|
||||
[switch]$Headed,
|
||||
|
||||
[string]$InputFile = ".\product-urls.txt",
|
||||
[string]$OutputJson = ".\prismatic_powders.json",
|
||||
[string]$ProgressLog = ".\prismatic-scrape-progress.log",
|
||||
|
||||
[int]$MinDelaySeconds = 8,
|
||||
[int]$MaxDelaySeconds = 18,
|
||||
[int]$PageSettleSeconds = 4,
|
||||
|
||||
# 0 means no limit.
|
||||
[int]$MaxProducts = 0,
|
||||
|
||||
# By default, URLs in errors are skipped on resume.
|
||||
# Use -RetryErrors to try failed URLs again.
|
||||
[switch]$RetryErrors
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function Ensure-NodeAvailable {
|
||||
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
|
||||
throw "Node.js is required. Install Node.js LTS from https://nodejs.org/"
|
||||
}
|
||||
|
||||
if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {
|
||||
throw "npm is required. It usually comes with Node.js."
|
||||
}
|
||||
}
|
||||
|
||||
function Install-PlaywrightIfNeeded {
|
||||
param([bool]$Requested)
|
||||
|
||||
Ensure-NodeAvailable
|
||||
|
||||
if ($Requested -or -not (Test-Path ".\node_modules\playwright")) {
|
||||
Write-Host "Installing Playwright package locally..."
|
||||
npm init -y | Out-Null
|
||||
npm install playwright | Out-Null
|
||||
|
||||
Write-Host "Installing Playwright Chromium browser..."
|
||||
npx playwright install chromium
|
||||
}
|
||||
}
|
||||
|
||||
function Write-NodeScraper {
|
||||
$js = @'
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
const retryErrors = process.argv.includes("--retry-errors");
|
||||
|
||||
function getArgValue(name, defaultValue) {
|
||||
const prefix = `--${name}=`;
|
||||
const found = process.argv.find(x => x.startsWith(prefix));
|
||||
return found ? found.slice(prefix.length) : defaultValue;
|
||||
}
|
||||
|
||||
const inputFile = getArgValue("input-file", "product-urls.txt");
|
||||
const outputJson = getArgValue("output-json", "prismatic_powders.json");
|
||||
const progressLog = getArgValue("progress-log", "prismatic-scrape-progress.log");
|
||||
|
||||
const minDelaySeconds = parseInt(getArgValue("min-delay-seconds", "8"), 10);
|
||||
const maxDelaySeconds = parseInt(getArgValue("max-delay-seconds", "18"), 10);
|
||||
const pageSettleSeconds = parseInt(getArgValue("page-settle-seconds", "4"), 10);
|
||||
const maxProducts = parseInt(getArgValue("max-products", "0"), 10);
|
||||
|
||||
function clean(text) {
|
||||
return (text || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function cleanUrl(url) {
|
||||
return (url || "").split("?")[0].split("#")[0].trim();
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function randomDelayMs() {
|
||||
const minMs = Math.max(0, minDelaySeconds * 1000);
|
||||
const maxMs = Math.max(minMs, maxDelaySeconds * 1000);
|
||||
return Math.floor(minMs + Math.random() * (maxMs - minMs + 1));
|
||||
}
|
||||
|
||||
function logLine(message) {
|
||||
const line = `[${new Date().toISOString()}] ${message}`;
|
||||
console.log(line);
|
||||
fs.appendFileSync(progressLog, line + "\r\n", "utf8");
|
||||
}
|
||||
|
||||
function absoluteUrl(baseUrl, maybeUrl) {
|
||||
if (!maybeUrl) return "";
|
||||
try {
|
||||
return new URL(maybeUrl, baseUrl).href;
|
||||
} catch {
|
||||
return maybeUrl;
|
||||
}
|
||||
}
|
||||
|
||||
function loadInputUrls() {
|
||||
if (!fs.existsSync(inputFile)) {
|
||||
throw new Error(`Input file not found: ${inputFile}`);
|
||||
}
|
||||
|
||||
const urls = fs.readFileSync(inputFile, "utf8")
|
||||
.split(/\r?\n/)
|
||||
.map(cleanUrl)
|
||||
.filter(Boolean)
|
||||
.filter(x => !x.startsWith("#"))
|
||||
.filter(x => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(x));
|
||||
|
||||
return [...new Set(urls)];
|
||||
}
|
||||
|
||||
function loadOutput() {
|
||||
if (!fs.existsSync(outputJson)) {
|
||||
return { results: [], errors: [] };
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(fs.readFileSync(outputJson, "utf8"));
|
||||
|
||||
if (Array.isArray(parsed)) {
|
||||
return { results: parsed, errors: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
results: Array.isArray(parsed.results) ? parsed.results : [],
|
||||
errors: Array.isArray(parsed.errors) ? parsed.errors : []
|
||||
};
|
||||
} catch (err) {
|
||||
const backup = `${outputJson}.invalid-${Date.now()}.bak`;
|
||||
fs.copyFileSync(outputJson, backup);
|
||||
throw new Error(`Could not parse existing ${outputJson}. Backed it up to ${backup}. Error: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
function saveOutput(data) {
|
||||
const tempFile = `${outputJson}.tmp`;
|
||||
fs.writeFileSync(tempFile, JSON.stringify(data, null, 2), "utf8");
|
||||
fs.renameSync(tempFile, outputJson);
|
||||
}
|
||||
|
||||
function parsePriceTiers(plainText) {
|
||||
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
|
||||
|
||||
return priceMatches.map(m => {
|
||||
const rangeText = clean(m[1]);
|
||||
const price = parseFloat(m[2]);
|
||||
|
||||
let min = null;
|
||||
let max = null;
|
||||
|
||||
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
|
||||
if (rangeMatch) {
|
||||
min = parseInt(rangeMatch[1], 10);
|
||||
max = parseInt(rangeMatch[2], 10);
|
||||
}
|
||||
|
||||
const plusMatch = rangeText.match(/(\d+)\s*\+/);
|
||||
if (plusMatch) {
|
||||
min = parseInt(plusMatch[1], 10);
|
||||
max = null;
|
||||
}
|
||||
|
||||
return { min, max, price };
|
||||
});
|
||||
}
|
||||
|
||||
async function getLinkByText(page, patterns) {
|
||||
const links = await page.locator("a").evaluateAll((anchors) =>
|
||||
anchors.map(a => ({
|
||||
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
|
||||
href: a.getAttribute("href") || ""
|
||||
}))
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
|
||||
return absoluteUrl(page.url(), link.href);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
async function getSampleImageUrl(page) {
|
||||
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
|
||||
imgs.map(img =>
|
||||
img.currentSrc ||
|
||||
img.src ||
|
||||
img.getAttribute("src") ||
|
||||
img.getAttribute("data-src") ||
|
||||
""
|
||||
).filter(Boolean)
|
||||
);
|
||||
|
||||
return (
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
|
||||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
async function parseProduct(page, url) {
|
||||
logLine(`Scraping ${url}`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
await page.waitForTimeout(pageSettleSeconds * 1000);
|
||||
|
||||
const status = response ? response.status() : 0;
|
||||
const pageTitle = clean(await page.title().catch(() => ""));
|
||||
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
|
||||
|
||||
logLine(`HTTP status ${status}; title "${pageTitle}"`);
|
||||
|
||||
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
|
||||
throw new Error("403 Forbidden returned by site.");
|
||||
}
|
||||
|
||||
if (status === 404 || /404|Page Not Found/i.test(pageTitle)) {
|
||||
throw new Error("404 Not Found returned by site.");
|
||||
}
|
||||
|
||||
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
|
||||
|
||||
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
|
||||
const sku = skuMatch ? skuMatch[1] : "";
|
||||
|
||||
if (!sku && !title) {
|
||||
throw new Error("Could not find SKU or title on product page.");
|
||||
}
|
||||
|
||||
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
|
||||
const description = descMatch ? clean(descMatch[1]) : "";
|
||||
|
||||
const priceTiers = parsePriceTiers(plainText);
|
||||
|
||||
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
|
||||
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
|
||||
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
|
||||
const sampleImageUrl = await getSampleImageUrl(page);
|
||||
|
||||
return {
|
||||
sku,
|
||||
color_name: title,
|
||||
description,
|
||||
price_tiers: priceTiers,
|
||||
safety_data_sheet_url: safetyDataSheetUrl,
|
||||
technical_data_sheet_url: technicalDataSheetUrl,
|
||||
application_guide_url: applicationGuideUrl,
|
||||
sample_image_url: sampleImageUrl,
|
||||
product_url: url,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const allUrls = loadInputUrls();
|
||||
const data = loadOutput();
|
||||
|
||||
const completedUrls = new Set(data.results.map(r => cleanUrl(r.product_url)).filter(Boolean));
|
||||
const errorUrls = new Set(data.errors.map(e => cleanUrl(e.product_url)).filter(Boolean));
|
||||
|
||||
let remainingUrls = allUrls.filter(url => {
|
||||
if (completedUrls.has(url)) return false;
|
||||
if (!retryErrors && errorUrls.has(url)) return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (maxProducts > 0) {
|
||||
remainingUrls = remainingUrls.slice(0, maxProducts);
|
||||
}
|
||||
|
||||
logLine(`Input URLs: ${allUrls.length}`);
|
||||
logLine(`Already scraped: ${completedUrls.size}`);
|
||||
logLine(`Existing errors: ${errorUrls.size}`);
|
||||
logLine(`Retry errors: ${retryErrors ? "yes" : "no"}`);
|
||||
logLine(`This run target count: ${remainingUrls.length}`);
|
||||
logLine(`Delay range: ${minDelaySeconds}-${maxDelaySeconds} seconds; page settle: ${pageSettleSeconds} seconds`);
|
||||
|
||||
if (remainingUrls.length === 0) {
|
||||
logLine("Nothing to scrape. Done.");
|
||||
saveOutput(data);
|
||||
return;
|
||||
}
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: !headed
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
let processedThisRun = 0;
|
||||
|
||||
for (const url of remainingUrls) {
|
||||
try {
|
||||
const row = await parseProduct(page, url);
|
||||
|
||||
// If retrying an old error, keep the old error history but avoid duplicate successful result.
|
||||
if (!completedUrls.has(url)) {
|
||||
data.results.push(row);
|
||||
completedUrls.add(url);
|
||||
}
|
||||
|
||||
processedThisRun++;
|
||||
saveOutput(data);
|
||||
|
||||
logLine(`Saved result ${processedThisRun}/${remainingUrls.length}: ${row.sku || "(no sku)"} ${row.color_name || ""}`);
|
||||
} catch (err) {
|
||||
const errorRecord = {
|
||||
product_url: url,
|
||||
error: err.message,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
data.errors.push(errorRecord);
|
||||
saveOutput(data);
|
||||
|
||||
logLine(`ERROR ${url}: ${err.message}`);
|
||||
}
|
||||
|
||||
const delay = randomDelayMs();
|
||||
logLine(`Waiting ${(delay / 1000).toFixed(1)} seconds before next product...`);
|
||||
await sleep(delay);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
logLine(`Done. Results: ${data.results.length}; Errors: ${data.errors.length}; Output: ${outputJson}`);
|
||||
})();
|
||||
'@
|
||||
|
||||
Set-Content -Path ".\prismatic-browser-scraper.js" -Value $js -Encoding UTF8
|
||||
}
|
||||
|
||||
try {
|
||||
Install-PlaywrightIfNeeded -Requested:$InstallPlaywright
|
||||
Write-NodeScraper
|
||||
|
||||
Write-Host "Running resumable browser scraper..."
|
||||
|
||||
$nodeArgs = @(
|
||||
".\prismatic-browser-scraper.js",
|
||||
"--input-file=$InputFile",
|
||||
"--output-json=$OutputJson",
|
||||
"--progress-log=$ProgressLog",
|
||||
"--min-delay-seconds=$MinDelaySeconds",
|
||||
"--max-delay-seconds=$MaxDelaySeconds",
|
||||
"--page-settle-seconds=$PageSettleSeconds",
|
||||
"--max-products=$MaxProducts"
|
||||
)
|
||||
|
||||
if ($Headed) {
|
||||
$nodeArgs += "--headed"
|
||||
}
|
||||
|
||||
if ($RetryErrors) {
|
||||
$nodeArgs += "--retry-errors"
|
||||
}
|
||||
|
||||
node @nodeArgs
|
||||
}
|
||||
catch {
|
||||
Write-Error $_.Exception.Message
|
||||
exit 1
|
||||
}
|
||||
@@ -0,0 +1,265 @@
|
||||
# Crawl and Index Prismatic Colors - Known-Good Style JSON.ps1
|
||||
#
|
||||
# Rollback to the earlier working browser pattern:
|
||||
# - Playwright Chromium
|
||||
# - Full Chrome-style User-Agent
|
||||
# - JSON output
|
||||
# - Structured price tiers
|
||||
# - Color matches from #collection-list
|
||||
#
|
||||
# First-time setup:
|
||||
# Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
|
||||
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 -InstallPlaywright
|
||||
#
|
||||
# Normal run:
|
||||
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1
|
||||
#
|
||||
# Watch browser:
|
||||
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 -Headed
|
||||
|
||||
param(
|
||||
[switch]$InstallPlaywright,
|
||||
[switch]$Headed
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function Ensure-NodeAvailable {
|
||||
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
|
||||
throw "Node.js is required. Install Node.js LTS from https://nodejs.org/"
|
||||
}
|
||||
|
||||
if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {
|
||||
throw "npm is required. It usually comes with Node.js."
|
||||
}
|
||||
}
|
||||
|
||||
function Install-PlaywrightIfNeeded {
|
||||
param([bool]$Requested)
|
||||
|
||||
Ensure-NodeAvailable
|
||||
|
||||
if ($Requested -or -not (Test-Path ".\node_modules\playwright")) {
|
||||
Write-Host "Installing Playwright package locally..."
|
||||
npm init -y | Out-Null
|
||||
npm install playwright | Out-Null
|
||||
|
||||
Write-Host "Installing Playwright Chromium browser..."
|
||||
npx playwright install chromium
|
||||
}
|
||||
}
|
||||
|
||||
function Write-NodeScraper {
|
||||
# Single-quoted here-string prevents PowerShell from interpreting JavaScript regex/template strings.
|
||||
$js = @'
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
|
||||
const productUrls = [
|
||||
"https://www.prismaticpowders.com/shop/powder-coating-colors/PSS-11248/high-gloss-black"
|
||||
];
|
||||
|
||||
const outputJson = "prismatic_powders.json";
|
||||
|
||||
function clean(text) {
|
||||
return (text || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function absoluteUrl(baseUrl, maybeUrl) {
|
||||
if (!maybeUrl) return "";
|
||||
try {
|
||||
return new URL(maybeUrl, baseUrl).href;
|
||||
} catch {
|
||||
return maybeUrl;
|
||||
}
|
||||
}
|
||||
|
||||
function unique(items) {
|
||||
return [...new Set(items.filter(Boolean).map(clean).filter(Boolean))];
|
||||
}
|
||||
|
||||
async function getLinkByText(page, patterns) {
|
||||
const links = await page.locator("a").evaluateAll((anchors) =>
|
||||
anchors.map(a => ({
|
||||
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
|
||||
href: a.getAttribute("href") || ""
|
||||
}))
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
|
||||
return absoluteUrl(page.url(), link.href);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
function parsePriceTiers(plainText) {
|
||||
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
|
||||
|
||||
return priceMatches.map(m => {
|
||||
const rangeText = clean(m[1]);
|
||||
const price = parseFloat(m[2]);
|
||||
|
||||
let min = null;
|
||||
let max = null;
|
||||
|
||||
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
|
||||
if (rangeMatch) {
|
||||
min = parseInt(rangeMatch[1], 10);
|
||||
max = parseInt(rangeMatch[2], 10);
|
||||
}
|
||||
|
||||
const plusMatch = rangeText.match(/(\d+)\s*\+/);
|
||||
if (plusMatch) {
|
||||
min = parseInt(plusMatch[1], 10);
|
||||
max = null;
|
||||
}
|
||||
|
||||
return {
|
||||
min,
|
||||
max,
|
||||
price
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function getSampleImageUrl(page) {
|
||||
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
|
||||
imgs.map(img =>
|
||||
img.currentSrc ||
|
||||
img.src ||
|
||||
img.getAttribute("src") ||
|
||||
img.getAttribute("data-src") ||
|
||||
""
|
||||
).filter(Boolean)
|
||||
);
|
||||
|
||||
return (
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
|
||||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
async function parseProduct(page, url) {
|
||||
console.log(`Scraping ${url}`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const status = response ? response.status() : 0;
|
||||
const pageTitle = clean(await page.title().catch(() => ""));
|
||||
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
|
||||
|
||||
console.log(`HTTP status: ${status}`);
|
||||
console.log(`Page title: ${pageTitle}`);
|
||||
|
||||
// Do not silently output a fake product if blocked.
|
||||
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
|
||||
throw new Error("403 Forbidden returned by site.");
|
||||
}
|
||||
|
||||
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
|
||||
|
||||
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
|
||||
const sku = skuMatch ? skuMatch[1] : "";
|
||||
|
||||
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
|
||||
const description = descMatch ? clean(descMatch[1]) : "";
|
||||
|
||||
const priceTiers = parsePriceTiers(plainText);
|
||||
|
||||
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
|
||||
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
|
||||
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
|
||||
const sampleImageUrl = await getSampleImageUrl(page);
|
||||
|
||||
return {
|
||||
sku,
|
||||
color_name: title,
|
||||
description,
|
||||
price_tiers: priceTiers,
|
||||
safety_data_sheet_url: safetyDataSheetUrl,
|
||||
technical_data_sheet_url: technicalDataSheetUrl,
|
||||
application_guide_url: applicationGuideUrl,
|
||||
sample_image_url: sampleImageUrl,
|
||||
product_url: url,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const browser = await chromium.launch({
|
||||
headless: !headed
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const results = [];
|
||||
const errors = [];
|
||||
|
||||
for (const url of productUrls) {
|
||||
try {
|
||||
const row = await parseProduct(page, url);
|
||||
results.push(row);
|
||||
await page.waitForTimeout(3000);
|
||||
} catch (err) {
|
||||
console.warn(`Failed ${url}: ${err.message}`);
|
||||
errors.push({
|
||||
product_url: url,
|
||||
error: err.message,
|
||||
scraped_at: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
// If you prefer only the array, change this to JSON.stringify(results, null, 2)
|
||||
const output = {
|
||||
results,
|
||||
errors
|
||||
};
|
||||
|
||||
fs.writeFileSync(outputJson, JSON.stringify(output, null, 2), "utf8");
|
||||
|
||||
console.log(`Done. Output: ${outputJson}`);
|
||||
})();
|
||||
'@
|
||||
|
||||
Set-Content -Path ".\prismatic-browser-scraper.js" -Value $js -Encoding UTF8
|
||||
}
|
||||
|
||||
try {
|
||||
Install-PlaywrightIfNeeded -Requested:$InstallPlaywright
|
||||
Write-NodeScraper
|
||||
|
||||
Write-Host "Running browser scraper..."
|
||||
|
||||
if ($Headed) {
|
||||
node .\prismatic-browser-scraper.js --headed
|
||||
}
|
||||
else {
|
||||
node .\prismatic-browser-scraper.js
|
||||
}
|
||||
}
|
||||
catch {
|
||||
Write-Error $_.Exception.Message
|
||||
exit 1
|
||||
}
|
||||
@@ -0,0 +1,319 @@
|
||||
# Discover-Prismatic-Product-Urls-By-ColorParam.ps1
|
||||
#
|
||||
# Discovers Prismatic Powders product URLs by visiting color filter URLs like:
|
||||
# https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_red
|
||||
#
|
||||
# Outputs:
|
||||
# .\product-urls.txt
|
||||
# .\color-discovery-log.json
|
||||
#
|
||||
# First-time setup:
|
||||
# Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
|
||||
# .\Discover-Prismatic-Product-Urls-By-ColorParam.ps1 -InstallPlaywright -Headed
|
||||
#
|
||||
# Normal run:
|
||||
# .\Discover-Prismatic-Product-Urls-By-ColorParam.ps1
|
||||
#
|
||||
# Watch browser:
|
||||
# .\Discover-Prismatic-Product-Urls-By-ColorParam.ps1 -Headed
|
||||
|
||||
param(
|
||||
[switch]$InstallPlaywright,
|
||||
[switch]$Headed,
|
||||
[int]$MaxScrollsPerColor = 180,
|
||||
[int]$StopAfterNoNewScrolls = 10
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function Ensure-NodeAvailable {
|
||||
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
|
||||
throw "Node.js is required. Install Node.js LTS from https://nodejs.org/"
|
||||
}
|
||||
|
||||
if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {
|
||||
throw "npm is required. It usually comes with Node.js."
|
||||
}
|
||||
}
|
||||
|
||||
function Install-PlaywrightIfNeeded {
|
||||
param([bool]$Requested)
|
||||
|
||||
Ensure-NodeAvailable
|
||||
|
||||
if ($Requested -or -not (Test-Path ".\node_modules\playwright")) {
|
||||
Write-Host "Installing Playwright package locally..."
|
||||
npm init -y | Out-Null
|
||||
npm install playwright | Out-Null
|
||||
|
||||
Write-Host "Installing Playwright Chromium browser..."
|
||||
npx playwright install chromium
|
||||
}
|
||||
}
|
||||
|
||||
function Write-NodeDiscoveryScript {
|
||||
$js = @'
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
|
||||
function getArgValue(name, defaultValue) {
|
||||
const prefix = `--${name}=`;
|
||||
const found = process.argv.find(x => x.startsWith(prefix));
|
||||
return found ? found.slice(prefix.length) : defaultValue;
|
||||
}
|
||||
|
||||
const maxScrollsPerColor = parseInt(getArgValue("max-scrolls-per-color", "180"), 10);
|
||||
const stopAfterNoNewScrolls = parseInt(getArgValue("stop-after-no-new-scrolls", "10"), 10);
|
||||
|
||||
const baseUrl = "https://www.prismaticpowders.com/shop/powder-coating-colors";
|
||||
const outputFile = "product-urls.txt";
|
||||
const logFile = "color-discovery-log.json";
|
||||
|
||||
// Update this list if you find more color params in the site HTML.
|
||||
const colorParams = [
|
||||
"pris_black",
|
||||
"pris_blue",
|
||||
"pris_bronze",
|
||||
"pris_brown",
|
||||
"pris_clear",
|
||||
"pris_copper",
|
||||
"pris_gold",
|
||||
"pris_gray",
|
||||
"pris_green",
|
||||
"pris_orange",
|
||||
"pris_pink",
|
||||
"pris_purple",
|
||||
"pris_red",
|
||||
"pris_silver",
|
||||
"pris_tan",
|
||||
"pris_white",
|
||||
"pris_yellow"
|
||||
];
|
||||
|
||||
function cleanUrl(url) {
|
||||
return (url || "").split("?")[0].split("#")[0].trim();
|
||||
}
|
||||
|
||||
function isProductUrl(url) {
|
||||
return /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(url || "");
|
||||
}
|
||||
|
||||
function readExistingUrls() {
|
||||
if (!fs.existsSync(outputFile)) return [];
|
||||
|
||||
return fs.readFileSync(outputFile, "utf8")
|
||||
.split(/\r?\n/)
|
||||
.map(cleanUrl)
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function writeUrls(urls) {
|
||||
const sorted = [...urls].sort();
|
||||
fs.writeFileSync(outputFile, sorted.join("\r\n") + "\r\n", "utf8");
|
||||
}
|
||||
|
||||
function readLog() {
|
||||
if (!fs.existsSync(logFile)) {
|
||||
return {
|
||||
completed_colors: {},
|
||||
runs: []
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(fs.readFileSync(logFile, "utf8"));
|
||||
} catch {
|
||||
return {
|
||||
completed_colors: {},
|
||||
runs: []
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function writeLog(log) {
|
||||
fs.writeFileSync(logFile, JSON.stringify(log, null, 2), "utf8");
|
||||
}
|
||||
|
||||
async function collectProductLinks(page) {
|
||||
const links = await page.locator("a").evaluateAll(anchors =>
|
||||
anchors
|
||||
.map(a => a.href)
|
||||
.filter(Boolean)
|
||||
.filter(h => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(h))
|
||||
);
|
||||
|
||||
return links.map(cleanUrl).filter(Boolean);
|
||||
}
|
||||
|
||||
async function scrollAndCollect(page, urls, label) {
|
||||
let noNewScrolls = 0;
|
||||
let totalAddedForThisColor = 0;
|
||||
|
||||
for (let i = 0; i < maxScrollsPerColor; i++) {
|
||||
const before = urls.size;
|
||||
|
||||
for (const link of await collectProductLinks(page)) {
|
||||
urls.add(link);
|
||||
}
|
||||
|
||||
const after = urls.size;
|
||||
const added = after - before;
|
||||
totalAddedForThisColor += added;
|
||||
|
||||
if (added === 0) {
|
||||
noNewScrolls++;
|
||||
} else {
|
||||
noNewScrolls = 0;
|
||||
}
|
||||
|
||||
writeUrls(urls);
|
||||
|
||||
console.log(`[${label}] Scroll ${i + 1}/${maxScrollsPerColor}: +${added}, total ${after}, no-new ${noNewScrolls}`);
|
||||
|
||||
if (noNewScrolls >= stopAfterNoNewScrolls) {
|
||||
break;
|
||||
}
|
||||
|
||||
await page.mouse.wheel(0, 2500);
|
||||
await page.waitForTimeout(1500);
|
||||
}
|
||||
|
||||
return totalAddedForThisColor;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const existingUrls = readExistingUrls();
|
||||
const urls = new Set(existingUrls);
|
||||
const log = readLog();
|
||||
|
||||
console.log(`Existing URLs in ${outputFile}: ${existingUrls.length}`);
|
||||
|
||||
const browser = await chromium.launch({ headless: !headed });
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const runRecord = {
|
||||
started_at: new Date().toISOString(),
|
||||
existing_at_start: existingUrls.length,
|
||||
colors_attempted: []
|
||||
};
|
||||
|
||||
for (const color of colorParams) {
|
||||
if (log.completed_colors[color]) {
|
||||
console.log(`Skipping completed color: ${color}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const url = `${baseUrl}?color=${encodeURIComponent(color)}`;
|
||||
console.log("");
|
||||
console.log(`Opening color filter: ${color}`);
|
||||
console.log(url);
|
||||
|
||||
try {
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
const status = response ? response.status() : "unknown";
|
||||
console.log(`HTTP status: ${status}`);
|
||||
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
const before = urls.size;
|
||||
const addedDuringScroll = await scrollAndCollect(page, urls, color);
|
||||
const after = urls.size;
|
||||
const netAdded = after - before;
|
||||
|
||||
log.completed_colors[color] = {
|
||||
url,
|
||||
http_status: status,
|
||||
added: netAdded,
|
||||
added_during_scroll: addedDuringScroll,
|
||||
total_after: after,
|
||||
completed_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
runRecord.colors_attempted.push({
|
||||
color,
|
||||
url,
|
||||
http_status: status,
|
||||
added: netAdded,
|
||||
total_after: after
|
||||
});
|
||||
|
||||
writeLog(log);
|
||||
writeUrls(urls);
|
||||
|
||||
console.log(`Color complete: ${color}; added ${netAdded}; total ${after}`);
|
||||
|
||||
// Polite pause between filters.
|
||||
await page.waitForTimeout(3000);
|
||||
} catch (err) {
|
||||
console.log(`Color failed: ${color}; ${err.message}`);
|
||||
|
||||
runRecord.colors_attempted.push({
|
||||
color,
|
||||
url,
|
||||
added: 0,
|
||||
error: err.message
|
||||
});
|
||||
|
||||
writeLog(log);
|
||||
}
|
||||
}
|
||||
|
||||
runRecord.finished_at = new Date().toISOString();
|
||||
runRecord.final_total = urls.size;
|
||||
runRecord.new_this_run = urls.size - existingUrls.length;
|
||||
|
||||
log.runs.push(runRecord);
|
||||
writeLog(log);
|
||||
writeUrls(urls);
|
||||
|
||||
console.log("");
|
||||
console.log("Color-param discovery complete.");
|
||||
console.log(`Existing at start: ${existingUrls.length}`);
|
||||
console.log(`Final total: ${urls.size}`);
|
||||
console.log(`New this run: ${urls.size - existingUrls.length}`);
|
||||
console.log(`Output: ${outputFile}`);
|
||||
console.log(`Log: ${logFile}`);
|
||||
|
||||
await browser.close();
|
||||
})();
|
||||
'@
|
||||
|
||||
Set-Content -Path ".\discover-prismatic-by-color-param.js" -Value $js -Encoding UTF8
|
||||
}
|
||||
|
||||
try {
|
||||
Install-PlaywrightIfNeeded -Requested:$InstallPlaywright
|
||||
Write-NodeDiscoveryScript
|
||||
|
||||
Write-Host "Running color-param URL discovery..."
|
||||
|
||||
$nodeArgs = @(
|
||||
".\discover-prismatic-by-color-param.js",
|
||||
"--max-scrolls-per-color=$MaxScrollsPerColor",
|
||||
"--stop-after-no-new-scrolls=$StopAfterNoNewScrolls"
|
||||
)
|
||||
|
||||
if ($Headed) {
|
||||
$nodeArgs += "--headed"
|
||||
}
|
||||
|
||||
node @nodeArgs
|
||||
}
|
||||
catch {
|
||||
Write-Error $_.Exception.Message
|
||||
exit 1
|
||||
}
|
||||
@@ -0,0 +1,265 @@
|
||||
# Crawl and Index Prismatic Colors - Known-Good Style JSON.ps1
|
||||
#
|
||||
# Rollback to the earlier working browser pattern:
|
||||
# - Playwright Chromium
|
||||
# - Full Chrome-style User-Agent
|
||||
# - JSON output
|
||||
# - Structured price tiers
|
||||
# - Color matches from #collection-list
|
||||
#
|
||||
# First-time setup:
|
||||
# Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
|
||||
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 -InstallPlaywright
|
||||
#
|
||||
# Normal run:
|
||||
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1
|
||||
#
|
||||
# Watch browser:
|
||||
# .\Crawl-and-Index-Prismatic-colors-known-good-json.ps1 -Headed
|
||||
|
||||
param(
|
||||
[switch]$InstallPlaywright,
|
||||
[switch]$Headed
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function Ensure-NodeAvailable {
|
||||
if (-not (Get-Command node -ErrorAction SilentlyContinue)) {
|
||||
throw "Node.js is required. Install Node.js LTS from https://nodejs.org/"
|
||||
}
|
||||
|
||||
if (-not (Get-Command npm -ErrorAction SilentlyContinue)) {
|
||||
throw "npm is required. It usually comes with Node.js."
|
||||
}
|
||||
}
|
||||
|
||||
function Install-PlaywrightIfNeeded {
|
||||
param([bool]$Requested)
|
||||
|
||||
Ensure-NodeAvailable
|
||||
|
||||
if ($Requested -or -not (Test-Path ".\node_modules\playwright")) {
|
||||
Write-Host "Installing Playwright package locally..."
|
||||
npm init -y | Out-Null
|
||||
npm install playwright | Out-Null
|
||||
|
||||
Write-Host "Installing Playwright Chromium browser..."
|
||||
npx playwright install chromium
|
||||
}
|
||||
}
|
||||
|
||||
function Write-NodeScraper {
|
||||
# Single-quoted here-string prevents PowerShell from interpreting JavaScript regex/template strings.
|
||||
$js = @'
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
|
||||
const productUrls = [
|
||||
"https://www.prismaticpowders.com/shop/powder-coating-colors/PSS-11248/high-gloss-black"
|
||||
];
|
||||
|
||||
const outputJson = "prismatic_powders.json";
|
||||
|
||||
function clean(text) {
|
||||
return (text || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function absoluteUrl(baseUrl, maybeUrl) {
|
||||
if (!maybeUrl) return "";
|
||||
try {
|
||||
return new URL(maybeUrl, baseUrl).href;
|
||||
} catch {
|
||||
return maybeUrl;
|
||||
}
|
||||
}
|
||||
|
||||
function unique(items) {
|
||||
return [...new Set(items.filter(Boolean).map(clean).filter(Boolean))];
|
||||
}
|
||||
|
||||
async function getLinkByText(page, patterns) {
|
||||
const links = await page.locator("a").evaluateAll((anchors) =>
|
||||
anchors.map(a => ({
|
||||
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
|
||||
href: a.getAttribute("href") || ""
|
||||
}))
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
|
||||
return absoluteUrl(page.url(), link.href);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
function parsePriceTiers(plainText) {
|
||||
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
|
||||
|
||||
return priceMatches.map(m => {
|
||||
const rangeText = clean(m[1]);
|
||||
const price = parseFloat(m[2]);
|
||||
|
||||
let min = null;
|
||||
let max = null;
|
||||
|
||||
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
|
||||
if (rangeMatch) {
|
||||
min = parseInt(rangeMatch[1], 10);
|
||||
max = parseInt(rangeMatch[2], 10);
|
||||
}
|
||||
|
||||
const plusMatch = rangeText.match(/(\d+)\s*\+/);
|
||||
if (plusMatch) {
|
||||
min = parseInt(plusMatch[1], 10);
|
||||
max = null;
|
||||
}
|
||||
|
||||
return {
|
||||
min,
|
||||
max,
|
||||
price
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function getSampleImageUrl(page) {
|
||||
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
|
||||
imgs.map(img =>
|
||||
img.currentSrc ||
|
||||
img.src ||
|
||||
img.getAttribute("src") ||
|
||||
img.getAttribute("data-src") ||
|
||||
""
|
||||
).filter(Boolean)
|
||||
);
|
||||
|
||||
return (
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
|
||||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
async function parseProduct(page, url) {
|
||||
console.log(`Scraping ${url}`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const status = response ? response.status() : 0;
|
||||
const pageTitle = clean(await page.title().catch(() => ""));
|
||||
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
|
||||
|
||||
console.log(`HTTP status: ${status}`);
|
||||
console.log(`Page title: ${pageTitle}`);
|
||||
|
||||
// Do not silently output a fake product if blocked.
|
||||
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
|
||||
throw new Error("403 Forbidden returned by site.");
|
||||
}
|
||||
|
||||
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
|
||||
|
||||
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
|
||||
const sku = skuMatch ? skuMatch[1] : "";
|
||||
|
||||
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
|
||||
const description = descMatch ? clean(descMatch[1]) : "";
|
||||
|
||||
const priceTiers = parsePriceTiers(plainText);
|
||||
|
||||
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
|
||||
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
|
||||
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
|
||||
const sampleImageUrl = await getSampleImageUrl(page);
|
||||
|
||||
return {
|
||||
sku,
|
||||
color_name: title,
|
||||
description,
|
||||
price_tiers: priceTiers,
|
||||
safety_data_sheet_url: safetyDataSheetUrl,
|
||||
technical_data_sheet_url: technicalDataSheetUrl,
|
||||
application_guide_url: applicationGuideUrl,
|
||||
sample_image_url: sampleImageUrl,
|
||||
product_url: url,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const browser = await chromium.launch({
|
||||
headless: !headed
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const results = [];
|
||||
const errors = [];
|
||||
|
||||
for (const url of productUrls) {
|
||||
try {
|
||||
const row = await parseProduct(page, url);
|
||||
results.push(row);
|
||||
await page.waitForTimeout(3000);
|
||||
} catch (err) {
|
||||
console.warn(`Failed ${url}: ${err.message}`);
|
||||
errors.push({
|
||||
product_url: url,
|
||||
error: err.message,
|
||||
scraped_at: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
// If you prefer only the array, change this to JSON.stringify(results, null, 2)
|
||||
const output = {
|
||||
results,
|
||||
errors
|
||||
};
|
||||
|
||||
fs.writeFileSync(outputJson, JSON.stringify(output, null, 2), "utf8");
|
||||
|
||||
console.log(`Done. Output: ${outputJson}`);
|
||||
})();
|
||||
'@
|
||||
|
||||
Set-Content -Path ".\prismatic-browser-scraper.js" -Value $js -Encoding UTF8
|
||||
}
|
||||
|
||||
try {
|
||||
Install-PlaywrightIfNeeded -Requested:$InstallPlaywright
|
||||
Write-NodeScraper
|
||||
|
||||
Write-Host "Running browser scraper..."
|
||||
|
||||
if ($Headed) {
|
||||
node .\prismatic-browser-scraper.js --headed
|
||||
}
|
||||
else {
|
||||
node .\prismatic-browser-scraper.js
|
||||
}
|
||||
}
|
||||
catch {
|
||||
Write-Error $_.Exception.Message
|
||||
exit 1
|
||||
}
|
||||
Binary file not shown.
@@ -0,0 +1,270 @@
|
||||
{
|
||||
"completed_colors": {
|
||||
"pris_black": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_black",
|
||||
"http_status": 200,
|
||||
"added": 472,
|
||||
"added_during_scroll": 472,
|
||||
"total_after": 472,
|
||||
"completed_at": "2026-04-30T00:47:46.289Z"
|
||||
},
|
||||
"pris_blue": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_blue",
|
||||
"http_status": 200,
|
||||
"added": 948,
|
||||
"added_during_scroll": 948,
|
||||
"total_after": 1420,
|
||||
"completed_at": "2026-04-30T00:49:25.145Z"
|
||||
},
|
||||
"pris_bronze": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_bronze",
|
||||
"http_status": 200,
|
||||
"added": 358,
|
||||
"added_during_scroll": 358,
|
||||
"total_after": 1778,
|
||||
"completed_at": "2026-04-30T00:50:18.466Z"
|
||||
},
|
||||
"pris_brown": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_brown",
|
||||
"http_status": 200,
|
||||
"added": 373,
|
||||
"added_during_scroll": 373,
|
||||
"total_after": 2151,
|
||||
"completed_at": "2026-04-30T00:51:18.033Z"
|
||||
},
|
||||
"pris_clear": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_clear",
|
||||
"http_status": 200,
|
||||
"added": 19,
|
||||
"added_during_scroll": 19,
|
||||
"total_after": 2170,
|
||||
"completed_at": "2026-04-30T00:51:42.889Z"
|
||||
},
|
||||
"pris_copper": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_copper",
|
||||
"http_status": 200,
|
||||
"added": 1094,
|
||||
"added_during_scroll": 1094,
|
||||
"total_after": 3264,
|
||||
"completed_at": "2026-04-30T00:56:34.934Z"
|
||||
},
|
||||
"pris_gold": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_gold",
|
||||
"http_status": 200,
|
||||
"added": 152,
|
||||
"added_during_scroll": 152,
|
||||
"total_after": 3416,
|
||||
"completed_at": "2026-04-30T00:57:26.775Z"
|
||||
},
|
||||
"pris_gray": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_gray",
|
||||
"http_status": 200,
|
||||
"added": 0,
|
||||
"added_during_scroll": 0,
|
||||
"total_after": 3416,
|
||||
"completed_at": "2026-04-30T00:57:49.624Z"
|
||||
},
|
||||
"pris_green": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_green",
|
||||
"http_status": 200,
|
||||
"added": 0,
|
||||
"added_during_scroll": 0,
|
||||
"total_after": 3416,
|
||||
"completed_at": "2026-04-30T00:58:12.277Z"
|
||||
},
|
||||
"pris_orange": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_orange",
|
||||
"http_status": 200,
|
||||
"added": 233,
|
||||
"added_during_scroll": 233,
|
||||
"total_after": 3649,
|
||||
"completed_at": "2026-04-30T00:59:06.776Z"
|
||||
},
|
||||
"pris_pink": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_pink",
|
||||
"http_status": 200,
|
||||
"added": 169,
|
||||
"added_during_scroll": 169,
|
||||
"total_after": 3818,
|
||||
"completed_at": "2026-04-30T00:59:49.323Z"
|
||||
},
|
||||
"pris_purple": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_purple",
|
||||
"http_status": 200,
|
||||
"added": 182,
|
||||
"added_during_scroll": 182,
|
||||
"total_after": 4000,
|
||||
"completed_at": "2026-04-30T01:00:38.111Z"
|
||||
},
|
||||
"pris_red": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_red",
|
||||
"http_status": 200,
|
||||
"added": 346,
|
||||
"added_during_scroll": 346,
|
||||
"total_after": 4346,
|
||||
"completed_at": "2026-04-30T01:01:51.910Z"
|
||||
},
|
||||
"pris_silver": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_silver",
|
||||
"http_status": 200,
|
||||
"added": 210,
|
||||
"added_during_scroll": 210,
|
||||
"total_after": 4556,
|
||||
"completed_at": "2026-04-30T01:02:51.835Z"
|
||||
},
|
||||
"pris_tan": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_tan",
|
||||
"http_status": 200,
|
||||
"added": 219,
|
||||
"added_during_scroll": 219,
|
||||
"total_after": 4775,
|
||||
"completed_at": "2026-04-30T01:03:43.244Z"
|
||||
},
|
||||
"pris_white": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_white",
|
||||
"http_status": 200,
|
||||
"added": 218,
|
||||
"added_during_scroll": 218,
|
||||
"total_after": 4993,
|
||||
"completed_at": "2026-04-30T01:04:39.931Z"
|
||||
},
|
||||
"pris_yellow": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_yellow",
|
||||
"http_status": 200,
|
||||
"added": 199,
|
||||
"added_during_scroll": 199,
|
||||
"total_after": 5192,
|
||||
"completed_at": "2026-04-30T01:05:31.945Z"
|
||||
}
|
||||
},
|
||||
"runs": [
|
||||
{
|
||||
"started_at": "2026-04-30T00:46:47.692Z",
|
||||
"existing_at_start": 0,
|
||||
"colors_attempted": [
|
||||
{
|
||||
"color": "pris_black",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_black",
|
||||
"http_status": 200,
|
||||
"added": 472,
|
||||
"total_after": 472
|
||||
},
|
||||
{
|
||||
"color": "pris_blue",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_blue",
|
||||
"http_status": 200,
|
||||
"added": 948,
|
||||
"total_after": 1420
|
||||
},
|
||||
{
|
||||
"color": "pris_bronze",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_bronze",
|
||||
"http_status": 200,
|
||||
"added": 358,
|
||||
"total_after": 1778
|
||||
},
|
||||
{
|
||||
"color": "pris_brown",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_brown",
|
||||
"http_status": 200,
|
||||
"added": 373,
|
||||
"total_after": 2151
|
||||
},
|
||||
{
|
||||
"color": "pris_clear",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_clear",
|
||||
"http_status": 200,
|
||||
"added": 19,
|
||||
"total_after": 2170
|
||||
},
|
||||
{
|
||||
"color": "pris_copper",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_copper",
|
||||
"http_status": 200,
|
||||
"added": 1094,
|
||||
"total_after": 3264
|
||||
},
|
||||
{
|
||||
"color": "pris_gold",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_gold",
|
||||
"http_status": 200,
|
||||
"added": 152,
|
||||
"total_after": 3416
|
||||
},
|
||||
{
|
||||
"color": "pris_gray",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_gray",
|
||||
"http_status": 200,
|
||||
"added": 0,
|
||||
"total_after": 3416
|
||||
},
|
||||
{
|
||||
"color": "pris_green",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_green",
|
||||
"http_status": 200,
|
||||
"added": 0,
|
||||
"total_after": 3416
|
||||
},
|
||||
{
|
||||
"color": "pris_orange",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_orange",
|
||||
"http_status": 200,
|
||||
"added": 233,
|
||||
"total_after": 3649
|
||||
},
|
||||
{
|
||||
"color": "pris_pink",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_pink",
|
||||
"http_status": 200,
|
||||
"added": 169,
|
||||
"total_after": 3818
|
||||
},
|
||||
{
|
||||
"color": "pris_purple",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_purple",
|
||||
"http_status": 200,
|
||||
"added": 182,
|
||||
"total_after": 4000
|
||||
},
|
||||
{
|
||||
"color": "pris_red",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_red",
|
||||
"http_status": 200,
|
||||
"added": 346,
|
||||
"total_after": 4346
|
||||
},
|
||||
{
|
||||
"color": "pris_silver",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_silver",
|
||||
"http_status": 200,
|
||||
"added": 210,
|
||||
"total_after": 4556
|
||||
},
|
||||
{
|
||||
"color": "pris_tan",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_tan",
|
||||
"http_status": 200,
|
||||
"added": 219,
|
||||
"total_after": 4775
|
||||
},
|
||||
{
|
||||
"color": "pris_white",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_white",
|
||||
"http_status": 200,
|
||||
"added": 218,
|
||||
"total_after": 4993
|
||||
},
|
||||
{
|
||||
"color": "pris_yellow",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_yellow",
|
||||
"http_status": 200,
|
||||
"added": 199,
|
||||
"total_after": 5192
|
||||
}
|
||||
],
|
||||
"finished_at": "2026-04-30T01:05:34.987Z",
|
||||
"final_total": 5192,
|
||||
"new_this_run": 5192
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,237 @@
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
|
||||
function getArgValue(name, defaultValue) {
|
||||
const prefix = `--${name}=`;
|
||||
const found = process.argv.find(x => x.startsWith(prefix));
|
||||
return found ? found.slice(prefix.length) : defaultValue;
|
||||
}
|
||||
|
||||
const maxScrollsPerColor = parseInt(getArgValue("max-scrolls-per-color", "180"), 10);
|
||||
const stopAfterNoNewScrolls = parseInt(getArgValue("stop-after-no-new-scrolls", "10"), 10);
|
||||
|
||||
const baseUrl = "https://www.prismaticpowders.com/shop/powder-coating-colors";
|
||||
const outputFile = "product-urls.txt";
|
||||
const logFile = "color-discovery-log.json";
|
||||
|
||||
// Update this list if you find more color params in the site HTML.
|
||||
const colorParams = [
|
||||
"pris_black",
|
||||
"pris_blue",
|
||||
"pris_bronze",
|
||||
"pris_brown",
|
||||
"pris_clear",
|
||||
"pris_copper",
|
||||
"pris_gold",
|
||||
"pris_gray",
|
||||
"pris_green",
|
||||
"pris_orange",
|
||||
"pris_pink",
|
||||
"pris_purple",
|
||||
"pris_red",
|
||||
"pris_silver",
|
||||
"pris_tan",
|
||||
"pris_white",
|
||||
"pris_yellow"
|
||||
];
|
||||
|
||||
function cleanUrl(url) {
|
||||
return (url || "").split("?")[0].split("#")[0].trim();
|
||||
}
|
||||
|
||||
function isProductUrl(url) {
|
||||
return /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(url || "");
|
||||
}
|
||||
|
||||
function readExistingUrls() {
|
||||
if (!fs.existsSync(outputFile)) return [];
|
||||
|
||||
return fs.readFileSync(outputFile, "utf8")
|
||||
.split(/\r?\n/)
|
||||
.map(cleanUrl)
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function writeUrls(urls) {
|
||||
const sorted = [...urls].sort();
|
||||
fs.writeFileSync(outputFile, sorted.join("\r\n") + "\r\n", "utf8");
|
||||
}
|
||||
|
||||
function readLog() {
|
||||
if (!fs.existsSync(logFile)) {
|
||||
return {
|
||||
completed_colors: {},
|
||||
runs: []
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(fs.readFileSync(logFile, "utf8"));
|
||||
} catch {
|
||||
return {
|
||||
completed_colors: {},
|
||||
runs: []
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function writeLog(log) {
|
||||
fs.writeFileSync(logFile, JSON.stringify(log, null, 2), "utf8");
|
||||
}
|
||||
|
||||
async function collectProductLinks(page) {
|
||||
const links = await page.locator("a").evaluateAll(anchors =>
|
||||
anchors
|
||||
.map(a => a.href)
|
||||
.filter(Boolean)
|
||||
.filter(h => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(h))
|
||||
);
|
||||
|
||||
return links.map(cleanUrl).filter(Boolean);
|
||||
}
|
||||
|
||||
async function scrollAndCollect(page, urls, label) {
|
||||
let noNewScrolls = 0;
|
||||
let totalAddedForThisColor = 0;
|
||||
|
||||
for (let i = 0; i < maxScrollsPerColor; i++) {
|
||||
const before = urls.size;
|
||||
|
||||
for (const link of await collectProductLinks(page)) {
|
||||
urls.add(link);
|
||||
}
|
||||
|
||||
const after = urls.size;
|
||||
const added = after - before;
|
||||
totalAddedForThisColor += added;
|
||||
|
||||
if (added === 0) {
|
||||
noNewScrolls++;
|
||||
} else {
|
||||
noNewScrolls = 0;
|
||||
}
|
||||
|
||||
writeUrls(urls);
|
||||
|
||||
console.log(`[${label}] Scroll ${i + 1}/${maxScrollsPerColor}: +${added}, total ${after}, no-new ${noNewScrolls}`);
|
||||
|
||||
if (noNewScrolls >= stopAfterNoNewScrolls) {
|
||||
break;
|
||||
}
|
||||
|
||||
await page.mouse.wheel(0, 2500);
|
||||
await page.waitForTimeout(1500);
|
||||
}
|
||||
|
||||
return totalAddedForThisColor;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const existingUrls = readExistingUrls();
|
||||
const urls = new Set(existingUrls);
|
||||
const log = readLog();
|
||||
|
||||
console.log(`Existing URLs in ${outputFile}: ${existingUrls.length}`);
|
||||
|
||||
const browser = await chromium.launch({ headless: !headed });
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const runRecord = {
|
||||
started_at: new Date().toISOString(),
|
||||
existing_at_start: existingUrls.length,
|
||||
colors_attempted: []
|
||||
};
|
||||
|
||||
for (const color of colorParams) {
|
||||
if (log.completed_colors[color]) {
|
||||
console.log(`Skipping completed color: ${color}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const url = `${baseUrl}?color=${encodeURIComponent(color)}`;
|
||||
console.log("");
|
||||
console.log(`Opening color filter: ${color}`);
|
||||
console.log(url);
|
||||
|
||||
try {
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
const status = response ? response.status() : "unknown";
|
||||
console.log(`HTTP status: ${status}`);
|
||||
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
const before = urls.size;
|
||||
const addedDuringScroll = await scrollAndCollect(page, urls, color);
|
||||
const after = urls.size;
|
||||
const netAdded = after - before;
|
||||
|
||||
log.completed_colors[color] = {
|
||||
url,
|
||||
http_status: status,
|
||||
added: netAdded,
|
||||
added_during_scroll: addedDuringScroll,
|
||||
total_after: after,
|
||||
completed_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
runRecord.colors_attempted.push({
|
||||
color,
|
||||
url,
|
||||
http_status: status,
|
||||
added: netAdded,
|
||||
total_after: after
|
||||
});
|
||||
|
||||
writeLog(log);
|
||||
writeUrls(urls);
|
||||
|
||||
console.log(`Color complete: ${color}; added ${netAdded}; total ${after}`);
|
||||
|
||||
// Polite pause between filters.
|
||||
await page.waitForTimeout(3000);
|
||||
} catch (err) {
|
||||
console.log(`Color failed: ${color}; ${err.message}`);
|
||||
|
||||
runRecord.colors_attempted.push({
|
||||
color,
|
||||
url,
|
||||
added: 0,
|
||||
error: err.message
|
||||
});
|
||||
|
||||
writeLog(log);
|
||||
}
|
||||
}
|
||||
|
||||
runRecord.finished_at = new Date().toISOString();
|
||||
runRecord.final_total = urls.size;
|
||||
runRecord.new_this_run = urls.size - existingUrls.length;
|
||||
|
||||
log.runs.push(runRecord);
|
||||
writeLog(log);
|
||||
writeUrls(urls);
|
||||
|
||||
console.log("");
|
||||
console.log("Color-param discovery complete.");
|
||||
console.log(`Existing at start: ${existingUrls.length}`);
|
||||
console.log(`Final total: ${urls.size}`);
|
||||
console.log(`New this run: ${urls.size - existingUrls.length}`);
|
||||
console.log(`Output: ${outputFile}`);
|
||||
console.log(`Log: ${logFile}`);
|
||||
|
||||
await browser.close();
|
||||
})();
|
||||
+60
@@ -0,0 +1,60 @@
|
||||
{
|
||||
"name": "web-scraping",
|
||||
"version": "1.0.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "web-scraping",
|
||||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"playwright": "^1.59.1"
|
||||
}
|
||||
},
|
||||
"node_modules/fsevents": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
||||
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright": {
|
||||
"version": "1.59.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
|
||||
"integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"playwright-core": "1.59.1"
|
||||
},
|
||||
"bin": {
|
||||
"playwright": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"fsevents": "2.3.2"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright-core": {
|
||||
"version": "1.59.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz",
|
||||
"integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==",
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"playwright-core": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "web-scraping",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"type": "commonjs",
|
||||
"dependencies": {
|
||||
"playwright": "^1.59.1"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,189 @@
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
|
||||
const productUrls = [
|
||||
"https://www.prismaticpowders.com/shop/powder-coating-colors/PSS-11248/high-gloss-black"
|
||||
];
|
||||
|
||||
const outputJson = "prismatic_powders.json";
|
||||
|
||||
function clean(text) {
|
||||
return (text || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function absoluteUrl(baseUrl, maybeUrl) {
|
||||
if (!maybeUrl) return "";
|
||||
try {
|
||||
return new URL(maybeUrl, baseUrl).href;
|
||||
} catch {
|
||||
return maybeUrl;
|
||||
}
|
||||
}
|
||||
|
||||
function unique(items) {
|
||||
return [...new Set(items.filter(Boolean).map(clean).filter(Boolean))];
|
||||
}
|
||||
|
||||
async function getLinkByText(page, patterns) {
|
||||
const links = await page.locator("a").evaluateAll((anchors) =>
|
||||
anchors.map(a => ({
|
||||
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
|
||||
href: a.getAttribute("href") || ""
|
||||
}))
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
|
||||
return absoluteUrl(page.url(), link.href);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
function parsePriceTiers(plainText) {
|
||||
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
|
||||
|
||||
return priceMatches.map(m => {
|
||||
const rangeText = clean(m[1]);
|
||||
const price = parseFloat(m[2]);
|
||||
|
||||
let min = null;
|
||||
let max = null;
|
||||
|
||||
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
|
||||
if (rangeMatch) {
|
||||
min = parseInt(rangeMatch[1], 10);
|
||||
max = parseInt(rangeMatch[2], 10);
|
||||
}
|
||||
|
||||
const plusMatch = rangeText.match(/(\d+)\s*\+/);
|
||||
if (plusMatch) {
|
||||
min = parseInt(plusMatch[1], 10);
|
||||
max = null;
|
||||
}
|
||||
|
||||
return {
|
||||
min,
|
||||
max,
|
||||
price
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function getSampleImageUrl(page) {
|
||||
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
|
||||
imgs.map(img =>
|
||||
img.currentSrc ||
|
||||
img.src ||
|
||||
img.getAttribute("src") ||
|
||||
img.getAttribute("data-src") ||
|
||||
""
|
||||
).filter(Boolean)
|
||||
);
|
||||
|
||||
return (
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
|
||||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
async function parseProduct(page, url) {
|
||||
console.log(`Scraping ${url}`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const status = response ? response.status() : 0;
|
||||
const pageTitle = clean(await page.title().catch(() => ""));
|
||||
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
|
||||
|
||||
console.log(`HTTP status: ${status}`);
|
||||
console.log(`Page title: ${pageTitle}`);
|
||||
|
||||
// Do not silently output a fake product if blocked.
|
||||
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
|
||||
throw new Error("403 Forbidden returned by site.");
|
||||
}
|
||||
|
||||
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
|
||||
|
||||
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
|
||||
const sku = skuMatch ? skuMatch[1] : "";
|
||||
|
||||
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
|
||||
const description = descMatch ? clean(descMatch[1]) : "";
|
||||
|
||||
const priceTiers = parsePriceTiers(plainText);
|
||||
|
||||
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
|
||||
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
|
||||
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
|
||||
const sampleImageUrl = await getSampleImageUrl(page);
|
||||
|
||||
return {
|
||||
sku,
|
||||
color_name: title,
|
||||
description,
|
||||
price_tiers: priceTiers,
|
||||
safety_data_sheet_url: safetyDataSheetUrl,
|
||||
technical_data_sheet_url: technicalDataSheetUrl,
|
||||
application_guide_url: applicationGuideUrl,
|
||||
sample_image_url: sampleImageUrl,
|
||||
product_url: url,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const browser = await chromium.launch({
|
||||
headless: !headed
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const results = [];
|
||||
const errors = [];
|
||||
|
||||
for (const url of productUrls) {
|
||||
try {
|
||||
const row = await parseProduct(page, url);
|
||||
results.push(row);
|
||||
await page.waitForTimeout(3000);
|
||||
} catch (err) {
|
||||
console.warn(`Failed ${url}: ${err.message}`);
|
||||
errors.push({
|
||||
product_url: url,
|
||||
error: err.message,
|
||||
scraped_at: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
// If you prefer only the array, change this to JSON.stringify(results, null, 2)
|
||||
const output = {
|
||||
results,
|
||||
errors
|
||||
};
|
||||
|
||||
fs.writeFileSync(outputJson, JSON.stringify(output, null, 2), "utf8");
|
||||
|
||||
console.log(`Done. Output: ${outputJson}`);
|
||||
})();
|
||||
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"sku": "PSS-11248",
|
||||
"color_name": "High Gloss Black",
|
||||
"description": "Prismatic Powders developed High Gloss Black to be the only high gloss black powder coating you will ever need. It has an incredibly deep, mirror like finish with amazing flow out, yielding the highest gloss, true black finish available in a powder coating. High Gloss Black is a polyester solid tone and is the best option to use as a base coat with all of our clear metallics, because of its mirror-like finish. Gloss Units: 85+ Proposition 65 Warning",
|
||||
"price_tiers": [
|
||||
{
|
||||
"min": 1,
|
||||
"max": 49,
|
||||
"price": 12.47
|
||||
},
|
||||
{
|
||||
"min": 50,
|
||||
"max": 199,
|
||||
"price": 11.85
|
||||
},
|
||||
{
|
||||
"min": 200,
|
||||
"max": null,
|
||||
"price": 11.22
|
||||
}
|
||||
],
|
||||
"safety_data_sheet_url": "https://images.nicindustries.com/prismatic/documents/8099/prismatic-powders-p-series-sds-dt20260126212318272746.pdf?1769462600",
|
||||
"technical_data_sheet_url": "https://images.nicindustries.com/prismatic/documents/5536/pss-11248-high-gloss-black-tds-dt20240111230300658308.pdf?1705014182",
|
||||
"application_guide_url": "https://images.nicindustries.com/prismatic/documents/2274/prismatic-powders-application-guide-dt20230508192819506132.pdf?1683574101",
|
||||
"sample_image_url": "https://images.nicindustries.com/prismatic/products/15027/high-gloss-black-pss-11248-dt20251107212621375559-thumbnail.jpg?size=600",
|
||||
"product_url": "https://www.prismaticpowders.com/shop/powder-coating-colors/PSS-11248/high-gloss-black",
|
||||
"scraped_at": "2026-04-30T12:52:36.244Z"
|
||||
}
|
||||
],
|
||||
"errors": []
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,270 @@
|
||||
{
|
||||
"completed_colors": {
|
||||
"pris_black": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_black",
|
||||
"http_status": 200,
|
||||
"added": 472,
|
||||
"added_during_scroll": 472,
|
||||
"total_after": 472,
|
||||
"completed_at": "2026-04-30T00:47:46.289Z"
|
||||
},
|
||||
"pris_blue": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_blue",
|
||||
"http_status": 200,
|
||||
"added": 948,
|
||||
"added_during_scroll": 948,
|
||||
"total_after": 1420,
|
||||
"completed_at": "2026-04-30T00:49:25.145Z"
|
||||
},
|
||||
"pris_bronze": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_bronze",
|
||||
"http_status": 200,
|
||||
"added": 358,
|
||||
"added_during_scroll": 358,
|
||||
"total_after": 1778,
|
||||
"completed_at": "2026-04-30T00:50:18.466Z"
|
||||
},
|
||||
"pris_brown": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_brown",
|
||||
"http_status": 200,
|
||||
"added": 373,
|
||||
"added_during_scroll": 373,
|
||||
"total_after": 2151,
|
||||
"completed_at": "2026-04-30T00:51:18.033Z"
|
||||
},
|
||||
"pris_clear": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_clear",
|
||||
"http_status": 200,
|
||||
"added": 19,
|
||||
"added_during_scroll": 19,
|
||||
"total_after": 2170,
|
||||
"completed_at": "2026-04-30T00:51:42.889Z"
|
||||
},
|
||||
"pris_copper": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_copper",
|
||||
"http_status": 200,
|
||||
"added": 1094,
|
||||
"added_during_scroll": 1094,
|
||||
"total_after": 3264,
|
||||
"completed_at": "2026-04-30T00:56:34.934Z"
|
||||
},
|
||||
"pris_gold": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_gold",
|
||||
"http_status": 200,
|
||||
"added": 152,
|
||||
"added_during_scroll": 152,
|
||||
"total_after": 3416,
|
||||
"completed_at": "2026-04-30T00:57:26.775Z"
|
||||
},
|
||||
"pris_gray": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_gray",
|
||||
"http_status": 200,
|
||||
"added": 0,
|
||||
"added_during_scroll": 0,
|
||||
"total_after": 3416,
|
||||
"completed_at": "2026-04-30T00:57:49.624Z"
|
||||
},
|
||||
"pris_green": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_green",
|
||||
"http_status": 200,
|
||||
"added": 0,
|
||||
"added_during_scroll": 0,
|
||||
"total_after": 3416,
|
||||
"completed_at": "2026-04-30T00:58:12.277Z"
|
||||
},
|
||||
"pris_orange": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_orange",
|
||||
"http_status": 200,
|
||||
"added": 233,
|
||||
"added_during_scroll": 233,
|
||||
"total_after": 3649,
|
||||
"completed_at": "2026-04-30T00:59:06.776Z"
|
||||
},
|
||||
"pris_pink": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_pink",
|
||||
"http_status": 200,
|
||||
"added": 169,
|
||||
"added_during_scroll": 169,
|
||||
"total_after": 3818,
|
||||
"completed_at": "2026-04-30T00:59:49.323Z"
|
||||
},
|
||||
"pris_purple": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_purple",
|
||||
"http_status": 200,
|
||||
"added": 182,
|
||||
"added_during_scroll": 182,
|
||||
"total_after": 4000,
|
||||
"completed_at": "2026-04-30T01:00:38.111Z"
|
||||
},
|
||||
"pris_red": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_red",
|
||||
"http_status": 200,
|
||||
"added": 346,
|
||||
"added_during_scroll": 346,
|
||||
"total_after": 4346,
|
||||
"completed_at": "2026-04-30T01:01:51.910Z"
|
||||
},
|
||||
"pris_silver": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_silver",
|
||||
"http_status": 200,
|
||||
"added": 210,
|
||||
"added_during_scroll": 210,
|
||||
"total_after": 4556,
|
||||
"completed_at": "2026-04-30T01:02:51.835Z"
|
||||
},
|
||||
"pris_tan": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_tan",
|
||||
"http_status": 200,
|
||||
"added": 219,
|
||||
"added_during_scroll": 219,
|
||||
"total_after": 4775,
|
||||
"completed_at": "2026-04-30T01:03:43.244Z"
|
||||
},
|
||||
"pris_white": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_white",
|
||||
"http_status": 200,
|
||||
"added": 218,
|
||||
"added_during_scroll": 218,
|
||||
"total_after": 4993,
|
||||
"completed_at": "2026-04-30T01:04:39.931Z"
|
||||
},
|
||||
"pris_yellow": {
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_yellow",
|
||||
"http_status": 200,
|
||||
"added": 199,
|
||||
"added_during_scroll": 199,
|
||||
"total_after": 5192,
|
||||
"completed_at": "2026-04-30T01:05:31.945Z"
|
||||
}
|
||||
},
|
||||
"runs": [
|
||||
{
|
||||
"started_at": "2026-04-30T00:46:47.692Z",
|
||||
"existing_at_start": 0,
|
||||
"colors_attempted": [
|
||||
{
|
||||
"color": "pris_black",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_black",
|
||||
"http_status": 200,
|
||||
"added": 472,
|
||||
"total_after": 472
|
||||
},
|
||||
{
|
||||
"color": "pris_blue",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_blue",
|
||||
"http_status": 200,
|
||||
"added": 948,
|
||||
"total_after": 1420
|
||||
},
|
||||
{
|
||||
"color": "pris_bronze",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_bronze",
|
||||
"http_status": 200,
|
||||
"added": 358,
|
||||
"total_after": 1778
|
||||
},
|
||||
{
|
||||
"color": "pris_brown",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_brown",
|
||||
"http_status": 200,
|
||||
"added": 373,
|
||||
"total_after": 2151
|
||||
},
|
||||
{
|
||||
"color": "pris_clear",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_clear",
|
||||
"http_status": 200,
|
||||
"added": 19,
|
||||
"total_after": 2170
|
||||
},
|
||||
{
|
||||
"color": "pris_copper",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_copper",
|
||||
"http_status": 200,
|
||||
"added": 1094,
|
||||
"total_after": 3264
|
||||
},
|
||||
{
|
||||
"color": "pris_gold",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_gold",
|
||||
"http_status": 200,
|
||||
"added": 152,
|
||||
"total_after": 3416
|
||||
},
|
||||
{
|
||||
"color": "pris_gray",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_gray",
|
||||
"http_status": 200,
|
||||
"added": 0,
|
||||
"total_after": 3416
|
||||
},
|
||||
{
|
||||
"color": "pris_green",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_green",
|
||||
"http_status": 200,
|
||||
"added": 0,
|
||||
"total_after": 3416
|
||||
},
|
||||
{
|
||||
"color": "pris_orange",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_orange",
|
||||
"http_status": 200,
|
||||
"added": 233,
|
||||
"total_after": 3649
|
||||
},
|
||||
{
|
||||
"color": "pris_pink",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_pink",
|
||||
"http_status": 200,
|
||||
"added": 169,
|
||||
"total_after": 3818
|
||||
},
|
||||
{
|
||||
"color": "pris_purple",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_purple",
|
||||
"http_status": 200,
|
||||
"added": 182,
|
||||
"total_after": 4000
|
||||
},
|
||||
{
|
||||
"color": "pris_red",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_red",
|
||||
"http_status": 200,
|
||||
"added": 346,
|
||||
"total_after": 4346
|
||||
},
|
||||
{
|
||||
"color": "pris_silver",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_silver",
|
||||
"http_status": 200,
|
||||
"added": 210,
|
||||
"total_after": 4556
|
||||
},
|
||||
{
|
||||
"color": "pris_tan",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_tan",
|
||||
"http_status": 200,
|
||||
"added": 219,
|
||||
"total_after": 4775
|
||||
},
|
||||
{
|
||||
"color": "pris_white",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_white",
|
||||
"http_status": 200,
|
||||
"added": 218,
|
||||
"total_after": 4993
|
||||
},
|
||||
{
|
||||
"color": "pris_yellow",
|
||||
"url": "https://www.prismaticpowders.com/shop/powder-coating-colors?color=pris_yellow",
|
||||
"http_status": 200,
|
||||
"added": 199,
|
||||
"total_after": 5192
|
||||
}
|
||||
],
|
||||
"finished_at": "2026-04-30T01:05:34.987Z",
|
||||
"final_total": 5192,
|
||||
"new_this_run": 5192
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,237 @@
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
|
||||
function getArgValue(name, defaultValue) {
|
||||
const prefix = `--${name}=`;
|
||||
const found = process.argv.find(x => x.startsWith(prefix));
|
||||
return found ? found.slice(prefix.length) : defaultValue;
|
||||
}
|
||||
|
||||
const maxScrollsPerColor = parseInt(getArgValue("max-scrolls-per-color", "180"), 10);
|
||||
const stopAfterNoNewScrolls = parseInt(getArgValue("stop-after-no-new-scrolls", "10"), 10);
|
||||
|
||||
const baseUrl = "https://www.prismaticpowders.com/shop/powder-coating-colors";
|
||||
const outputFile = "product-urls.txt";
|
||||
const logFile = "color-discovery-log.json";
|
||||
|
||||
// Update this list if you find more color params in the site HTML.
|
||||
const colorParams = [
|
||||
"pris_black",
|
||||
"pris_blue",
|
||||
"pris_bronze",
|
||||
"pris_brown",
|
||||
"pris_clear",
|
||||
"pris_copper",
|
||||
"pris_gold",
|
||||
"pris_gray",
|
||||
"pris_green",
|
||||
"pris_orange",
|
||||
"pris_pink",
|
||||
"pris_purple",
|
||||
"pris_red",
|
||||
"pris_silver",
|
||||
"pris_tan",
|
||||
"pris_white",
|
||||
"pris_yellow"
|
||||
];
|
||||
|
||||
function cleanUrl(url) {
|
||||
return (url || "").split("?")[0].split("#")[0].trim();
|
||||
}
|
||||
|
||||
function isProductUrl(url) {
|
||||
return /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(url || "");
|
||||
}
|
||||
|
||||
function readExistingUrls() {
|
||||
if (!fs.existsSync(outputFile)) return [];
|
||||
|
||||
return fs.readFileSync(outputFile, "utf8")
|
||||
.split(/\r?\n/)
|
||||
.map(cleanUrl)
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function writeUrls(urls) {
|
||||
const sorted = [...urls].sort();
|
||||
fs.writeFileSync(outputFile, sorted.join("\r\n") + "\r\n", "utf8");
|
||||
}
|
||||
|
||||
function readLog() {
|
||||
if (!fs.existsSync(logFile)) {
|
||||
return {
|
||||
completed_colors: {},
|
||||
runs: []
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(fs.readFileSync(logFile, "utf8"));
|
||||
} catch {
|
||||
return {
|
||||
completed_colors: {},
|
||||
runs: []
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function writeLog(log) {
|
||||
fs.writeFileSync(logFile, JSON.stringify(log, null, 2), "utf8");
|
||||
}
|
||||
|
||||
async function collectProductLinks(page) {
|
||||
const links = await page.locator("a").evaluateAll(anchors =>
|
||||
anchors
|
||||
.map(a => a.href)
|
||||
.filter(Boolean)
|
||||
.filter(h => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(h))
|
||||
);
|
||||
|
||||
return links.map(cleanUrl).filter(Boolean);
|
||||
}
|
||||
|
||||
async function scrollAndCollect(page, urls, label) {
|
||||
let noNewScrolls = 0;
|
||||
let totalAddedForThisColor = 0;
|
||||
|
||||
for (let i = 0; i < maxScrollsPerColor; i++) {
|
||||
const before = urls.size;
|
||||
|
||||
for (const link of await collectProductLinks(page)) {
|
||||
urls.add(link);
|
||||
}
|
||||
|
||||
const after = urls.size;
|
||||
const added = after - before;
|
||||
totalAddedForThisColor += added;
|
||||
|
||||
if (added === 0) {
|
||||
noNewScrolls++;
|
||||
} else {
|
||||
noNewScrolls = 0;
|
||||
}
|
||||
|
||||
writeUrls(urls);
|
||||
|
||||
console.log(`[${label}] Scroll ${i + 1}/${maxScrollsPerColor}: +${added}, total ${after}, no-new ${noNewScrolls}`);
|
||||
|
||||
if (noNewScrolls >= stopAfterNoNewScrolls) {
|
||||
break;
|
||||
}
|
||||
|
||||
await page.mouse.wheel(0, 2500);
|
||||
await page.waitForTimeout(1500);
|
||||
}
|
||||
|
||||
return totalAddedForThisColor;
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const existingUrls = readExistingUrls();
|
||||
const urls = new Set(existingUrls);
|
||||
const log = readLog();
|
||||
|
||||
console.log(`Existing URLs in ${outputFile}: ${existingUrls.length}`);
|
||||
|
||||
const browser = await chromium.launch({ headless: !headed });
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const runRecord = {
|
||||
started_at: new Date().toISOString(),
|
||||
existing_at_start: existingUrls.length,
|
||||
colors_attempted: []
|
||||
};
|
||||
|
||||
for (const color of colorParams) {
|
||||
if (log.completed_colors[color]) {
|
||||
console.log(`Skipping completed color: ${color}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const url = `${baseUrl}?color=${encodeURIComponent(color)}`;
|
||||
console.log("");
|
||||
console.log(`Opening color filter: ${color}`);
|
||||
console.log(url);
|
||||
|
||||
try {
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
const status = response ? response.status() : "unknown";
|
||||
console.log(`HTTP status: ${status}`);
|
||||
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
const before = urls.size;
|
||||
const addedDuringScroll = await scrollAndCollect(page, urls, color);
|
||||
const after = urls.size;
|
||||
const netAdded = after - before;
|
||||
|
||||
log.completed_colors[color] = {
|
||||
url,
|
||||
http_status: status,
|
||||
added: netAdded,
|
||||
added_during_scroll: addedDuringScroll,
|
||||
total_after: after,
|
||||
completed_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
runRecord.colors_attempted.push({
|
||||
color,
|
||||
url,
|
||||
http_status: status,
|
||||
added: netAdded,
|
||||
total_after: after
|
||||
});
|
||||
|
||||
writeLog(log);
|
||||
writeUrls(urls);
|
||||
|
||||
console.log(`Color complete: ${color}; added ${netAdded}; total ${after}`);
|
||||
|
||||
// Polite pause between filters.
|
||||
await page.waitForTimeout(3000);
|
||||
} catch (err) {
|
||||
console.log(`Color failed: ${color}; ${err.message}`);
|
||||
|
||||
runRecord.colors_attempted.push({
|
||||
color,
|
||||
url,
|
||||
added: 0,
|
||||
error: err.message
|
||||
});
|
||||
|
||||
writeLog(log);
|
||||
}
|
||||
}
|
||||
|
||||
runRecord.finished_at = new Date().toISOString();
|
||||
runRecord.final_total = urls.size;
|
||||
runRecord.new_this_run = urls.size - existingUrls.length;
|
||||
|
||||
log.runs.push(runRecord);
|
||||
writeLog(log);
|
||||
writeUrls(urls);
|
||||
|
||||
console.log("");
|
||||
console.log("Color-param discovery complete.");
|
||||
console.log(`Existing at start: ${existingUrls.length}`);
|
||||
console.log(`Final total: ${urls.size}`);
|
||||
console.log(`New this run: ${urls.size - existingUrls.length}`);
|
||||
console.log(`Output: ${outputFile}`);
|
||||
console.log(`Log: ${logFile}`);
|
||||
|
||||
await browser.close();
|
||||
})();
|
||||
Generated
+60
@@ -0,0 +1,60 @@
|
||||
{
|
||||
"name": "web-scraping",
|
||||
"version": "1.0.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "web-scraping",
|
||||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"playwright": "^1.59.1"
|
||||
}
|
||||
},
|
||||
"node_modules/fsevents": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
||||
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright": {
|
||||
"version": "1.59.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
|
||||
"integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"playwright-core": "1.59.1"
|
||||
},
|
||||
"bin": {
|
||||
"playwright": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"fsevents": "2.3.2"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright-core": {
|
||||
"version": "1.59.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz",
|
||||
"integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==",
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"playwright-core": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "web-scraping",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"type": "commonjs",
|
||||
"dependencies": {
|
||||
"playwright": "^1.59.1"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,298 @@
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
const retryErrors = process.argv.includes("--retry-errors");
|
||||
|
||||
function getArgValue(name, defaultValue) {
|
||||
const prefix = `--${name}=`;
|
||||
const found = process.argv.find(x => x.startsWith(prefix));
|
||||
return found ? found.slice(prefix.length) : defaultValue;
|
||||
}
|
||||
|
||||
const inputFile = getArgValue("input-file", "product-urls.txt");
|
||||
const outputJson = getArgValue("output-json", "prismatic_powders.json");
|
||||
const progressLog = getArgValue("progress-log", "prismatic-scrape-progress.log");
|
||||
|
||||
const minDelaySeconds = parseInt(getArgValue("min-delay-seconds", "8"), 10);
|
||||
const maxDelaySeconds = parseInt(getArgValue("max-delay-seconds", "18"), 10);
|
||||
const pageSettleSeconds = parseInt(getArgValue("page-settle-seconds", "4"), 10);
|
||||
const maxProducts = parseInt(getArgValue("max-products", "0"), 10);
|
||||
|
||||
function clean(text) {
|
||||
return (text || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function cleanUrl(url) {
|
||||
return (url || "").split("?")[0].split("#")[0].trim();
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function randomDelayMs() {
|
||||
const minMs = Math.max(0, minDelaySeconds * 1000);
|
||||
const maxMs = Math.max(minMs, maxDelaySeconds * 1000);
|
||||
return Math.floor(minMs + Math.random() * (maxMs - minMs + 1));
|
||||
}
|
||||
|
||||
function logLine(message) {
|
||||
const line = `[${new Date().toISOString()}] ${message}`;
|
||||
console.log(line);
|
||||
fs.appendFileSync(progressLog, line + "\r\n", "utf8");
|
||||
}
|
||||
|
||||
function absoluteUrl(baseUrl, maybeUrl) {
|
||||
if (!maybeUrl) return "";
|
||||
try {
|
||||
return new URL(maybeUrl, baseUrl).href;
|
||||
} catch {
|
||||
return maybeUrl;
|
||||
}
|
||||
}
|
||||
|
||||
function loadInputUrls() {
|
||||
if (!fs.existsSync(inputFile)) {
|
||||
throw new Error(`Input file not found: ${inputFile}`);
|
||||
}
|
||||
|
||||
const urls = fs.readFileSync(inputFile, "utf8")
|
||||
.split(/\r?\n/)
|
||||
.map(cleanUrl)
|
||||
.filter(Boolean)
|
||||
.filter(x => !x.startsWith("#"))
|
||||
.filter(x => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(x));
|
||||
|
||||
return [...new Set(urls)];
|
||||
}
|
||||
|
||||
function loadOutput() {
|
||||
if (!fs.existsSync(outputJson)) {
|
||||
return { results: [], errors: [] };
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(fs.readFileSync(outputJson, "utf8"));
|
||||
|
||||
if (Array.isArray(parsed)) {
|
||||
return { results: parsed, errors: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
results: Array.isArray(parsed.results) ? parsed.results : [],
|
||||
errors: Array.isArray(parsed.errors) ? parsed.errors : []
|
||||
};
|
||||
} catch (err) {
|
||||
const backup = `${outputJson}.invalid-${Date.now()}.bak`;
|
||||
fs.copyFileSync(outputJson, backup);
|
||||
throw new Error(`Could not parse existing ${outputJson}. Backed it up to ${backup}. Error: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
function saveOutput(data) {
|
||||
const tempFile = `${outputJson}.tmp`;
|
||||
fs.writeFileSync(tempFile, JSON.stringify(data, null, 2), "utf8");
|
||||
fs.renameSync(tempFile, outputJson);
|
||||
}
|
||||
|
||||
function parsePriceTiers(plainText) {
|
||||
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
|
||||
|
||||
return priceMatches.map(m => {
|
||||
const rangeText = clean(m[1]);
|
||||
const price = parseFloat(m[2]);
|
||||
|
||||
let min = null;
|
||||
let max = null;
|
||||
|
||||
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
|
||||
if (rangeMatch) {
|
||||
min = parseInt(rangeMatch[1], 10);
|
||||
max = parseInt(rangeMatch[2], 10);
|
||||
}
|
||||
|
||||
const plusMatch = rangeText.match(/(\d+)\s*\+/);
|
||||
if (plusMatch) {
|
||||
min = parseInt(plusMatch[1], 10);
|
||||
max = null;
|
||||
}
|
||||
|
||||
return { min, max, price };
|
||||
});
|
||||
}
|
||||
|
||||
async function getLinkByText(page, patterns) {
|
||||
const links = await page.locator("a").evaluateAll((anchors) =>
|
||||
anchors.map(a => ({
|
||||
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
|
||||
href: a.getAttribute("href") || ""
|
||||
}))
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
|
||||
return absoluteUrl(page.url(), link.href);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
async function getSampleImageUrl(page) {
|
||||
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
|
||||
imgs.map(img =>
|
||||
img.currentSrc ||
|
||||
img.src ||
|
||||
img.getAttribute("src") ||
|
||||
img.getAttribute("data-src") ||
|
||||
""
|
||||
).filter(Boolean)
|
||||
);
|
||||
|
||||
return (
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
|
||||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
async function parseProduct(page, url) {
|
||||
logLine(`Scraping ${url}`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
await page.waitForTimeout(pageSettleSeconds * 1000);
|
||||
|
||||
const status = response ? response.status() : 0;
|
||||
const pageTitle = clean(await page.title().catch(() => ""));
|
||||
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
|
||||
|
||||
logLine(`HTTP status ${status}; title "${pageTitle}"`);
|
||||
|
||||
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
|
||||
throw new Error("403 Forbidden returned by site.");
|
||||
}
|
||||
|
||||
if (status === 404 || /404|Page Not Found/i.test(pageTitle)) {
|
||||
throw new Error("404 Not Found returned by site.");
|
||||
}
|
||||
|
||||
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
|
||||
|
||||
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
|
||||
const sku = skuMatch ? skuMatch[1] : "";
|
||||
|
||||
if (!sku && !title) {
|
||||
throw new Error("Could not find SKU or title on product page.");
|
||||
}
|
||||
|
||||
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
|
||||
const description = descMatch ? clean(descMatch[1]) : "";
|
||||
|
||||
const priceTiers = parsePriceTiers(plainText);
|
||||
|
||||
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
|
||||
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
|
||||
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
|
||||
const sampleImageUrl = await getSampleImageUrl(page);
|
||||
|
||||
return {
|
||||
sku,
|
||||
color_name: title,
|
||||
description,
|
||||
price_tiers: priceTiers,
|
||||
safety_data_sheet_url: safetyDataSheetUrl,
|
||||
technical_data_sheet_url: technicalDataSheetUrl,
|
||||
application_guide_url: applicationGuideUrl,
|
||||
sample_image_url: sampleImageUrl,
|
||||
product_url: url,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const allUrls = loadInputUrls();
|
||||
const data = loadOutput();
|
||||
|
||||
const completedUrls = new Set(data.results.map(r => cleanUrl(r.product_url)).filter(Boolean));
|
||||
const errorUrls = new Set(data.errors.map(e => cleanUrl(e.product_url)).filter(Boolean));
|
||||
|
||||
let remainingUrls = allUrls.filter(url => {
|
||||
if (completedUrls.has(url)) return false;
|
||||
if (!retryErrors && errorUrls.has(url)) return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (maxProducts > 0) {
|
||||
remainingUrls = remainingUrls.slice(0, maxProducts);
|
||||
}
|
||||
|
||||
logLine(`Input URLs: ${allUrls.length}`);
|
||||
logLine(`Already scraped: ${completedUrls.size}`);
|
||||
logLine(`Existing errors: ${errorUrls.size}`);
|
||||
logLine(`Retry errors: ${retryErrors ? "yes" : "no"}`);
|
||||
logLine(`This run target count: ${remainingUrls.length}`);
|
||||
logLine(`Delay range: ${minDelaySeconds}-${maxDelaySeconds} seconds; page settle: ${pageSettleSeconds} seconds`);
|
||||
|
||||
if (remainingUrls.length === 0) {
|
||||
logLine("Nothing to scrape. Done.");
|
||||
saveOutput(data);
|
||||
return;
|
||||
}
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: !headed
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
let processedThisRun = 0;
|
||||
|
||||
for (const url of remainingUrls) {
|
||||
try {
|
||||
const row = await parseProduct(page, url);
|
||||
|
||||
// If retrying an old error, keep the old error history but avoid duplicate successful result.
|
||||
if (!completedUrls.has(url)) {
|
||||
data.results.push(row);
|
||||
completedUrls.add(url);
|
||||
}
|
||||
|
||||
processedThisRun++;
|
||||
saveOutput(data);
|
||||
|
||||
logLine(`Saved result ${processedThisRun}/${remainingUrls.length}: ${row.sku || "(no sku)"} ${row.color_name || ""}`);
|
||||
} catch (err) {
|
||||
const errorRecord = {
|
||||
product_url: url,
|
||||
error: err.message,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
data.errors.push(errorRecord);
|
||||
saveOutput(data);
|
||||
|
||||
logLine(`ERROR ${url}: ${err.message}`);
|
||||
}
|
||||
|
||||
const delay = randomDelayMs();
|
||||
logLine(`Waiting ${(delay / 1000).toFixed(1)} seconds before next product...`);
|
||||
await sleep(delay);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
logLine(`Done. Results: ${data.results.length}; Errors: ${data.errors.length}; Output: ${outputJson}`);
|
||||
})();
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user