Files
PowderCoatingLogix/scripts/Web Scraping/discover-prismatic-by-color-param.js

238 lines
5.8 KiB
JavaScript

const fs = require("fs");
const { chromium } = require("playwright");
const headed = process.argv.includes("--headed");
function getArgValue(name, defaultValue) {
const prefix = `--${name}=`;
const found = process.argv.find(x => x.startsWith(prefix));
return found ? found.slice(prefix.length) : defaultValue;
}
const maxScrollsPerColor = parseInt(getArgValue("max-scrolls-per-color", "180"), 10);
const stopAfterNoNewScrolls = parseInt(getArgValue("stop-after-no-new-scrolls", "10"), 10);
const baseUrl = "https://www.prismaticpowders.com/shop/powder-coating-colors";
const outputFile = "product-urls.txt";
const logFile = "color-discovery-log.json";
// Update this list if you find more color params in the site HTML.
const colorParams = [
"pris_black",
"pris_blue",
"pris_bronze",
"pris_brown",
"pris_clear",
"pris_copper",
"pris_gold",
"pris_gray",
"pris_green",
"pris_orange",
"pris_pink",
"pris_purple",
"pris_red",
"pris_silver",
"pris_tan",
"pris_white",
"pris_yellow"
];
function cleanUrl(url) {
return (url || "").split("?")[0].split("#")[0].trim();
}
function isProductUrl(url) {
return /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(url || "");
}
function readExistingUrls() {
if (!fs.existsSync(outputFile)) return [];
return fs.readFileSync(outputFile, "utf8")
.split(/\r?\n/)
.map(cleanUrl)
.filter(Boolean);
}
function writeUrls(urls) {
const sorted = [...urls].sort();
fs.writeFileSync(outputFile, sorted.join("\r\n") + "\r\n", "utf8");
}
function readLog() {
if (!fs.existsSync(logFile)) {
return {
completed_colors: {},
runs: []
};
}
try {
return JSON.parse(fs.readFileSync(logFile, "utf8"));
} catch {
return {
completed_colors: {},
runs: []
};
}
}
function writeLog(log) {
fs.writeFileSync(logFile, JSON.stringify(log, null, 2), "utf8");
}
async function collectProductLinks(page) {
const links = await page.locator("a").evaluateAll(anchors =>
anchors
.map(a => a.href)
.filter(Boolean)
.filter(h => /\/shop\/powder-coating-colors\/[A-Z0-9-]+\//i.test(h))
);
return links.map(cleanUrl).filter(Boolean);
}
async function scrollAndCollect(page, urls, label) {
let noNewScrolls = 0;
let totalAddedForThisColor = 0;
for (let i = 0; i < maxScrollsPerColor; i++) {
const before = urls.size;
for (const link of await collectProductLinks(page)) {
urls.add(link);
}
const after = urls.size;
const added = after - before;
totalAddedForThisColor += added;
if (added === 0) {
noNewScrolls++;
} else {
noNewScrolls = 0;
}
writeUrls(urls);
console.log(`[${label}] Scroll ${i + 1}/${maxScrollsPerColor}: +${added}, total ${after}, no-new ${noNewScrolls}`);
if (noNewScrolls >= stopAfterNoNewScrolls) {
break;
}
await page.mouse.wheel(0, 2500);
await page.waitForTimeout(1500);
}
return totalAddedForThisColor;
}
(async () => {
const existingUrls = readExistingUrls();
const urls = new Set(existingUrls);
const log = readLog();
console.log(`Existing URLs in ${outputFile}: ${existingUrls.length}`);
const browser = await chromium.launch({ headless: !headed });
const context = await browser.newContext({
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
viewport: { width: 1365, height: 900 },
locale: "en-US",
timezoneId: "America/New_York"
});
const page = await context.newPage();
const runRecord = {
started_at: new Date().toISOString(),
existing_at_start: existingUrls.length,
colors_attempted: []
};
for (const color of colorParams) {
if (log.completed_colors[color]) {
console.log(`Skipping completed color: ${color}`);
continue;
}
const url = `${baseUrl}?color=${encodeURIComponent(color)}`;
console.log("");
console.log(`Opening color filter: ${color}`);
console.log(url);
try {
const response = await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 60000
});
const status = response ? response.status() : "unknown";
console.log(`HTTP status: ${status}`);
await page.waitForTimeout(5000);
const before = urls.size;
const addedDuringScroll = await scrollAndCollect(page, urls, color);
const after = urls.size;
const netAdded = after - before;
log.completed_colors[color] = {
url,
http_status: status,
added: netAdded,
added_during_scroll: addedDuringScroll,
total_after: after,
completed_at: new Date().toISOString()
};
runRecord.colors_attempted.push({
color,
url,
http_status: status,
added: netAdded,
total_after: after
});
writeLog(log);
writeUrls(urls);
console.log(`Color complete: ${color}; added ${netAdded}; total ${after}`);
// Polite pause between filters.
await page.waitForTimeout(3000);
} catch (err) {
console.log(`Color failed: ${color}; ${err.message}`);
runRecord.colors_attempted.push({
color,
url,
added: 0,
error: err.message
});
writeLog(log);
}
}
runRecord.finished_at = new Date().toISOString();
runRecord.final_total = urls.size;
runRecord.new_this_run = urls.size - existingUrls.length;
log.runs.push(runRecord);
writeLog(log);
writeUrls(urls);
console.log("");
console.log("Color-param discovery complete.");
console.log(`Existing at start: ${existingUrls.length}`);
console.log(`Final total: ${urls.size}`);
console.log(`New this run: ${urls.size - existingUrls.length}`);
console.log(`Output: ${outputFile}`);
console.log(`Log: ${logFile}`);
await browser.close();
})();