Commit misc scripts, feature specs, SQL deploy scripts, and settings updates
This commit is contained in:
@@ -0,0 +1,189 @@
|
||||
const fs = require("fs");
|
||||
const { chromium } = require("playwright");
|
||||
|
||||
const headed = process.argv.includes("--headed");
|
||||
|
||||
const productUrls = [
|
||||
"https://www.prismaticpowders.com/shop/powder-coating-colors/PSS-11248/high-gloss-black"
|
||||
];
|
||||
|
||||
const outputJson = "prismatic_powders.json";
|
||||
|
||||
function clean(text) {
|
||||
return (text || "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function absoluteUrl(baseUrl, maybeUrl) {
|
||||
if (!maybeUrl) return "";
|
||||
try {
|
||||
return new URL(maybeUrl, baseUrl).href;
|
||||
} catch {
|
||||
return maybeUrl;
|
||||
}
|
||||
}
|
||||
|
||||
function unique(items) {
|
||||
return [...new Set(items.filter(Boolean).map(clean).filter(Boolean))];
|
||||
}
|
||||
|
||||
async function getLinkByText(page, patterns) {
|
||||
const links = await page.locator("a").evaluateAll((anchors) =>
|
||||
anchors.map(a => ({
|
||||
text: (a.innerText || a.textContent || "").replace(/\s+/g, " ").trim(),
|
||||
href: a.getAttribute("href") || ""
|
||||
}))
|
||||
);
|
||||
|
||||
for (const link of links) {
|
||||
if (patterns.some(p => new RegExp(p, "i").test(link.text))) {
|
||||
return absoluteUrl(page.url(), link.href);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
function parsePriceTiers(plainText) {
|
||||
const priceMatches = [...plainText.matchAll(/(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)/gi)];
|
||||
|
||||
return priceMatches.map(m => {
|
||||
const rangeText = clean(m[1]);
|
||||
const price = parseFloat(m[2]);
|
||||
|
||||
let min = null;
|
||||
let max = null;
|
||||
|
||||
const rangeMatch = rangeText.match(/(\d+)\s*-\s*(\d+)/);
|
||||
if (rangeMatch) {
|
||||
min = parseInt(rangeMatch[1], 10);
|
||||
max = parseInt(rangeMatch[2], 10);
|
||||
}
|
||||
|
||||
const plusMatch = rangeText.match(/(\d+)\s*\+/);
|
||||
if (plusMatch) {
|
||||
min = parseInt(plusMatch[1], 10);
|
||||
max = null;
|
||||
}
|
||||
|
||||
return {
|
||||
min,
|
||||
max,
|
||||
price
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function getSampleImageUrl(page) {
|
||||
const imageUrls = await page.locator("img").evaluateAll((imgs) =>
|
||||
imgs.map(img =>
|
||||
img.currentSrc ||
|
||||
img.src ||
|
||||
img.getAttribute("src") ||
|
||||
img.getAttribute("data-src") ||
|
||||
""
|
||||
).filter(Boolean)
|
||||
);
|
||||
|
||||
return (
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src) && !/thumbnail/i.test(src)) ||
|
||||
imageUrls.find(src => /images\.nicindustries\.com/i.test(src)) ||
|
||||
imageUrls.find(src => /prismatic|powder|color/i.test(src)) ||
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
async function parseProduct(page, url) {
|
||||
console.log(`Scraping ${url}`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000
|
||||
});
|
||||
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const status = response ? response.status() : 0;
|
||||
const pageTitle = clean(await page.title().catch(() => ""));
|
||||
const plainText = clean(await page.locator("body").innerText().catch(() => ""));
|
||||
|
||||
console.log(`HTTP status: ${status}`);
|
||||
console.log(`Page title: ${pageTitle}`);
|
||||
|
||||
// Do not silently output a fake product if blocked.
|
||||
if (status === 403 || /^403 Forbidden$/i.test(pageTitle) || /^403 Forbidden$/i.test(plainText)) {
|
||||
throw new Error("403 Forbidden returned by site.");
|
||||
}
|
||||
|
||||
const title = clean(await page.locator("h1").first().innerText().catch(() => ""));
|
||||
|
||||
const skuMatch = plainText.match(/Item:\s*([A-Z0-9-]+)/i);
|
||||
const sku = skuMatch ? skuMatch[1] : "";
|
||||
|
||||
const descMatch = plainText.match(/Description:\s*(.*?)(WARNING:|What does this match\?|$)/is);
|
||||
const description = descMatch ? clean(descMatch[1]) : "";
|
||||
|
||||
const priceTiers = parsePriceTiers(plainText);
|
||||
|
||||
const safetyDataSheetUrl = await getLinkByText(page, ["Safety Data Sheet", "\\bSDS\\b"]);
|
||||
const applicationGuideUrl = await getLinkByText(page, ["Application Guide"]);
|
||||
const technicalDataSheetUrl = await getLinkByText(page, ["Tech Data Sheet", "Technical Data Sheet", "\\bTDS\\b"]);
|
||||
const sampleImageUrl = await getSampleImageUrl(page);
|
||||
|
||||
return {
|
||||
sku,
|
||||
color_name: title,
|
||||
description,
|
||||
price_tiers: priceTiers,
|
||||
safety_data_sheet_url: safetyDataSheetUrl,
|
||||
technical_data_sheet_url: technicalDataSheetUrl,
|
||||
application_guide_url: applicationGuideUrl,
|
||||
sample_image_url: sampleImageUrl,
|
||||
product_url: url,
|
||||
scraped_at: new Date().toISOString()
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const browser = await chromium.launch({
|
||||
headless: !headed
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
viewport: { width: 1365, height: 900 },
|
||||
locale: "en-US",
|
||||
timezoneId: "America/New_York"
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
const results = [];
|
||||
const errors = [];
|
||||
|
||||
for (const url of productUrls) {
|
||||
try {
|
||||
const row = await parseProduct(page, url);
|
||||
results.push(row);
|
||||
await page.waitForTimeout(3000);
|
||||
} catch (err) {
|
||||
console.warn(`Failed ${url}: ${err.message}`);
|
||||
errors.push({
|
||||
product_url: url,
|
||||
error: err.message,
|
||||
scraped_at: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
// If you prefer only the array, change this to JSON.stringify(results, null, 2)
|
||||
const output = {
|
||||
results,
|
||||
errors
|
||||
};
|
||||
|
||||
fs.writeFileSync(outputJson, JSON.stringify(output, null, 2), "utf8");
|
||||
|
||||
console.log(`Done. Output: ${outputJson}`);
|
||||
})();
|
||||
Reference in New Issue
Block a user