Files
PowderCoatingLogix/scripts/Prismatic Data Scraper/Program.cs
T
spouliot c59d55529f Add PrismaticSync console tool for unattended Prismatic catalog sync
Standalone .NET 8 console app (not part of the main solution) that scrapes the
Prismatic Powders catalog via Playwright and pushes it into the app's catalog
import. Prismatic has no API, so this runs on a workstation (Task Scheduler),
never the deployed server.

- Discovery: incremental newest-first via ?category=created_at (stops once it
  reaches already-known URLs — cheap, finds new colors) and a full all-colors
  crawl for occasional reconcile.
- Scraper: resumable product-page scrape (sku/color/description/price tiers/
  SDS/TDS/app-guide/image), with --refresh-older-than to re-scrape stale
  products and catch price changes. Output matches the app import format so it
  flows through the same shared upsert as the Columbia sync.
- Resilience: brisk randomized base delay, escalating 403 cooldown-and-retry to
  avoid hard bans, periodic rest. All configurable.
- Visibility: streams every product + the inter-product wait to the console
  (colored) and a log file, with an up-front ETA.
- Push: token-authenticated POST to the app import endpoint (skips to manual
  upload when unconfigured).

The app-side token import endpoint is a separate follow-up.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 11:30:47 -04:00

107 lines
4.3 KiB
C#

using Microsoft.Extensions.Configuration;
using PrismaticSync.Infrastructure;
using PrismaticSync.Services;
// ── Load config ───────────────────────────────────────────────────────────────
var configRoot = new ConfigurationBuilder()
.SetBasePath(AppContext.BaseDirectory)
.AddJsonFile("appsettings.json", optional: false)
.Build();
var config = configRoot.GetSection("Sync").Get<SyncConfig>() ?? new SyncConfig();
Log.Configure(config.LogFile);
// ── Parse args ────────────────────────────────────────────────────────────────
var command = args.Length > 0 && !args[0].StartsWith("--") ? args[0].ToLowerInvariant() : "run";
var headed = args.Contains("--headed");
var retryErrors = args.Contains("--retry-errors");
var maxProducts = GetIntArg("--max-products", 0);
// "run" refreshes products older than 30 days by default; explicit commands default to new-only.
var refreshOlderThanDays = GetIntArg("--refresh-older-than", command == "run" ? 30 : 0);
Log.Info($"PrismaticSync — command '{command}' (headed={headed}, refreshOlderThan={refreshOlderThanDays}d, maxProducts={maxProducts})");
try
{
switch (command)
{
case "discover-new":
await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverNewAsync());
break;
case "discover-full":
await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverFullAsync());
break;
case "scrape":
await WithBrowser(d => new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors));
break;
case "push":
await new CatalogPusher(config).PushAsync();
break;
case "run":
// The scheduled default: find new colors, scrape new + stale, then push.
await WithBrowser(async d =>
{
await new PrismaticDiscoverer(d, config).DiscoverNewAsync();
await new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors);
});
await new CatalogPusher(config).PushAsync();
break;
default:
PrintUsage();
return 1;
}
Log.Info("Done.");
return 0;
}
catch (Exception ex)
{
Log.Error($"Fatal: {ex}");
return 1;
}
// ── Helpers ───────────────────────────────────────────────────────────────────
async Task WithBrowser(Func<BrowserSession, Task> action)
{
await using var session = await BrowserSession.CreateAsync(headed);
await action(session);
}
int GetIntArg(string name, int fallback)
{
var prefix = name + "=";
var found = args.FirstOrDefault(a => a.StartsWith(prefix, StringComparison.OrdinalIgnoreCase));
return found is not null && int.TryParse(found[prefix.Length..], out var value) ? value : fallback;
}
void PrintUsage()
{
Console.WriteLine(
"""
PrismaticSync scrape Prismatic Powders and push to the app catalog.
Usage: PrismaticSync [command] [options]
Commands:
run (default) discover-new + scrape (new + stale) + push
discover-new Incremental discovery via newest-first sort (cheap; finds new colors)
discover-full Full discovery across all color filters (heavy; reconciles the whole set)
scrape Scrape product pages from the URL list (resumable)
push Push the scraped JSON to the import endpoint
Options:
--refresh-older-than=N Re-scrape products whose data is older than N days (default 30 for 'run')
--max-products=N Cap products scraped this run (0 = no cap)
--retry-errors Retry URLs previously recorded as errors
--headed Show the browser window (debugging)
Config: appsettings.json (delays, file paths, import endpoint + token).
First run on a new machine: dotnet build, then `pwsh bin/Debug/net8.0/playwright.ps1 install chromium`.
""");
}