Add PrismaticSync console tool for unattended Prismatic catalog sync

Standalone .NET 8 console app (not part of the main solution) that scrapes the
Prismatic Powders catalog via Playwright and pushes it into the app's catalog
import. Prismatic has no API, so this runs on a workstation (Task Scheduler),
never the deployed server.

- Discovery: incremental newest-first via ?category=created_at (stops once it
  reaches already-known URLs — cheap, finds new colors) and a full all-colors
  crawl for occasional reconcile.
- Scraper: resumable product-page scrape (sku/color/description/price tiers/
  SDS/TDS/app-guide/image), with --refresh-older-than to re-scrape stale
  products and catch price changes. Output matches the app import format so it
  flows through the same shared upsert as the Columbia sync.
- Resilience: brisk randomized base delay, escalating 403 cooldown-and-retry to
  avoid hard bans, periodic rest. All configurable.
- Visibility: streams every product + the inter-product wait to the console
  (colored) and a log file, with an up-front ETA.
- Push: token-authenticated POST to the app import endpoint (skips to manual
  upload when unconfigured).

The app-side token import endpoint is a separate follow-up.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-18 11:30:47 -04:00
parent f752abad86
commit c59d55529f
13 changed files with 1037 additions and 0 deletions
@@ -0,0 +1,43 @@
using Microsoft.Playwright;
namespace PrismaticSync.Infrastructure;
/// <summary>
/// A headless Chromium session with a realistic desktop fingerprint (UA, viewport, locale,
/// timezone) — matching the original scraper's settings to look like a normal browser.
/// </summary>
public sealed class BrowserSession : IAsyncDisposable
{
private IPlaywright? _pw;
private IBrowser? _browser;
private IBrowserContext? _context;
public IPage Page { get; private set; } = null!;
public static async Task<BrowserSession> CreateAsync(bool headed)
{
var session = new BrowserSession();
session._pw = await Playwright.CreateAsync();
session._browser = await session._pw.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
{
Headless = !headed
});
session._context = await session._browser.NewContextAsync(new BrowserNewContextOptions
{
UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
ViewportSize = new ViewportSize { Width = 1365, Height = 900 },
Locale = "en-US",
TimezoneId = "America/New_York"
});
session.Page = await session._context.NewPageAsync();
return session;
}
public async ValueTask DisposeAsync()
{
if (_context is not null) await _context.CloseAsync();
if (_browser is not null) await _browser.CloseAsync();
_pw?.Dispose();
}
}
@@ -0,0 +1,65 @@
using System.Text.Json;
using PrismaticSync.Models;
namespace PrismaticSync.Infrastructure;
/// <summary>Loads/saves the scrape output and the URL list, with atomic writes so a crash mid-save can't corrupt them.</summary>
public static class JsonStore
{
private static readonly JsonSerializerOptions WriteOptions = new() { WriteIndented = true };
private static readonly JsonSerializerOptions ReadOptions = new() { PropertyNameCaseInsensitive = true };
public static ScrapeOutput LoadOutput(string path)
{
if (!File.Exists(path))
return new ScrapeOutput();
var json = File.ReadAllText(path);
try
{
// Tolerate a bare array (older output format) as well as { results, errors }.
if (json.TrimStart().StartsWith("["))
{
var results = JsonSerializer.Deserialize<List<ProductRecord>>(json, ReadOptions) ?? new();
return new ScrapeOutput { Results = results };
}
return JsonSerializer.Deserialize<ScrapeOutput>(json, ReadOptions) ?? new ScrapeOutput();
}
catch (Exception ex)
{
var backup = $"{path}.invalid-{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}.bak";
File.Copy(path, backup, overwrite: true);
throw new InvalidOperationException($"Could not parse {path}. Backed it up to {backup}. {ex.Message}");
}
}
public static void SaveOutput(string path, ScrapeOutput data)
{
var tmp = path + ".tmp";
File.WriteAllText(tmp, JsonSerializer.Serialize(data, WriteOptions));
File.Move(tmp, path, overwrite: true);
}
public static List<string> LoadUrls(string path)
{
if (!File.Exists(path))
return new List<string>();
return File.ReadAllLines(path)
.Select(CleanUrl)
.Where(u => u.Length > 0 && !u.StartsWith("#"))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
public static void SaveUrls(string path, IEnumerable<string> urls)
{
var sorted = urls.Distinct(StringComparer.OrdinalIgnoreCase).OrderBy(u => u, StringComparer.OrdinalIgnoreCase);
var tmp = path + ".tmp";
File.WriteAllText(tmp, string.Join(Environment.NewLine, sorted) + Environment.NewLine);
File.Move(tmp, path, overwrite: true);
}
public static string CleanUrl(string? url) =>
(url ?? string.Empty).Split('?')[0].Split('#')[0].Trim();
}
@@ -0,0 +1,49 @@
namespace PrismaticSync.Infrastructure;
/// <summary>
/// Minimal timestamped logger — writes to the console and appends to a rolling log file so an
/// unattended (Task Scheduler) run leaves an audit trail. Intentionally dependency-free.
/// </summary>
public static class Log
{
private static string _logFile = "prismatic-sync.log";
private static readonly object Gate = new();
public static void Configure(string logFile) => _logFile = logFile;
public static void Info(string message) => Write("INFO", message);
public static void Warn(string message) => Write("WARN", message);
public static void Error(string message) => Write("ERROR", message);
private static void Write(string level, string message)
{
var line = $"[{DateTime.UtcNow:yyyy-MM-ddTHH:mm:ssZ}] {level,-5} {message}";
// Live console stream (visible on a manual run); color-code so warnings/errors stand out.
lock (Gate)
{
var color = level switch
{
"WARN" => ConsoleColor.Yellow,
"ERROR" => ConsoleColor.Red,
_ => (ConsoleColor?)null
};
if (color is { } c)
{
var previous = Console.ForegroundColor;
Console.ForegroundColor = c;
Console.WriteLine(line);
Console.ForegroundColor = previous;
}
else
{
Console.WriteLine(line);
}
// File trail — never let logging break a run.
try { File.AppendAllText(_logFile, line + Environment.NewLine); }
catch { /* ignore */ }
}
}
}
@@ -0,0 +1,69 @@
namespace PrismaticSync.Infrastructure;
/// <summary>Strongly-typed config bound from the "Sync" section of appsettings.json.</summary>
public class SyncConfig
{
public string BaseUrl { get; set; } = "https://www.prismaticpowders.com";
public string ColorsPath { get; set; } = "/shop/powder-coating-colors";
public string ProductUrlsFile { get; set; } = "product-urls.txt";
public string OutputJsonFile { get; set; } = "prismatic_powders.json";
public string LogFile { get; set; } = "prismatic-sync.log";
/// <summary>Politeness delay between product scrapes (randomized within the range).</summary>
public int MinDelaySeconds { get; set; } = 6;
public int MaxDelaySeconds { get; set; } = 14;
/// <summary>On a 403/block, cool down this many seconds × the consecutive-block count, then retry.</summary>
public int BlockedCooldownSeconds { get; set; } = 120;
/// <summary>Upper bound on a single cooldown so escalation can't run away.</summary>
public int BlockedCooldownMaxSeconds { get; set; } = 600;
/// <summary>How many times to cool-down-and-retry a blocked product before recording it as an error.</summary>
public int BlockedMaxRetries { get; set; } = 3;
/// <summary>Take a longer rest after this many products (0 disables). Eases load and looks less robotic.</summary>
public int LongRestEveryProducts { get; set; } = 150;
/// <summary>Length of the periodic long rest, in seconds.</summary>
public int LongRestSeconds { get; set; } = 45;
/// <summary>Extra settle time after a product page loads before reading it.</summary>
public int PageSettleSeconds { get; set; } = 4;
/// <summary>Pause after each scroll while a listing lazy-loads more items.</summary>
public int ScrollWaitMs { get; set; } = 1500;
/// <summary>Hard cap on scrolls per listing, as a safety stop.</summary>
public int MaxScrolls { get; set; } = 400;
/// <summary>Full discovery: stop a listing after this many scrolls add no new links.</summary>
public int StopAfterNoNewScrolls { get; set; } = 10;
/// <summary>
/// Incremental discovery: stop the newest-first listing after this many consecutive scrolls
/// that surfaced only already-known URLs — i.e. we've scrolled past the new products.
/// </summary>
public int StopAfterKnownScrolls { get; set; } = 8;
/// <summary>Color filter params used by full discovery.</summary>
public string[] ColorParams { get; set; } = Array.Empty<string>();
public ImportConfig Import { get; set; } = new();
public string ColorsUrl => $"{BaseUrl.TrimEnd('/')}{ColorsPath}";
}
/// <summary>Where and how to push the scraped catalog into the app.</summary>
public class ImportConfig
{
/// <summary>Full URL of the app's token-authenticated catalog import endpoint.</summary>
public string EndpointUrl { get; set; } = "";
/// <summary>Shared secret sent in the X-Import-Token header. Must match the app's config.</summary>
public string Token { get; set; } = "";
/// <summary>Vendor name applied to every record on import.</summary>
public string VendorName { get; set; } = "Prismatic Powders";
}