Add PrismaticSync console tool for unattended Prismatic catalog sync

Standalone .NET 8 console app (not part of the main solution) that scrapes the
Prismatic Powders catalog via Playwright and pushes it into the app's catalog
import. Prismatic has no API, so this runs on a workstation (Task Scheduler),
never the deployed server.

- Discovery: incremental newest-first via ?category=created_at (stops once it
  reaches already-known URLs — cheap, finds new colors) and a full all-colors
  crawl for occasional reconcile.
- Scraper: resumable product-page scrape (sku/color/description/price tiers/
  SDS/TDS/app-guide/image), with --refresh-older-than to re-scrape stale
  products and catch price changes. Output matches the app import format so it
  flows through the same shared upsert as the Columbia sync.
- Resilience: brisk randomized base delay, escalating 403 cooldown-and-retry to
  avoid hard bans, periodic rest. All configurable.
- Visibility: streams every product + the inter-product wait to the console
  (colored) and a log file, with an up-front ETA.
- Push: token-authenticated POST to the app import endpoint (skips to manual
  upload when unconfigured).

The app-side token import endpoint is a separate follow-up.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-18 11:30:47 -04:00
parent f752abad86
commit c59d55529f
13 changed files with 1037 additions and 0 deletions
@@ -0,0 +1,138 @@
using System.Text.RegularExpressions;
using Microsoft.Playwright;
using PrismaticSync.Infrastructure;
namespace PrismaticSync.Services;
/// <summary>
/// Discovers product URLs from the Prismatic color listing (infinite-scroll). Two modes:
/// incremental (newest-first via <c>?category=created_at</c>, stop once we reach already-known
/// URLs) for cheap frequent runs, and full (every color filter to the bottom) for occasional
/// reconciliation. Both append to the URL list file.
/// </summary>
public class PrismaticDiscoverer
{
private static readonly Regex ProductUrlRegex =
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private readonly BrowserSession _session;
private readonly SyncConfig _config;
public PrismaticDiscoverer(BrowserSession session, SyncConfig config)
{
_session = session;
_config = config;
}
/// <summary>
/// Incremental discovery: crawl the newest-first listing and stop once a run of consecutive
/// scrolls surfaces only already-known URLs — meaning we've scrolled past the new products.
/// Returns the count of newly found URLs.
/// </summary>
public async Task<int> DiscoverNewAsync()
{
var known = new HashSet<string>(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
var startCount = known.Count;
Log.Info($"Incremental discovery (newest first). Known URLs: {startCount}");
await GotoAsync($"{_config.ColorsUrl}?category=created_at");
var knownStreak = 0;
for (var i = 0; i < _config.MaxScrolls; i++)
{
var addedNew = 0;
foreach (var link in await CollectProductLinksAsync())
if (known.Add(link)) addedNew++;
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
knownStreak = addedNew == 0 ? knownStreak + 1 : 0;
Log.Info($"Scroll {i + 1}: +{addedNew} new, total {known.Count}, known-streak {knownStreak}");
if (knownStreak >= _config.StopAfterKnownScrolls)
{
Log.Info("Reached known territory — stopping incremental discovery.");
break;
}
await ScrollAsync();
}
var newCount = known.Count - startCount;
Log.Info($"Incremental discovery done. New URLs: {newCount}; total {known.Count}");
return newCount;
}
/// <summary>
/// Full discovery: crawl every color filter to the bottom. Heavier — use occasionally to
/// reconcile the whole set (e.g. to notice colors that have been removed). Returns new URL count.
/// </summary>
public async Task<int> DiscoverFullAsync()
{
var known = new HashSet<string>(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
var startCount = known.Count;
Log.Info($"Full discovery across {_config.ColorParams.Length} color filters. Known URLs: {startCount}");
foreach (var color in _config.ColorParams)
{
Log.Info($"Color filter: {color}");
try
{
await GotoAsync($"{_config.ColorsUrl}?color={Uri.EscapeDataString(color)}");
var noNew = 0;
for (var i = 0; i < _config.MaxScrolls; i++)
{
var added = 0;
foreach (var link in await CollectProductLinksAsync())
if (known.Add(link)) added++;
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
noNew = added == 0 ? noNew + 1 : 0;
if (noNew >= _config.StopAfterNoNewScrolls)
break;
await ScrollAsync();
}
Log.Info($"Color {color} done. Total {known.Count}");
await _session.Page.WaitForTimeoutAsync(3000);
}
catch (Exception ex)
{
Log.Warn($"Color {color} failed: {ex.Message}");
}
}
var newCount = known.Count - startCount;
Log.Info($"Full discovery done. New this run: {newCount}; total {known.Count}");
return newCount;
}
private async Task GotoAsync(string url)
{
await _session.Page.GotoAsync(url, new PageGotoOptions
{
WaitUntil = WaitUntilState.DOMContentLoaded,
Timeout = 60000
});
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
}
private async Task ScrollAsync()
{
await _session.Page.Mouse.WheelAsync(0, 2500);
await _session.Page.WaitForTimeoutAsync(_config.ScrollWaitMs);
}
private async Task<List<string>> CollectProductLinksAsync()
{
var hrefs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
"a", "els => els.map(a => a.href).filter(Boolean)");
return hrefs
.Where(h => ProductUrlRegex.IsMatch(h))
.Select(JsonStore.CleanUrl)
.Where(u => u.Length > 0)
.ToList();
}
}