Add PrismaticSync console tool for unattended Prismatic catalog sync

Standalone .NET 8 console app (not part of the main solution) that scrapes the
Prismatic Powders catalog via Playwright and pushes it into the app's catalog
import. Prismatic has no API, so this runs on a workstation (Task Scheduler),
never the deployed server.

- Discovery: incremental newest-first via ?category=created_at (stops once it
  reaches already-known URLs — cheap, finds new colors) and a full all-colors
  crawl for occasional reconcile.
- Scraper: resumable product-page scrape (sku/color/description/price tiers/
  SDS/TDS/app-guide/image), with --refresh-older-than to re-scrape stale
  products and catch price changes. Output matches the app import format so it
  flows through the same shared upsert as the Columbia sync.
- Resilience: brisk randomized base delay, escalating 403 cooldown-and-retry to
  avoid hard bans, periodic rest. All configurable.
- Visibility: streams every product + the inter-product wait to the console
  (colored) and a log file, with an up-front ETA.
- Push: token-authenticated POST to the app import endpoint (skips to manual
  upload when unconfigured).

The app-side token import endpoint is a separate follow-up.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-18 11:30:47 -04:00
parent f752abad86
commit c59d55529f
13 changed files with 1037 additions and 0 deletions
@@ -0,0 +1,295 @@
using System.Diagnostics;
using System.Globalization;
using System.Text.RegularExpressions;
using Microsoft.Playwright;
using PrismaticSync.Infrastructure;
using PrismaticSync.Models;
namespace PrismaticSync.Services;
/// <summary>
/// Scrapes individual Prismatic product pages into <see cref="ProductRecord"/>s. Resumable (skips
/// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale
/// records get re-scraped to catch price changes. Saves after every product so a long run can be
/// stopped and resumed safely, and logs continuously — including the delay between products — so a
/// manual run always shows it's alive.
/// </summary>
public class PrismaticScraper
{
private static readonly Regex ProductUrlRegex =
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex SkuRegex =
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex DescRegex =
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
private static readonly Regex PriceTierRegex =
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled);
private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled);
private readonly BrowserSession _session;
private readonly SyncConfig _config;
private readonly Random _random = new();
public PrismaticScraper(BrowserSession session, SyncConfig config)
{
_session = session;
_config = config;
}
/// <summary>
/// Scrapes products needing work: those not yet scraped, plus (when <paramref name="refreshOlderThanDays"/>
/// &gt; 0) any whose data is older than that window. Returns (scraped, errors).
/// </summary>
public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors)
{
var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile)
.Where(u => ProductUrlRegex.IsMatch(u))
.ToList();
var data = JsonStore.LoadOutput(_config.OutputJsonFile);
// Index existing results by URL (keep the most recent if the file has dupes).
var resultByUrl = data.Results
.GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase)
.ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase);
var errorUrls = new HashSet<string>(
data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase);
var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays));
var toScrape = new List<string>();
foreach (var url in allUrls)
{
if (resultByUrl.TryGetValue(url, out var existing))
{
if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff)
toScrape.Add(url); // stale → refresh for price changes
}
else
{
if (retryErrors || !errorUrls.Contains(url))
toScrape.Add(url); // never scraped (skip known errors unless retrying)
}
}
if (maxProducts > 0)
toScrape = toScrape.Take(maxProducts).ToList();
var total = toScrape.Count;
Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}");
Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})");
if (total == 0)
{
Log.Info("Nothing to scrape. Done.");
return (0, 0);
}
var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0;
var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0;
Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " +
$"(grab a coffee if that's a while — it saves after every product and is resumable).");
var stopwatch = Stopwatch.StartNew();
int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0;
foreach (var url in toScrape)
{
index++;
for (var attempt = 1; ; attempt++)
{
try
{
var row = await ParseProductAsync(url, index, total);
if (resultByUrl.TryGetValue(url, out var existing))
data.Results[data.Results.IndexOf(existing)] = row;
else
data.Results.Add(row);
resultByUrl[url] = row;
data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase));
scraped++;
consecutiveBlocks = 0;
JsonStore.SaveOutput(_config.OutputJsonFile, data);
var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m;
Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " +
$"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00}) | elapsed {FormatDuration(stopwatch.Elapsed)}");
break;
}
catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries)
{
// Site pushed back — back off (escalating) and retry the SAME product rather
// than barreling on, which is how an unattended run gets hard-banned.
consecutiveBlocks++;
var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds);
Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product...");
await Task.Delay(cooldown * 1000);
}
catch (Exception ex)
{
data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow });
JsonStore.SaveOutput(_config.OutputJsonFile, data);
errors++;
Log.Error($"[{index}/{total}] {url} -> {ex.Message}");
break;
}
}
// Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence.
if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total)
{
Log.Info($"Resting {_config.LongRestSeconds}s after {index} products...");
await Task.Delay(_config.LongRestSeconds * 1000);
}
if (index < total)
{
var delayMs = RandomDelayMs();
Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product...");
await Task.Delay(delayMs);
}
}
Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " +
$"Took {FormatDuration(stopwatch.Elapsed)}.");
return (scraped, errors);
}
private async Task<ProductRecord> ParseProductAsync(string url, int index, int total)
{
Log.Info($"[{index}/{total}] Scraping {url}");
var response = await _session.Page.GotoAsync(url, new PageGotoOptions
{
WaitUntil = WaitUntilState.DOMContentLoaded,
Timeout = 60000
});
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
var status = response?.Status ?? 0;
var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync()));
var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync()));
if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase))
throw new Exception("403 Forbidden returned by site.");
if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase))
throw new Exception("404 Not Found returned by site.");
var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync()));
var skuMatch = SkuRegex.Match(plainText);
var sku = skuMatch.Success ? skuMatch.Groups[1].Value : "";
if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName))
throw new Exception("Could not find SKU or title on product page.");
var descMatch = DescRegex.Match(plainText);
var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : "";
return new ProductRecord
{
Sku = sku,
ColorName = colorName,
Description = description,
PriceTiers = ParsePriceTiers(plainText),
SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }),
TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }),
ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }),
SampleImageUrl = await GetSampleImageUrlAsync(),
ProductUrl = url,
ScrapedAt = DateTime.UtcNow
};
}
private static List<PriceTier> ParsePriceTiers(string text)
{
var tiers = new List<PriceTier>();
foreach (Match m in PriceTierRegex.Matches(text))
{
if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price))
continue;
var rangeText = Clean(m.Groups[1].Value);
int? min = null, max = null;
var range = RangeRegex.Match(rangeText);
if (range.Success)
{
min = int.Parse(range.Groups[1].Value);
max = int.Parse(range.Groups[2].Value);
}
var plus = PlusRegex.Match(rangeText);
if (plus.Success)
{
min = int.Parse(plus.Groups[1].Value);
max = null;
}
tiers.Add(new PriceTier { Min = min, Max = max, Price = price });
}
return tiers;
}
/// <summary>Returns the href of the first link whose text matches any pattern. Uses a single eval
/// returning "texthref" pairs to avoid object deserialization quirks.</summary>
private async Task<string> GetLinkByTextAsync(string[] patterns)
{
var combined = await _session.Page.EvalOnSelectorAllAsync<string[]>(
"a",
"els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " +
"+ String.fromCharCode(1) + (a.href || ''))");
foreach (var entry in combined)
{
var parts = entry.Split('');
var text = parts.Length > 0 ? parts[0] : "";
var href = parts.Length > 1 ? parts[1] : "";
if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
return href;
}
return "";
}
private async Task<string> GetSampleImageUrlAsync()
{
var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
"img",
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
".filter(Boolean)");
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase))
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase))
?? "";
}
private static bool IsBlocked(Exception ex) =>
ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase);
private static async Task<string> SafeTextAsync(Func<Task<string>> fn)
{
try { return await fn(); } catch { return ""; }
}
private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim();
private int RandomDelayMs()
{
var min = Math.Max(0, _config.MinDelaySeconds * 1000);
var max = Math.Max(min, _config.MaxDelaySeconds * 1000);
return _random.Next(min, max + 1);
}
private static string FormatDuration(TimeSpan t) =>
t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" :
t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" :
$"{t.Seconds}s";
}