Add PrismaticSync console tool for unattended Prismatic catalog sync
Standalone .NET 8 console app (not part of the main solution) that scrapes the Prismatic Powders catalog via Playwright and pushes it into the app's catalog import. Prismatic has no API, so this runs on a workstation (Task Scheduler), never the deployed server. - Discovery: incremental newest-first via ?category=created_at (stops once it reaches already-known URLs — cheap, finds new colors) and a full all-colors crawl for occasional reconcile. - Scraper: resumable product-page scrape (sku/color/description/price tiers/ SDS/TDS/app-guide/image), with --refresh-older-than to re-scrape stale products and catch price changes. Output matches the app import format so it flows through the same shared upsert as the Columbia sync. - Resilience: brisk randomized base delay, escalating 403 cooldown-and-retry to avoid hard bans, periodic rest. All configurable. - Visibility: streams every product + the inter-product wait to the console (colored) and a log file, with an up-front ETA. - Push: token-authenticated POST to the app import endpoint (skips to manual upload when unconfigured). The app-side token import endpoint is a separate follow-up. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,65 @@
|
||||
using System.Text.Json;
|
||||
using PrismaticSync.Models;
|
||||
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>Loads/saves the scrape output and the URL list, with atomic writes so a crash mid-save can't corrupt them.</summary>
|
||||
public static class JsonStore
|
||||
{
|
||||
private static readonly JsonSerializerOptions WriteOptions = new() { WriteIndented = true };
|
||||
private static readonly JsonSerializerOptions ReadOptions = new() { PropertyNameCaseInsensitive = true };
|
||||
|
||||
public static ScrapeOutput LoadOutput(string path)
|
||||
{
|
||||
if (!File.Exists(path))
|
||||
return new ScrapeOutput();
|
||||
|
||||
var json = File.ReadAllText(path);
|
||||
try
|
||||
{
|
||||
// Tolerate a bare array (older output format) as well as { results, errors }.
|
||||
if (json.TrimStart().StartsWith("["))
|
||||
{
|
||||
var results = JsonSerializer.Deserialize<List<ProductRecord>>(json, ReadOptions) ?? new();
|
||||
return new ScrapeOutput { Results = results };
|
||||
}
|
||||
return JsonSerializer.Deserialize<ScrapeOutput>(json, ReadOptions) ?? new ScrapeOutput();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
var backup = $"{path}.invalid-{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}.bak";
|
||||
File.Copy(path, backup, overwrite: true);
|
||||
throw new InvalidOperationException($"Could not parse {path}. Backed it up to {backup}. {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
public static void SaveOutput(string path, ScrapeOutput data)
|
||||
{
|
||||
var tmp = path + ".tmp";
|
||||
File.WriteAllText(tmp, JsonSerializer.Serialize(data, WriteOptions));
|
||||
File.Move(tmp, path, overwrite: true);
|
||||
}
|
||||
|
||||
public static List<string> LoadUrls(string path)
|
||||
{
|
||||
if (!File.Exists(path))
|
||||
return new List<string>();
|
||||
|
||||
return File.ReadAllLines(path)
|
||||
.Select(CleanUrl)
|
||||
.Where(u => u.Length > 0 && !u.StartsWith("#"))
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public static void SaveUrls(string path, IEnumerable<string> urls)
|
||||
{
|
||||
var sorted = urls.Distinct(StringComparer.OrdinalIgnoreCase).OrderBy(u => u, StringComparer.OrdinalIgnoreCase);
|
||||
var tmp = path + ".tmp";
|
||||
File.WriteAllText(tmp, string.Join(Environment.NewLine, sorted) + Environment.NewLine);
|
||||
File.Move(tmp, path, overwrite: true);
|
||||
}
|
||||
|
||||
public static string CleanUrl(string? url) =>
|
||||
(url ?? string.Empty).Split('?')[0].Split('#')[0].Trim();
|
||||
}
|
||||
Reference in New Issue
Block a user