diff --git a/scripts/Prismatic Data Scraper/.gitignore b/scripts/Prismatic Data Scraper/.gitignore new file mode 100644 index 0000000..657abbc --- /dev/null +++ b/scripts/Prismatic Data Scraper/.gitignore @@ -0,0 +1,8 @@ +# Build output +bin/ +obj/ + +# Transient scrape artifacts +*.tmp +*.invalid-*.bak +prismatic-sync.log diff --git a/scripts/Prismatic Data Scraper/Infrastructure/BrowserSession.cs b/scripts/Prismatic Data Scraper/Infrastructure/BrowserSession.cs new file mode 100644 index 0000000..0443aa5 --- /dev/null +++ b/scripts/Prismatic Data Scraper/Infrastructure/BrowserSession.cs @@ -0,0 +1,43 @@ +using Microsoft.Playwright; + +namespace PrismaticSync.Infrastructure; + +/// +/// A headless Chromium session with a realistic desktop fingerprint (UA, viewport, locale, +/// timezone) — matching the original scraper's settings to look like a normal browser. +/// +public sealed class BrowserSession : IAsyncDisposable +{ + private IPlaywright? _pw; + private IBrowser? _browser; + private IBrowserContext? _context; + + public IPage Page { get; private set; } = null!; + + public static async Task CreateAsync(bool headed) + { + var session = new BrowserSession(); + session._pw = await Playwright.CreateAsync(); + session._browser = await session._pw.Chromium.LaunchAsync(new BrowserTypeLaunchOptions + { + Headless = !headed + }); + session._context = await session._browser.NewContextAsync(new BrowserNewContextOptions + { + UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + ViewportSize = new ViewportSize { Width = 1365, Height = 900 }, + Locale = "en-US", + TimezoneId = "America/New_York" + }); + session.Page = await session._context.NewPageAsync(); + return session; + } + + public async ValueTask DisposeAsync() + { + if (_context is not null) await _context.CloseAsync(); + if (_browser is not null) await _browser.CloseAsync(); + _pw?.Dispose(); + } +} diff --git a/scripts/Prismatic Data Scraper/Infrastructure/JsonStore.cs b/scripts/Prismatic Data Scraper/Infrastructure/JsonStore.cs new file mode 100644 index 0000000..5ee544b --- /dev/null +++ b/scripts/Prismatic Data Scraper/Infrastructure/JsonStore.cs @@ -0,0 +1,65 @@ +using System.Text.Json; +using PrismaticSync.Models; + +namespace PrismaticSync.Infrastructure; + +/// Loads/saves the scrape output and the URL list, with atomic writes so a crash mid-save can't corrupt them. +public static class JsonStore +{ + private static readonly JsonSerializerOptions WriteOptions = new() { WriteIndented = true }; + private static readonly JsonSerializerOptions ReadOptions = new() { PropertyNameCaseInsensitive = true }; + + public static ScrapeOutput LoadOutput(string path) + { + if (!File.Exists(path)) + return new ScrapeOutput(); + + var json = File.ReadAllText(path); + try + { + // Tolerate a bare array (older output format) as well as { results, errors }. + if (json.TrimStart().StartsWith("[")) + { + var results = JsonSerializer.Deserialize>(json, ReadOptions) ?? new(); + return new ScrapeOutput { Results = results }; + } + return JsonSerializer.Deserialize(json, ReadOptions) ?? new ScrapeOutput(); + } + catch (Exception ex) + { + var backup = $"{path}.invalid-{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}.bak"; + File.Copy(path, backup, overwrite: true); + throw new InvalidOperationException($"Could not parse {path}. Backed it up to {backup}. {ex.Message}"); + } + } + + public static void SaveOutput(string path, ScrapeOutput data) + { + var tmp = path + ".tmp"; + File.WriteAllText(tmp, JsonSerializer.Serialize(data, WriteOptions)); + File.Move(tmp, path, overwrite: true); + } + + public static List LoadUrls(string path) + { + if (!File.Exists(path)) + return new List(); + + return File.ReadAllLines(path) + .Select(CleanUrl) + .Where(u => u.Length > 0 && !u.StartsWith("#")) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + public static void SaveUrls(string path, IEnumerable urls) + { + var sorted = urls.Distinct(StringComparer.OrdinalIgnoreCase).OrderBy(u => u, StringComparer.OrdinalIgnoreCase); + var tmp = path + ".tmp"; + File.WriteAllText(tmp, string.Join(Environment.NewLine, sorted) + Environment.NewLine); + File.Move(tmp, path, overwrite: true); + } + + public static string CleanUrl(string? url) => + (url ?? string.Empty).Split('?')[0].Split('#')[0].Trim(); +} diff --git a/scripts/Prismatic Data Scraper/Infrastructure/Log.cs b/scripts/Prismatic Data Scraper/Infrastructure/Log.cs new file mode 100644 index 0000000..6a6df6b --- /dev/null +++ b/scripts/Prismatic Data Scraper/Infrastructure/Log.cs @@ -0,0 +1,49 @@ +namespace PrismaticSync.Infrastructure; + +/// +/// Minimal timestamped logger — writes to the console and appends to a rolling log file so an +/// unattended (Task Scheduler) run leaves an audit trail. Intentionally dependency-free. +/// +public static class Log +{ + private static string _logFile = "prismatic-sync.log"; + private static readonly object Gate = new(); + + public static void Configure(string logFile) => _logFile = logFile; + + public static void Info(string message) => Write("INFO", message); + public static void Warn(string message) => Write("WARN", message); + public static void Error(string message) => Write("ERROR", message); + + private static void Write(string level, string message) + { + var line = $"[{DateTime.UtcNow:yyyy-MM-ddTHH:mm:ssZ}] {level,-5} {message}"; + + // Live console stream (visible on a manual run); color-code so warnings/errors stand out. + lock (Gate) + { + var color = level switch + { + "WARN" => ConsoleColor.Yellow, + "ERROR" => ConsoleColor.Red, + _ => (ConsoleColor?)null + }; + + if (color is { } c) + { + var previous = Console.ForegroundColor; + Console.ForegroundColor = c; + Console.WriteLine(line); + Console.ForegroundColor = previous; + } + else + { + Console.WriteLine(line); + } + + // File trail — never let logging break a run. + try { File.AppendAllText(_logFile, line + Environment.NewLine); } + catch { /* ignore */ } + } + } +} diff --git a/scripts/Prismatic Data Scraper/Infrastructure/SyncConfig.cs b/scripts/Prismatic Data Scraper/Infrastructure/SyncConfig.cs new file mode 100644 index 0000000..04e3020 --- /dev/null +++ b/scripts/Prismatic Data Scraper/Infrastructure/SyncConfig.cs @@ -0,0 +1,69 @@ +namespace PrismaticSync.Infrastructure; + +/// Strongly-typed config bound from the "Sync" section of appsettings.json. +public class SyncConfig +{ + public string BaseUrl { get; set; } = "https://www.prismaticpowders.com"; + public string ColorsPath { get; set; } = "/shop/powder-coating-colors"; + + public string ProductUrlsFile { get; set; } = "product-urls.txt"; + public string OutputJsonFile { get; set; } = "prismatic_powders.json"; + public string LogFile { get; set; } = "prismatic-sync.log"; + + /// Politeness delay between product scrapes (randomized within the range). + public int MinDelaySeconds { get; set; } = 6; + public int MaxDelaySeconds { get; set; } = 14; + + /// On a 403/block, cool down this many seconds × the consecutive-block count, then retry. + public int BlockedCooldownSeconds { get; set; } = 120; + + /// Upper bound on a single cooldown so escalation can't run away. + public int BlockedCooldownMaxSeconds { get; set; } = 600; + + /// How many times to cool-down-and-retry a blocked product before recording it as an error. + public int BlockedMaxRetries { get; set; } = 3; + + /// Take a longer rest after this many products (0 disables). Eases load and looks less robotic. + public int LongRestEveryProducts { get; set; } = 150; + + /// Length of the periodic long rest, in seconds. + public int LongRestSeconds { get; set; } = 45; + + /// Extra settle time after a product page loads before reading it. + public int PageSettleSeconds { get; set; } = 4; + + /// Pause after each scroll while a listing lazy-loads more items. + public int ScrollWaitMs { get; set; } = 1500; + + /// Hard cap on scrolls per listing, as a safety stop. + public int MaxScrolls { get; set; } = 400; + + /// Full discovery: stop a listing after this many scrolls add no new links. + public int StopAfterNoNewScrolls { get; set; } = 10; + + /// + /// Incremental discovery: stop the newest-first listing after this many consecutive scrolls + /// that surfaced only already-known URLs — i.e. we've scrolled past the new products. + /// + public int StopAfterKnownScrolls { get; set; } = 8; + + /// Color filter params used by full discovery. + public string[] ColorParams { get; set; } = Array.Empty(); + + public ImportConfig Import { get; set; } = new(); + + public string ColorsUrl => $"{BaseUrl.TrimEnd('/')}{ColorsPath}"; +} + +/// Where and how to push the scraped catalog into the app. +public class ImportConfig +{ + /// Full URL of the app's token-authenticated catalog import endpoint. + public string EndpointUrl { get; set; } = ""; + + /// Shared secret sent in the X-Import-Token header. Must match the app's config. + public string Token { get; set; } = ""; + + /// Vendor name applied to every record on import. + public string VendorName { get; set; } = "Prismatic Powders"; +} diff --git a/scripts/Prismatic Data Scraper/Models/ScrapeModels.cs b/scripts/Prismatic Data Scraper/Models/ScrapeModels.cs new file mode 100644 index 0000000..f30d5ef --- /dev/null +++ b/scripts/Prismatic Data Scraper/Models/ScrapeModels.cs @@ -0,0 +1,45 @@ +using System.Text.Json.Serialization; + +namespace PrismaticSync.Models; + +/// +/// On-disk scrape output. Shape matches the app's catalog import (a top-level "results" array of +/// snake_case product records), so the JSON drops straight into the import endpoint. "errors" tracks +/// failed URLs for resumable re-runs. +/// +public class ScrapeOutput +{ + [JsonPropertyName("results")] public List Results { get; set; } = new(); + [JsonPropertyName("errors")] public List Errors { get; set; } = new(); +} + +/// One scraped product, in the import's expected field shape. +public class ProductRecord +{ + [JsonPropertyName("sku")] public string Sku { get; set; } = ""; + [JsonPropertyName("color_name")] public string ColorName { get; set; } = ""; + [JsonPropertyName("description")] public string Description { get; set; } = ""; + [JsonPropertyName("price_tiers")] public List PriceTiers { get; set; } = new(); + [JsonPropertyName("safety_data_sheet_url")] public string SafetyDataSheetUrl { get; set; } = ""; + [JsonPropertyName("technical_data_sheet_url")] public string TechnicalDataSheetUrl { get; set; } = ""; + [JsonPropertyName("application_guide_url")] public string ApplicationGuideUrl { get; set; } = ""; + [JsonPropertyName("sample_image_url")] public string SampleImageUrl { get; set; } = ""; + [JsonPropertyName("product_url")] public string ProductUrl { get; set; } = ""; + [JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; } +} + +/// A quantity-break price tier — {min, max, price}. max is null for an open-ended top tier. +public class PriceTier +{ + [JsonPropertyName("min")] public int? Min { get; set; } + [JsonPropertyName("max")] public int? Max { get; set; } + [JsonPropertyName("price")] public decimal Price { get; set; } +} + +/// A URL that failed to scrape, kept so resumable runs can skip or retry it. +public class ScrapeError +{ + [JsonPropertyName("product_url")] public string ProductUrl { get; set; } = ""; + [JsonPropertyName("error")] public string Error { get; set; } = ""; + [JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; } +} diff --git a/scripts/Prismatic Data Scraper/PrismaticSync.csproj b/scripts/Prismatic Data Scraper/PrismaticSync.csproj new file mode 100644 index 0000000..775228e --- /dev/null +++ b/scripts/Prismatic Data Scraper/PrismaticSync.csproj @@ -0,0 +1,36 @@ + + + + + Exe + net8.0 + enable + enable + PrismaticSync + PrismaticSync + true + + + + + + + + + + + + PreserveNewest + + + + diff --git a/scripts/Prismatic Data Scraper/Program.cs b/scripts/Prismatic Data Scraper/Program.cs new file mode 100644 index 0000000..4c5b624 --- /dev/null +++ b/scripts/Prismatic Data Scraper/Program.cs @@ -0,0 +1,106 @@ +using Microsoft.Extensions.Configuration; +using PrismaticSync.Infrastructure; +using PrismaticSync.Services; + +// ── Load config ─────────────────────────────────────────────────────────────── +var configRoot = new ConfigurationBuilder() + .SetBasePath(AppContext.BaseDirectory) + .AddJsonFile("appsettings.json", optional: false) + .Build(); + +var config = configRoot.GetSection("Sync").Get() ?? new SyncConfig(); +Log.Configure(config.LogFile); + +// ── Parse args ──────────────────────────────────────────────────────────────── +var command = args.Length > 0 && !args[0].StartsWith("--") ? args[0].ToLowerInvariant() : "run"; +var headed = args.Contains("--headed"); +var retryErrors = args.Contains("--retry-errors"); +var maxProducts = GetIntArg("--max-products", 0); +// "run" refreshes products older than 30 days by default; explicit commands default to new-only. +var refreshOlderThanDays = GetIntArg("--refresh-older-than", command == "run" ? 30 : 0); + +Log.Info($"PrismaticSync — command '{command}' (headed={headed}, refreshOlderThan={refreshOlderThanDays}d, maxProducts={maxProducts})"); + +try +{ + switch (command) + { + case "discover-new": + await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverNewAsync()); + break; + + case "discover-full": + await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverFullAsync()); + break; + + case "scrape": + await WithBrowser(d => new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors)); + break; + + case "push": + await new CatalogPusher(config).PushAsync(); + break; + + case "run": + // The scheduled default: find new colors, scrape new + stale, then push. + await WithBrowser(async d => + { + await new PrismaticDiscoverer(d, config).DiscoverNewAsync(); + await new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors); + }); + await new CatalogPusher(config).PushAsync(); + break; + + default: + PrintUsage(); + return 1; + } + + Log.Info("Done."); + return 0; +} +catch (Exception ex) +{ + Log.Error($"Fatal: {ex}"); + return 1; +} + +// ── Helpers ─────────────────────────────────────────────────────────────────── +async Task WithBrowser(Func action) +{ + await using var session = await BrowserSession.CreateAsync(headed); + await action(session); +} + +int GetIntArg(string name, int fallback) +{ + var prefix = name + "="; + var found = args.FirstOrDefault(a => a.StartsWith(prefix, StringComparison.OrdinalIgnoreCase)); + return found is not null && int.TryParse(found[prefix.Length..], out var value) ? value : fallback; +} + +void PrintUsage() +{ + Console.WriteLine( + """ + PrismaticSync — scrape Prismatic Powders and push to the app catalog. + + Usage: PrismaticSync [command] [options] + + Commands: + run (default) discover-new + scrape (new + stale) + push + discover-new Incremental discovery via newest-first sort (cheap; finds new colors) + discover-full Full discovery across all color filters (heavy; reconciles the whole set) + scrape Scrape product pages from the URL list (resumable) + push Push the scraped JSON to the import endpoint + + Options: + --refresh-older-than=N Re-scrape products whose data is older than N days (default 30 for 'run') + --max-products=N Cap products scraped this run (0 = no cap) + --retry-errors Retry URLs previously recorded as errors + --headed Show the browser window (debugging) + + Config: appsettings.json (delays, file paths, import endpoint + token). + First run on a new machine: dotnet build, then `pwsh bin/Debug/net8.0/playwright.ps1 install chromium`. + """); +} diff --git a/scripts/Prismatic Data Scraper/README.md b/scripts/Prismatic Data Scraper/README.md new file mode 100644 index 0000000..52d9d38 --- /dev/null +++ b/scripts/Prismatic Data Scraper/README.md @@ -0,0 +1,82 @@ +# PrismaticSync + +A standalone .NET console tool that scrapes the Prismatic Powders catalog and pushes it into the +Powder Coating Logix catalog import endpoint. It exists because Prismatic has **no API** (unlike +Columbia Coatings) — so the data has to be scraped via browser automation. + +> **Runs on a workstation you control — never on the deployed app server.** Scraping from the cloud +> app's IP would get blocked and isn't appropriate. This tool is deliberately *not* part of +> `PowderCoating.sln`; build and run it independently. + +## First-time setup (per machine) + +```powershell +cd "scripts/Prismatic Data Scraper" +dotnet build +pwsh bin/Debug/net8.0/playwright.ps1 install chromium # one-time browser download +``` + +## Commands + +```powershell +dotnet run -- run # default: discover-new + scrape (new + stale >30d) + push +dotnet run -- discover-new # cheap: find newly-added colors (newest-first, stops at known) +dotnet run -- discover-full # heavy: crawl all color filters (reconcile whole set / removals) +dotnet run -- scrape # scrape product pages from product-urls.txt (resumable) +dotnet run -- scrape --refresh-older-than=30 # also re-scrape products older than 30 days (price changes) +dotnet run -- push # push prismatic_powders.json to the import endpoint +``` + +Options: `--max-products=N`, `--retry-errors`, `--headed` (show the browser for debugging). + +Everything streams to the console live (warnings/errors in color) **and** to `prismatic-sync.log`. + +## Operating model (suggested cadence) + +| Run | Command | Cadence | Why | +|-----|---------|---------|-----| +| Find new colors | `run` (does discover-new + scrape-new) | Weekly | Cheap; Prismatic adds colors often | +| Price refresh | `scrape --refresh-older-than=30` then `push` | Monthly | Re-scrapes stale products to catch price changes (slow, ~hours) | +| Full reconcile | `discover-full` then `scrape` | Quarterly | Catches removed/discontinued colors | + +A full scrape of ~5,000 products takes hours (polite delays). It saves after every product and is +fully resumable, so stop/restart any time. + +## Politeness / anti-block + +Configurable in `appsettings.json`: randomized 6–14s base delay, an escalating **cooldown + retry on +403** (so a temporary block doesn't get you hard-banned mid-run), and a periodic long rest. Leave +these conservative — getting blocked is worse than being slow, and Prismatic is a partner. + +## Pushing into the app + +Set `Sync.Import.EndpointUrl` + `Sync.Import.Token` in `appsettings.json`. The tool POSTs the JSON +with an `X-Import-Token` header to the app's token-authenticated import endpoint, which runs it +through the same upsert as the Columbia sync. If the endpoint isn't configured, `push` is skipped and +you upload `prismatic_powders.json` manually via the Powder Catalog admin page. + +> **App-side dependency:** the token-authenticated import endpoint must exist in the web app for +> unattended push to work. Until then, use the manual upload. + +## Scheduling (Windows Task Scheduler) + +Point a scheduled task at the published exe (or `dotnet run`). Example weekly task command: + +``` +Program/script: C:\Tools\PrismaticSync\PrismaticSync.exe +Arguments: run +Start in: C:\Tools\PrismaticSync +``` + +Publish a self-contained build to drop on the workstation: + +```powershell +dotnet publish -c Release -r win-x64 --self-contained false -o C:\Tools\PrismaticSync +pwsh C:\Tools\PrismaticSync\playwright.ps1 install chromium +``` + +## The long game + +This is the interim path. The durable endgame is a real Prismatic **API** (the partnership), at which +point this tool is replaced by a clean in-app sync like Columbia's — reusing the same upsert, +propagation, and discontinued handling. diff --git a/scripts/Prismatic Data Scraper/Services/CatalogPusher.cs b/scripts/Prismatic Data Scraper/Services/CatalogPusher.cs new file mode 100644 index 0000000..d16b708 --- /dev/null +++ b/scripts/Prismatic Data Scraper/Services/CatalogPusher.cs @@ -0,0 +1,63 @@ +using System.Text; +using PrismaticSync.Infrastructure; + +namespace PrismaticSync.Services; + +/// +/// Pushes the scraped JSON to the app's token-authenticated catalog import endpoint. When no +/// endpoint is configured it no-ops (the JSON is still on disk for a manual upload), so the tool is +/// useful before the endpoint exists. +/// +public class CatalogPusher +{ + private readonly SyncConfig _config; + + public CatalogPusher(SyncConfig config) => _config = config; + + public async Task PushAsync() + { + if (string.IsNullOrWhiteSpace(_config.Import.EndpointUrl)) + { + Log.Warn($"No import endpoint configured (Sync.Import.EndpointUrl) — skipping push. " + + $"Upload {_config.OutputJsonFile} manually via the Powder Catalog admin instead."); + return false; + } + + if (!File.Exists(_config.OutputJsonFile)) + { + Log.Warn($"Output file {_config.OutputJsonFile} not found — nothing to push."); + return false; + } + + var json = await File.ReadAllTextAsync(_config.OutputJsonFile); + Log.Info($"Pushing {_config.OutputJsonFile} to {_config.Import.EndpointUrl} (vendor: {_config.Import.VendorName})..."); + + using var http = new HttpClient { Timeout = TimeSpan.FromMinutes(5) }; + using var request = new HttpRequestMessage(HttpMethod.Post, _config.Import.EndpointUrl); + request.Headers.Add("X-Import-Token", _config.Import.Token); + request.Headers.Add("X-Vendor-Name", _config.Import.VendorName); + request.Content = new StringContent(json, Encoding.UTF8, "application/json"); + + try + { + using var response = await http.SendAsync(request); + var body = await response.Content.ReadAsStringAsync(); + + if (response.IsSuccessStatusCode) + { + Log.Info($"Push succeeded ({(int)response.StatusCode}): {Trim(body)}"); + return true; + } + + Log.Error($"Push failed ({(int)response.StatusCode}): {Trim(body)}"); + return false; + } + catch (Exception ex) + { + Log.Error($"Push error: {ex.Message}"); + return false; + } + } + + private static string Trim(string s) => s.Length > 500 ? s[..500] + "…" : s; +} diff --git a/scripts/Prismatic Data Scraper/Services/PrismaticDiscoverer.cs b/scripts/Prismatic Data Scraper/Services/PrismaticDiscoverer.cs new file mode 100644 index 0000000..12e86e0 --- /dev/null +++ b/scripts/Prismatic Data Scraper/Services/PrismaticDiscoverer.cs @@ -0,0 +1,138 @@ +using System.Text.RegularExpressions; +using Microsoft.Playwright; +using PrismaticSync.Infrastructure; + +namespace PrismaticSync.Services; + +/// +/// Discovers product URLs from the Prismatic color listing (infinite-scroll). Two modes: +/// incremental (newest-first via ?category=created_at, stop once we reach already-known +/// URLs) for cheap frequent runs, and full (every color filter to the bottom) for occasional +/// reconciliation. Both append to the URL list file. +/// +public class PrismaticDiscoverer +{ + private static readonly Regex ProductUrlRegex = + new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled); + + private readonly BrowserSession _session; + private readonly SyncConfig _config; + + public PrismaticDiscoverer(BrowserSession session, SyncConfig config) + { + _session = session; + _config = config; + } + + /// + /// Incremental discovery: crawl the newest-first listing and stop once a run of consecutive + /// scrolls surfaces only already-known URLs — meaning we've scrolled past the new products. + /// Returns the count of newly found URLs. + /// + public async Task DiscoverNewAsync() + { + var known = new HashSet(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase); + var startCount = known.Count; + Log.Info($"Incremental discovery (newest first). Known URLs: {startCount}"); + + await GotoAsync($"{_config.ColorsUrl}?category=created_at"); + + var knownStreak = 0; + for (var i = 0; i < _config.MaxScrolls; i++) + { + var addedNew = 0; + foreach (var link in await CollectProductLinksAsync()) + if (known.Add(link)) addedNew++; + + JsonStore.SaveUrls(_config.ProductUrlsFile, known); + knownStreak = addedNew == 0 ? knownStreak + 1 : 0; + Log.Info($"Scroll {i + 1}: +{addedNew} new, total {known.Count}, known-streak {knownStreak}"); + + if (knownStreak >= _config.StopAfterKnownScrolls) + { + Log.Info("Reached known territory — stopping incremental discovery."); + break; + } + + await ScrollAsync(); + } + + var newCount = known.Count - startCount; + Log.Info($"Incremental discovery done. New URLs: {newCount}; total {known.Count}"); + return newCount; + } + + /// + /// Full discovery: crawl every color filter to the bottom. Heavier — use occasionally to + /// reconcile the whole set (e.g. to notice colors that have been removed). Returns new URL count. + /// + public async Task DiscoverFullAsync() + { + var known = new HashSet(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase); + var startCount = known.Count; + Log.Info($"Full discovery across {_config.ColorParams.Length} color filters. Known URLs: {startCount}"); + + foreach (var color in _config.ColorParams) + { + Log.Info($"Color filter: {color}"); + try + { + await GotoAsync($"{_config.ColorsUrl}?color={Uri.EscapeDataString(color)}"); + + var noNew = 0; + for (var i = 0; i < _config.MaxScrolls; i++) + { + var added = 0; + foreach (var link in await CollectProductLinksAsync()) + if (known.Add(link)) added++; + + JsonStore.SaveUrls(_config.ProductUrlsFile, known); + noNew = added == 0 ? noNew + 1 : 0; + if (noNew >= _config.StopAfterNoNewScrolls) + break; + + await ScrollAsync(); + } + + Log.Info($"Color {color} done. Total {known.Count}"); + await _session.Page.WaitForTimeoutAsync(3000); + } + catch (Exception ex) + { + Log.Warn($"Color {color} failed: {ex.Message}"); + } + } + + var newCount = known.Count - startCount; + Log.Info($"Full discovery done. New this run: {newCount}; total {known.Count}"); + return newCount; + } + + private async Task GotoAsync(string url) + { + await _session.Page.GotoAsync(url, new PageGotoOptions + { + WaitUntil = WaitUntilState.DOMContentLoaded, + Timeout = 60000 + }); + await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000); + } + + private async Task ScrollAsync() + { + await _session.Page.Mouse.WheelAsync(0, 2500); + await _session.Page.WaitForTimeoutAsync(_config.ScrollWaitMs); + } + + private async Task> CollectProductLinksAsync() + { + var hrefs = await _session.Page.EvalOnSelectorAllAsync( + "a", "els => els.map(a => a.href).filter(Boolean)"); + + return hrefs + .Where(h => ProductUrlRegex.IsMatch(h)) + .Select(JsonStore.CleanUrl) + .Where(u => u.Length > 0) + .ToList(); + } +} diff --git a/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs b/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs new file mode 100644 index 0000000..38b24d9 --- /dev/null +++ b/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs @@ -0,0 +1,295 @@ +using System.Diagnostics; +using System.Globalization; +using System.Text.RegularExpressions; +using Microsoft.Playwright; +using PrismaticSync.Infrastructure; +using PrismaticSync.Models; + +namespace PrismaticSync.Services; + +/// +/// Scrapes individual Prismatic product pages into s. Resumable (skips +/// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale +/// records get re-scraped to catch price changes. Saves after every product so a long run can be +/// stopped and resumed safely, and logs continuously — including the delay between products — so a +/// manual run always shows it's alive. +/// +public class PrismaticScraper +{ + private static readonly Regex ProductUrlRegex = + new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled); + private static readonly Regex SkuRegex = + new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); + private static readonly Regex DescRegex = + new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); + private static readonly Regex PriceTierRegex = + new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); + private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled); + private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled); + private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled); + + private readonly BrowserSession _session; + private readonly SyncConfig _config; + private readonly Random _random = new(); + + public PrismaticScraper(BrowserSession session, SyncConfig config) + { + _session = session; + _config = config; + } + + /// + /// Scrapes products needing work: those not yet scraped, plus (when + /// > 0) any whose data is older than that window. Returns (scraped, errors). + /// + public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors) + { + var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile) + .Where(u => ProductUrlRegex.IsMatch(u)) + .ToList(); + + var data = JsonStore.LoadOutput(_config.OutputJsonFile); + + // Index existing results by URL (keep the most recent if the file has dupes). + var resultByUrl = data.Results + .GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase) + .ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase); + + var errorUrls = new HashSet( + data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase); + + var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays)); + + var toScrape = new List(); + foreach (var url in allUrls) + { + if (resultByUrl.TryGetValue(url, out var existing)) + { + if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff) + toScrape.Add(url); // stale → refresh for price changes + } + else + { + if (retryErrors || !errorUrls.Contains(url)) + toScrape.Add(url); // never scraped (skip known errors unless retrying) + } + } + + if (maxProducts > 0) + toScrape = toScrape.Take(maxProducts).ToList(); + + var total = toScrape.Count; + Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}"); + Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})"); + + if (total == 0) + { + Log.Info("Nothing to scrape. Done."); + return (0, 0); + } + + var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0; + var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0; + Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " + + $"(grab a coffee if that's a while — it saves after every product and is resumable)."); + + var stopwatch = Stopwatch.StartNew(); + int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0; + + foreach (var url in toScrape) + { + index++; + + for (var attempt = 1; ; attempt++) + { + try + { + var row = await ParseProductAsync(url, index, total); + + if (resultByUrl.TryGetValue(url, out var existing)) + data.Results[data.Results.IndexOf(existing)] = row; + else + data.Results.Add(row); + + resultByUrl[url] = row; + data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase)); + + scraped++; + consecutiveBlocks = 0; + JsonStore.SaveOutput(_config.OutputJsonFile, data); + + var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m; + Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " + + $"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00}) | elapsed {FormatDuration(stopwatch.Elapsed)}"); + break; + } + catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries) + { + // Site pushed back — back off (escalating) and retry the SAME product rather + // than barreling on, which is how an unattended run gets hard-banned. + consecutiveBlocks++; + var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds); + Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product..."); + await Task.Delay(cooldown * 1000); + } + catch (Exception ex) + { + data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow }); + JsonStore.SaveOutput(_config.OutputJsonFile, data); + errors++; + Log.Error($"[{index}/{total}] {url} -> {ex.Message}"); + break; + } + } + + // Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence. + if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total) + { + Log.Info($"Resting {_config.LongRestSeconds}s after {index} products..."); + await Task.Delay(_config.LongRestSeconds * 1000); + } + + if (index < total) + { + var delayMs = RandomDelayMs(); + Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product..."); + await Task.Delay(delayMs); + } + } + + Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " + + $"Took {FormatDuration(stopwatch.Elapsed)}."); + return (scraped, errors); + } + + private async Task ParseProductAsync(string url, int index, int total) + { + Log.Info($"[{index}/{total}] Scraping {url}"); + + var response = await _session.Page.GotoAsync(url, new PageGotoOptions + { + WaitUntil = WaitUntilState.DOMContentLoaded, + Timeout = 60000 + }); + await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000); + + var status = response?.Status ?? 0; + var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync())); + var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync())); + + if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase)) + throw new Exception("403 Forbidden returned by site."); + if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase)) + throw new Exception("404 Not Found returned by site."); + + var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync())); + + var skuMatch = SkuRegex.Match(plainText); + var sku = skuMatch.Success ? skuMatch.Groups[1].Value : ""; + if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName)) + throw new Exception("Could not find SKU or title on product page."); + + var descMatch = DescRegex.Match(plainText); + var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : ""; + + return new ProductRecord + { + Sku = sku, + ColorName = colorName, + Description = description, + PriceTiers = ParsePriceTiers(plainText), + SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }), + TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }), + ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }), + SampleImageUrl = await GetSampleImageUrlAsync(), + ProductUrl = url, + ScrapedAt = DateTime.UtcNow + }; + } + + private static List ParsePriceTiers(string text) + { + var tiers = new List(); + foreach (Match m in PriceTierRegex.Matches(text)) + { + if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price)) + continue; + + var rangeText = Clean(m.Groups[1].Value); + int? min = null, max = null; + + var range = RangeRegex.Match(rangeText); + if (range.Success) + { + min = int.Parse(range.Groups[1].Value); + max = int.Parse(range.Groups[2].Value); + } + + var plus = PlusRegex.Match(rangeText); + if (plus.Success) + { + min = int.Parse(plus.Groups[1].Value); + max = null; + } + + tiers.Add(new PriceTier { Min = min, Max = max, Price = price }); + } + return tiers; + } + + /// Returns the href of the first link whose text matches any pattern. Uses a single eval + /// returning "texthref" pairs to avoid object deserialization quirks. + private async Task GetLinkByTextAsync(string[] patterns) + { + var combined = await _session.Page.EvalOnSelectorAllAsync( + "a", + "els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " + + "+ String.fromCharCode(1) + (a.href || ''))"); + + foreach (var entry in combined) + { + var parts = entry.Split(''); + var text = parts.Length > 0 ? parts[0] : ""; + var href = parts.Length > 1 ? parts[1] : ""; + if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase))) + return href; + } + return ""; + } + + private async Task GetSampleImageUrlAsync() + { + var srcs = await _session.Page.EvalOnSelectorAllAsync( + "img", + "els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" + + ".filter(Boolean)"); + + return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase) + && !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase)) + ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)) + ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase)) + ?? ""; + } + + private static bool IsBlocked(Exception ex) => + ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase); + + private static async Task SafeTextAsync(Func> fn) + { + try { return await fn(); } catch { return ""; } + } + + private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim(); + + private int RandomDelayMs() + { + var min = Math.Max(0, _config.MinDelaySeconds * 1000); + var max = Math.Max(min, _config.MaxDelaySeconds * 1000); + return _random.Next(min, max + 1); + } + + private static string FormatDuration(TimeSpan t) => + t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" : + t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" : + $"{t.Seconds}s"; +} diff --git a/scripts/Prismatic Data Scraper/appsettings.json b/scripts/Prismatic Data Scraper/appsettings.json new file mode 100644 index 0000000..90062bf --- /dev/null +++ b/scripts/Prismatic Data Scraper/appsettings.json @@ -0,0 +1,38 @@ +{ + "Sync": { + "BaseUrl": "https://www.prismaticpowders.com", + "ColorsPath": "/shop/powder-coating-colors", + + "ProductUrlsFile": "product-urls.txt", + "OutputJsonFile": "prismatic_powders.json", + "LogFile": "prismatic-sync.log", + + "MinDelaySeconds": 6, + "MaxDelaySeconds": 14, + "PageSettleSeconds": 4, + + "BlockedCooldownSeconds": 120, + "BlockedCooldownMaxSeconds": 600, + "BlockedMaxRetries": 3, + "LongRestEveryProducts": 150, + "LongRestSeconds": 45, + + "ScrollWaitMs": 1500, + "MaxScrolls": 400, + "StopAfterNoNewScrolls": 10, + "StopAfterKnownScrolls": 8, + + "ColorParams": [ + "pris_black", "pris_blue", "pris_bronze", "pris_brown", "pris_clear", + "pris_copper", "pris_gold", "pris_gray", "pris_green", "pris_orange", + "pris_pink", "pris_purple", "pris_red", "pris_silver", "pris_tan", + "pris_white", "pris_yellow" + ], + + "Import": { + "EndpointUrl": "", + "Token": "", + "VendorName": "Prismatic Powders" + } + } +}