c59d55529f
Standalone .NET 8 console app (not part of the main solution) that scrapes the Prismatic Powders catalog via Playwright and pushes it into the app's catalog import. Prismatic has no API, so this runs on a workstation (Task Scheduler), never the deployed server. - Discovery: incremental newest-first via ?category=created_at (stops once it reaches already-known URLs — cheap, finds new colors) and a full all-colors crawl for occasional reconcile. - Scraper: resumable product-page scrape (sku/color/description/price tiers/ SDS/TDS/app-guide/image), with --refresh-older-than to re-scrape stale products and catch price changes. Output matches the app import format so it flows through the same shared upsert as the Columbia sync. - Resilience: brisk randomized base delay, escalating 403 cooldown-and-retry to avoid hard bans, periodic rest. All configurable. - Visibility: streams every product + the inter-product wait to the console (colored) and a log file, with an up-front ETA. - Push: token-authenticated POST to the app import endpoint (skips to manual upload when unconfigured). The app-side token import endpoint is a separate follow-up. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
46 lines
2.2 KiB
C#
46 lines
2.2 KiB
C#
using System.Text.Json.Serialization;
|
|
|
|
namespace PrismaticSync.Models;
|
|
|
|
/// <summary>
|
|
/// On-disk scrape output. Shape matches the app's catalog import (a top-level "results" array of
|
|
/// snake_case product records), so the JSON drops straight into the import endpoint. "errors" tracks
|
|
/// failed URLs for resumable re-runs.
|
|
/// </summary>
|
|
public class ScrapeOutput
|
|
{
|
|
[JsonPropertyName("results")] public List<ProductRecord> Results { get; set; } = new();
|
|
[JsonPropertyName("errors")] public List<ScrapeError> Errors { get; set; } = new();
|
|
}
|
|
|
|
/// <summary>One scraped product, in the import's expected field shape.</summary>
|
|
public class ProductRecord
|
|
{
|
|
[JsonPropertyName("sku")] public string Sku { get; set; } = "";
|
|
[JsonPropertyName("color_name")] public string ColorName { get; set; } = "";
|
|
[JsonPropertyName("description")] public string Description { get; set; } = "";
|
|
[JsonPropertyName("price_tiers")] public List<PriceTier> PriceTiers { get; set; } = new();
|
|
[JsonPropertyName("safety_data_sheet_url")] public string SafetyDataSheetUrl { get; set; } = "";
|
|
[JsonPropertyName("technical_data_sheet_url")] public string TechnicalDataSheetUrl { get; set; } = "";
|
|
[JsonPropertyName("application_guide_url")] public string ApplicationGuideUrl { get; set; } = "";
|
|
[JsonPropertyName("sample_image_url")] public string SampleImageUrl { get; set; } = "";
|
|
[JsonPropertyName("product_url")] public string ProductUrl { get; set; } = "";
|
|
[JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; }
|
|
}
|
|
|
|
/// <summary>A quantity-break price tier — {min, max, price}. max is null for an open-ended top tier.</summary>
|
|
public class PriceTier
|
|
{
|
|
[JsonPropertyName("min")] public int? Min { get; set; }
|
|
[JsonPropertyName("max")] public int? Max { get; set; }
|
|
[JsonPropertyName("price")] public decimal Price { get; set; }
|
|
}
|
|
|
|
/// <summary>A URL that failed to scrape, kept so resumable runs can skip or retry it.</summary>
|
|
public class ScrapeError
|
|
{
|
|
[JsonPropertyName("product_url")] public string ProductUrl { get; set; } = "";
|
|
[JsonPropertyName("error")] public string Error { get; set; } = "";
|
|
[JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; }
|
|
}
|