Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| da2bb46d5a | |||
| 843d1c3c51 | |||
| c59d55529f |
@@ -0,0 +1,8 @@
|
||||
# Build output
|
||||
bin/
|
||||
obj/
|
||||
|
||||
# Transient scrape artifacts
|
||||
*.tmp
|
||||
*.invalid-*.bak
|
||||
prismatic-sync.log
|
||||
@@ -0,0 +1,43 @@
|
||||
using Microsoft.Playwright;
|
||||
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>
|
||||
/// A headless Chromium session with a realistic desktop fingerprint (UA, viewport, locale,
|
||||
/// timezone) — matching the original scraper's settings to look like a normal browser.
|
||||
/// </summary>
|
||||
public sealed class BrowserSession : IAsyncDisposable
|
||||
{
|
||||
private IPlaywright? _pw;
|
||||
private IBrowser? _browser;
|
||||
private IBrowserContext? _context;
|
||||
|
||||
public IPage Page { get; private set; } = null!;
|
||||
|
||||
public static async Task<BrowserSession> CreateAsync(bool headed)
|
||||
{
|
||||
var session = new BrowserSession();
|
||||
session._pw = await Playwright.CreateAsync();
|
||||
session._browser = await session._pw.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
|
||||
{
|
||||
Headless = !headed
|
||||
});
|
||||
session._context = await session._browser.NewContextAsync(new BrowserNewContextOptions
|
||||
{
|
||||
UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
ViewportSize = new ViewportSize { Width = 1365, Height = 900 },
|
||||
Locale = "en-US",
|
||||
TimezoneId = "America/New_York"
|
||||
});
|
||||
session.Page = await session._context.NewPageAsync();
|
||||
return session;
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_context is not null) await _context.CloseAsync();
|
||||
if (_browser is not null) await _browser.CloseAsync();
|
||||
_pw?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
using System.Text.Json;
|
||||
using PrismaticSync.Models;
|
||||
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>Loads/saves the scrape output and the URL list, with atomic writes so a crash mid-save can't corrupt them.</summary>
|
||||
public static class JsonStore
|
||||
{
|
||||
private static readonly JsonSerializerOptions WriteOptions = new() { WriteIndented = true };
|
||||
private static readonly JsonSerializerOptions ReadOptions = new() { PropertyNameCaseInsensitive = true };
|
||||
|
||||
public static ScrapeOutput LoadOutput(string path)
|
||||
{
|
||||
if (!File.Exists(path))
|
||||
return new ScrapeOutput();
|
||||
|
||||
var json = File.ReadAllText(path);
|
||||
try
|
||||
{
|
||||
// Tolerate a bare array (older output format) as well as { results, errors }.
|
||||
if (json.TrimStart().StartsWith("["))
|
||||
{
|
||||
var results = JsonSerializer.Deserialize<List<ProductRecord>>(json, ReadOptions) ?? new();
|
||||
return new ScrapeOutput { Results = results };
|
||||
}
|
||||
return JsonSerializer.Deserialize<ScrapeOutput>(json, ReadOptions) ?? new ScrapeOutput();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
var backup = $"{path}.invalid-{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}.bak";
|
||||
File.Copy(path, backup, overwrite: true);
|
||||
throw new InvalidOperationException($"Could not parse {path}. Backed it up to {backup}. {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
public static void SaveOutput(string path, ScrapeOutput data)
|
||||
{
|
||||
var tmp = path + ".tmp";
|
||||
File.WriteAllText(tmp, JsonSerializer.Serialize(data, WriteOptions));
|
||||
File.Move(tmp, path, overwrite: true);
|
||||
}
|
||||
|
||||
public static List<string> LoadUrls(string path)
|
||||
{
|
||||
if (!File.Exists(path))
|
||||
return new List<string>();
|
||||
|
||||
return File.ReadAllLines(path)
|
||||
.Select(CleanUrl)
|
||||
.Where(u => u.Length > 0 && !u.StartsWith("#"))
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public static void SaveUrls(string path, IEnumerable<string> urls)
|
||||
{
|
||||
var sorted = urls.Distinct(StringComparer.OrdinalIgnoreCase).OrderBy(u => u, StringComparer.OrdinalIgnoreCase);
|
||||
var tmp = path + ".tmp";
|
||||
File.WriteAllText(tmp, string.Join(Environment.NewLine, sorted) + Environment.NewLine);
|
||||
File.Move(tmp, path, overwrite: true);
|
||||
}
|
||||
|
||||
public static string CleanUrl(string? url) =>
|
||||
(url ?? string.Empty).Split('?')[0].Split('#')[0].Trim();
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>
|
||||
/// Minimal timestamped logger — writes to the console and appends to a rolling log file so an
|
||||
/// unattended (Task Scheduler) run leaves an audit trail. Intentionally dependency-free.
|
||||
/// </summary>
|
||||
public static class Log
|
||||
{
|
||||
private static string _logFile = "prismatic-sync.log";
|
||||
private static readonly object Gate = new();
|
||||
|
||||
public static void Configure(string logFile) => _logFile = logFile;
|
||||
|
||||
public static void Info(string message) => Write("INFO", message);
|
||||
public static void Warn(string message) => Write("WARN", message);
|
||||
public static void Error(string message) => Write("ERROR", message);
|
||||
|
||||
private static void Write(string level, string message)
|
||||
{
|
||||
var line = $"[{DateTime.UtcNow:yyyy-MM-ddTHH:mm:ssZ}] {level,-5} {message}";
|
||||
|
||||
// Live console stream (visible on a manual run); color-code so warnings/errors stand out.
|
||||
lock (Gate)
|
||||
{
|
||||
var color = level switch
|
||||
{
|
||||
"WARN" => ConsoleColor.Yellow,
|
||||
"ERROR" => ConsoleColor.Red,
|
||||
_ => (ConsoleColor?)null
|
||||
};
|
||||
|
||||
if (color is { } c)
|
||||
{
|
||||
var previous = Console.ForegroundColor;
|
||||
Console.ForegroundColor = c;
|
||||
Console.WriteLine(line);
|
||||
Console.ForegroundColor = previous;
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine(line);
|
||||
}
|
||||
|
||||
// File trail — never let logging break a run.
|
||||
try { File.AppendAllText(_logFile, line + Environment.NewLine); }
|
||||
catch { /* ignore */ }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>Strongly-typed config bound from the "Sync" section of appsettings.json.</summary>
|
||||
public class SyncConfig
|
||||
{
|
||||
public string BaseUrl { get; set; } = "https://www.prismaticpowders.com";
|
||||
public string ColorsPath { get; set; } = "/shop/powder-coating-colors";
|
||||
|
||||
public string ProductUrlsFile { get; set; } = "product-urls.txt";
|
||||
public string OutputJsonFile { get; set; } = "prismatic_powders.json";
|
||||
public string LogFile { get; set; } = "prismatic-sync.log";
|
||||
|
||||
/// <summary>Politeness delay between product scrapes (randomized within the range).</summary>
|
||||
public int MinDelaySeconds { get; set; } = 6;
|
||||
public int MaxDelaySeconds { get; set; } = 14;
|
||||
|
||||
/// <summary>On a 403/block, cool down this many seconds × the consecutive-block count, then retry.</summary>
|
||||
public int BlockedCooldownSeconds { get; set; } = 120;
|
||||
|
||||
/// <summary>Upper bound on a single cooldown so escalation can't run away.</summary>
|
||||
public int BlockedCooldownMaxSeconds { get; set; } = 600;
|
||||
|
||||
/// <summary>How many times to cool-down-and-retry a blocked product before recording it as an error.</summary>
|
||||
public int BlockedMaxRetries { get; set; } = 3;
|
||||
|
||||
/// <summary>Take a longer rest after this many products (0 disables). Eases load and looks less robotic.</summary>
|
||||
public int LongRestEveryProducts { get; set; } = 150;
|
||||
|
||||
/// <summary>Length of the periodic long rest, in seconds.</summary>
|
||||
public int LongRestSeconds { get; set; } = 45;
|
||||
|
||||
/// <summary>Extra settle time after a product page loads before reading it.</summary>
|
||||
public int PageSettleSeconds { get; set; } = 4;
|
||||
|
||||
/// <summary>Pause after each scroll while a listing lazy-loads more items.</summary>
|
||||
public int ScrollWaitMs { get; set; } = 1500;
|
||||
|
||||
/// <summary>Hard cap on scrolls per listing, as a safety stop.</summary>
|
||||
public int MaxScrolls { get; set; } = 400;
|
||||
|
||||
/// <summary>Full discovery: stop a listing after this many scrolls add no new links.</summary>
|
||||
public int StopAfterNoNewScrolls { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Incremental discovery: stop the newest-first listing after this many consecutive scrolls
|
||||
/// that surfaced only already-known URLs — i.e. we've scrolled past the new products.
|
||||
/// </summary>
|
||||
public int StopAfterKnownScrolls { get; set; } = 8;
|
||||
|
||||
/// <summary>Color filter params used by full discovery.</summary>
|
||||
public string[] ColorParams { get; set; } = Array.Empty<string>();
|
||||
|
||||
public ImportConfig Import { get; set; } = new();
|
||||
|
||||
public string ColorsUrl => $"{BaseUrl.TrimEnd('/')}{ColorsPath}";
|
||||
}
|
||||
|
||||
/// <summary>Where and how to push the scraped catalog into the app.</summary>
|
||||
public class ImportConfig
|
||||
{
|
||||
/// <summary>Full URL of the app's token-authenticated catalog import endpoint.</summary>
|
||||
public string EndpointUrl { get; set; } = "";
|
||||
|
||||
/// <summary>Shared secret sent in the X-Import-Token header. Must match the app's config.</summary>
|
||||
public string Token { get; set; } = "";
|
||||
|
||||
/// <summary>Vendor name applied to every record on import.</summary>
|
||||
public string VendorName { get; set; } = "Prismatic Powders";
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace PrismaticSync.Models;
|
||||
|
||||
/// <summary>
|
||||
/// On-disk scrape output. Shape matches the app's catalog import (a top-level "results" array of
|
||||
/// snake_case product records), so the JSON drops straight into the import endpoint. "errors" tracks
|
||||
/// failed URLs for resumable re-runs.
|
||||
/// </summary>
|
||||
public class ScrapeOutput
|
||||
{
|
||||
[JsonPropertyName("results")] public List<ProductRecord> Results { get; set; } = new();
|
||||
[JsonPropertyName("errors")] public List<ScrapeError> Errors { get; set; } = new();
|
||||
}
|
||||
|
||||
/// <summary>One scraped product, in the import's expected field shape.</summary>
|
||||
public class ProductRecord
|
||||
{
|
||||
[JsonPropertyName("sku")] public string Sku { get; set; } = "";
|
||||
[JsonPropertyName("color_name")] public string ColorName { get; set; } = "";
|
||||
[JsonPropertyName("description")] public string Description { get; set; } = "";
|
||||
[JsonPropertyName("price_tiers")] public List<PriceTier> PriceTiers { get; set; } = new();
|
||||
[JsonPropertyName("safety_data_sheet_url")] public string SafetyDataSheetUrl { get; set; } = "";
|
||||
[JsonPropertyName("technical_data_sheet_url")] public string TechnicalDataSheetUrl { get; set; } = "";
|
||||
[JsonPropertyName("application_guide_url")] public string ApplicationGuideUrl { get; set; } = "";
|
||||
[JsonPropertyName("sample_image_url")] public string SampleImageUrl { get; set; } = "";
|
||||
[JsonPropertyName("product_url")] public string ProductUrl { get; set; } = "";
|
||||
[JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>A quantity-break price tier — {min, max, price}. max is null for an open-ended top tier.</summary>
|
||||
public class PriceTier
|
||||
{
|
||||
[JsonPropertyName("min")] public int? Min { get; set; }
|
||||
[JsonPropertyName("max")] public int? Max { get; set; }
|
||||
[JsonPropertyName("price")] public decimal Price { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>A URL that failed to scrape, kept so resumable runs can skip or retry it.</summary>
|
||||
public class ScrapeError
|
||||
{
|
||||
[JsonPropertyName("product_url")] public string ProductUrl { get; set; } = "";
|
||||
[JsonPropertyName("error")] public string Error { get; set; } = "";
|
||||
[JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<!--
|
||||
Standalone workstation tool — deliberately NOT part of PowderCoating.sln.
|
||||
Build/publish independently and run on a machine you control (Task Scheduler),
|
||||
never on the deployed app server. Scrapes Prismatic Powders and pushes the
|
||||
result into the app's catalog import endpoint.
|
||||
|
||||
First-time setup on a workstation:
|
||||
dotnet build
|
||||
pwsh bin/Debug/net8.0/playwright.ps1 install chromium
|
||||
-->
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<AssemblyName>PrismaticSync</AssemblyName>
|
||||
<RootNamespace>PrismaticSync</RootNamespace>
|
||||
<InvariantGlobalization>true</InvariantGlobalization>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Playwright" Version="1.49.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration" Version="8.0.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="8.0.1" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="8.0.2" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<None Update="appsettings.json">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,106 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using PrismaticSync.Infrastructure;
|
||||
using PrismaticSync.Services;
|
||||
|
||||
// ── Load config ───────────────────────────────────────────────────────────────
|
||||
var configRoot = new ConfigurationBuilder()
|
||||
.SetBasePath(AppContext.BaseDirectory)
|
||||
.AddJsonFile("appsettings.json", optional: false)
|
||||
.Build();
|
||||
|
||||
var config = configRoot.GetSection("Sync").Get<SyncConfig>() ?? new SyncConfig();
|
||||
Log.Configure(config.LogFile);
|
||||
|
||||
// ── Parse args ────────────────────────────────────────────────────────────────
|
||||
var command = args.Length > 0 && !args[0].StartsWith("--") ? args[0].ToLowerInvariant() : "run";
|
||||
var headed = args.Contains("--headed");
|
||||
var retryErrors = args.Contains("--retry-errors");
|
||||
var maxProducts = GetIntArg("--max-products", 0);
|
||||
// "run" refreshes products older than 30 days by default; explicit commands default to new-only.
|
||||
var refreshOlderThanDays = GetIntArg("--refresh-older-than", command == "run" ? 30 : 0);
|
||||
|
||||
Log.Info($"PrismaticSync — command '{command}' (headed={headed}, refreshOlderThan={refreshOlderThanDays}d, maxProducts={maxProducts})");
|
||||
|
||||
try
|
||||
{
|
||||
switch (command)
|
||||
{
|
||||
case "discover-new":
|
||||
await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverNewAsync());
|
||||
break;
|
||||
|
||||
case "discover-full":
|
||||
await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverFullAsync());
|
||||
break;
|
||||
|
||||
case "scrape":
|
||||
await WithBrowser(d => new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors));
|
||||
break;
|
||||
|
||||
case "push":
|
||||
await new CatalogPusher(config).PushAsync();
|
||||
break;
|
||||
|
||||
case "run":
|
||||
// The scheduled default: find new colors, scrape new + stale, then push.
|
||||
await WithBrowser(async d =>
|
||||
{
|
||||
await new PrismaticDiscoverer(d, config).DiscoverNewAsync();
|
||||
await new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors);
|
||||
});
|
||||
await new CatalogPusher(config).PushAsync();
|
||||
break;
|
||||
|
||||
default:
|
||||
PrintUsage();
|
||||
return 1;
|
||||
}
|
||||
|
||||
Log.Info("Done.");
|
||||
return 0;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Error($"Fatal: {ex}");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
async Task WithBrowser(Func<BrowserSession, Task> action)
|
||||
{
|
||||
await using var session = await BrowserSession.CreateAsync(headed);
|
||||
await action(session);
|
||||
}
|
||||
|
||||
int GetIntArg(string name, int fallback)
|
||||
{
|
||||
var prefix = name + "=";
|
||||
var found = args.FirstOrDefault(a => a.StartsWith(prefix, StringComparison.OrdinalIgnoreCase));
|
||||
return found is not null && int.TryParse(found[prefix.Length..], out var value) ? value : fallback;
|
||||
}
|
||||
|
||||
void PrintUsage()
|
||||
{
|
||||
Console.WriteLine(
|
||||
"""
|
||||
PrismaticSync — scrape Prismatic Powders and push to the app catalog.
|
||||
|
||||
Usage: PrismaticSync [command] [options]
|
||||
|
||||
Commands:
|
||||
run (default) discover-new + scrape (new + stale) + push
|
||||
discover-new Incremental discovery via newest-first sort (cheap; finds new colors)
|
||||
discover-full Full discovery across all color filters (heavy; reconciles the whole set)
|
||||
scrape Scrape product pages from the URL list (resumable)
|
||||
push Push the scraped JSON to the import endpoint
|
||||
|
||||
Options:
|
||||
--refresh-older-than=N Re-scrape products whose data is older than N days (default 30 for 'run')
|
||||
--max-products=N Cap products scraped this run (0 = no cap)
|
||||
--retry-errors Retry URLs previously recorded as errors
|
||||
--headed Show the browser window (debugging)
|
||||
|
||||
Config: appsettings.json (delays, file paths, import endpoint + token).
|
||||
First run on a new machine: dotnet build, then `pwsh bin/Debug/net8.0/playwright.ps1 install chromium`.
|
||||
""");
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
# PrismaticSync
|
||||
|
||||
A standalone .NET console tool that scrapes the Prismatic Powders catalog and pushes it into the
|
||||
Powder Coating Logix catalog import endpoint. It exists because Prismatic has **no API** (unlike
|
||||
Columbia Coatings) — so the data has to be scraped via browser automation.
|
||||
|
||||
> **Runs on a workstation you control — never on the deployed app server.** Scraping from the cloud
|
||||
> app's IP would get blocked and isn't appropriate. This tool is deliberately *not* part of
|
||||
> `PowderCoating.sln`; build and run it independently.
|
||||
|
||||
## First-time setup (per machine)
|
||||
|
||||
```powershell
|
||||
cd "scripts/Prismatic Data Scraper"
|
||||
dotnet build
|
||||
pwsh bin/Debug/net8.0/playwright.ps1 install chromium # one-time browser download
|
||||
```
|
||||
|
||||
## Commands
|
||||
|
||||
```powershell
|
||||
dotnet run -- run # default: discover-new + scrape (new + stale >30d) + push
|
||||
dotnet run -- discover-new # cheap: find newly-added colors (newest-first, stops at known)
|
||||
dotnet run -- discover-full # heavy: crawl all color filters (reconcile whole set / removals)
|
||||
dotnet run -- scrape # scrape product pages from product-urls.txt (resumable)
|
||||
dotnet run -- scrape --refresh-older-than=30 # also re-scrape products older than 30 days (price changes)
|
||||
dotnet run -- push # push prismatic_powders.json to the import endpoint
|
||||
```
|
||||
|
||||
Options: `--max-products=N`, `--retry-errors`, `--headed` (show the browser for debugging).
|
||||
|
||||
Everything streams to the console live (warnings/errors in color) **and** to `prismatic-sync.log`.
|
||||
|
||||
## Operating model (suggested cadence)
|
||||
|
||||
| Run | Command | Cadence | Why |
|
||||
|-----|---------|---------|-----|
|
||||
| Find new colors | `run` (does discover-new + scrape-new) | Weekly | Cheap; Prismatic adds colors often |
|
||||
| Price refresh | `scrape --refresh-older-than=30` then `push` | Monthly | Re-scrapes stale products to catch price changes (slow, ~hours) |
|
||||
| Full reconcile | `discover-full` then `scrape` | Quarterly | Catches removed/discontinued colors |
|
||||
|
||||
A full scrape of ~5,000 products takes hours (polite delays). It saves after every product and is
|
||||
fully resumable, so stop/restart any time.
|
||||
|
||||
## Politeness / anti-block
|
||||
|
||||
Configurable in `appsettings.json`: randomized 6–14s base delay, an escalating **cooldown + retry on
|
||||
403** (so a temporary block doesn't get you hard-banned mid-run), and a periodic long rest. Leave
|
||||
these conservative — getting blocked is worse than being slow, and Prismatic is a partner.
|
||||
|
||||
## Pushing into the app
|
||||
|
||||
Set in `appsettings.json`:
|
||||
- `Sync.Import.EndpointUrl` → `https://<your-app>/PowderCatalog/ImportApi`
|
||||
- `Sync.Import.Token` → the same secret as the app's `CatalogImport:Token` config
|
||||
|
||||
The tool POSTs the JSON with an `X-Import-Token` header (and `X-Vendor-Name: Prismatic Powders`) to
|
||||
that endpoint, which authenticates the token and runs the records through the same upsert as the
|
||||
Columbia sync. If the endpoint/token isn't configured here, `push` is skipped and you upload
|
||||
`prismatic_powders.json` manually via the Powder Catalog admin page instead.
|
||||
|
||||
> **App side:** set `CatalogImport:Token` in the web app's config (Azure App Setting in prod). The
|
||||
> endpoint returns 401 until a token is set, so it's inert by default.
|
||||
|
||||
## Scheduling (Windows Task Scheduler)
|
||||
|
||||
Point a scheduled task at the published exe (or `dotnet run`). Example weekly task command:
|
||||
|
||||
```
|
||||
Program/script: C:\Tools\PrismaticSync\PrismaticSync.exe
|
||||
Arguments: run
|
||||
Start in: C:\Tools\PrismaticSync
|
||||
```
|
||||
|
||||
Publish a self-contained build to drop on the workstation:
|
||||
|
||||
```powershell
|
||||
dotnet publish -c Release -r win-x64 --self-contained false -o C:\Tools\PrismaticSync
|
||||
pwsh C:\Tools\PrismaticSync\playwright.ps1 install chromium
|
||||
```
|
||||
|
||||
## The long game
|
||||
|
||||
This is the interim path. The durable endgame is a real Prismatic **API** (the partnership), at which
|
||||
point this tool is replaced by a clean in-app sync like Columbia's — reusing the same upsert,
|
||||
propagation, and discontinued handling.
|
||||
@@ -0,0 +1,63 @@
|
||||
using System.Text;
|
||||
using PrismaticSync.Infrastructure;
|
||||
|
||||
namespace PrismaticSync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Pushes the scraped JSON to the app's token-authenticated catalog import endpoint. When no
|
||||
/// endpoint is configured it no-ops (the JSON is still on disk for a manual upload), so the tool is
|
||||
/// useful before the endpoint exists.
|
||||
/// </summary>
|
||||
public class CatalogPusher
|
||||
{
|
||||
private readonly SyncConfig _config;
|
||||
|
||||
public CatalogPusher(SyncConfig config) => _config = config;
|
||||
|
||||
public async Task<bool> PushAsync()
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(_config.Import.EndpointUrl))
|
||||
{
|
||||
Log.Warn($"No import endpoint configured (Sync.Import.EndpointUrl) — skipping push. " +
|
||||
$"Upload {_config.OutputJsonFile} manually via the Powder Catalog admin instead.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!File.Exists(_config.OutputJsonFile))
|
||||
{
|
||||
Log.Warn($"Output file {_config.OutputJsonFile} not found — nothing to push.");
|
||||
return false;
|
||||
}
|
||||
|
||||
var json = await File.ReadAllTextAsync(_config.OutputJsonFile);
|
||||
Log.Info($"Pushing {_config.OutputJsonFile} to {_config.Import.EndpointUrl} (vendor: {_config.Import.VendorName})...");
|
||||
|
||||
using var http = new HttpClient { Timeout = TimeSpan.FromMinutes(5) };
|
||||
using var request = new HttpRequestMessage(HttpMethod.Post, _config.Import.EndpointUrl);
|
||||
request.Headers.Add("X-Import-Token", _config.Import.Token);
|
||||
request.Headers.Add("X-Vendor-Name", _config.Import.VendorName);
|
||||
request.Content = new StringContent(json, Encoding.UTF8, "application/json");
|
||||
|
||||
try
|
||||
{
|
||||
using var response = await http.SendAsync(request);
|
||||
var body = await response.Content.ReadAsStringAsync();
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
Log.Info($"Push succeeded ({(int)response.StatusCode}): {Trim(body)}");
|
||||
return true;
|
||||
}
|
||||
|
||||
Log.Error($"Push failed ({(int)response.StatusCode}): {Trim(body)}");
|
||||
return false;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Error($"Push error: {ex.Message}");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static string Trim(string s) => s.Length > 500 ? s[..500] + "…" : s;
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Playwright;
|
||||
using PrismaticSync.Infrastructure;
|
||||
|
||||
namespace PrismaticSync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Discovers product URLs from the Prismatic color listing (infinite-scroll). Two modes:
|
||||
/// incremental (newest-first via <c>?category=created_at</c>, stop once we reach already-known
|
||||
/// URLs) for cheap frequent runs, and full (every color filter to the bottom) for occasional
|
||||
/// reconciliation. Both append to the URL list file.
|
||||
/// </summary>
|
||||
public class PrismaticDiscoverer
|
||||
{
|
||||
private static readonly Regex ProductUrlRegex =
|
||||
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
private readonly BrowserSession _session;
|
||||
private readonly SyncConfig _config;
|
||||
|
||||
public PrismaticDiscoverer(BrowserSession session, SyncConfig config)
|
||||
{
|
||||
_session = session;
|
||||
_config = config;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Incremental discovery: crawl the newest-first listing and stop once a run of consecutive
|
||||
/// scrolls surfaces only already-known URLs — meaning we've scrolled past the new products.
|
||||
/// Returns the count of newly found URLs.
|
||||
/// </summary>
|
||||
public async Task<int> DiscoverNewAsync()
|
||||
{
|
||||
var known = new HashSet<string>(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
|
||||
var startCount = known.Count;
|
||||
Log.Info($"Incremental discovery (newest first). Known URLs: {startCount}");
|
||||
|
||||
await GotoAsync($"{_config.ColorsUrl}?category=created_at");
|
||||
|
||||
var knownStreak = 0;
|
||||
for (var i = 0; i < _config.MaxScrolls; i++)
|
||||
{
|
||||
var addedNew = 0;
|
||||
foreach (var link in await CollectProductLinksAsync())
|
||||
if (known.Add(link)) addedNew++;
|
||||
|
||||
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
|
||||
knownStreak = addedNew == 0 ? knownStreak + 1 : 0;
|
||||
Log.Info($"Scroll {i + 1}: +{addedNew} new, total {known.Count}, known-streak {knownStreak}");
|
||||
|
||||
if (knownStreak >= _config.StopAfterKnownScrolls)
|
||||
{
|
||||
Log.Info("Reached known territory — stopping incremental discovery.");
|
||||
break;
|
||||
}
|
||||
|
||||
await ScrollAsync();
|
||||
}
|
||||
|
||||
var newCount = known.Count - startCount;
|
||||
Log.Info($"Incremental discovery done. New URLs: {newCount}; total {known.Count}");
|
||||
return newCount;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Full discovery: crawl every color filter to the bottom. Heavier — use occasionally to
|
||||
/// reconcile the whole set (e.g. to notice colors that have been removed). Returns new URL count.
|
||||
/// </summary>
|
||||
public async Task<int> DiscoverFullAsync()
|
||||
{
|
||||
var known = new HashSet<string>(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
|
||||
var startCount = known.Count;
|
||||
Log.Info($"Full discovery across {_config.ColorParams.Length} color filters. Known URLs: {startCount}");
|
||||
|
||||
foreach (var color in _config.ColorParams)
|
||||
{
|
||||
Log.Info($"Color filter: {color}");
|
||||
try
|
||||
{
|
||||
await GotoAsync($"{_config.ColorsUrl}?color={Uri.EscapeDataString(color)}");
|
||||
|
||||
var noNew = 0;
|
||||
for (var i = 0; i < _config.MaxScrolls; i++)
|
||||
{
|
||||
var added = 0;
|
||||
foreach (var link in await CollectProductLinksAsync())
|
||||
if (known.Add(link)) added++;
|
||||
|
||||
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
|
||||
noNew = added == 0 ? noNew + 1 : 0;
|
||||
if (noNew >= _config.StopAfterNoNewScrolls)
|
||||
break;
|
||||
|
||||
await ScrollAsync();
|
||||
}
|
||||
|
||||
Log.Info($"Color {color} done. Total {known.Count}");
|
||||
await _session.Page.WaitForTimeoutAsync(3000);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warn($"Color {color} failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
var newCount = known.Count - startCount;
|
||||
Log.Info($"Full discovery done. New this run: {newCount}; total {known.Count}");
|
||||
return newCount;
|
||||
}
|
||||
|
||||
private async Task GotoAsync(string url)
|
||||
{
|
||||
await _session.Page.GotoAsync(url, new PageGotoOptions
|
||||
{
|
||||
WaitUntil = WaitUntilState.DOMContentLoaded,
|
||||
Timeout = 60000
|
||||
});
|
||||
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
|
||||
}
|
||||
|
||||
private async Task ScrollAsync()
|
||||
{
|
||||
await _session.Page.Mouse.WheelAsync(0, 2500);
|
||||
await _session.Page.WaitForTimeoutAsync(_config.ScrollWaitMs);
|
||||
}
|
||||
|
||||
private async Task<List<string>> CollectProductLinksAsync()
|
||||
{
|
||||
var hrefs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||
"a", "els => els.map(a => a.href).filter(Boolean)");
|
||||
|
||||
return hrefs
|
||||
.Where(h => ProductUrlRegex.IsMatch(h))
|
||||
.Select(JsonStore.CleanUrl)
|
||||
.Where(u => u.Length > 0)
|
||||
.ToList();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,308 @@
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Playwright;
|
||||
using PrismaticSync.Infrastructure;
|
||||
using PrismaticSync.Models;
|
||||
|
||||
namespace PrismaticSync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes individual Prismatic product pages into <see cref="ProductRecord"/>s. Resumable (skips
|
||||
/// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale
|
||||
/// records get re-scraped to catch price changes. Saves after every product so a long run can be
|
||||
/// stopped and resumed safely, and logs continuously — including the delay between products — so a
|
||||
/// manual run always shows it's alive.
|
||||
/// </summary>
|
||||
public class PrismaticScraper
|
||||
{
|
||||
private static readonly Regex ProductUrlRegex =
|
||||
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex SkuRegex =
|
||||
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex DescRegex =
|
||||
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
||||
private static readonly Regex PriceTierRegex =
|
||||
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
|
||||
private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled);
|
||||
private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled);
|
||||
|
||||
private readonly BrowserSession _session;
|
||||
private readonly SyncConfig _config;
|
||||
private readonly Random _random = new();
|
||||
|
||||
public PrismaticScraper(BrowserSession session, SyncConfig config)
|
||||
{
|
||||
_session = session;
|
||||
_config = config;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes products needing work: those not yet scraped, plus (when <paramref name="refreshOlderThanDays"/>
|
||||
/// > 0) any whose data is older than that window. Returns (scraped, errors).
|
||||
/// </summary>
|
||||
public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors)
|
||||
{
|
||||
var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile)
|
||||
.Where(u => ProductUrlRegex.IsMatch(u))
|
||||
.ToList();
|
||||
|
||||
var data = JsonStore.LoadOutput(_config.OutputJsonFile);
|
||||
|
||||
// Index existing results by URL (keep the most recent if the file has dupes).
|
||||
var resultByUrl = data.Results
|
||||
.GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase)
|
||||
.ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var errorUrls = new HashSet<string>(
|
||||
data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays));
|
||||
|
||||
var toScrape = new List<string>();
|
||||
foreach (var url in allUrls)
|
||||
{
|
||||
if (resultByUrl.TryGetValue(url, out var existing))
|
||||
{
|
||||
if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff)
|
||||
toScrape.Add(url); // stale → refresh for price changes
|
||||
}
|
||||
else
|
||||
{
|
||||
if (retryErrors || !errorUrls.Contains(url))
|
||||
toScrape.Add(url); // never scraped (skip known errors unless retrying)
|
||||
}
|
||||
}
|
||||
|
||||
if (maxProducts > 0)
|
||||
toScrape = toScrape.Take(maxProducts).ToList();
|
||||
|
||||
var total = toScrape.Count;
|
||||
Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}");
|
||||
Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})");
|
||||
|
||||
if (total == 0)
|
||||
{
|
||||
Log.Info("Nothing to scrape. Done.");
|
||||
return (0, 0);
|
||||
}
|
||||
|
||||
var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0;
|
||||
var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0;
|
||||
Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " +
|
||||
$"(grab a coffee if that's a while — it saves after every product and is resumable).");
|
||||
|
||||
var stopwatch = Stopwatch.StartNew();
|
||||
int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0;
|
||||
|
||||
foreach (var url in toScrape)
|
||||
{
|
||||
index++;
|
||||
|
||||
for (var attempt = 1; ; attempt++)
|
||||
{
|
||||
try
|
||||
{
|
||||
var row = await ParseProductAsync(url, index, total);
|
||||
|
||||
if (resultByUrl.TryGetValue(url, out var existing))
|
||||
data.Results[data.Results.IndexOf(existing)] = row;
|
||||
else
|
||||
data.Results.Add(row);
|
||||
|
||||
resultByUrl[url] = row;
|
||||
data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
scraped++;
|
||||
consecutiveBlocks = 0;
|
||||
JsonStore.SaveOutput(_config.OutputJsonFile, data);
|
||||
|
||||
var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m;
|
||||
Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " +
|
||||
$"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00}) | elapsed {FormatDuration(stopwatch.Elapsed)}");
|
||||
break;
|
||||
}
|
||||
catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries)
|
||||
{
|
||||
// Site pushed back — back off (escalating) and retry the SAME product rather
|
||||
// than barreling on, which is how an unattended run gets hard-banned.
|
||||
consecutiveBlocks++;
|
||||
var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds);
|
||||
Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product...");
|
||||
await Task.Delay(cooldown * 1000);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow });
|
||||
JsonStore.SaveOutput(_config.OutputJsonFile, data);
|
||||
errors++;
|
||||
Log.Error($"[{index}/{total}] {url} -> {ex.Message}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence.
|
||||
if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total)
|
||||
{
|
||||
Log.Info($"Resting {_config.LongRestSeconds}s after {index} products...");
|
||||
await Task.Delay(_config.LongRestSeconds * 1000);
|
||||
}
|
||||
|
||||
if (index < total)
|
||||
{
|
||||
var delayMs = RandomDelayMs();
|
||||
Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product...");
|
||||
await Task.Delay(delayMs);
|
||||
}
|
||||
}
|
||||
|
||||
Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " +
|
||||
$"Took {FormatDuration(stopwatch.Elapsed)}.");
|
||||
return (scraped, errors);
|
||||
}
|
||||
|
||||
private async Task<ProductRecord> ParseProductAsync(string url, int index, int total)
|
||||
{
|
||||
Log.Info($"[{index}/{total}] Scraping {url}");
|
||||
|
||||
var response = await _session.Page.GotoAsync(url, new PageGotoOptions
|
||||
{
|
||||
WaitUntil = WaitUntilState.DOMContentLoaded,
|
||||
Timeout = 60000
|
||||
});
|
||||
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
|
||||
|
||||
var status = response?.Status ?? 0;
|
||||
var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync()));
|
||||
var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync()));
|
||||
|
||||
if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase))
|
||||
throw new Exception("403 Forbidden returned by site.");
|
||||
if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase))
|
||||
throw new Exception("404 Not Found returned by site.");
|
||||
|
||||
var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync()));
|
||||
|
||||
var skuMatch = SkuRegex.Match(plainText);
|
||||
var sku = skuMatch.Success ? skuMatch.Groups[1].Value : "";
|
||||
if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName))
|
||||
throw new Exception("Could not find SKU or title on product page.");
|
||||
|
||||
var descMatch = DescRegex.Match(plainText);
|
||||
var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : "";
|
||||
|
||||
return new ProductRecord
|
||||
{
|
||||
Sku = sku,
|
||||
ColorName = colorName,
|
||||
Description = description,
|
||||
PriceTiers = ParsePriceTiers(plainText),
|
||||
SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }),
|
||||
TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }),
|
||||
ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }),
|
||||
SampleImageUrl = await GetSampleImageUrlAsync(),
|
||||
ProductUrl = url,
|
||||
ScrapedAt = DateTime.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
private static List<PriceTier> ParsePriceTiers(string text)
|
||||
{
|
||||
var tiers = new List<PriceTier>();
|
||||
foreach (Match m in PriceTierRegex.Matches(text))
|
||||
{
|
||||
if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price))
|
||||
continue;
|
||||
|
||||
var rangeText = Clean(m.Groups[1].Value);
|
||||
int? min = null, max = null;
|
||||
|
||||
var range = RangeRegex.Match(rangeText);
|
||||
if (range.Success)
|
||||
{
|
||||
min = int.Parse(range.Groups[1].Value);
|
||||
max = int.Parse(range.Groups[2].Value);
|
||||
}
|
||||
|
||||
var plus = PlusRegex.Match(rangeText);
|
||||
if (plus.Success)
|
||||
{
|
||||
min = int.Parse(plus.Groups[1].Value);
|
||||
max = null;
|
||||
}
|
||||
|
||||
tiers.Add(new PriceTier { Min = min, Max = max, Price = price });
|
||||
}
|
||||
return tiers;
|
||||
}
|
||||
|
||||
/// <summary>Returns the href of the first link whose text matches any pattern. Uses a single eval
|
||||
/// returning "texthref" pairs to avoid object deserialization quirks.</summary>
|
||||
private async Task<string> GetLinkByTextAsync(string[] patterns)
|
||||
{
|
||||
var combined = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||
"a",
|
||||
"els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " +
|
||||
"+ String.fromCharCode(1) + (a.href || ''))");
|
||||
|
||||
foreach (var entry in combined)
|
||||
{
|
||||
var parts = entry.Split('');
|
||||
var text = parts.Length > 0 ? parts[0] : "";
|
||||
var href = parts.Length > 1 ? parts[1] : "";
|
||||
// Require the link to point at an actual document, not a generic /documents nav page.
|
||||
if (href.Length > 0
|
||||
&& IsDocumentUrl(href)
|
||||
&& patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
|
||||
return href;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/// <summary>True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).</summary>
|
||||
private static bool IsDocumentUrl(string href)
|
||||
{
|
||||
var path = href.Split('?')[0];
|
||||
return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
|
||||
|| path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private async Task<string> GetSampleImageUrlAsync()
|
||||
{
|
||||
var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||
"img",
|
||||
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
|
||||
".filter(Boolean)");
|
||||
|
||||
// Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
|
||||
// fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
|
||||
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
|
||||
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
|
||||
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
|
||||
?? "";
|
||||
}
|
||||
|
||||
private static bool IsBlocked(Exception ex) =>
|
||||
ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
private static async Task<string> SafeTextAsync(Func<Task<string>> fn)
|
||||
{
|
||||
try { return await fn(); } catch { return ""; }
|
||||
}
|
||||
|
||||
private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim();
|
||||
|
||||
private int RandomDelayMs()
|
||||
{
|
||||
var min = Math.Max(0, _config.MinDelaySeconds * 1000);
|
||||
var max = Math.Max(min, _config.MaxDelaySeconds * 1000);
|
||||
return _random.Next(min, max + 1);
|
||||
}
|
||||
|
||||
private static string FormatDuration(TimeSpan t) =>
|
||||
t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" :
|
||||
t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" :
|
||||
$"{t.Seconds}s";
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"Sync": {
|
||||
"BaseUrl": "https://www.prismaticpowders.com",
|
||||
"ColorsPath": "/shop/powder-coating-colors",
|
||||
|
||||
"ProductUrlsFile": "product-urls.txt",
|
||||
"OutputJsonFile": "prismatic_powders.json",
|
||||
"LogFile": "prismatic-sync.log",
|
||||
|
||||
"MinDelaySeconds": 6,
|
||||
"MaxDelaySeconds": 14,
|
||||
"PageSettleSeconds": 4,
|
||||
|
||||
"BlockedCooldownSeconds": 120,
|
||||
"BlockedCooldownMaxSeconds": 600,
|
||||
"BlockedMaxRetries": 3,
|
||||
"LongRestEveryProducts": 150,
|
||||
"LongRestSeconds": 45,
|
||||
|
||||
"ScrollWaitMs": 1500,
|
||||
"MaxScrolls": 400,
|
||||
"StopAfterNoNewScrolls": 10,
|
||||
"StopAfterKnownScrolls": 8,
|
||||
|
||||
"ColorParams": [
|
||||
"pris_black", "pris_blue", "pris_bronze", "pris_brown", "pris_clear",
|
||||
"pris_copper", "pris_gold", "pris_gray", "pris_green", "pris_orange",
|
||||
"pris_pink", "pris_purple", "pris_red", "pris_silver", "pris_tan",
|
||||
"pris_white", "pris_yellow"
|
||||
],
|
||||
|
||||
"Import": {
|
||||
"EndpointUrl": "",
|
||||
"Token": "",
|
||||
"VendorName": "Prismatic Powders"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -25,6 +25,7 @@ public class PowderCatalogController : Controller
|
||||
private readonly IColumbiaCatalogSyncService _columbiaSyncService;
|
||||
private readonly IPowderCatalogUpsertService _upsertService;
|
||||
private readonly IPlatformSettingsService _platformSettings;
|
||||
private readonly IConfiguration _config;
|
||||
private readonly ILogger<PowderCatalogController> _logger;
|
||||
|
||||
public PowderCatalogController(
|
||||
@@ -33,6 +34,7 @@ public class PowderCatalogController : Controller
|
||||
IColumbiaCatalogSyncService columbiaSyncService,
|
||||
IPowderCatalogUpsertService upsertService,
|
||||
IPlatformSettingsService platformSettings,
|
||||
IConfiguration config,
|
||||
ILogger<PowderCatalogController> logger)
|
||||
{
|
||||
_unitOfWork = unitOfWork;
|
||||
@@ -40,6 +42,7 @@ public class PowderCatalogController : Controller
|
||||
_columbiaSyncService = columbiaSyncService;
|
||||
_upsertService = upsertService;
|
||||
_platformSettings = platformSettings;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
@@ -372,7 +375,8 @@ public class PowderCatalogController : Controller
|
||||
PowderCatalogImportResult result;
|
||||
try
|
||||
{
|
||||
result = await ImportJsonAsync(file, vendorName);
|
||||
using var stream = file.OpenReadStream();
|
||||
result = await ImportJsonAsync(stream, vendorName);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -393,6 +397,67 @@ public class PowderCatalogController : Controller
|
||||
return RedirectToAction(nameof(Index));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unattended catalog import for the offline scraper tool (e.g. PrismaticSync). Accepts the same
|
||||
/// JSON scrape format in the request body, authenticated by a shared secret in the
|
||||
/// <c>X-Import-Token</c> header (matched against <c>CatalogImport:Token</c>). The vendor name
|
||||
/// comes from the <c>X-Vendor-Name</c> header. Runs through the same upsert as the manual upload.
|
||||
/// Inert (401) until a token is configured.
|
||||
/// </summary>
|
||||
[HttpPost]
|
||||
[AllowAnonymous]
|
||||
[IgnoreAntiforgeryToken]
|
||||
[RequestSizeLimit(50 * 1024 * 1024)] // 50 MB
|
||||
public async Task<IActionResult> ImportApi()
|
||||
{
|
||||
var configuredToken = _config["CatalogImport:Token"];
|
||||
if (string.IsNullOrWhiteSpace(configuredToken))
|
||||
{
|
||||
_logger.LogWarning("ImportApi called but no CatalogImport:Token is configured — rejecting.");
|
||||
return Unauthorized(new { success = false, errorMessage = "Import API is not enabled." });
|
||||
}
|
||||
|
||||
var providedToken = Request.Headers["X-Import-Token"].ToString();
|
||||
if (!FixedTimeEquals(providedToken, configuredToken))
|
||||
return Unauthorized(new { success = false, errorMessage = "Invalid import token." });
|
||||
|
||||
var vendorName = Request.Headers["X-Vendor-Name"].ToString();
|
||||
if (string.IsNullOrWhiteSpace(vendorName))
|
||||
vendorName = "Prismatic Powders";
|
||||
|
||||
try
|
||||
{
|
||||
var result = await ImportJsonAsync(Request.Body, vendorName);
|
||||
_logger.LogInformation(
|
||||
"ImportApi ({Vendor}): {Inserted} inserted, {Updated} updated, {Skipped} skipped, {Errors} errors.",
|
||||
vendorName, result.Inserted, result.Updated, result.Skipped, result.Errors);
|
||||
|
||||
return Json(new
|
||||
{
|
||||
success = result.Success,
|
||||
vendorName,
|
||||
result.Inserted,
|
||||
result.Updated,
|
||||
result.Skipped,
|
||||
result.Errors,
|
||||
result.ErrorMessage
|
||||
});
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "ImportApi failed for vendor {Vendor}", vendorName);
|
||||
return StatusCode(500, new { success = false, errorMessage = "Import failed." });
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Constant-time string comparison so token checks don't leak length/contents via timing.</summary>
|
||||
private static bool FixedTimeEquals(string a, string b)
|
||||
{
|
||||
var ba = System.Text.Encoding.UTF8.GetBytes(a ?? string.Empty);
|
||||
var bb = System.Text.Encoding.UTF8.GetBytes(b ?? string.Empty);
|
||||
return System.Security.Cryptography.CryptographicOperations.FixedTimeEquals(ba, bb);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// AJAX endpoint used by the inventory form to search the catalog by SKU or color name.
|
||||
/// SKU exact matches are ranked first; color name substring matches follow.
|
||||
@@ -527,9 +592,8 @@ public class PowderCatalogController : Controller
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<PowderCatalogImportResult> ImportJsonAsync(IFormFile file, string vendorName)
|
||||
private async Task<PowderCatalogImportResult> ImportJsonAsync(Stream stream, string vendorName)
|
||||
{
|
||||
using var stream = file.OpenReadStream();
|
||||
using var doc = await JsonDocument.ParseAsync(stream);
|
||||
|
||||
if (!doc.RootElement.TryGetProperty("results", out var resultsEl) ||
|
||||
|
||||
@@ -47,6 +47,9 @@
|
||||
"BaseUrl": "https://columbiacoatings.com",
|
||||
"ApiBasePath": "/wp-json/cca/v1"
|
||||
},
|
||||
"CatalogImport": {
|
||||
"Token": ""
|
||||
},
|
||||
"SendGrid": {
|
||||
"ApiKey": "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",
|
||||
"FromEmail": "spouliot@scppowdercoating.com",
|
||||
|
||||
Reference in New Issue
Block a user