Add PrismaticSync console tool for unattended Prismatic catalog sync
Standalone .NET 8 console app (not part of the main solution) that scrapes the Prismatic Powders catalog via Playwright and pushes it into the app's catalog import. Prismatic has no API, so this runs on a workstation (Task Scheduler), never the deployed server. - Discovery: incremental newest-first via ?category=created_at (stops once it reaches already-known URLs — cheap, finds new colors) and a full all-colors crawl for occasional reconcile. - Scraper: resumable product-page scrape (sku/color/description/price tiers/ SDS/TDS/app-guide/image), with --refresh-older-than to re-scrape stale products and catch price changes. Output matches the app import format so it flows through the same shared upsert as the Columbia sync. - Resilience: brisk randomized base delay, escalating 403 cooldown-and-retry to avoid hard bans, periodic rest. All configurable. - Visibility: streams every product + the inter-product wait to the console (colored) and a log file, with an up-front ETA. - Push: token-authenticated POST to the app import endpoint (skips to manual upload when unconfigured). The app-side token import endpoint is a separate follow-up. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
# Build output
|
||||
bin/
|
||||
obj/
|
||||
|
||||
# Transient scrape artifacts
|
||||
*.tmp
|
||||
*.invalid-*.bak
|
||||
prismatic-sync.log
|
||||
@@ -0,0 +1,43 @@
|
||||
using Microsoft.Playwright;
|
||||
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>
|
||||
/// A headless Chromium session with a realistic desktop fingerprint (UA, viewport, locale,
|
||||
/// timezone) — matching the original scraper's settings to look like a normal browser.
|
||||
/// </summary>
|
||||
public sealed class BrowserSession : IAsyncDisposable
|
||||
{
|
||||
private IPlaywright? _pw;
|
||||
private IBrowser? _browser;
|
||||
private IBrowserContext? _context;
|
||||
|
||||
public IPage Page { get; private set; } = null!;
|
||||
|
||||
public static async Task<BrowserSession> CreateAsync(bool headed)
|
||||
{
|
||||
var session = new BrowserSession();
|
||||
session._pw = await Playwright.CreateAsync();
|
||||
session._browser = await session._pw.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
|
||||
{
|
||||
Headless = !headed
|
||||
});
|
||||
session._context = await session._browser.NewContextAsync(new BrowserNewContextOptions
|
||||
{
|
||||
UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
ViewportSize = new ViewportSize { Width = 1365, Height = 900 },
|
||||
Locale = "en-US",
|
||||
TimezoneId = "America/New_York"
|
||||
});
|
||||
session.Page = await session._context.NewPageAsync();
|
||||
return session;
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_context is not null) await _context.CloseAsync();
|
||||
if (_browser is not null) await _browser.CloseAsync();
|
||||
_pw?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
using System.Text.Json;
|
||||
using PrismaticSync.Models;
|
||||
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>Loads/saves the scrape output and the URL list, with atomic writes so a crash mid-save can't corrupt them.</summary>
|
||||
public static class JsonStore
|
||||
{
|
||||
private static readonly JsonSerializerOptions WriteOptions = new() { WriteIndented = true };
|
||||
private static readonly JsonSerializerOptions ReadOptions = new() { PropertyNameCaseInsensitive = true };
|
||||
|
||||
public static ScrapeOutput LoadOutput(string path)
|
||||
{
|
||||
if (!File.Exists(path))
|
||||
return new ScrapeOutput();
|
||||
|
||||
var json = File.ReadAllText(path);
|
||||
try
|
||||
{
|
||||
// Tolerate a bare array (older output format) as well as { results, errors }.
|
||||
if (json.TrimStart().StartsWith("["))
|
||||
{
|
||||
var results = JsonSerializer.Deserialize<List<ProductRecord>>(json, ReadOptions) ?? new();
|
||||
return new ScrapeOutput { Results = results };
|
||||
}
|
||||
return JsonSerializer.Deserialize<ScrapeOutput>(json, ReadOptions) ?? new ScrapeOutput();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
var backup = $"{path}.invalid-{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}.bak";
|
||||
File.Copy(path, backup, overwrite: true);
|
||||
throw new InvalidOperationException($"Could not parse {path}. Backed it up to {backup}. {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
public static void SaveOutput(string path, ScrapeOutput data)
|
||||
{
|
||||
var tmp = path + ".tmp";
|
||||
File.WriteAllText(tmp, JsonSerializer.Serialize(data, WriteOptions));
|
||||
File.Move(tmp, path, overwrite: true);
|
||||
}
|
||||
|
||||
public static List<string> LoadUrls(string path)
|
||||
{
|
||||
if (!File.Exists(path))
|
||||
return new List<string>();
|
||||
|
||||
return File.ReadAllLines(path)
|
||||
.Select(CleanUrl)
|
||||
.Where(u => u.Length > 0 && !u.StartsWith("#"))
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public static void SaveUrls(string path, IEnumerable<string> urls)
|
||||
{
|
||||
var sorted = urls.Distinct(StringComparer.OrdinalIgnoreCase).OrderBy(u => u, StringComparer.OrdinalIgnoreCase);
|
||||
var tmp = path + ".tmp";
|
||||
File.WriteAllText(tmp, string.Join(Environment.NewLine, sorted) + Environment.NewLine);
|
||||
File.Move(tmp, path, overwrite: true);
|
||||
}
|
||||
|
||||
public static string CleanUrl(string? url) =>
|
||||
(url ?? string.Empty).Split('?')[0].Split('#')[0].Trim();
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>
|
||||
/// Minimal timestamped logger — writes to the console and appends to a rolling log file so an
|
||||
/// unattended (Task Scheduler) run leaves an audit trail. Intentionally dependency-free.
|
||||
/// </summary>
|
||||
public static class Log
|
||||
{
|
||||
private static string _logFile = "prismatic-sync.log";
|
||||
private static readonly object Gate = new();
|
||||
|
||||
public static void Configure(string logFile) => _logFile = logFile;
|
||||
|
||||
public static void Info(string message) => Write("INFO", message);
|
||||
public static void Warn(string message) => Write("WARN", message);
|
||||
public static void Error(string message) => Write("ERROR", message);
|
||||
|
||||
private static void Write(string level, string message)
|
||||
{
|
||||
var line = $"[{DateTime.UtcNow:yyyy-MM-ddTHH:mm:ssZ}] {level,-5} {message}";
|
||||
|
||||
// Live console stream (visible on a manual run); color-code so warnings/errors stand out.
|
||||
lock (Gate)
|
||||
{
|
||||
var color = level switch
|
||||
{
|
||||
"WARN" => ConsoleColor.Yellow,
|
||||
"ERROR" => ConsoleColor.Red,
|
||||
_ => (ConsoleColor?)null
|
||||
};
|
||||
|
||||
if (color is { } c)
|
||||
{
|
||||
var previous = Console.ForegroundColor;
|
||||
Console.ForegroundColor = c;
|
||||
Console.WriteLine(line);
|
||||
Console.ForegroundColor = previous;
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine(line);
|
||||
}
|
||||
|
||||
// File trail — never let logging break a run.
|
||||
try { File.AppendAllText(_logFile, line + Environment.NewLine); }
|
||||
catch { /* ignore */ }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
namespace PrismaticSync.Infrastructure;
|
||||
|
||||
/// <summary>Strongly-typed config bound from the "Sync" section of appsettings.json.</summary>
|
||||
public class SyncConfig
|
||||
{
|
||||
public string BaseUrl { get; set; } = "https://www.prismaticpowders.com";
|
||||
public string ColorsPath { get; set; } = "/shop/powder-coating-colors";
|
||||
|
||||
public string ProductUrlsFile { get; set; } = "product-urls.txt";
|
||||
public string OutputJsonFile { get; set; } = "prismatic_powders.json";
|
||||
public string LogFile { get; set; } = "prismatic-sync.log";
|
||||
|
||||
/// <summary>Politeness delay between product scrapes (randomized within the range).</summary>
|
||||
public int MinDelaySeconds { get; set; } = 6;
|
||||
public int MaxDelaySeconds { get; set; } = 14;
|
||||
|
||||
/// <summary>On a 403/block, cool down this many seconds × the consecutive-block count, then retry.</summary>
|
||||
public int BlockedCooldownSeconds { get; set; } = 120;
|
||||
|
||||
/// <summary>Upper bound on a single cooldown so escalation can't run away.</summary>
|
||||
public int BlockedCooldownMaxSeconds { get; set; } = 600;
|
||||
|
||||
/// <summary>How many times to cool-down-and-retry a blocked product before recording it as an error.</summary>
|
||||
public int BlockedMaxRetries { get; set; } = 3;
|
||||
|
||||
/// <summary>Take a longer rest after this many products (0 disables). Eases load and looks less robotic.</summary>
|
||||
public int LongRestEveryProducts { get; set; } = 150;
|
||||
|
||||
/// <summary>Length of the periodic long rest, in seconds.</summary>
|
||||
public int LongRestSeconds { get; set; } = 45;
|
||||
|
||||
/// <summary>Extra settle time after a product page loads before reading it.</summary>
|
||||
public int PageSettleSeconds { get; set; } = 4;
|
||||
|
||||
/// <summary>Pause after each scroll while a listing lazy-loads more items.</summary>
|
||||
public int ScrollWaitMs { get; set; } = 1500;
|
||||
|
||||
/// <summary>Hard cap on scrolls per listing, as a safety stop.</summary>
|
||||
public int MaxScrolls { get; set; } = 400;
|
||||
|
||||
/// <summary>Full discovery: stop a listing after this many scrolls add no new links.</summary>
|
||||
public int StopAfterNoNewScrolls { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Incremental discovery: stop the newest-first listing after this many consecutive scrolls
|
||||
/// that surfaced only already-known URLs — i.e. we've scrolled past the new products.
|
||||
/// </summary>
|
||||
public int StopAfterKnownScrolls { get; set; } = 8;
|
||||
|
||||
/// <summary>Color filter params used by full discovery.</summary>
|
||||
public string[] ColorParams { get; set; } = Array.Empty<string>();
|
||||
|
||||
public ImportConfig Import { get; set; } = new();
|
||||
|
||||
public string ColorsUrl => $"{BaseUrl.TrimEnd('/')}{ColorsPath}";
|
||||
}
|
||||
|
||||
/// <summary>Where and how to push the scraped catalog into the app.</summary>
|
||||
public class ImportConfig
|
||||
{
|
||||
/// <summary>Full URL of the app's token-authenticated catalog import endpoint.</summary>
|
||||
public string EndpointUrl { get; set; } = "";
|
||||
|
||||
/// <summary>Shared secret sent in the X-Import-Token header. Must match the app's config.</summary>
|
||||
public string Token { get; set; } = "";
|
||||
|
||||
/// <summary>Vendor name applied to every record on import.</summary>
|
||||
public string VendorName { get; set; } = "Prismatic Powders";
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace PrismaticSync.Models;
|
||||
|
||||
/// <summary>
|
||||
/// On-disk scrape output. Shape matches the app's catalog import (a top-level "results" array of
|
||||
/// snake_case product records), so the JSON drops straight into the import endpoint. "errors" tracks
|
||||
/// failed URLs for resumable re-runs.
|
||||
/// </summary>
|
||||
public class ScrapeOutput
|
||||
{
|
||||
[JsonPropertyName("results")] public List<ProductRecord> Results { get; set; } = new();
|
||||
[JsonPropertyName("errors")] public List<ScrapeError> Errors { get; set; } = new();
|
||||
}
|
||||
|
||||
/// <summary>One scraped product, in the import's expected field shape.</summary>
|
||||
public class ProductRecord
|
||||
{
|
||||
[JsonPropertyName("sku")] public string Sku { get; set; } = "";
|
||||
[JsonPropertyName("color_name")] public string ColorName { get; set; } = "";
|
||||
[JsonPropertyName("description")] public string Description { get; set; } = "";
|
||||
[JsonPropertyName("price_tiers")] public List<PriceTier> PriceTiers { get; set; } = new();
|
||||
[JsonPropertyName("safety_data_sheet_url")] public string SafetyDataSheetUrl { get; set; } = "";
|
||||
[JsonPropertyName("technical_data_sheet_url")] public string TechnicalDataSheetUrl { get; set; } = "";
|
||||
[JsonPropertyName("application_guide_url")] public string ApplicationGuideUrl { get; set; } = "";
|
||||
[JsonPropertyName("sample_image_url")] public string SampleImageUrl { get; set; } = "";
|
||||
[JsonPropertyName("product_url")] public string ProductUrl { get; set; } = "";
|
||||
[JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>A quantity-break price tier — {min, max, price}. max is null for an open-ended top tier.</summary>
|
||||
public class PriceTier
|
||||
{
|
||||
[JsonPropertyName("min")] public int? Min { get; set; }
|
||||
[JsonPropertyName("max")] public int? Max { get; set; }
|
||||
[JsonPropertyName("price")] public decimal Price { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>A URL that failed to scrape, kept so resumable runs can skip or retry it.</summary>
|
||||
public class ScrapeError
|
||||
{
|
||||
[JsonPropertyName("product_url")] public string ProductUrl { get; set; } = "";
|
||||
[JsonPropertyName("error")] public string Error { get; set; } = "";
|
||||
[JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<!--
|
||||
Standalone workstation tool — deliberately NOT part of PowderCoating.sln.
|
||||
Build/publish independently and run on a machine you control (Task Scheduler),
|
||||
never on the deployed app server. Scrapes Prismatic Powders and pushes the
|
||||
result into the app's catalog import endpoint.
|
||||
|
||||
First-time setup on a workstation:
|
||||
dotnet build
|
||||
pwsh bin/Debug/net8.0/playwright.ps1 install chromium
|
||||
-->
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<AssemblyName>PrismaticSync</AssemblyName>
|
||||
<RootNamespace>PrismaticSync</RootNamespace>
|
||||
<InvariantGlobalization>true</InvariantGlobalization>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Playwright" Version="1.49.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration" Version="8.0.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="8.0.1" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="8.0.2" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<None Update="appsettings.json">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,106 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using PrismaticSync.Infrastructure;
|
||||
using PrismaticSync.Services;
|
||||
|
||||
// ── Load config ───────────────────────────────────────────────────────────────
|
||||
var configRoot = new ConfigurationBuilder()
|
||||
.SetBasePath(AppContext.BaseDirectory)
|
||||
.AddJsonFile("appsettings.json", optional: false)
|
||||
.Build();
|
||||
|
||||
var config = configRoot.GetSection("Sync").Get<SyncConfig>() ?? new SyncConfig();
|
||||
Log.Configure(config.LogFile);
|
||||
|
||||
// ── Parse args ────────────────────────────────────────────────────────────────
|
||||
var command = args.Length > 0 && !args[0].StartsWith("--") ? args[0].ToLowerInvariant() : "run";
|
||||
var headed = args.Contains("--headed");
|
||||
var retryErrors = args.Contains("--retry-errors");
|
||||
var maxProducts = GetIntArg("--max-products", 0);
|
||||
// "run" refreshes products older than 30 days by default; explicit commands default to new-only.
|
||||
var refreshOlderThanDays = GetIntArg("--refresh-older-than", command == "run" ? 30 : 0);
|
||||
|
||||
Log.Info($"PrismaticSync — command '{command}' (headed={headed}, refreshOlderThan={refreshOlderThanDays}d, maxProducts={maxProducts})");
|
||||
|
||||
try
|
||||
{
|
||||
switch (command)
|
||||
{
|
||||
case "discover-new":
|
||||
await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverNewAsync());
|
||||
break;
|
||||
|
||||
case "discover-full":
|
||||
await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverFullAsync());
|
||||
break;
|
||||
|
||||
case "scrape":
|
||||
await WithBrowser(d => new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors));
|
||||
break;
|
||||
|
||||
case "push":
|
||||
await new CatalogPusher(config).PushAsync();
|
||||
break;
|
||||
|
||||
case "run":
|
||||
// The scheduled default: find new colors, scrape new + stale, then push.
|
||||
await WithBrowser(async d =>
|
||||
{
|
||||
await new PrismaticDiscoverer(d, config).DiscoverNewAsync();
|
||||
await new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors);
|
||||
});
|
||||
await new CatalogPusher(config).PushAsync();
|
||||
break;
|
||||
|
||||
default:
|
||||
PrintUsage();
|
||||
return 1;
|
||||
}
|
||||
|
||||
Log.Info("Done.");
|
||||
return 0;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Error($"Fatal: {ex}");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
async Task WithBrowser(Func<BrowserSession, Task> action)
|
||||
{
|
||||
await using var session = await BrowserSession.CreateAsync(headed);
|
||||
await action(session);
|
||||
}
|
||||
|
||||
int GetIntArg(string name, int fallback)
|
||||
{
|
||||
var prefix = name + "=";
|
||||
var found = args.FirstOrDefault(a => a.StartsWith(prefix, StringComparison.OrdinalIgnoreCase));
|
||||
return found is not null && int.TryParse(found[prefix.Length..], out var value) ? value : fallback;
|
||||
}
|
||||
|
||||
void PrintUsage()
|
||||
{
|
||||
Console.WriteLine(
|
||||
"""
|
||||
PrismaticSync — scrape Prismatic Powders and push to the app catalog.
|
||||
|
||||
Usage: PrismaticSync [command] [options]
|
||||
|
||||
Commands:
|
||||
run (default) discover-new + scrape (new + stale) + push
|
||||
discover-new Incremental discovery via newest-first sort (cheap; finds new colors)
|
||||
discover-full Full discovery across all color filters (heavy; reconciles the whole set)
|
||||
scrape Scrape product pages from the URL list (resumable)
|
||||
push Push the scraped JSON to the import endpoint
|
||||
|
||||
Options:
|
||||
--refresh-older-than=N Re-scrape products whose data is older than N days (default 30 for 'run')
|
||||
--max-products=N Cap products scraped this run (0 = no cap)
|
||||
--retry-errors Retry URLs previously recorded as errors
|
||||
--headed Show the browser window (debugging)
|
||||
|
||||
Config: appsettings.json (delays, file paths, import endpoint + token).
|
||||
First run on a new machine: dotnet build, then `pwsh bin/Debug/net8.0/playwright.ps1 install chromium`.
|
||||
""");
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
# PrismaticSync
|
||||
|
||||
A standalone .NET console tool that scrapes the Prismatic Powders catalog and pushes it into the
|
||||
Powder Coating Logix catalog import endpoint. It exists because Prismatic has **no API** (unlike
|
||||
Columbia Coatings) — so the data has to be scraped via browser automation.
|
||||
|
||||
> **Runs on a workstation you control — never on the deployed app server.** Scraping from the cloud
|
||||
> app's IP would get blocked and isn't appropriate. This tool is deliberately *not* part of
|
||||
> `PowderCoating.sln`; build and run it independently.
|
||||
|
||||
## First-time setup (per machine)
|
||||
|
||||
```powershell
|
||||
cd "scripts/Prismatic Data Scraper"
|
||||
dotnet build
|
||||
pwsh bin/Debug/net8.0/playwright.ps1 install chromium # one-time browser download
|
||||
```
|
||||
|
||||
## Commands
|
||||
|
||||
```powershell
|
||||
dotnet run -- run # default: discover-new + scrape (new + stale >30d) + push
|
||||
dotnet run -- discover-new # cheap: find newly-added colors (newest-first, stops at known)
|
||||
dotnet run -- discover-full # heavy: crawl all color filters (reconcile whole set / removals)
|
||||
dotnet run -- scrape # scrape product pages from product-urls.txt (resumable)
|
||||
dotnet run -- scrape --refresh-older-than=30 # also re-scrape products older than 30 days (price changes)
|
||||
dotnet run -- push # push prismatic_powders.json to the import endpoint
|
||||
```
|
||||
|
||||
Options: `--max-products=N`, `--retry-errors`, `--headed` (show the browser for debugging).
|
||||
|
||||
Everything streams to the console live (warnings/errors in color) **and** to `prismatic-sync.log`.
|
||||
|
||||
## Operating model (suggested cadence)
|
||||
|
||||
| Run | Command | Cadence | Why |
|
||||
|-----|---------|---------|-----|
|
||||
| Find new colors | `run` (does discover-new + scrape-new) | Weekly | Cheap; Prismatic adds colors often |
|
||||
| Price refresh | `scrape --refresh-older-than=30` then `push` | Monthly | Re-scrapes stale products to catch price changes (slow, ~hours) |
|
||||
| Full reconcile | `discover-full` then `scrape` | Quarterly | Catches removed/discontinued colors |
|
||||
|
||||
A full scrape of ~5,000 products takes hours (polite delays). It saves after every product and is
|
||||
fully resumable, so stop/restart any time.
|
||||
|
||||
## Politeness / anti-block
|
||||
|
||||
Configurable in `appsettings.json`: randomized 6–14s base delay, an escalating **cooldown + retry on
|
||||
403** (so a temporary block doesn't get you hard-banned mid-run), and a periodic long rest. Leave
|
||||
these conservative — getting blocked is worse than being slow, and Prismatic is a partner.
|
||||
|
||||
## Pushing into the app
|
||||
|
||||
Set `Sync.Import.EndpointUrl` + `Sync.Import.Token` in `appsettings.json`. The tool POSTs the JSON
|
||||
with an `X-Import-Token` header to the app's token-authenticated import endpoint, which runs it
|
||||
through the same upsert as the Columbia sync. If the endpoint isn't configured, `push` is skipped and
|
||||
you upload `prismatic_powders.json` manually via the Powder Catalog admin page.
|
||||
|
||||
> **App-side dependency:** the token-authenticated import endpoint must exist in the web app for
|
||||
> unattended push to work. Until then, use the manual upload.
|
||||
|
||||
## Scheduling (Windows Task Scheduler)
|
||||
|
||||
Point a scheduled task at the published exe (or `dotnet run`). Example weekly task command:
|
||||
|
||||
```
|
||||
Program/script: C:\Tools\PrismaticSync\PrismaticSync.exe
|
||||
Arguments: run
|
||||
Start in: C:\Tools\PrismaticSync
|
||||
```
|
||||
|
||||
Publish a self-contained build to drop on the workstation:
|
||||
|
||||
```powershell
|
||||
dotnet publish -c Release -r win-x64 --self-contained false -o C:\Tools\PrismaticSync
|
||||
pwsh C:\Tools\PrismaticSync\playwright.ps1 install chromium
|
||||
```
|
||||
|
||||
## The long game
|
||||
|
||||
This is the interim path. The durable endgame is a real Prismatic **API** (the partnership), at which
|
||||
point this tool is replaced by a clean in-app sync like Columbia's — reusing the same upsert,
|
||||
propagation, and discontinued handling.
|
||||
@@ -0,0 +1,63 @@
|
||||
using System.Text;
|
||||
using PrismaticSync.Infrastructure;
|
||||
|
||||
namespace PrismaticSync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Pushes the scraped JSON to the app's token-authenticated catalog import endpoint. When no
|
||||
/// endpoint is configured it no-ops (the JSON is still on disk for a manual upload), so the tool is
|
||||
/// useful before the endpoint exists.
|
||||
/// </summary>
|
||||
public class CatalogPusher
|
||||
{
|
||||
private readonly SyncConfig _config;
|
||||
|
||||
public CatalogPusher(SyncConfig config) => _config = config;
|
||||
|
||||
public async Task<bool> PushAsync()
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(_config.Import.EndpointUrl))
|
||||
{
|
||||
Log.Warn($"No import endpoint configured (Sync.Import.EndpointUrl) — skipping push. " +
|
||||
$"Upload {_config.OutputJsonFile} manually via the Powder Catalog admin instead.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!File.Exists(_config.OutputJsonFile))
|
||||
{
|
||||
Log.Warn($"Output file {_config.OutputJsonFile} not found — nothing to push.");
|
||||
return false;
|
||||
}
|
||||
|
||||
var json = await File.ReadAllTextAsync(_config.OutputJsonFile);
|
||||
Log.Info($"Pushing {_config.OutputJsonFile} to {_config.Import.EndpointUrl} (vendor: {_config.Import.VendorName})...");
|
||||
|
||||
using var http = new HttpClient { Timeout = TimeSpan.FromMinutes(5) };
|
||||
using var request = new HttpRequestMessage(HttpMethod.Post, _config.Import.EndpointUrl);
|
||||
request.Headers.Add("X-Import-Token", _config.Import.Token);
|
||||
request.Headers.Add("X-Vendor-Name", _config.Import.VendorName);
|
||||
request.Content = new StringContent(json, Encoding.UTF8, "application/json");
|
||||
|
||||
try
|
||||
{
|
||||
using var response = await http.SendAsync(request);
|
||||
var body = await response.Content.ReadAsStringAsync();
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
Log.Info($"Push succeeded ({(int)response.StatusCode}): {Trim(body)}");
|
||||
return true;
|
||||
}
|
||||
|
||||
Log.Error($"Push failed ({(int)response.StatusCode}): {Trim(body)}");
|
||||
return false;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Error($"Push error: {ex.Message}");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static string Trim(string s) => s.Length > 500 ? s[..500] + "…" : s;
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Playwright;
|
||||
using PrismaticSync.Infrastructure;
|
||||
|
||||
namespace PrismaticSync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Discovers product URLs from the Prismatic color listing (infinite-scroll). Two modes:
|
||||
/// incremental (newest-first via <c>?category=created_at</c>, stop once we reach already-known
|
||||
/// URLs) for cheap frequent runs, and full (every color filter to the bottom) for occasional
|
||||
/// reconciliation. Both append to the URL list file.
|
||||
/// </summary>
|
||||
public class PrismaticDiscoverer
|
||||
{
|
||||
private static readonly Regex ProductUrlRegex =
|
||||
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
private readonly BrowserSession _session;
|
||||
private readonly SyncConfig _config;
|
||||
|
||||
public PrismaticDiscoverer(BrowserSession session, SyncConfig config)
|
||||
{
|
||||
_session = session;
|
||||
_config = config;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Incremental discovery: crawl the newest-first listing and stop once a run of consecutive
|
||||
/// scrolls surfaces only already-known URLs — meaning we've scrolled past the new products.
|
||||
/// Returns the count of newly found URLs.
|
||||
/// </summary>
|
||||
public async Task<int> DiscoverNewAsync()
|
||||
{
|
||||
var known = new HashSet<string>(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
|
||||
var startCount = known.Count;
|
||||
Log.Info($"Incremental discovery (newest first). Known URLs: {startCount}");
|
||||
|
||||
await GotoAsync($"{_config.ColorsUrl}?category=created_at");
|
||||
|
||||
var knownStreak = 0;
|
||||
for (var i = 0; i < _config.MaxScrolls; i++)
|
||||
{
|
||||
var addedNew = 0;
|
||||
foreach (var link in await CollectProductLinksAsync())
|
||||
if (known.Add(link)) addedNew++;
|
||||
|
||||
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
|
||||
knownStreak = addedNew == 0 ? knownStreak + 1 : 0;
|
||||
Log.Info($"Scroll {i + 1}: +{addedNew} new, total {known.Count}, known-streak {knownStreak}");
|
||||
|
||||
if (knownStreak >= _config.StopAfterKnownScrolls)
|
||||
{
|
||||
Log.Info("Reached known territory — stopping incremental discovery.");
|
||||
break;
|
||||
}
|
||||
|
||||
await ScrollAsync();
|
||||
}
|
||||
|
||||
var newCount = known.Count - startCount;
|
||||
Log.Info($"Incremental discovery done. New URLs: {newCount}; total {known.Count}");
|
||||
return newCount;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Full discovery: crawl every color filter to the bottom. Heavier — use occasionally to
|
||||
/// reconcile the whole set (e.g. to notice colors that have been removed). Returns new URL count.
|
||||
/// </summary>
|
||||
public async Task<int> DiscoverFullAsync()
|
||||
{
|
||||
var known = new HashSet<string>(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
|
||||
var startCount = known.Count;
|
||||
Log.Info($"Full discovery across {_config.ColorParams.Length} color filters. Known URLs: {startCount}");
|
||||
|
||||
foreach (var color in _config.ColorParams)
|
||||
{
|
||||
Log.Info($"Color filter: {color}");
|
||||
try
|
||||
{
|
||||
await GotoAsync($"{_config.ColorsUrl}?color={Uri.EscapeDataString(color)}");
|
||||
|
||||
var noNew = 0;
|
||||
for (var i = 0; i < _config.MaxScrolls; i++)
|
||||
{
|
||||
var added = 0;
|
||||
foreach (var link in await CollectProductLinksAsync())
|
||||
if (known.Add(link)) added++;
|
||||
|
||||
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
|
||||
noNew = added == 0 ? noNew + 1 : 0;
|
||||
if (noNew >= _config.StopAfterNoNewScrolls)
|
||||
break;
|
||||
|
||||
await ScrollAsync();
|
||||
}
|
||||
|
||||
Log.Info($"Color {color} done. Total {known.Count}");
|
||||
await _session.Page.WaitForTimeoutAsync(3000);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warn($"Color {color} failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
var newCount = known.Count - startCount;
|
||||
Log.Info($"Full discovery done. New this run: {newCount}; total {known.Count}");
|
||||
return newCount;
|
||||
}
|
||||
|
||||
private async Task GotoAsync(string url)
|
||||
{
|
||||
await _session.Page.GotoAsync(url, new PageGotoOptions
|
||||
{
|
||||
WaitUntil = WaitUntilState.DOMContentLoaded,
|
||||
Timeout = 60000
|
||||
});
|
||||
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
|
||||
}
|
||||
|
||||
private async Task ScrollAsync()
|
||||
{
|
||||
await _session.Page.Mouse.WheelAsync(0, 2500);
|
||||
await _session.Page.WaitForTimeoutAsync(_config.ScrollWaitMs);
|
||||
}
|
||||
|
||||
private async Task<List<string>> CollectProductLinksAsync()
|
||||
{
|
||||
var hrefs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||
"a", "els => els.map(a => a.href).filter(Boolean)");
|
||||
|
||||
return hrefs
|
||||
.Where(h => ProductUrlRegex.IsMatch(h))
|
||||
.Select(JsonStore.CleanUrl)
|
||||
.Where(u => u.Length > 0)
|
||||
.ToList();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,295 @@
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Playwright;
|
||||
using PrismaticSync.Infrastructure;
|
||||
using PrismaticSync.Models;
|
||||
|
||||
namespace PrismaticSync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes individual Prismatic product pages into <see cref="ProductRecord"/>s. Resumable (skips
|
||||
/// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale
|
||||
/// records get re-scraped to catch price changes. Saves after every product so a long run can be
|
||||
/// stopped and resumed safely, and logs continuously — including the delay between products — so a
|
||||
/// manual run always shows it's alive.
|
||||
/// </summary>
|
||||
public class PrismaticScraper
|
||||
{
|
||||
private static readonly Regex ProductUrlRegex =
|
||||
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex SkuRegex =
|
||||
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex DescRegex =
|
||||
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
||||
private static readonly Regex PriceTierRegex =
|
||||
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
|
||||
private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled);
|
||||
private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled);
|
||||
|
||||
private readonly BrowserSession _session;
|
||||
private readonly SyncConfig _config;
|
||||
private readonly Random _random = new();
|
||||
|
||||
public PrismaticScraper(BrowserSession session, SyncConfig config)
|
||||
{
|
||||
_session = session;
|
||||
_config = config;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes products needing work: those not yet scraped, plus (when <paramref name="refreshOlderThanDays"/>
|
||||
/// > 0) any whose data is older than that window. Returns (scraped, errors).
|
||||
/// </summary>
|
||||
public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors)
|
||||
{
|
||||
var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile)
|
||||
.Where(u => ProductUrlRegex.IsMatch(u))
|
||||
.ToList();
|
||||
|
||||
var data = JsonStore.LoadOutput(_config.OutputJsonFile);
|
||||
|
||||
// Index existing results by URL (keep the most recent if the file has dupes).
|
||||
var resultByUrl = data.Results
|
||||
.GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase)
|
||||
.ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var errorUrls = new HashSet<string>(
|
||||
data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays));
|
||||
|
||||
var toScrape = new List<string>();
|
||||
foreach (var url in allUrls)
|
||||
{
|
||||
if (resultByUrl.TryGetValue(url, out var existing))
|
||||
{
|
||||
if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff)
|
||||
toScrape.Add(url); // stale → refresh for price changes
|
||||
}
|
||||
else
|
||||
{
|
||||
if (retryErrors || !errorUrls.Contains(url))
|
||||
toScrape.Add(url); // never scraped (skip known errors unless retrying)
|
||||
}
|
||||
}
|
||||
|
||||
if (maxProducts > 0)
|
||||
toScrape = toScrape.Take(maxProducts).ToList();
|
||||
|
||||
var total = toScrape.Count;
|
||||
Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}");
|
||||
Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})");
|
||||
|
||||
if (total == 0)
|
||||
{
|
||||
Log.Info("Nothing to scrape. Done.");
|
||||
return (0, 0);
|
||||
}
|
||||
|
||||
var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0;
|
||||
var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0;
|
||||
Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " +
|
||||
$"(grab a coffee if that's a while — it saves after every product and is resumable).");
|
||||
|
||||
var stopwatch = Stopwatch.StartNew();
|
||||
int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0;
|
||||
|
||||
foreach (var url in toScrape)
|
||||
{
|
||||
index++;
|
||||
|
||||
for (var attempt = 1; ; attempt++)
|
||||
{
|
||||
try
|
||||
{
|
||||
var row = await ParseProductAsync(url, index, total);
|
||||
|
||||
if (resultByUrl.TryGetValue(url, out var existing))
|
||||
data.Results[data.Results.IndexOf(existing)] = row;
|
||||
else
|
||||
data.Results.Add(row);
|
||||
|
||||
resultByUrl[url] = row;
|
||||
data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
scraped++;
|
||||
consecutiveBlocks = 0;
|
||||
JsonStore.SaveOutput(_config.OutputJsonFile, data);
|
||||
|
||||
var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m;
|
||||
Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " +
|
||||
$"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00}) | elapsed {FormatDuration(stopwatch.Elapsed)}");
|
||||
break;
|
||||
}
|
||||
catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries)
|
||||
{
|
||||
// Site pushed back — back off (escalating) and retry the SAME product rather
|
||||
// than barreling on, which is how an unattended run gets hard-banned.
|
||||
consecutiveBlocks++;
|
||||
var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds);
|
||||
Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product...");
|
||||
await Task.Delay(cooldown * 1000);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow });
|
||||
JsonStore.SaveOutput(_config.OutputJsonFile, data);
|
||||
errors++;
|
||||
Log.Error($"[{index}/{total}] {url} -> {ex.Message}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence.
|
||||
if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total)
|
||||
{
|
||||
Log.Info($"Resting {_config.LongRestSeconds}s after {index} products...");
|
||||
await Task.Delay(_config.LongRestSeconds * 1000);
|
||||
}
|
||||
|
||||
if (index < total)
|
||||
{
|
||||
var delayMs = RandomDelayMs();
|
||||
Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product...");
|
||||
await Task.Delay(delayMs);
|
||||
}
|
||||
}
|
||||
|
||||
Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " +
|
||||
$"Took {FormatDuration(stopwatch.Elapsed)}.");
|
||||
return (scraped, errors);
|
||||
}
|
||||
|
||||
private async Task<ProductRecord> ParseProductAsync(string url, int index, int total)
|
||||
{
|
||||
Log.Info($"[{index}/{total}] Scraping {url}");
|
||||
|
||||
var response = await _session.Page.GotoAsync(url, new PageGotoOptions
|
||||
{
|
||||
WaitUntil = WaitUntilState.DOMContentLoaded,
|
||||
Timeout = 60000
|
||||
});
|
||||
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
|
||||
|
||||
var status = response?.Status ?? 0;
|
||||
var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync()));
|
||||
var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync()));
|
||||
|
||||
if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase))
|
||||
throw new Exception("403 Forbidden returned by site.");
|
||||
if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase))
|
||||
throw new Exception("404 Not Found returned by site.");
|
||||
|
||||
var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync()));
|
||||
|
||||
var skuMatch = SkuRegex.Match(plainText);
|
||||
var sku = skuMatch.Success ? skuMatch.Groups[1].Value : "";
|
||||
if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName))
|
||||
throw new Exception("Could not find SKU or title on product page.");
|
||||
|
||||
var descMatch = DescRegex.Match(plainText);
|
||||
var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : "";
|
||||
|
||||
return new ProductRecord
|
||||
{
|
||||
Sku = sku,
|
||||
ColorName = colorName,
|
||||
Description = description,
|
||||
PriceTiers = ParsePriceTiers(plainText),
|
||||
SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }),
|
||||
TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }),
|
||||
ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }),
|
||||
SampleImageUrl = await GetSampleImageUrlAsync(),
|
||||
ProductUrl = url,
|
||||
ScrapedAt = DateTime.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
private static List<PriceTier> ParsePriceTiers(string text)
|
||||
{
|
||||
var tiers = new List<PriceTier>();
|
||||
foreach (Match m in PriceTierRegex.Matches(text))
|
||||
{
|
||||
if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price))
|
||||
continue;
|
||||
|
||||
var rangeText = Clean(m.Groups[1].Value);
|
||||
int? min = null, max = null;
|
||||
|
||||
var range = RangeRegex.Match(rangeText);
|
||||
if (range.Success)
|
||||
{
|
||||
min = int.Parse(range.Groups[1].Value);
|
||||
max = int.Parse(range.Groups[2].Value);
|
||||
}
|
||||
|
||||
var plus = PlusRegex.Match(rangeText);
|
||||
if (plus.Success)
|
||||
{
|
||||
min = int.Parse(plus.Groups[1].Value);
|
||||
max = null;
|
||||
}
|
||||
|
||||
tiers.Add(new PriceTier { Min = min, Max = max, Price = price });
|
||||
}
|
||||
return tiers;
|
||||
}
|
||||
|
||||
/// <summary>Returns the href of the first link whose text matches any pattern. Uses a single eval
|
||||
/// returning "texthref" pairs to avoid object deserialization quirks.</summary>
|
||||
private async Task<string> GetLinkByTextAsync(string[] patterns)
|
||||
{
|
||||
var combined = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||
"a",
|
||||
"els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " +
|
||||
"+ String.fromCharCode(1) + (a.href || ''))");
|
||||
|
||||
foreach (var entry in combined)
|
||||
{
|
||||
var parts = entry.Split('');
|
||||
var text = parts.Length > 0 ? parts[0] : "";
|
||||
var href = parts.Length > 1 ? parts[1] : "";
|
||||
if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
|
||||
return href;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private async Task<string> GetSampleImageUrlAsync()
|
||||
{
|
||||
var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||
"img",
|
||||
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
|
||||
".filter(Boolean)");
|
||||
|
||||
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)
|
||||
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
|
||||
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase))
|
||||
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase))
|
||||
?? "";
|
||||
}
|
||||
|
||||
private static bool IsBlocked(Exception ex) =>
|
||||
ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
private static async Task<string> SafeTextAsync(Func<Task<string>> fn)
|
||||
{
|
||||
try { return await fn(); } catch { return ""; }
|
||||
}
|
||||
|
||||
private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim();
|
||||
|
||||
private int RandomDelayMs()
|
||||
{
|
||||
var min = Math.Max(0, _config.MinDelaySeconds * 1000);
|
||||
var max = Math.Max(min, _config.MaxDelaySeconds * 1000);
|
||||
return _random.Next(min, max + 1);
|
||||
}
|
||||
|
||||
private static string FormatDuration(TimeSpan t) =>
|
||||
t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" :
|
||||
t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" :
|
||||
$"{t.Seconds}s";
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"Sync": {
|
||||
"BaseUrl": "https://www.prismaticpowders.com",
|
||||
"ColorsPath": "/shop/powder-coating-colors",
|
||||
|
||||
"ProductUrlsFile": "product-urls.txt",
|
||||
"OutputJsonFile": "prismatic_powders.json",
|
||||
"LogFile": "prismatic-sync.log",
|
||||
|
||||
"MinDelaySeconds": 6,
|
||||
"MaxDelaySeconds": 14,
|
||||
"PageSettleSeconds": 4,
|
||||
|
||||
"BlockedCooldownSeconds": 120,
|
||||
"BlockedCooldownMaxSeconds": 600,
|
||||
"BlockedMaxRetries": 3,
|
||||
"LongRestEveryProducts": 150,
|
||||
"LongRestSeconds": 45,
|
||||
|
||||
"ScrollWaitMs": 1500,
|
||||
"MaxScrolls": 400,
|
||||
"StopAfterNoNewScrolls": 10,
|
||||
"StopAfterKnownScrolls": 8,
|
||||
|
||||
"ColorParams": [
|
||||
"pris_black", "pris_blue", "pris_bronze", "pris_brown", "pris_clear",
|
||||
"pris_copper", "pris_gold", "pris_gray", "pris_green", "pris_orange",
|
||||
"pris_pink", "pris_purple", "pris_red", "pris_silver", "pris_tan",
|
||||
"pris_white", "pris_yellow"
|
||||
],
|
||||
|
||||
"Import": {
|
||||
"EndpointUrl": "",
|
||||
"Token": "",
|
||||
"VendorName": "Prismatic Powders"
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user