Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| da2bb46d5a | |||
| 843d1c3c51 | |||
| c59d55529f |
@@ -0,0 +1,8 @@
|
|||||||
|
# Build output
|
||||||
|
bin/
|
||||||
|
obj/
|
||||||
|
|
||||||
|
# Transient scrape artifacts
|
||||||
|
*.tmp
|
||||||
|
*.invalid-*.bak
|
||||||
|
prismatic-sync.log
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
using Microsoft.Playwright;
|
||||||
|
|
||||||
|
namespace PrismaticSync.Infrastructure;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A headless Chromium session with a realistic desktop fingerprint (UA, viewport, locale,
|
||||||
|
/// timezone) — matching the original scraper's settings to look like a normal browser.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class BrowserSession : IAsyncDisposable
|
||||||
|
{
|
||||||
|
private IPlaywright? _pw;
|
||||||
|
private IBrowser? _browser;
|
||||||
|
private IBrowserContext? _context;
|
||||||
|
|
||||||
|
public IPage Page { get; private set; } = null!;
|
||||||
|
|
||||||
|
public static async Task<BrowserSession> CreateAsync(bool headed)
|
||||||
|
{
|
||||||
|
var session = new BrowserSession();
|
||||||
|
session._pw = await Playwright.CreateAsync();
|
||||||
|
session._browser = await session._pw.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
|
||||||
|
{
|
||||||
|
Headless = !headed
|
||||||
|
});
|
||||||
|
session._context = await session._browser.NewContextAsync(new BrowserNewContextOptions
|
||||||
|
{
|
||||||
|
UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
||||||
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
|
ViewportSize = new ViewportSize { Width = 1365, Height = 900 },
|
||||||
|
Locale = "en-US",
|
||||||
|
TimezoneId = "America/New_York"
|
||||||
|
});
|
||||||
|
session.Page = await session._context.NewPageAsync();
|
||||||
|
return session;
|
||||||
|
}
|
||||||
|
|
||||||
|
public async ValueTask DisposeAsync()
|
||||||
|
{
|
||||||
|
if (_context is not null) await _context.CloseAsync();
|
||||||
|
if (_browser is not null) await _browser.CloseAsync();
|
||||||
|
_pw?.Dispose();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
using System.Text.Json;
|
||||||
|
using PrismaticSync.Models;
|
||||||
|
|
||||||
|
namespace PrismaticSync.Infrastructure;
|
||||||
|
|
||||||
|
/// <summary>Loads/saves the scrape output and the URL list, with atomic writes so a crash mid-save can't corrupt them.</summary>
|
||||||
|
public static class JsonStore
|
||||||
|
{
|
||||||
|
private static readonly JsonSerializerOptions WriteOptions = new() { WriteIndented = true };
|
||||||
|
private static readonly JsonSerializerOptions ReadOptions = new() { PropertyNameCaseInsensitive = true };
|
||||||
|
|
||||||
|
public static ScrapeOutput LoadOutput(string path)
|
||||||
|
{
|
||||||
|
if (!File.Exists(path))
|
||||||
|
return new ScrapeOutput();
|
||||||
|
|
||||||
|
var json = File.ReadAllText(path);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// Tolerate a bare array (older output format) as well as { results, errors }.
|
||||||
|
if (json.TrimStart().StartsWith("["))
|
||||||
|
{
|
||||||
|
var results = JsonSerializer.Deserialize<List<ProductRecord>>(json, ReadOptions) ?? new();
|
||||||
|
return new ScrapeOutput { Results = results };
|
||||||
|
}
|
||||||
|
return JsonSerializer.Deserialize<ScrapeOutput>(json, ReadOptions) ?? new ScrapeOutput();
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
var backup = $"{path}.invalid-{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}.bak";
|
||||||
|
File.Copy(path, backup, overwrite: true);
|
||||||
|
throw new InvalidOperationException($"Could not parse {path}. Backed it up to {backup}. {ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void SaveOutput(string path, ScrapeOutput data)
|
||||||
|
{
|
||||||
|
var tmp = path + ".tmp";
|
||||||
|
File.WriteAllText(tmp, JsonSerializer.Serialize(data, WriteOptions));
|
||||||
|
File.Move(tmp, path, overwrite: true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<string> LoadUrls(string path)
|
||||||
|
{
|
||||||
|
if (!File.Exists(path))
|
||||||
|
return new List<string>();
|
||||||
|
|
||||||
|
return File.ReadAllLines(path)
|
||||||
|
.Select(CleanUrl)
|
||||||
|
.Where(u => u.Length > 0 && !u.StartsWith("#"))
|
||||||
|
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||||
|
.ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void SaveUrls(string path, IEnumerable<string> urls)
|
||||||
|
{
|
||||||
|
var sorted = urls.Distinct(StringComparer.OrdinalIgnoreCase).OrderBy(u => u, StringComparer.OrdinalIgnoreCase);
|
||||||
|
var tmp = path + ".tmp";
|
||||||
|
File.WriteAllText(tmp, string.Join(Environment.NewLine, sorted) + Environment.NewLine);
|
||||||
|
File.Move(tmp, path, overwrite: true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string CleanUrl(string? url) =>
|
||||||
|
(url ?? string.Empty).Split('?')[0].Split('#')[0].Trim();
|
||||||
|
}
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
namespace PrismaticSync.Infrastructure;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Minimal timestamped logger — writes to the console and appends to a rolling log file so an
|
||||||
|
/// unattended (Task Scheduler) run leaves an audit trail. Intentionally dependency-free.
|
||||||
|
/// </summary>
|
||||||
|
public static class Log
|
||||||
|
{
|
||||||
|
private static string _logFile = "prismatic-sync.log";
|
||||||
|
private static readonly object Gate = new();
|
||||||
|
|
||||||
|
public static void Configure(string logFile) => _logFile = logFile;
|
||||||
|
|
||||||
|
public static void Info(string message) => Write("INFO", message);
|
||||||
|
public static void Warn(string message) => Write("WARN", message);
|
||||||
|
public static void Error(string message) => Write("ERROR", message);
|
||||||
|
|
||||||
|
private static void Write(string level, string message)
|
||||||
|
{
|
||||||
|
var line = $"[{DateTime.UtcNow:yyyy-MM-ddTHH:mm:ssZ}] {level,-5} {message}";
|
||||||
|
|
||||||
|
// Live console stream (visible on a manual run); color-code so warnings/errors stand out.
|
||||||
|
lock (Gate)
|
||||||
|
{
|
||||||
|
var color = level switch
|
||||||
|
{
|
||||||
|
"WARN" => ConsoleColor.Yellow,
|
||||||
|
"ERROR" => ConsoleColor.Red,
|
||||||
|
_ => (ConsoleColor?)null
|
||||||
|
};
|
||||||
|
|
||||||
|
if (color is { } c)
|
||||||
|
{
|
||||||
|
var previous = Console.ForegroundColor;
|
||||||
|
Console.ForegroundColor = c;
|
||||||
|
Console.WriteLine(line);
|
||||||
|
Console.ForegroundColor = previous;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Console.WriteLine(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
// File trail — never let logging break a run.
|
||||||
|
try { File.AppendAllText(_logFile, line + Environment.NewLine); }
|
||||||
|
catch { /* ignore */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
namespace PrismaticSync.Infrastructure;
|
||||||
|
|
||||||
|
/// <summary>Strongly-typed config bound from the "Sync" section of appsettings.json.</summary>
|
||||||
|
public class SyncConfig
|
||||||
|
{
|
||||||
|
public string BaseUrl { get; set; } = "https://www.prismaticpowders.com";
|
||||||
|
public string ColorsPath { get; set; } = "/shop/powder-coating-colors";
|
||||||
|
|
||||||
|
public string ProductUrlsFile { get; set; } = "product-urls.txt";
|
||||||
|
public string OutputJsonFile { get; set; } = "prismatic_powders.json";
|
||||||
|
public string LogFile { get; set; } = "prismatic-sync.log";
|
||||||
|
|
||||||
|
/// <summary>Politeness delay between product scrapes (randomized within the range).</summary>
|
||||||
|
public int MinDelaySeconds { get; set; } = 6;
|
||||||
|
public int MaxDelaySeconds { get; set; } = 14;
|
||||||
|
|
||||||
|
/// <summary>On a 403/block, cool down this many seconds × the consecutive-block count, then retry.</summary>
|
||||||
|
public int BlockedCooldownSeconds { get; set; } = 120;
|
||||||
|
|
||||||
|
/// <summary>Upper bound on a single cooldown so escalation can't run away.</summary>
|
||||||
|
public int BlockedCooldownMaxSeconds { get; set; } = 600;
|
||||||
|
|
||||||
|
/// <summary>How many times to cool-down-and-retry a blocked product before recording it as an error.</summary>
|
||||||
|
public int BlockedMaxRetries { get; set; } = 3;
|
||||||
|
|
||||||
|
/// <summary>Take a longer rest after this many products (0 disables). Eases load and looks less robotic.</summary>
|
||||||
|
public int LongRestEveryProducts { get; set; } = 150;
|
||||||
|
|
||||||
|
/// <summary>Length of the periodic long rest, in seconds.</summary>
|
||||||
|
public int LongRestSeconds { get; set; } = 45;
|
||||||
|
|
||||||
|
/// <summary>Extra settle time after a product page loads before reading it.</summary>
|
||||||
|
public int PageSettleSeconds { get; set; } = 4;
|
||||||
|
|
||||||
|
/// <summary>Pause after each scroll while a listing lazy-loads more items.</summary>
|
||||||
|
public int ScrollWaitMs { get; set; } = 1500;
|
||||||
|
|
||||||
|
/// <summary>Hard cap on scrolls per listing, as a safety stop.</summary>
|
||||||
|
public int MaxScrolls { get; set; } = 400;
|
||||||
|
|
||||||
|
/// <summary>Full discovery: stop a listing after this many scrolls add no new links.</summary>
|
||||||
|
public int StopAfterNoNewScrolls { get; set; } = 10;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Incremental discovery: stop the newest-first listing after this many consecutive scrolls
|
||||||
|
/// that surfaced only already-known URLs — i.e. we've scrolled past the new products.
|
||||||
|
/// </summary>
|
||||||
|
public int StopAfterKnownScrolls { get; set; } = 8;
|
||||||
|
|
||||||
|
/// <summary>Color filter params used by full discovery.</summary>
|
||||||
|
public string[] ColorParams { get; set; } = Array.Empty<string>();
|
||||||
|
|
||||||
|
public ImportConfig Import { get; set; } = new();
|
||||||
|
|
||||||
|
public string ColorsUrl => $"{BaseUrl.TrimEnd('/')}{ColorsPath}";
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Where and how to push the scraped catalog into the app.</summary>
|
||||||
|
public class ImportConfig
|
||||||
|
{
|
||||||
|
/// <summary>Full URL of the app's token-authenticated catalog import endpoint.</summary>
|
||||||
|
public string EndpointUrl { get; set; } = "";
|
||||||
|
|
||||||
|
/// <summary>Shared secret sent in the X-Import-Token header. Must match the app's config.</summary>
|
||||||
|
public string Token { get; set; } = "";
|
||||||
|
|
||||||
|
/// <summary>Vendor name applied to every record on import.</summary>
|
||||||
|
public string VendorName { get; set; } = "Prismatic Powders";
|
||||||
|
}
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
using System.Text.Json.Serialization;
|
||||||
|
|
||||||
|
namespace PrismaticSync.Models;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// On-disk scrape output. Shape matches the app's catalog import (a top-level "results" array of
|
||||||
|
/// snake_case product records), so the JSON drops straight into the import endpoint. "errors" tracks
|
||||||
|
/// failed URLs for resumable re-runs.
|
||||||
|
/// </summary>
|
||||||
|
public class ScrapeOutput
|
||||||
|
{
|
||||||
|
[JsonPropertyName("results")] public List<ProductRecord> Results { get; set; } = new();
|
||||||
|
[JsonPropertyName("errors")] public List<ScrapeError> Errors { get; set; } = new();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>One scraped product, in the import's expected field shape.</summary>
|
||||||
|
public class ProductRecord
|
||||||
|
{
|
||||||
|
[JsonPropertyName("sku")] public string Sku { get; set; } = "";
|
||||||
|
[JsonPropertyName("color_name")] public string ColorName { get; set; } = "";
|
||||||
|
[JsonPropertyName("description")] public string Description { get; set; } = "";
|
||||||
|
[JsonPropertyName("price_tiers")] public List<PriceTier> PriceTiers { get; set; } = new();
|
||||||
|
[JsonPropertyName("safety_data_sheet_url")] public string SafetyDataSheetUrl { get; set; } = "";
|
||||||
|
[JsonPropertyName("technical_data_sheet_url")] public string TechnicalDataSheetUrl { get; set; } = "";
|
||||||
|
[JsonPropertyName("application_guide_url")] public string ApplicationGuideUrl { get; set; } = "";
|
||||||
|
[JsonPropertyName("sample_image_url")] public string SampleImageUrl { get; set; } = "";
|
||||||
|
[JsonPropertyName("product_url")] public string ProductUrl { get; set; } = "";
|
||||||
|
[JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>A quantity-break price tier — {min, max, price}. max is null for an open-ended top tier.</summary>
|
||||||
|
public class PriceTier
|
||||||
|
{
|
||||||
|
[JsonPropertyName("min")] public int? Min { get; set; }
|
||||||
|
[JsonPropertyName("max")] public int? Max { get; set; }
|
||||||
|
[JsonPropertyName("price")] public decimal Price { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>A URL that failed to scrape, kept so resumable runs can skip or retry it.</summary>
|
||||||
|
public class ScrapeError
|
||||||
|
{
|
||||||
|
[JsonPropertyName("product_url")] public string ProductUrl { get; set; } = "";
|
||||||
|
[JsonPropertyName("error")] public string Error { get; set; } = "";
|
||||||
|
[JsonPropertyName("scraped_at")] public DateTime ScrapedAt { get; set; }
|
||||||
|
}
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<!--
|
||||||
|
Standalone workstation tool — deliberately NOT part of PowderCoating.sln.
|
||||||
|
Build/publish independently and run on a machine you control (Task Scheduler),
|
||||||
|
never on the deployed app server. Scrapes Prismatic Powders and pushes the
|
||||||
|
result into the app's catalog import endpoint.
|
||||||
|
|
||||||
|
First-time setup on a workstation:
|
||||||
|
dotnet build
|
||||||
|
pwsh bin/Debug/net8.0/playwright.ps1 install chromium
|
||||||
|
-->
|
||||||
|
<PropertyGroup>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<TargetFramework>net8.0</TargetFramework>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<AssemblyName>PrismaticSync</AssemblyName>
|
||||||
|
<RootNamespace>PrismaticSync</RootNamespace>
|
||||||
|
<InvariantGlobalization>true</InvariantGlobalization>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="Microsoft.Playwright" Version="1.49.0" />
|
||||||
|
<PackageReference Include="Microsoft.Extensions.Configuration" Version="8.0.0" />
|
||||||
|
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="8.0.1" />
|
||||||
|
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="8.0.2" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<None Update="appsettings.json">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,106 @@
|
|||||||
|
using Microsoft.Extensions.Configuration;
|
||||||
|
using PrismaticSync.Infrastructure;
|
||||||
|
using PrismaticSync.Services;
|
||||||
|
|
||||||
|
// ── Load config ───────────────────────────────────────────────────────────────
|
||||||
|
var configRoot = new ConfigurationBuilder()
|
||||||
|
.SetBasePath(AppContext.BaseDirectory)
|
||||||
|
.AddJsonFile("appsettings.json", optional: false)
|
||||||
|
.Build();
|
||||||
|
|
||||||
|
var config = configRoot.GetSection("Sync").Get<SyncConfig>() ?? new SyncConfig();
|
||||||
|
Log.Configure(config.LogFile);
|
||||||
|
|
||||||
|
// ── Parse args ────────────────────────────────────────────────────────────────
|
||||||
|
var command = args.Length > 0 && !args[0].StartsWith("--") ? args[0].ToLowerInvariant() : "run";
|
||||||
|
var headed = args.Contains("--headed");
|
||||||
|
var retryErrors = args.Contains("--retry-errors");
|
||||||
|
var maxProducts = GetIntArg("--max-products", 0);
|
||||||
|
// "run" refreshes products older than 30 days by default; explicit commands default to new-only.
|
||||||
|
var refreshOlderThanDays = GetIntArg("--refresh-older-than", command == "run" ? 30 : 0);
|
||||||
|
|
||||||
|
Log.Info($"PrismaticSync — command '{command}' (headed={headed}, refreshOlderThan={refreshOlderThanDays}d, maxProducts={maxProducts})");
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
switch (command)
|
||||||
|
{
|
||||||
|
case "discover-new":
|
||||||
|
await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverNewAsync());
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "discover-full":
|
||||||
|
await WithBrowser(d => new PrismaticDiscoverer(d, config).DiscoverFullAsync());
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "scrape":
|
||||||
|
await WithBrowser(d => new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "push":
|
||||||
|
await new CatalogPusher(config).PushAsync();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "run":
|
||||||
|
// The scheduled default: find new colors, scrape new + stale, then push.
|
||||||
|
await WithBrowser(async d =>
|
||||||
|
{
|
||||||
|
await new PrismaticDiscoverer(d, config).DiscoverNewAsync();
|
||||||
|
await new PrismaticScraper(d, config).ScrapeAsync(refreshOlderThanDays, maxProducts, retryErrors);
|
||||||
|
});
|
||||||
|
await new CatalogPusher(config).PushAsync();
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
PrintUsage();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
Log.Info("Done.");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Log.Error($"Fatal: {ex}");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
async Task WithBrowser(Func<BrowserSession, Task> action)
|
||||||
|
{
|
||||||
|
await using var session = await BrowserSession.CreateAsync(headed);
|
||||||
|
await action(session);
|
||||||
|
}
|
||||||
|
|
||||||
|
int GetIntArg(string name, int fallback)
|
||||||
|
{
|
||||||
|
var prefix = name + "=";
|
||||||
|
var found = args.FirstOrDefault(a => a.StartsWith(prefix, StringComparison.OrdinalIgnoreCase));
|
||||||
|
return found is not null && int.TryParse(found[prefix.Length..], out var value) ? value : fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PrintUsage()
|
||||||
|
{
|
||||||
|
Console.WriteLine(
|
||||||
|
"""
|
||||||
|
PrismaticSync — scrape Prismatic Powders and push to the app catalog.
|
||||||
|
|
||||||
|
Usage: PrismaticSync [command] [options]
|
||||||
|
|
||||||
|
Commands:
|
||||||
|
run (default) discover-new + scrape (new + stale) + push
|
||||||
|
discover-new Incremental discovery via newest-first sort (cheap; finds new colors)
|
||||||
|
discover-full Full discovery across all color filters (heavy; reconciles the whole set)
|
||||||
|
scrape Scrape product pages from the URL list (resumable)
|
||||||
|
push Push the scraped JSON to the import endpoint
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--refresh-older-than=N Re-scrape products whose data is older than N days (default 30 for 'run')
|
||||||
|
--max-products=N Cap products scraped this run (0 = no cap)
|
||||||
|
--retry-errors Retry URLs previously recorded as errors
|
||||||
|
--headed Show the browser window (debugging)
|
||||||
|
|
||||||
|
Config: appsettings.json (delays, file paths, import endpoint + token).
|
||||||
|
First run on a new machine: dotnet build, then `pwsh bin/Debug/net8.0/playwright.ps1 install chromium`.
|
||||||
|
""");
|
||||||
|
}
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
# PrismaticSync
|
||||||
|
|
||||||
|
A standalone .NET console tool that scrapes the Prismatic Powders catalog and pushes it into the
|
||||||
|
Powder Coating Logix catalog import endpoint. It exists because Prismatic has **no API** (unlike
|
||||||
|
Columbia Coatings) — so the data has to be scraped via browser automation.
|
||||||
|
|
||||||
|
> **Runs on a workstation you control — never on the deployed app server.** Scraping from the cloud
|
||||||
|
> app's IP would get blocked and isn't appropriate. This tool is deliberately *not* part of
|
||||||
|
> `PowderCoating.sln`; build and run it independently.
|
||||||
|
|
||||||
|
## First-time setup (per machine)
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
cd "scripts/Prismatic Data Scraper"
|
||||||
|
dotnet build
|
||||||
|
pwsh bin/Debug/net8.0/playwright.ps1 install chromium # one-time browser download
|
||||||
|
```
|
||||||
|
|
||||||
|
## Commands
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
dotnet run -- run # default: discover-new + scrape (new + stale >30d) + push
|
||||||
|
dotnet run -- discover-new # cheap: find newly-added colors (newest-first, stops at known)
|
||||||
|
dotnet run -- discover-full # heavy: crawl all color filters (reconcile whole set / removals)
|
||||||
|
dotnet run -- scrape # scrape product pages from product-urls.txt (resumable)
|
||||||
|
dotnet run -- scrape --refresh-older-than=30 # also re-scrape products older than 30 days (price changes)
|
||||||
|
dotnet run -- push # push prismatic_powders.json to the import endpoint
|
||||||
|
```
|
||||||
|
|
||||||
|
Options: `--max-products=N`, `--retry-errors`, `--headed` (show the browser for debugging).
|
||||||
|
|
||||||
|
Everything streams to the console live (warnings/errors in color) **and** to `prismatic-sync.log`.
|
||||||
|
|
||||||
|
## Operating model (suggested cadence)
|
||||||
|
|
||||||
|
| Run | Command | Cadence | Why |
|
||||||
|
|-----|---------|---------|-----|
|
||||||
|
| Find new colors | `run` (does discover-new + scrape-new) | Weekly | Cheap; Prismatic adds colors often |
|
||||||
|
| Price refresh | `scrape --refresh-older-than=30` then `push` | Monthly | Re-scrapes stale products to catch price changes (slow, ~hours) |
|
||||||
|
| Full reconcile | `discover-full` then `scrape` | Quarterly | Catches removed/discontinued colors |
|
||||||
|
|
||||||
|
A full scrape of ~5,000 products takes hours (polite delays). It saves after every product and is
|
||||||
|
fully resumable, so stop/restart any time.
|
||||||
|
|
||||||
|
## Politeness / anti-block
|
||||||
|
|
||||||
|
Configurable in `appsettings.json`: randomized 6–14s base delay, an escalating **cooldown + retry on
|
||||||
|
403** (so a temporary block doesn't get you hard-banned mid-run), and a periodic long rest. Leave
|
||||||
|
these conservative — getting blocked is worse than being slow, and Prismatic is a partner.
|
||||||
|
|
||||||
|
## Pushing into the app
|
||||||
|
|
||||||
|
Set in `appsettings.json`:
|
||||||
|
- `Sync.Import.EndpointUrl` → `https://<your-app>/PowderCatalog/ImportApi`
|
||||||
|
- `Sync.Import.Token` → the same secret as the app's `CatalogImport:Token` config
|
||||||
|
|
||||||
|
The tool POSTs the JSON with an `X-Import-Token` header (and `X-Vendor-Name: Prismatic Powders`) to
|
||||||
|
that endpoint, which authenticates the token and runs the records through the same upsert as the
|
||||||
|
Columbia sync. If the endpoint/token isn't configured here, `push` is skipped and you upload
|
||||||
|
`prismatic_powders.json` manually via the Powder Catalog admin page instead.
|
||||||
|
|
||||||
|
> **App side:** set `CatalogImport:Token` in the web app's config (Azure App Setting in prod). The
|
||||||
|
> endpoint returns 401 until a token is set, so it's inert by default.
|
||||||
|
|
||||||
|
## Scheduling (Windows Task Scheduler)
|
||||||
|
|
||||||
|
Point a scheduled task at the published exe (or `dotnet run`). Example weekly task command:
|
||||||
|
|
||||||
|
```
|
||||||
|
Program/script: C:\Tools\PrismaticSync\PrismaticSync.exe
|
||||||
|
Arguments: run
|
||||||
|
Start in: C:\Tools\PrismaticSync
|
||||||
|
```
|
||||||
|
|
||||||
|
Publish a self-contained build to drop on the workstation:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
dotnet publish -c Release -r win-x64 --self-contained false -o C:\Tools\PrismaticSync
|
||||||
|
pwsh C:\Tools\PrismaticSync\playwright.ps1 install chromium
|
||||||
|
```
|
||||||
|
|
||||||
|
## The long game
|
||||||
|
|
||||||
|
This is the interim path. The durable endgame is a real Prismatic **API** (the partnership), at which
|
||||||
|
point this tool is replaced by a clean in-app sync like Columbia's — reusing the same upsert,
|
||||||
|
propagation, and discontinued handling.
|
||||||
@@ -0,0 +1,63 @@
|
|||||||
|
using System.Text;
|
||||||
|
using PrismaticSync.Infrastructure;
|
||||||
|
|
||||||
|
namespace PrismaticSync.Services;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Pushes the scraped JSON to the app's token-authenticated catalog import endpoint. When no
|
||||||
|
/// endpoint is configured it no-ops (the JSON is still on disk for a manual upload), so the tool is
|
||||||
|
/// useful before the endpoint exists.
|
||||||
|
/// </summary>
|
||||||
|
public class CatalogPusher
|
||||||
|
{
|
||||||
|
private readonly SyncConfig _config;
|
||||||
|
|
||||||
|
public CatalogPusher(SyncConfig config) => _config = config;
|
||||||
|
|
||||||
|
public async Task<bool> PushAsync()
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(_config.Import.EndpointUrl))
|
||||||
|
{
|
||||||
|
Log.Warn($"No import endpoint configured (Sync.Import.EndpointUrl) — skipping push. " +
|
||||||
|
$"Upload {_config.OutputJsonFile} manually via the Powder Catalog admin instead.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!File.Exists(_config.OutputJsonFile))
|
||||||
|
{
|
||||||
|
Log.Warn($"Output file {_config.OutputJsonFile} not found — nothing to push.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var json = await File.ReadAllTextAsync(_config.OutputJsonFile);
|
||||||
|
Log.Info($"Pushing {_config.OutputJsonFile} to {_config.Import.EndpointUrl} (vendor: {_config.Import.VendorName})...");
|
||||||
|
|
||||||
|
using var http = new HttpClient { Timeout = TimeSpan.FromMinutes(5) };
|
||||||
|
using var request = new HttpRequestMessage(HttpMethod.Post, _config.Import.EndpointUrl);
|
||||||
|
request.Headers.Add("X-Import-Token", _config.Import.Token);
|
||||||
|
request.Headers.Add("X-Vendor-Name", _config.Import.VendorName);
|
||||||
|
request.Content = new StringContent(json, Encoding.UTF8, "application/json");
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var response = await http.SendAsync(request);
|
||||||
|
var body = await response.Content.ReadAsStringAsync();
|
||||||
|
|
||||||
|
if (response.IsSuccessStatusCode)
|
||||||
|
{
|
||||||
|
Log.Info($"Push succeeded ({(int)response.StatusCode}): {Trim(body)}");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Log.Error($"Push failed ({(int)response.StatusCode}): {Trim(body)}");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Log.Error($"Push error: {ex.Message}");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string Trim(string s) => s.Length > 500 ? s[..500] + "…" : s;
|
||||||
|
}
|
||||||
@@ -0,0 +1,138 @@
|
|||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using Microsoft.Playwright;
|
||||||
|
using PrismaticSync.Infrastructure;
|
||||||
|
|
||||||
|
namespace PrismaticSync.Services;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Discovers product URLs from the Prismatic color listing (infinite-scroll). Two modes:
|
||||||
|
/// incremental (newest-first via <c>?category=created_at</c>, stop once we reach already-known
|
||||||
|
/// URLs) for cheap frequent runs, and full (every color filter to the bottom) for occasional
|
||||||
|
/// reconciliation. Both append to the URL list file.
|
||||||
|
/// </summary>
|
||||||
|
public class PrismaticDiscoverer
|
||||||
|
{
|
||||||
|
private static readonly Regex ProductUrlRegex =
|
||||||
|
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||||
|
|
||||||
|
private readonly BrowserSession _session;
|
||||||
|
private readonly SyncConfig _config;
|
||||||
|
|
||||||
|
public PrismaticDiscoverer(BrowserSession session, SyncConfig config)
|
||||||
|
{
|
||||||
|
_session = session;
|
||||||
|
_config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Incremental discovery: crawl the newest-first listing and stop once a run of consecutive
|
||||||
|
/// scrolls surfaces only already-known URLs — meaning we've scrolled past the new products.
|
||||||
|
/// Returns the count of newly found URLs.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<int> DiscoverNewAsync()
|
||||||
|
{
|
||||||
|
var known = new HashSet<string>(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
|
||||||
|
var startCount = known.Count;
|
||||||
|
Log.Info($"Incremental discovery (newest first). Known URLs: {startCount}");
|
||||||
|
|
||||||
|
await GotoAsync($"{_config.ColorsUrl}?category=created_at");
|
||||||
|
|
||||||
|
var knownStreak = 0;
|
||||||
|
for (var i = 0; i < _config.MaxScrolls; i++)
|
||||||
|
{
|
||||||
|
var addedNew = 0;
|
||||||
|
foreach (var link in await CollectProductLinksAsync())
|
||||||
|
if (known.Add(link)) addedNew++;
|
||||||
|
|
||||||
|
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
|
||||||
|
knownStreak = addedNew == 0 ? knownStreak + 1 : 0;
|
||||||
|
Log.Info($"Scroll {i + 1}: +{addedNew} new, total {known.Count}, known-streak {knownStreak}");
|
||||||
|
|
||||||
|
if (knownStreak >= _config.StopAfterKnownScrolls)
|
||||||
|
{
|
||||||
|
Log.Info("Reached known territory — stopping incremental discovery.");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
await ScrollAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
var newCount = known.Count - startCount;
|
||||||
|
Log.Info($"Incremental discovery done. New URLs: {newCount}; total {known.Count}");
|
||||||
|
return newCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Full discovery: crawl every color filter to the bottom. Heavier — use occasionally to
|
||||||
|
/// reconcile the whole set (e.g. to notice colors that have been removed). Returns new URL count.
|
||||||
|
/// </summary>
|
||||||
|
public async Task<int> DiscoverFullAsync()
|
||||||
|
{
|
||||||
|
var known = new HashSet<string>(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
|
||||||
|
var startCount = known.Count;
|
||||||
|
Log.Info($"Full discovery across {_config.ColorParams.Length} color filters. Known URLs: {startCount}");
|
||||||
|
|
||||||
|
foreach (var color in _config.ColorParams)
|
||||||
|
{
|
||||||
|
Log.Info($"Color filter: {color}");
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await GotoAsync($"{_config.ColorsUrl}?color={Uri.EscapeDataString(color)}");
|
||||||
|
|
||||||
|
var noNew = 0;
|
||||||
|
for (var i = 0; i < _config.MaxScrolls; i++)
|
||||||
|
{
|
||||||
|
var added = 0;
|
||||||
|
foreach (var link in await CollectProductLinksAsync())
|
||||||
|
if (known.Add(link)) added++;
|
||||||
|
|
||||||
|
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
|
||||||
|
noNew = added == 0 ? noNew + 1 : 0;
|
||||||
|
if (noNew >= _config.StopAfterNoNewScrolls)
|
||||||
|
break;
|
||||||
|
|
||||||
|
await ScrollAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
Log.Info($"Color {color} done. Total {known.Count}");
|
||||||
|
await _session.Page.WaitForTimeoutAsync(3000);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Log.Warn($"Color {color} failed: {ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var newCount = known.Count - startCount;
|
||||||
|
Log.Info($"Full discovery done. New this run: {newCount}; total {known.Count}");
|
||||||
|
return newCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task GotoAsync(string url)
|
||||||
|
{
|
||||||
|
await _session.Page.GotoAsync(url, new PageGotoOptions
|
||||||
|
{
|
||||||
|
WaitUntil = WaitUntilState.DOMContentLoaded,
|
||||||
|
Timeout = 60000
|
||||||
|
});
|
||||||
|
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task ScrollAsync()
|
||||||
|
{
|
||||||
|
await _session.Page.Mouse.WheelAsync(0, 2500);
|
||||||
|
await _session.Page.WaitForTimeoutAsync(_config.ScrollWaitMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<List<string>> CollectProductLinksAsync()
|
||||||
|
{
|
||||||
|
var hrefs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||||
|
"a", "els => els.map(a => a.href).filter(Boolean)");
|
||||||
|
|
||||||
|
return hrefs
|
||||||
|
.Where(h => ProductUrlRegex.IsMatch(h))
|
||||||
|
.Select(JsonStore.CleanUrl)
|
||||||
|
.Where(u => u.Length > 0)
|
||||||
|
.ToList();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,308 @@
|
|||||||
|
using System.Diagnostics;
|
||||||
|
using System.Globalization;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using Microsoft.Playwright;
|
||||||
|
using PrismaticSync.Infrastructure;
|
||||||
|
using PrismaticSync.Models;
|
||||||
|
|
||||||
|
namespace PrismaticSync.Services;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Scrapes individual Prismatic product pages into <see cref="ProductRecord"/>s. Resumable (skips
|
||||||
|
/// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale
|
||||||
|
/// records get re-scraped to catch price changes. Saves after every product so a long run can be
|
||||||
|
/// stopped and resumed safely, and logs continuously — including the delay between products — so a
|
||||||
|
/// manual run always shows it's alive.
|
||||||
|
/// </summary>
|
||||||
|
public class PrismaticScraper
|
||||||
|
{
|
||||||
|
private static readonly Regex ProductUrlRegex =
|
||||||
|
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||||
|
private static readonly Regex SkuRegex =
|
||||||
|
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||||
|
private static readonly Regex DescRegex =
|
||||||
|
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
|
||||||
|
RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
||||||
|
private static readonly Regex PriceTierRegex =
|
||||||
|
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||||
|
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
|
||||||
|
private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled);
|
||||||
|
private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled);
|
||||||
|
|
||||||
|
private readonly BrowserSession _session;
|
||||||
|
private readonly SyncConfig _config;
|
||||||
|
private readonly Random _random = new();
|
||||||
|
|
||||||
|
public PrismaticScraper(BrowserSession session, SyncConfig config)
|
||||||
|
{
|
||||||
|
_session = session;
|
||||||
|
_config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Scrapes products needing work: those not yet scraped, plus (when <paramref name="refreshOlderThanDays"/>
|
||||||
|
/// > 0) any whose data is older than that window. Returns (scraped, errors).
|
||||||
|
/// </summary>
|
||||||
|
public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors)
|
||||||
|
{
|
||||||
|
var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile)
|
||||||
|
.Where(u => ProductUrlRegex.IsMatch(u))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var data = JsonStore.LoadOutput(_config.OutputJsonFile);
|
||||||
|
|
||||||
|
// Index existing results by URL (keep the most recent if the file has dupes).
|
||||||
|
var resultByUrl = data.Results
|
||||||
|
.GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase)
|
||||||
|
.ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase);
|
||||||
|
|
||||||
|
var errorUrls = new HashSet<string>(
|
||||||
|
data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase);
|
||||||
|
|
||||||
|
var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays));
|
||||||
|
|
||||||
|
var toScrape = new List<string>();
|
||||||
|
foreach (var url in allUrls)
|
||||||
|
{
|
||||||
|
if (resultByUrl.TryGetValue(url, out var existing))
|
||||||
|
{
|
||||||
|
if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff)
|
||||||
|
toScrape.Add(url); // stale → refresh for price changes
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (retryErrors || !errorUrls.Contains(url))
|
||||||
|
toScrape.Add(url); // never scraped (skip known errors unless retrying)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxProducts > 0)
|
||||||
|
toScrape = toScrape.Take(maxProducts).ToList();
|
||||||
|
|
||||||
|
var total = toScrape.Count;
|
||||||
|
Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}");
|
||||||
|
Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})");
|
||||||
|
|
||||||
|
if (total == 0)
|
||||||
|
{
|
||||||
|
Log.Info("Nothing to scrape. Done.");
|
||||||
|
return (0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0;
|
||||||
|
var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0;
|
||||||
|
Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " +
|
||||||
|
$"(grab a coffee if that's a while — it saves after every product and is resumable).");
|
||||||
|
|
||||||
|
var stopwatch = Stopwatch.StartNew();
|
||||||
|
int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0;
|
||||||
|
|
||||||
|
foreach (var url in toScrape)
|
||||||
|
{
|
||||||
|
index++;
|
||||||
|
|
||||||
|
for (var attempt = 1; ; attempt++)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var row = await ParseProductAsync(url, index, total);
|
||||||
|
|
||||||
|
if (resultByUrl.TryGetValue(url, out var existing))
|
||||||
|
data.Results[data.Results.IndexOf(existing)] = row;
|
||||||
|
else
|
||||||
|
data.Results.Add(row);
|
||||||
|
|
||||||
|
resultByUrl[url] = row;
|
||||||
|
data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase));
|
||||||
|
|
||||||
|
scraped++;
|
||||||
|
consecutiveBlocks = 0;
|
||||||
|
JsonStore.SaveOutput(_config.OutputJsonFile, data);
|
||||||
|
|
||||||
|
var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m;
|
||||||
|
Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " +
|
||||||
|
$"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00}) | elapsed {FormatDuration(stopwatch.Elapsed)}");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries)
|
||||||
|
{
|
||||||
|
// Site pushed back — back off (escalating) and retry the SAME product rather
|
||||||
|
// than barreling on, which is how an unattended run gets hard-banned.
|
||||||
|
consecutiveBlocks++;
|
||||||
|
var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds);
|
||||||
|
Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product...");
|
||||||
|
await Task.Delay(cooldown * 1000);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow });
|
||||||
|
JsonStore.SaveOutput(_config.OutputJsonFile, data);
|
||||||
|
errors++;
|
||||||
|
Log.Error($"[{index}/{total}] {url} -> {ex.Message}");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence.
|
||||||
|
if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total)
|
||||||
|
{
|
||||||
|
Log.Info($"Resting {_config.LongRestSeconds}s after {index} products...");
|
||||||
|
await Task.Delay(_config.LongRestSeconds * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (index < total)
|
||||||
|
{
|
||||||
|
var delayMs = RandomDelayMs();
|
||||||
|
Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product...");
|
||||||
|
await Task.Delay(delayMs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " +
|
||||||
|
$"Took {FormatDuration(stopwatch.Elapsed)}.");
|
||||||
|
return (scraped, errors);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<ProductRecord> ParseProductAsync(string url, int index, int total)
|
||||||
|
{
|
||||||
|
Log.Info($"[{index}/{total}] Scraping {url}");
|
||||||
|
|
||||||
|
var response = await _session.Page.GotoAsync(url, new PageGotoOptions
|
||||||
|
{
|
||||||
|
WaitUntil = WaitUntilState.DOMContentLoaded,
|
||||||
|
Timeout = 60000
|
||||||
|
});
|
||||||
|
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
|
||||||
|
|
||||||
|
var status = response?.Status ?? 0;
|
||||||
|
var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync()));
|
||||||
|
var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync()));
|
||||||
|
|
||||||
|
if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase))
|
||||||
|
throw new Exception("403 Forbidden returned by site.");
|
||||||
|
if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase))
|
||||||
|
throw new Exception("404 Not Found returned by site.");
|
||||||
|
|
||||||
|
var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync()));
|
||||||
|
|
||||||
|
var skuMatch = SkuRegex.Match(plainText);
|
||||||
|
var sku = skuMatch.Success ? skuMatch.Groups[1].Value : "";
|
||||||
|
if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName))
|
||||||
|
throw new Exception("Could not find SKU or title on product page.");
|
||||||
|
|
||||||
|
var descMatch = DescRegex.Match(plainText);
|
||||||
|
var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : "";
|
||||||
|
|
||||||
|
return new ProductRecord
|
||||||
|
{
|
||||||
|
Sku = sku,
|
||||||
|
ColorName = colorName,
|
||||||
|
Description = description,
|
||||||
|
PriceTiers = ParsePriceTiers(plainText),
|
||||||
|
SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }),
|
||||||
|
TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }),
|
||||||
|
ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }),
|
||||||
|
SampleImageUrl = await GetSampleImageUrlAsync(),
|
||||||
|
ProductUrl = url,
|
||||||
|
ScrapedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<PriceTier> ParsePriceTiers(string text)
|
||||||
|
{
|
||||||
|
var tiers = new List<PriceTier>();
|
||||||
|
foreach (Match m in PriceTierRegex.Matches(text))
|
||||||
|
{
|
||||||
|
if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var rangeText = Clean(m.Groups[1].Value);
|
||||||
|
int? min = null, max = null;
|
||||||
|
|
||||||
|
var range = RangeRegex.Match(rangeText);
|
||||||
|
if (range.Success)
|
||||||
|
{
|
||||||
|
min = int.Parse(range.Groups[1].Value);
|
||||||
|
max = int.Parse(range.Groups[2].Value);
|
||||||
|
}
|
||||||
|
|
||||||
|
var plus = PlusRegex.Match(rangeText);
|
||||||
|
if (plus.Success)
|
||||||
|
{
|
||||||
|
min = int.Parse(plus.Groups[1].Value);
|
||||||
|
max = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
tiers.Add(new PriceTier { Min = min, Max = max, Price = price });
|
||||||
|
}
|
||||||
|
return tiers;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Returns the href of the first link whose text matches any pattern. Uses a single eval
|
||||||
|
/// returning "texthref" pairs to avoid object deserialization quirks.</summary>
|
||||||
|
private async Task<string> GetLinkByTextAsync(string[] patterns)
|
||||||
|
{
|
||||||
|
var combined = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||||
|
"a",
|
||||||
|
"els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " +
|
||||||
|
"+ String.fromCharCode(1) + (a.href || ''))");
|
||||||
|
|
||||||
|
foreach (var entry in combined)
|
||||||
|
{
|
||||||
|
var parts = entry.Split('');
|
||||||
|
var text = parts.Length > 0 ? parts[0] : "";
|
||||||
|
var href = parts.Length > 1 ? parts[1] : "";
|
||||||
|
// Require the link to point at an actual document, not a generic /documents nav page.
|
||||||
|
if (href.Length > 0
|
||||||
|
&& IsDocumentUrl(href)
|
||||||
|
&& patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
|
||||||
|
return href;
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).</summary>
|
||||||
|
private static bool IsDocumentUrl(string href)
|
||||||
|
{
|
||||||
|
var path = href.Split('?')[0];
|
||||||
|
return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
|
||||||
|
|| path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<string> GetSampleImageUrlAsync()
|
||||||
|
{
|
||||||
|
var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||||
|
"img",
|
||||||
|
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
|
||||||
|
".filter(Boolean)");
|
||||||
|
|
||||||
|
// Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
|
||||||
|
// fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
|
||||||
|
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
|
||||||
|
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
|
||||||
|
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
|
||||||
|
?? "";
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsBlocked(Exception ex) =>
|
||||||
|
ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase);
|
||||||
|
|
||||||
|
private static async Task<string> SafeTextAsync(Func<Task<string>> fn)
|
||||||
|
{
|
||||||
|
try { return await fn(); } catch { return ""; }
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim();
|
||||||
|
|
||||||
|
private int RandomDelayMs()
|
||||||
|
{
|
||||||
|
var min = Math.Max(0, _config.MinDelaySeconds * 1000);
|
||||||
|
var max = Math.Max(min, _config.MaxDelaySeconds * 1000);
|
||||||
|
return _random.Next(min, max + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string FormatDuration(TimeSpan t) =>
|
||||||
|
t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" :
|
||||||
|
t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" :
|
||||||
|
$"{t.Seconds}s";
|
||||||
|
}
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
{
|
||||||
|
"Sync": {
|
||||||
|
"BaseUrl": "https://www.prismaticpowders.com",
|
||||||
|
"ColorsPath": "/shop/powder-coating-colors",
|
||||||
|
|
||||||
|
"ProductUrlsFile": "product-urls.txt",
|
||||||
|
"OutputJsonFile": "prismatic_powders.json",
|
||||||
|
"LogFile": "prismatic-sync.log",
|
||||||
|
|
||||||
|
"MinDelaySeconds": 6,
|
||||||
|
"MaxDelaySeconds": 14,
|
||||||
|
"PageSettleSeconds": 4,
|
||||||
|
|
||||||
|
"BlockedCooldownSeconds": 120,
|
||||||
|
"BlockedCooldownMaxSeconds": 600,
|
||||||
|
"BlockedMaxRetries": 3,
|
||||||
|
"LongRestEveryProducts": 150,
|
||||||
|
"LongRestSeconds": 45,
|
||||||
|
|
||||||
|
"ScrollWaitMs": 1500,
|
||||||
|
"MaxScrolls": 400,
|
||||||
|
"StopAfterNoNewScrolls": 10,
|
||||||
|
"StopAfterKnownScrolls": 8,
|
||||||
|
|
||||||
|
"ColorParams": [
|
||||||
|
"pris_black", "pris_blue", "pris_bronze", "pris_brown", "pris_clear",
|
||||||
|
"pris_copper", "pris_gold", "pris_gray", "pris_green", "pris_orange",
|
||||||
|
"pris_pink", "pris_purple", "pris_red", "pris_silver", "pris_tan",
|
||||||
|
"pris_white", "pris_yellow"
|
||||||
|
],
|
||||||
|
|
||||||
|
"Import": {
|
||||||
|
"EndpointUrl": "",
|
||||||
|
"Token": "",
|
||||||
|
"VendorName": "Prismatic Powders"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -25,6 +25,7 @@ public class PowderCatalogController : Controller
|
|||||||
private readonly IColumbiaCatalogSyncService _columbiaSyncService;
|
private readonly IColumbiaCatalogSyncService _columbiaSyncService;
|
||||||
private readonly IPowderCatalogUpsertService _upsertService;
|
private readonly IPowderCatalogUpsertService _upsertService;
|
||||||
private readonly IPlatformSettingsService _platformSettings;
|
private readonly IPlatformSettingsService _platformSettings;
|
||||||
|
private readonly IConfiguration _config;
|
||||||
private readonly ILogger<PowderCatalogController> _logger;
|
private readonly ILogger<PowderCatalogController> _logger;
|
||||||
|
|
||||||
public PowderCatalogController(
|
public PowderCatalogController(
|
||||||
@@ -33,6 +34,7 @@ public class PowderCatalogController : Controller
|
|||||||
IColumbiaCatalogSyncService columbiaSyncService,
|
IColumbiaCatalogSyncService columbiaSyncService,
|
||||||
IPowderCatalogUpsertService upsertService,
|
IPowderCatalogUpsertService upsertService,
|
||||||
IPlatformSettingsService platformSettings,
|
IPlatformSettingsService platformSettings,
|
||||||
|
IConfiguration config,
|
||||||
ILogger<PowderCatalogController> logger)
|
ILogger<PowderCatalogController> logger)
|
||||||
{
|
{
|
||||||
_unitOfWork = unitOfWork;
|
_unitOfWork = unitOfWork;
|
||||||
@@ -40,6 +42,7 @@ public class PowderCatalogController : Controller
|
|||||||
_columbiaSyncService = columbiaSyncService;
|
_columbiaSyncService = columbiaSyncService;
|
||||||
_upsertService = upsertService;
|
_upsertService = upsertService;
|
||||||
_platformSettings = platformSettings;
|
_platformSettings = platformSettings;
|
||||||
|
_config = config;
|
||||||
_logger = logger;
|
_logger = logger;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -372,7 +375,8 @@ public class PowderCatalogController : Controller
|
|||||||
PowderCatalogImportResult result;
|
PowderCatalogImportResult result;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
result = await ImportJsonAsync(file, vendorName);
|
using var stream = file.OpenReadStream();
|
||||||
|
result = await ImportJsonAsync(stream, vendorName);
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
@@ -393,6 +397,67 @@ public class PowderCatalogController : Controller
|
|||||||
return RedirectToAction(nameof(Index));
|
return RedirectToAction(nameof(Index));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Unattended catalog import for the offline scraper tool (e.g. PrismaticSync). Accepts the same
|
||||||
|
/// JSON scrape format in the request body, authenticated by a shared secret in the
|
||||||
|
/// <c>X-Import-Token</c> header (matched against <c>CatalogImport:Token</c>). The vendor name
|
||||||
|
/// comes from the <c>X-Vendor-Name</c> header. Runs through the same upsert as the manual upload.
|
||||||
|
/// Inert (401) until a token is configured.
|
||||||
|
/// </summary>
|
||||||
|
[HttpPost]
|
||||||
|
[AllowAnonymous]
|
||||||
|
[IgnoreAntiforgeryToken]
|
||||||
|
[RequestSizeLimit(50 * 1024 * 1024)] // 50 MB
|
||||||
|
public async Task<IActionResult> ImportApi()
|
||||||
|
{
|
||||||
|
var configuredToken = _config["CatalogImport:Token"];
|
||||||
|
if (string.IsNullOrWhiteSpace(configuredToken))
|
||||||
|
{
|
||||||
|
_logger.LogWarning("ImportApi called but no CatalogImport:Token is configured — rejecting.");
|
||||||
|
return Unauthorized(new { success = false, errorMessage = "Import API is not enabled." });
|
||||||
|
}
|
||||||
|
|
||||||
|
var providedToken = Request.Headers["X-Import-Token"].ToString();
|
||||||
|
if (!FixedTimeEquals(providedToken, configuredToken))
|
||||||
|
return Unauthorized(new { success = false, errorMessage = "Invalid import token." });
|
||||||
|
|
||||||
|
var vendorName = Request.Headers["X-Vendor-Name"].ToString();
|
||||||
|
if (string.IsNullOrWhiteSpace(vendorName))
|
||||||
|
vendorName = "Prismatic Powders";
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var result = await ImportJsonAsync(Request.Body, vendorName);
|
||||||
|
_logger.LogInformation(
|
||||||
|
"ImportApi ({Vendor}): {Inserted} inserted, {Updated} updated, {Skipped} skipped, {Errors} errors.",
|
||||||
|
vendorName, result.Inserted, result.Updated, result.Skipped, result.Errors);
|
||||||
|
|
||||||
|
return Json(new
|
||||||
|
{
|
||||||
|
success = result.Success,
|
||||||
|
vendorName,
|
||||||
|
result.Inserted,
|
||||||
|
result.Updated,
|
||||||
|
result.Skipped,
|
||||||
|
result.Errors,
|
||||||
|
result.ErrorMessage
|
||||||
|
});
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "ImportApi failed for vendor {Vendor}", vendorName);
|
||||||
|
return StatusCode(500, new { success = false, errorMessage = "Import failed." });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Constant-time string comparison so token checks don't leak length/contents via timing.</summary>
|
||||||
|
private static bool FixedTimeEquals(string a, string b)
|
||||||
|
{
|
||||||
|
var ba = System.Text.Encoding.UTF8.GetBytes(a ?? string.Empty);
|
||||||
|
var bb = System.Text.Encoding.UTF8.GetBytes(b ?? string.Empty);
|
||||||
|
return System.Security.Cryptography.CryptographicOperations.FixedTimeEquals(ba, bb);
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// AJAX endpoint used by the inventory form to search the catalog by SKU or color name.
|
/// AJAX endpoint used by the inventory form to search the catalog by SKU or color name.
|
||||||
/// SKU exact matches are ranked first; color name substring matches follow.
|
/// SKU exact matches are ranked first; color name substring matches follow.
|
||||||
@@ -527,9 +592,8 @@ public class PowderCatalogController : Controller
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private async Task<PowderCatalogImportResult> ImportJsonAsync(IFormFile file, string vendorName)
|
private async Task<PowderCatalogImportResult> ImportJsonAsync(Stream stream, string vendorName)
|
||||||
{
|
{
|
||||||
using var stream = file.OpenReadStream();
|
|
||||||
using var doc = await JsonDocument.ParseAsync(stream);
|
using var doc = await JsonDocument.ParseAsync(stream);
|
||||||
|
|
||||||
if (!doc.RootElement.TryGetProperty("results", out var resultsEl) ||
|
if (!doc.RootElement.TryGetProperty("results", out var resultsEl) ||
|
||||||
|
|||||||
@@ -47,6 +47,9 @@
|
|||||||
"BaseUrl": "https://columbiacoatings.com",
|
"BaseUrl": "https://columbiacoatings.com",
|
||||||
"ApiBasePath": "/wp-json/cca/v1"
|
"ApiBasePath": "/wp-json/cca/v1"
|
||||||
},
|
},
|
||||||
|
"CatalogImport": {
|
||||||
|
"Token": ""
|
||||||
|
},
|
||||||
"SendGrid": {
|
"SendGrid": {
|
||||||
"ApiKey": "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",
|
"ApiKey": "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",
|
||||||
"FromEmail": "spouliot@scppowdercoating.com",
|
"FromEmail": "spouliot@scppowdercoating.com",
|
||||||
|
|||||||
Reference in New Issue
Block a user