using System.Diagnostics;
using System.Globalization;
using System.Text.RegularExpressions;
using Microsoft.Playwright;
using PrismaticSync.Infrastructure;
using PrismaticSync.Models;
namespace PrismaticSync.Services;
///
/// Scrapes individual Prismatic product pages into s. Resumable (skips
/// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale
/// records get re-scraped to catch price changes. Saves after every product so a long run can be
/// stopped and resumed safely, and logs continuously — including the delay between products — so a
/// manual run always shows it's alive.
///
public class PrismaticScraper
{
private static readonly Regex ProductUrlRegex =
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex SkuRegex =
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex DescRegex =
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
private static readonly Regex PriceTierRegex =
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled);
private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled);
private readonly BrowserSession _session;
private readonly SyncConfig _config;
private readonly Random _random = new();
public PrismaticScraper(BrowserSession session, SyncConfig config)
{
_session = session;
_config = config;
}
///
/// Scrapes products needing work: those not yet scraped, plus (when
/// > 0) any whose data is older than that window. Returns (scraped, errors).
///
public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors)
{
var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile)
.Where(u => ProductUrlRegex.IsMatch(u))
.ToList();
var data = JsonStore.LoadOutput(_config.OutputJsonFile);
// Index existing results by URL (keep the most recent if the file has dupes).
var resultByUrl = data.Results
.GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase)
.ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase);
var errorUrls = new HashSet(
data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase);
var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays));
var toScrape = new List();
foreach (var url in allUrls)
{
if (resultByUrl.TryGetValue(url, out var existing))
{
if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff)
toScrape.Add(url); // stale → refresh for price changes
}
else
{
if (retryErrors || !errorUrls.Contains(url))
toScrape.Add(url); // never scraped (skip known errors unless retrying)
}
}
if (maxProducts > 0)
toScrape = toScrape.Take(maxProducts).ToList();
var total = toScrape.Count;
Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}");
Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})");
if (total == 0)
{
Log.Info("Nothing to scrape. Done.");
return (0, 0);
}
var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0;
var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0;
Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " +
$"(grab a coffee if that's a while — it saves after every product and is resumable).");
var stopwatch = Stopwatch.StartNew();
int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0;
foreach (var url in toScrape)
{
index++;
for (var attempt = 1; ; attempt++)
{
try
{
var row = await ParseProductAsync(url, index, total);
if (resultByUrl.TryGetValue(url, out var existing))
data.Results[data.Results.IndexOf(existing)] = row;
else
data.Results.Add(row);
resultByUrl[url] = row;
data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase));
scraped++;
consecutiveBlocks = 0;
JsonStore.SaveOutput(_config.OutputJsonFile, data);
var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m;
Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " +
$"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00}) | elapsed {FormatDuration(stopwatch.Elapsed)}");
break;
}
catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries)
{
// Site pushed back — back off (escalating) and retry the SAME product rather
// than barreling on, which is how an unattended run gets hard-banned.
consecutiveBlocks++;
var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds);
Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product...");
await Task.Delay(cooldown * 1000);
}
catch (Exception ex)
{
data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow });
JsonStore.SaveOutput(_config.OutputJsonFile, data);
errors++;
Log.Error($"[{index}/{total}] {url} -> {ex.Message}");
break;
}
}
// Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence.
if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total)
{
Log.Info($"Resting {_config.LongRestSeconds}s after {index} products...");
await Task.Delay(_config.LongRestSeconds * 1000);
}
if (index < total)
{
var delayMs = RandomDelayMs();
Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product...");
await Task.Delay(delayMs);
}
}
Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " +
$"Took {FormatDuration(stopwatch.Elapsed)}.");
return (scraped, errors);
}
private async Task ParseProductAsync(string url, int index, int total)
{
Log.Info($"[{index}/{total}] Scraping {url}");
var response = await _session.Page.GotoAsync(url, new PageGotoOptions
{
WaitUntil = WaitUntilState.DOMContentLoaded,
Timeout = 60000
});
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
var status = response?.Status ?? 0;
var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync()));
var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync()));
if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase))
throw new Exception("403 Forbidden returned by site.");
if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase))
throw new Exception("404 Not Found returned by site.");
var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync()));
var skuMatch = SkuRegex.Match(plainText);
var sku = skuMatch.Success ? skuMatch.Groups[1].Value : "";
if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName))
throw new Exception("Could not find SKU or title on product page.");
var descMatch = DescRegex.Match(plainText);
var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : "";
return new ProductRecord
{
Sku = sku,
ColorName = colorName,
Description = description,
PriceTiers = ParsePriceTiers(plainText),
SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }),
TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }),
ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }),
SampleImageUrl = await GetSampleImageUrlAsync(),
ProductUrl = url,
ScrapedAt = DateTime.UtcNow
};
}
private static List ParsePriceTiers(string text)
{
var tiers = new List();
foreach (Match m in PriceTierRegex.Matches(text))
{
if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price))
continue;
var rangeText = Clean(m.Groups[1].Value);
int? min = null, max = null;
var range = RangeRegex.Match(rangeText);
if (range.Success)
{
min = int.Parse(range.Groups[1].Value);
max = int.Parse(range.Groups[2].Value);
}
var plus = PlusRegex.Match(rangeText);
if (plus.Success)
{
min = int.Parse(plus.Groups[1].Value);
max = null;
}
tiers.Add(new PriceTier { Min = min, Max = max, Price = price });
}
return tiers;
}
/// Returns the href of the first link whose text matches any pattern. Uses a single eval
/// returning "texthref" pairs to avoid object deserialization quirks.
private async Task GetLinkByTextAsync(string[] patterns)
{
var combined = await _session.Page.EvalOnSelectorAllAsync(
"a",
"els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " +
"+ String.fromCharCode(1) + (a.href || ''))");
foreach (var entry in combined)
{
var parts = entry.Split('');
var text = parts.Length > 0 ? parts[0] : "";
var href = parts.Length > 1 ? parts[1] : "";
// Require the link to point at an actual document, not a generic /documents nav page.
if (href.Length > 0
&& IsDocumentUrl(href)
&& patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
return href;
}
return "";
}
/// True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).
private static bool IsDocumentUrl(string href)
{
var path = href.Split('?')[0];
return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
|| path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
}
private async Task GetSampleImageUrlAsync()
{
var srcs = await _session.Page.EvalOnSelectorAllAsync(
"img",
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
".filter(Boolean)");
// Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
// fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
?? "";
}
private static bool IsBlocked(Exception ex) =>
ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase);
private static async Task SafeTextAsync(Func> fn)
{
try { return await fn(); } catch { return ""; }
}
private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim();
private int RandomDelayMs()
{
var min = Math.Max(0, _config.MinDelaySeconds * 1000);
var max = Math.Max(min, _config.MaxDelaySeconds * 1000);
return _random.Next(min, max + 1);
}
private static string FormatDuration(TimeSpan t) =>
t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" :
t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" :
$"{t.Seconds}s";
}