using System.Diagnostics; using System.Globalization; using System.Text.RegularExpressions; using Microsoft.Playwright; using PrismaticSync.Infrastructure; using PrismaticSync.Models; namespace PrismaticSync.Services; /// /// Scrapes individual Prismatic product pages into s. Resumable (skips /// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale /// records get re-scraped to catch price changes. Saves after every product so a long run can be /// stopped and resumed safely, and logs continuously — including the delay between products — so a /// manual run always shows it's alive. /// public class PrismaticScraper { private static readonly Regex ProductUrlRegex = new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex SkuRegex = new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex DescRegex = new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); private static readonly Regex PriceTierRegex = new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled); private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled); private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled); private readonly BrowserSession _session; private readonly SyncConfig _config; private readonly Random _random = new(); public PrismaticScraper(BrowserSession session, SyncConfig config) { _session = session; _config = config; } /// /// Scrapes products needing work: those not yet scraped, plus (when /// > 0) any whose data is older than that window. Returns (scraped, errors). /// public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors) { var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile) .Where(u => ProductUrlRegex.IsMatch(u)) .ToList(); var data = JsonStore.LoadOutput(_config.OutputJsonFile); // Index existing results by URL (keep the most recent if the file has dupes). var resultByUrl = data.Results .GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase) .ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase); var errorUrls = new HashSet( data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase); var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays)); var toScrape = new List(); foreach (var url in allUrls) { if (resultByUrl.TryGetValue(url, out var existing)) { if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff) toScrape.Add(url); // stale → refresh for price changes } else { if (retryErrors || !errorUrls.Contains(url)) toScrape.Add(url); // never scraped (skip known errors unless retrying) } } if (maxProducts > 0) toScrape = toScrape.Take(maxProducts).ToList(); var total = toScrape.Count; Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}"); Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})"); if (total == 0) { Log.Info("Nothing to scrape. Done."); return (0, 0); } var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0; var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0; Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " + $"(grab a coffee if that's a while — it saves after every product and is resumable)."); var stopwatch = Stopwatch.StartNew(); int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0; foreach (var url in toScrape) { index++; for (var attempt = 1; ; attempt++) { try { var row = await ParseProductAsync(url, index, total); if (resultByUrl.TryGetValue(url, out var existing)) data.Results[data.Results.IndexOf(existing)] = row; else data.Results.Add(row); resultByUrl[url] = row; data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase)); scraped++; consecutiveBlocks = 0; JsonStore.SaveOutput(_config.OutputJsonFile, data); var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m; Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " + $"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00}) | elapsed {FormatDuration(stopwatch.Elapsed)}"); break; } catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries) { // Site pushed back — back off (escalating) and retry the SAME product rather // than barreling on, which is how an unattended run gets hard-banned. consecutiveBlocks++; var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds); Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product..."); await Task.Delay(cooldown * 1000); } catch (Exception ex) { data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow }); JsonStore.SaveOutput(_config.OutputJsonFile, data); errors++; Log.Error($"[{index}/{total}] {url} -> {ex.Message}"); break; } } // Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence. if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total) { Log.Info($"Resting {_config.LongRestSeconds}s after {index} products..."); await Task.Delay(_config.LongRestSeconds * 1000); } if (index < total) { var delayMs = RandomDelayMs(); Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product..."); await Task.Delay(delayMs); } } Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " + $"Took {FormatDuration(stopwatch.Elapsed)}."); return (scraped, errors); } private async Task ParseProductAsync(string url, int index, int total) { Log.Info($"[{index}/{total}] Scraping {url}"); var response = await _session.Page.GotoAsync(url, new PageGotoOptions { WaitUntil = WaitUntilState.DOMContentLoaded, Timeout = 60000 }); await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000); var status = response?.Status ?? 0; var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync())); var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync())); if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase)) throw new Exception("403 Forbidden returned by site."); if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase)) throw new Exception("404 Not Found returned by site."); var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync())); var skuMatch = SkuRegex.Match(plainText); var sku = skuMatch.Success ? skuMatch.Groups[1].Value : ""; if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName)) throw new Exception("Could not find SKU or title on product page."); var descMatch = DescRegex.Match(plainText); var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : ""; return new ProductRecord { Sku = sku, ColorName = colorName, Description = description, PriceTiers = ParsePriceTiers(plainText), SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }), TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }), ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }), SampleImageUrl = await GetSampleImageUrlAsync(), ProductUrl = url, ScrapedAt = DateTime.UtcNow }; } private static List ParsePriceTiers(string text) { var tiers = new List(); foreach (Match m in PriceTierRegex.Matches(text)) { if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price)) continue; var rangeText = Clean(m.Groups[1].Value); int? min = null, max = null; var range = RangeRegex.Match(rangeText); if (range.Success) { min = int.Parse(range.Groups[1].Value); max = int.Parse(range.Groups[2].Value); } var plus = PlusRegex.Match(rangeText); if (plus.Success) { min = int.Parse(plus.Groups[1].Value); max = null; } tiers.Add(new PriceTier { Min = min, Max = max, Price = price }); } return tiers; } /// Returns the href of the first link whose text matches any pattern. Uses a single eval /// returning "texthref" pairs to avoid object deserialization quirks. private async Task GetLinkByTextAsync(string[] patterns) { var combined = await _session.Page.EvalOnSelectorAllAsync( "a", "els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " + "+ String.fromCharCode(1) + (a.href || ''))"); foreach (var entry in combined) { var parts = entry.Split(''); var text = parts.Length > 0 ? parts[0] : ""; var href = parts.Length > 1 ? parts[1] : ""; // Require the link to point at an actual document, not a generic /documents nav page. if (href.Length > 0 && IsDocumentUrl(href) && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase))) return href; } return ""; } /// True when an href looks like a real document (hosted on the NIC CDN or a direct PDF). private static bool IsDocumentUrl(string href) { var path = href.Split('?')[0]; return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase) || path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase); } private async Task GetSampleImageUrlAsync() { var srcs = await _session.Page.EvalOnSelectorAllAsync( "img", "els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" + ".filter(Boolean)"); // Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT // fall back to any "prismatic"-ish URL — that catches the site logo on products with no image. return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase) && !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase)) ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)) ?? ""; } private static bool IsBlocked(Exception ex) => ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase); private static async Task SafeTextAsync(Func> fn) { try { return await fn(); } catch { return ""; } } private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim(); private int RandomDelayMs() { var min = Math.Max(0, _config.MinDelaySeconds * 1000); var max = Math.Max(min, _config.MaxDelaySeconds * 1000); return _random.Next(min, max + 1); } private static string FormatDuration(TimeSpan t) => t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" : t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" : $"{t.Seconds}s"; }