PowderCoatingLogix/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs

using System.Diagnostics;
using System.Globalization;
using System.Text.RegularExpressions;
using Microsoft.Playwright;
using PrismaticSync.Infrastructure;
using PrismaticSync.Models;

namespace PrismaticSync.Services;

/// <summary>
/// Scrapes individual Prismatic product pages into <see cref="ProductRecord"/>s. Resumable (skips
/// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale
/// records get re-scraped to catch price changes. Saves after every product so a long run can be
/// stopped and resumed safely, and logs continuously — including the delay between products — so a
/// manual run always shows it's alive.
/// </summary>
public class PrismaticScraper
{
    private static readonly Regex ProductUrlRegex =
        new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    private static readonly Regex SkuRegex =
        new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    private static readonly Regex DescRegex =
        new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
            RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
    private static readonly Regex PriceTierRegex =
        new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
    private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled);
    private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled);

    private readonly BrowserSession _session;
    private readonly SyncConfig _config;
    private readonly Random _random = new();

    public PrismaticScraper(BrowserSession session, SyncConfig config)
    {
        _session = session;
        _config = config;
    }

    /// <summary>
    /// Scrapes products needing work: those not yet scraped, plus (when <paramref name="refreshOlderThanDays"/>
    /// &gt; 0) any whose data is older than that window. Returns (scraped, errors).
    /// </summary>
    public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors)
    {
        var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile)
            .Where(u => ProductUrlRegex.IsMatch(u))
            .ToList();

        var data = JsonStore.LoadOutput(_config.OutputJsonFile);

        // Index existing results by URL (keep the most recent if the file has dupes).
        var resultByUrl = data.Results
            .GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase)
            .ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase);

        var errorUrls = new HashSet<string>(
            data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase);

        var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays));

        var toScrape = new List<string>();
        foreach (var url in allUrls)
        {
            if (resultByUrl.TryGetValue(url, out var existing))
            {
                if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff)
                    toScrape.Add(url); // stale → refresh for price changes
            }
            else
            {
                if (retryErrors || !errorUrls.Contains(url))
                    toScrape.Add(url); // never scraped (skip known errors unless retrying)
            }
        }

        if (maxProducts > 0)
            toScrape = toScrape.Take(maxProducts).ToList();

        var total = toScrape.Count;
        Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}");
        Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})");

        if (total == 0)
        {
            Log.Info("Nothing to scrape. Done.");
            return (0, 0);
        }

        var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0;
        var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0;
        Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " +
                 $"(grab a coffee if that's a while — it saves after every product and is resumable).");

        var stopwatch = Stopwatch.StartNew();
        int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0;

        foreach (var url in toScrape)
        {
            index++;

            for (var attempt = 1; ; attempt++)
            {
                try
                {
                    var row = await ParseProductAsync(url, index, total);

                    if (resultByUrl.TryGetValue(url, out var existing))
                        data.Results[data.Results.IndexOf(existing)] = row;
                    else
                        data.Results.Add(row);

                    resultByUrl[url] = row;
                    data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase));

                    scraped++;
                    consecutiveBlocks = 0;
                    JsonStore.SaveOutput(_config.OutputJsonFile, data);

                    var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m;
                    Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " +
                             $"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00})  |  elapsed {FormatDuration(stopwatch.Elapsed)}");
                    break;
                }
                catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries)
                {
                    // Site pushed back — back off (escalating) and retry the SAME product rather
                    // than barreling on, which is how an unattended run gets hard-banned.
                    consecutiveBlocks++;
                    var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds);
                    Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product...");
                    await Task.Delay(cooldown * 1000);
                }
                catch (Exception ex)
                {
                    data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow });
                    JsonStore.SaveOutput(_config.OutputJsonFile, data);
                    errors++;
                    Log.Error($"[{index}/{total}] {url} -> {ex.Message}");
                    break;
                }
            }

            // Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence.
            if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total)
            {
                Log.Info($"Resting {_config.LongRestSeconds}s after {index} products...");
                await Task.Delay(_config.LongRestSeconds * 1000);
            }

            if (index < total)
            {
                var delayMs = RandomDelayMs();
                Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product...");
                await Task.Delay(delayMs);
            }
        }

        Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " +
                 $"Took {FormatDuration(stopwatch.Elapsed)}.");
        return (scraped, errors);
    }

    private async Task<ProductRecord> ParseProductAsync(string url, int index, int total)
    {
        Log.Info($"[{index}/{total}] Scraping {url}");

        var response = await _session.Page.GotoAsync(url, new PageGotoOptions
        {
            WaitUntil = WaitUntilState.DOMContentLoaded,
            Timeout = 60000
        });
        await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);

        var status = response?.Status ?? 0;
        var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync()));
        var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync()));

        if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase))
            throw new Exception("403 Forbidden returned by site.");
        if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase))
            throw new Exception("404 Not Found returned by site.");

        var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync()));

        var skuMatch = SkuRegex.Match(plainText);
        var sku = skuMatch.Success ? skuMatch.Groups[1].Value : "";
        if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName))
            throw new Exception("Could not find SKU or title on product page.");

        var descMatch = DescRegex.Match(plainText);
        var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : "";

        return new ProductRecord
        {
            Sku = sku,
            ColorName = colorName,
            Description = description,
            PriceTiers = ParsePriceTiers(plainText),
            SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }),
            TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }),
            ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }),
            SampleImageUrl = await GetSampleImageUrlAsync(),
            ProductUrl = url,
            ScrapedAt = DateTime.UtcNow
        };
    }

    private static List<PriceTier> ParsePriceTiers(string text)
    {
        var tiers = new List<PriceTier>();
        foreach (Match m in PriceTierRegex.Matches(text))
        {
            if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price))
                continue;

            var rangeText = Clean(m.Groups[1].Value);
            int? min = null, max = null;

            var range = RangeRegex.Match(rangeText);
            if (range.Success)
            {
                min = int.Parse(range.Groups[1].Value);
                max = int.Parse(range.Groups[2].Value);
            }

            var plus = PlusRegex.Match(rangeText);
            if (plus.Success)
            {
                min = int.Parse(plus.Groups[1].Value);
                max = null;
            }

            tiers.Add(new PriceTier { Min = min, Max = max, Price = price });
        }
        return tiers;
    }

    /// <summary>Returns the href of the first link whose text matches any pattern. Uses a single eval
    /// returning "texthref" pairs to avoid object deserialization quirks.</summary>
    private async Task<string> GetLinkByTextAsync(string[] patterns)
    {
        var combined = await _session.Page.EvalOnSelectorAllAsync<string[]>(
            "a",
            "els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " +
            "+ String.fromCharCode(1) + (a.href || ''))");

        foreach (var entry in combined)
        {
            var parts = entry.Split('');
            var text = parts.Length > 0 ? parts[0] : "";
            var href = parts.Length > 1 ? parts[1] : "";
            // Require the link to point at an actual document, not a generic /documents nav page.
            if (href.Length > 0
                && IsDocumentUrl(href)
                && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
                return href;
        }
        return "";
    }

    /// <summary>True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).</summary>
    private static bool IsDocumentUrl(string href)
    {
        var path = href.Split('?')[0];
        return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
               || path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
    }

    private async Task<string> GetSampleImageUrlAsync()
    {
        var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
            "img",
            "els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
            ".filter(Boolean)");

        // Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
        // fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
        return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
                                        && !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
               ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
               ?? "";
    }

    private static bool IsBlocked(Exception ex) =>
        ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase);

    private static async Task<string> SafeTextAsync(Func<Task<string>> fn)
    {
        try { return await fn(); } catch { return ""; }
    }

    private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim();

    private int RandomDelayMs()
    {
        var min = Math.Max(0, _config.MinDelaySeconds * 1000);
        var max = Math.Max(min, _config.MaxDelaySeconds * 1000);
        return _random.Next(min, max + 1);
    }

    private static string FormatDuration(TimeSpan t) =>
        t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" :
        t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" :
        $"{t.Seconds}s";
}