using System.Text.RegularExpressions; using Microsoft.Playwright; using PrismaticSync.Infrastructure; namespace PrismaticSync.Services; /// /// Discovers product URLs from the Prismatic color listing (infinite-scroll). Two modes: /// incremental (newest-first via ?category=created_at, stop once we reach already-known /// URLs) for cheap frequent runs, and full (every color filter to the bottom) for occasional /// reconciliation. Both append to the URL list file. /// public class PrismaticDiscoverer { private static readonly Regex ProductUrlRegex = new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled); private readonly BrowserSession _session; private readonly SyncConfig _config; public PrismaticDiscoverer(BrowserSession session, SyncConfig config) { _session = session; _config = config; } /// /// Incremental discovery: crawl the newest-first listing and stop once a run of consecutive /// scrolls surfaces only already-known URLs — meaning we've scrolled past the new products. /// Returns the count of newly found URLs. /// public async Task DiscoverNewAsync() { var known = new HashSet(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase); var startCount = known.Count; Log.Info($"Incremental discovery (newest first). Known URLs: {startCount}"); await GotoAsync($"{_config.ColorsUrl}?category=created_at"); var knownStreak = 0; for (var i = 0; i < _config.MaxScrolls; i++) { var addedNew = 0; foreach (var link in await CollectProductLinksAsync()) if (known.Add(link)) addedNew++; JsonStore.SaveUrls(_config.ProductUrlsFile, known); knownStreak = addedNew == 0 ? knownStreak + 1 : 0; Log.Info($"Scroll {i + 1}: +{addedNew} new, total {known.Count}, known-streak {knownStreak}"); if (knownStreak >= _config.StopAfterKnownScrolls) { Log.Info("Reached known territory — stopping incremental discovery."); break; } await ScrollAsync(); } var newCount = known.Count - startCount; Log.Info($"Incremental discovery done. New URLs: {newCount}; total {known.Count}"); return newCount; } /// /// Full discovery: crawl every color filter to the bottom. Heavier — use occasionally to /// reconcile the whole set (e.g. to notice colors that have been removed). Returns new URL count. /// public async Task DiscoverFullAsync() { var known = new HashSet(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase); var startCount = known.Count; Log.Info($"Full discovery across {_config.ColorParams.Length} color filters. Known URLs: {startCount}"); foreach (var color in _config.ColorParams) { Log.Info($"Color filter: {color}"); try { await GotoAsync($"{_config.ColorsUrl}?color={Uri.EscapeDataString(color)}"); var noNew = 0; for (var i = 0; i < _config.MaxScrolls; i++) { var added = 0; foreach (var link in await CollectProductLinksAsync()) if (known.Add(link)) added++; JsonStore.SaveUrls(_config.ProductUrlsFile, known); noNew = added == 0 ? noNew + 1 : 0; if (noNew >= _config.StopAfterNoNewScrolls) break; await ScrollAsync(); } Log.Info($"Color {color} done. Total {known.Count}"); await _session.Page.WaitForTimeoutAsync(3000); } catch (Exception ex) { Log.Warn($"Color {color} failed: {ex.Message}"); } } var newCount = known.Count - startCount; Log.Info($"Full discovery done. New this run: {newCount}; total {known.Count}"); return newCount; } private async Task GotoAsync(string url) { await _session.Page.GotoAsync(url, new PageGotoOptions { WaitUntil = WaitUntilState.DOMContentLoaded, Timeout = 60000 }); await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000); } private async Task ScrollAsync() { await _session.Page.Mouse.WheelAsync(0, 2500); await _session.Page.WaitForTimeoutAsync(_config.ScrollWaitMs); } private async Task> CollectProductLinksAsync() { var hrefs = await _session.Page.EvalOnSelectorAllAsync( "a", "els => els.map(a => a.href).filter(Boolean)"); return hrefs .Where(h => ProductUrlRegex.IsMatch(h)) .Select(JsonStore.CleanUrl) .Where(u => u.Length > 0) .ToList(); } }