using System.Text.RegularExpressions;
using Microsoft.Playwright;
using PrismaticSync.Infrastructure;
namespace PrismaticSync.Services;
///
/// Discovers product URLs from the Prismatic color listing (infinite-scroll). Two modes:
/// incremental (newest-first via ?category=created_at, stop once we reach already-known
/// URLs) for cheap frequent runs, and full (every color filter to the bottom) for occasional
/// reconciliation. Both append to the URL list file.
///
public class PrismaticDiscoverer
{
private static readonly Regex ProductUrlRegex =
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private readonly BrowserSession _session;
private readonly SyncConfig _config;
public PrismaticDiscoverer(BrowserSession session, SyncConfig config)
{
_session = session;
_config = config;
}
///
/// Incremental discovery: crawl the newest-first listing and stop once a run of consecutive
/// scrolls surfaces only already-known URLs — meaning we've scrolled past the new products.
/// Returns the count of newly found URLs.
///
public async Task DiscoverNewAsync()
{
var known = new HashSet(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
var startCount = known.Count;
Log.Info($"Incremental discovery (newest first). Known URLs: {startCount}");
await GotoAsync($"{_config.ColorsUrl}?category=created_at");
var knownStreak = 0;
for (var i = 0; i < _config.MaxScrolls; i++)
{
var addedNew = 0;
foreach (var link in await CollectProductLinksAsync())
if (known.Add(link)) addedNew++;
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
knownStreak = addedNew == 0 ? knownStreak + 1 : 0;
Log.Info($"Scroll {i + 1}: +{addedNew} new, total {known.Count}, known-streak {knownStreak}");
if (knownStreak >= _config.StopAfterKnownScrolls)
{
Log.Info("Reached known territory — stopping incremental discovery.");
break;
}
await ScrollAsync();
}
var newCount = known.Count - startCount;
Log.Info($"Incremental discovery done. New URLs: {newCount}; total {known.Count}");
return newCount;
}
///
/// Full discovery: crawl every color filter to the bottom. Heavier — use occasionally to
/// reconcile the whole set (e.g. to notice colors that have been removed). Returns new URL count.
///
public async Task DiscoverFullAsync()
{
var known = new HashSet(JsonStore.LoadUrls(_config.ProductUrlsFile), StringComparer.OrdinalIgnoreCase);
var startCount = known.Count;
Log.Info($"Full discovery across {_config.ColorParams.Length} color filters. Known URLs: {startCount}");
foreach (var color in _config.ColorParams)
{
Log.Info($"Color filter: {color}");
try
{
await GotoAsync($"{_config.ColorsUrl}?color={Uri.EscapeDataString(color)}");
var noNew = 0;
for (var i = 0; i < _config.MaxScrolls; i++)
{
var added = 0;
foreach (var link in await CollectProductLinksAsync())
if (known.Add(link)) added++;
JsonStore.SaveUrls(_config.ProductUrlsFile, known);
noNew = added == 0 ? noNew + 1 : 0;
if (noNew >= _config.StopAfterNoNewScrolls)
break;
await ScrollAsync();
}
Log.Info($"Color {color} done. Total {known.Count}");
await _session.Page.WaitForTimeoutAsync(3000);
}
catch (Exception ex)
{
Log.Warn($"Color {color} failed: {ex.Message}");
}
}
var newCount = known.Count - startCount;
Log.Info($"Full discovery done. New this run: {newCount}; total {known.Count}");
return newCount;
}
private async Task GotoAsync(string url)
{
await _session.Page.GotoAsync(url, new PageGotoOptions
{
WaitUntil = WaitUntilState.DOMContentLoaded,
Timeout = 60000
});
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
}
private async Task ScrollAsync()
{
await _session.Page.Mouse.WheelAsync(0, 2500);
await _session.Page.WaitForTimeoutAsync(_config.ScrollWaitMs);
}
private async Task> CollectProductLinksAsync()
{
var hrefs = await _session.Page.EvalOnSelectorAllAsync(
"a", "els => els.map(a => a.href).filter(Boolean)");
return hrefs
.Where(h => ProductUrlRegex.IsMatch(h))
.Select(JsonStore.CleanUrl)
.Where(u => u.Length > 0)
.ToList();
}
}