da2bb46d5a
Validated against live product pages; fixed three edge cases (also present in the original JS scraper) surfaced by specialty AkzoNobel products: - Sample image: only accept real product images on the NIC CDN (images.nicindustries.com/prismatic/products), preferring full-size over thumbnail. Dropped the loose "prismatic|powder|color" fallback that grabbed the site logo on products with no image. - SDS/TDS/app-guide links: require the href to be an actual document (NIC CDN or a .pdf) so a generic /documents nav link isn't captured as the SDS. - Description: also stop at PRODUCT SUPPORT / PRODUCT COLLECTIONS / CUSTOMER SERVICE so less page footer is captured (app-side StripBoilerplate cleans the rest). Structural fields (sku, color, price tiers) verified correct on live data. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
309 lines
14 KiB
C#
309 lines
14 KiB
C#
using System.Diagnostics;
|
|
using System.Globalization;
|
|
using System.Text.RegularExpressions;
|
|
using Microsoft.Playwright;
|
|
using PrismaticSync.Infrastructure;
|
|
using PrismaticSync.Models;
|
|
|
|
namespace PrismaticSync.Services;
|
|
|
|
/// <summary>
|
|
/// Scrapes individual Prismatic product pages into <see cref="ProductRecord"/>s. Resumable (skips
|
|
/// already-scraped URLs, optionally retries past errors) and supports a refresh window so stale
|
|
/// records get re-scraped to catch price changes. Saves after every product so a long run can be
|
|
/// stopped and resumed safely, and logs continuously — including the delay between products — so a
|
|
/// manual run always shows it's alive.
|
|
/// </summary>
|
|
public class PrismaticScraper
|
|
{
|
|
private static readonly Regex ProductUrlRegex =
|
|
new(@"/shop/powder-coating-colors/[A-Z0-9-]+/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|
private static readonly Regex SkuRegex =
|
|
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|
private static readonly Regex DescRegex =
|
|
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
|
|
RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
|
private static readonly Regex PriceTierRegex =
|
|
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
|
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
|
|
private static readonly Regex PlusRegex = new(@"(\d+)\s*\+", RegexOptions.Compiled);
|
|
private static readonly Regex WhitespaceRegex = new(@"\s+", RegexOptions.Compiled);
|
|
|
|
private readonly BrowserSession _session;
|
|
private readonly SyncConfig _config;
|
|
private readonly Random _random = new();
|
|
|
|
public PrismaticScraper(BrowserSession session, SyncConfig config)
|
|
{
|
|
_session = session;
|
|
_config = config;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Scrapes products needing work: those not yet scraped, plus (when <paramref name="refreshOlderThanDays"/>
|
|
/// > 0) any whose data is older than that window. Returns (scraped, errors).
|
|
/// </summary>
|
|
public async Task<(int Scraped, int Errors)> ScrapeAsync(int refreshOlderThanDays, int maxProducts, bool retryErrors)
|
|
{
|
|
var allUrls = JsonStore.LoadUrls(_config.ProductUrlsFile)
|
|
.Where(u => ProductUrlRegex.IsMatch(u))
|
|
.ToList();
|
|
|
|
var data = JsonStore.LoadOutput(_config.OutputJsonFile);
|
|
|
|
// Index existing results by URL (keep the most recent if the file has dupes).
|
|
var resultByUrl = data.Results
|
|
.GroupBy(r => JsonStore.CleanUrl(r.ProductUrl), StringComparer.OrdinalIgnoreCase)
|
|
.ToDictionary(g => g.Key, g => g.OrderByDescending(r => r.ScrapedAt).First(), StringComparer.OrdinalIgnoreCase);
|
|
|
|
var errorUrls = new HashSet<string>(
|
|
data.Errors.Select(e => JsonStore.CleanUrl(e.ProductUrl)), StringComparer.OrdinalIgnoreCase);
|
|
|
|
var staleCutoff = DateTime.UtcNow.AddDays(-Math.Max(0, refreshOlderThanDays));
|
|
|
|
var toScrape = new List<string>();
|
|
foreach (var url in allUrls)
|
|
{
|
|
if (resultByUrl.TryGetValue(url, out var existing))
|
|
{
|
|
if (refreshOlderThanDays > 0 && existing.ScrapedAt < staleCutoff)
|
|
toScrape.Add(url); // stale → refresh for price changes
|
|
}
|
|
else
|
|
{
|
|
if (retryErrors || !errorUrls.Contains(url))
|
|
toScrape.Add(url); // never scraped (skip known errors unless retrying)
|
|
}
|
|
}
|
|
|
|
if (maxProducts > 0)
|
|
toScrape = toScrape.Take(maxProducts).ToList();
|
|
|
|
var total = toScrape.Count;
|
|
Log.Info($"URLs: {allUrls.Count}; already scraped: {resultByUrl.Count}; errors on file: {errorUrls.Count}");
|
|
Log.Info($"To scrape this run: {total} (refresh older than {refreshOlderThanDays}d, retry errors: {retryErrors})");
|
|
|
|
if (total == 0)
|
|
{
|
|
Log.Info("Nothing to scrape. Done.");
|
|
return (0, 0);
|
|
}
|
|
|
|
var avgDelaySec = (_config.MinDelaySeconds + _config.MaxDelaySeconds) / 2.0;
|
|
var etaMinutes = total * (avgDelaySec + _config.PageSettleSeconds + 2) / 60.0;
|
|
Log.Info($"Estimated run time: ~{FormatDuration(TimeSpan.FromMinutes(etaMinutes))} " +
|
|
$"(grab a coffee if that's a while — it saves after every product and is resumable).");
|
|
|
|
var stopwatch = Stopwatch.StartNew();
|
|
int scraped = 0, errors = 0, index = 0, consecutiveBlocks = 0;
|
|
|
|
foreach (var url in toScrape)
|
|
{
|
|
index++;
|
|
|
|
for (var attempt = 1; ; attempt++)
|
|
{
|
|
try
|
|
{
|
|
var row = await ParseProductAsync(url, index, total);
|
|
|
|
if (resultByUrl.TryGetValue(url, out var existing))
|
|
data.Results[data.Results.IndexOf(existing)] = row;
|
|
else
|
|
data.Results.Add(row);
|
|
|
|
resultByUrl[url] = row;
|
|
data.Errors.RemoveAll(e => JsonStore.CleanUrl(e.ProductUrl).Equals(url, StringComparison.OrdinalIgnoreCase));
|
|
|
|
scraped++;
|
|
consecutiveBlocks = 0;
|
|
JsonStore.SaveOutput(_config.OutputJsonFile, data);
|
|
|
|
var basePrice = row.PriceTiers.Count > 0 ? row.PriceTiers.Min(t => t.Price) : 0m;
|
|
Log.Info($"[{index}/{total}] Saved {row.Sku} \"{row.ColorName}\" " +
|
|
$"({row.PriceTiers.Count} tier(s), base ${basePrice:0.00}) | elapsed {FormatDuration(stopwatch.Elapsed)}");
|
|
break;
|
|
}
|
|
catch (Exception ex) when (IsBlocked(ex) && attempt <= _config.BlockedMaxRetries)
|
|
{
|
|
// Site pushed back — back off (escalating) and retry the SAME product rather
|
|
// than barreling on, which is how an unattended run gets hard-banned.
|
|
consecutiveBlocks++;
|
|
var cooldown = Math.Min(_config.BlockedCooldownSeconds * consecutiveBlocks, _config.BlockedCooldownMaxSeconds);
|
|
Log.Warn($"[{index}/{total}] Blocked (403), attempt {attempt}. Cooling down {cooldown}s, then retrying this product...");
|
|
await Task.Delay(cooldown * 1000);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
data.Errors.Add(new ScrapeError { ProductUrl = url, Error = ex.Message, ScrapedAt = DateTime.UtcNow });
|
|
JsonStore.SaveOutput(_config.OutputJsonFile, data);
|
|
errors++;
|
|
Log.Error($"[{index}/{total}] {url} -> {ex.Message}");
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Periodic longer rest — eases server load and avoids a robotic, evenly-spaced cadence.
|
|
if (_config.LongRestEveryProducts > 0 && index % _config.LongRestEveryProducts == 0 && index < total)
|
|
{
|
|
Log.Info($"Resting {_config.LongRestSeconds}s after {index} products...");
|
|
await Task.Delay(_config.LongRestSeconds * 1000);
|
|
}
|
|
|
|
if (index < total)
|
|
{
|
|
var delayMs = RandomDelayMs();
|
|
Log.Info($"[{index}/{total}] Waiting {delayMs / 1000.0:0.0}s before next product...");
|
|
await Task.Delay(delayMs);
|
|
}
|
|
}
|
|
|
|
Log.Info($"Scrape complete. Scraped {scraped}, errors {errors}. Total results on file: {data.Results.Count}. " +
|
|
$"Took {FormatDuration(stopwatch.Elapsed)}.");
|
|
return (scraped, errors);
|
|
}
|
|
|
|
private async Task<ProductRecord> ParseProductAsync(string url, int index, int total)
|
|
{
|
|
Log.Info($"[{index}/{total}] Scraping {url}");
|
|
|
|
var response = await _session.Page.GotoAsync(url, new PageGotoOptions
|
|
{
|
|
WaitUntil = WaitUntilState.DOMContentLoaded,
|
|
Timeout = 60000
|
|
});
|
|
await _session.Page.WaitForTimeoutAsync(_config.PageSettleSeconds * 1000);
|
|
|
|
var status = response?.Status ?? 0;
|
|
var title = Clean(await SafeTextAsync(() => _session.Page.TitleAsync()));
|
|
var plainText = Clean(await SafeTextAsync(() => _session.Page.Locator("body").InnerTextAsync()));
|
|
|
|
if (status == 403 || Regex.IsMatch(title, @"^403 Forbidden$", RegexOptions.IgnoreCase))
|
|
throw new Exception("403 Forbidden returned by site.");
|
|
if (status == 404 || Regex.IsMatch(title, @"404|Page Not Found", RegexOptions.IgnoreCase))
|
|
throw new Exception("404 Not Found returned by site.");
|
|
|
|
var colorName = Clean(await SafeTextAsync(() => _session.Page.Locator("h1").First.InnerTextAsync()));
|
|
|
|
var skuMatch = SkuRegex.Match(plainText);
|
|
var sku = skuMatch.Success ? skuMatch.Groups[1].Value : "";
|
|
if (string.IsNullOrEmpty(sku) && string.IsNullOrEmpty(colorName))
|
|
throw new Exception("Could not find SKU or title on product page.");
|
|
|
|
var descMatch = DescRegex.Match(plainText);
|
|
var description = descMatch.Success ? Clean(descMatch.Groups[1].Value) : "";
|
|
|
|
return new ProductRecord
|
|
{
|
|
Sku = sku,
|
|
ColorName = colorName,
|
|
Description = description,
|
|
PriceTiers = ParsePriceTiers(plainText),
|
|
SafetyDataSheetUrl = await GetLinkByTextAsync(new[] { "Safety Data Sheet", @"\bSDS\b" }),
|
|
TechnicalDataSheetUrl = await GetLinkByTextAsync(new[] { "Tech Data Sheet", "Technical Data Sheet", @"\bTDS\b" }),
|
|
ApplicationGuideUrl = await GetLinkByTextAsync(new[] { "Application Guide" }),
|
|
SampleImageUrl = await GetSampleImageUrlAsync(),
|
|
ProductUrl = url,
|
|
ScrapedAt = DateTime.UtcNow
|
|
};
|
|
}
|
|
|
|
private static List<PriceTier> ParsePriceTiers(string text)
|
|
{
|
|
var tiers = new List<PriceTier>();
|
|
foreach (Match m in PriceTierRegex.Matches(text))
|
|
{
|
|
if (!decimal.TryParse(m.Groups[2].Value, NumberStyles.Any, CultureInfo.InvariantCulture, out var price))
|
|
continue;
|
|
|
|
var rangeText = Clean(m.Groups[1].Value);
|
|
int? min = null, max = null;
|
|
|
|
var range = RangeRegex.Match(rangeText);
|
|
if (range.Success)
|
|
{
|
|
min = int.Parse(range.Groups[1].Value);
|
|
max = int.Parse(range.Groups[2].Value);
|
|
}
|
|
|
|
var plus = PlusRegex.Match(rangeText);
|
|
if (plus.Success)
|
|
{
|
|
min = int.Parse(plus.Groups[1].Value);
|
|
max = null;
|
|
}
|
|
|
|
tiers.Add(new PriceTier { Min = min, Max = max, Price = price });
|
|
}
|
|
return tiers;
|
|
}
|
|
|
|
/// <summary>Returns the href of the first link whose text matches any pattern. Uses a single eval
|
|
/// returning "texthref" pairs to avoid object deserialization quirks.</summary>
|
|
private async Task<string> GetLinkByTextAsync(string[] patterns)
|
|
{
|
|
var combined = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
|
"a",
|
|
"els => els.map(a => ((a.innerText || a.textContent || '').replace(/\\s+/g, ' ').trim()) " +
|
|
"+ String.fromCharCode(1) + (a.href || ''))");
|
|
|
|
foreach (var entry in combined)
|
|
{
|
|
var parts = entry.Split('');
|
|
var text = parts.Length > 0 ? parts[0] : "";
|
|
var href = parts.Length > 1 ? parts[1] : "";
|
|
// Require the link to point at an actual document, not a generic /documents nav page.
|
|
if (href.Length > 0
|
|
&& IsDocumentUrl(href)
|
|
&& patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
|
|
return href;
|
|
}
|
|
return "";
|
|
}
|
|
|
|
/// <summary>True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).</summary>
|
|
private static bool IsDocumentUrl(string href)
|
|
{
|
|
var path = href.Split('?')[0];
|
|
return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
|
|
|| path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
|
|
}
|
|
|
|
private async Task<string> GetSampleImageUrlAsync()
|
|
{
|
|
var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
|
"img",
|
|
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
|
|
".filter(Boolean)");
|
|
|
|
// Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
|
|
// fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
|
|
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
|
|
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
|
|
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
|
|
?? "";
|
|
}
|
|
|
|
private static bool IsBlocked(Exception ex) =>
|
|
ex.Message.Contains("403", StringComparison.OrdinalIgnoreCase);
|
|
|
|
private static async Task<string> SafeTextAsync(Func<Task<string>> fn)
|
|
{
|
|
try { return await fn(); } catch { return ""; }
|
|
}
|
|
|
|
private static string Clean(string? text) => WhitespaceRegex.Replace(text ?? "", " ").Trim();
|
|
|
|
private int RandomDelayMs()
|
|
{
|
|
var min = Math.Max(0, _config.MinDelaySeconds * 1000);
|
|
var max = Math.Max(min, _config.MaxDelaySeconds * 1000);
|
|
return _random.Next(min, max + 1);
|
|
}
|
|
|
|
private static string FormatDuration(TimeSpan t) =>
|
|
t.TotalHours >= 1 ? $"{(int)t.TotalHours}h {t.Minutes}m" :
|
|
t.TotalMinutes >= 1 ? $"{(int)t.TotalMinutes}m {t.Seconds}s" :
|
|
$"{t.Seconds}s";
|
|
}
|