diff --git a/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs b/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs index 38b24d9..3952afe 100644 --- a/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs +++ b/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs @@ -21,7 +21,8 @@ public class PrismaticScraper private static readonly Regex SkuRegex = new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex DescRegex = - new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); + new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)", + RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); private static readonly Regex PriceTierRegex = new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled); @@ -251,12 +252,23 @@ public class PrismaticScraper var parts = entry.Split(''); var text = parts.Length > 0 ? parts[0] : ""; var href = parts.Length > 1 ? parts[1] : ""; - if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase))) + // Require the link to point at an actual document, not a generic /documents nav page. + if (href.Length > 0 + && IsDocumentUrl(href) + && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase))) return href; } return ""; } + /// True when an href looks like a real document (hosted on the NIC CDN or a direct PDF). + private static bool IsDocumentUrl(string href) + { + var path = href.Split('?')[0]; + return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase) + || path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase); + } + private async Task GetSampleImageUrlAsync() { var srcs = await _session.Page.EvalOnSelectorAllAsync( @@ -264,10 +276,11 @@ public class PrismaticScraper "els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" + ".filter(Boolean)"); - return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase) + // Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT + // fall back to any "prismatic"-ish URL — that catches the site logo on products with no image. + return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase) && !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase)) - ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)) - ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase)) + ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)) ?? ""; }