From da2bb46d5aec18d87ce796ffdf2d507ab228d278 Mon Sep 17 00:00:00 2001 From: Scott Pouliot Date: Thu, 18 Jun 2026 12:41:47 -0400 Subject: [PATCH] Tighten Prismatic scrape parsing after live smoke test Validated against live product pages; fixed three edge cases (also present in the original JS scraper) surfaced by specialty AkzoNobel products: - Sample image: only accept real product images on the NIC CDN (images.nicindustries.com/prismatic/products), preferring full-size over thumbnail. Dropped the loose "prismatic|powder|color" fallback that grabbed the site logo on products with no image. - SDS/TDS/app-guide links: require the href to be an actual document (NIC CDN or a .pdf) so a generic /documents nav link isn't captured as the SDS. - Description: also stop at PRODUCT SUPPORT / PRODUCT COLLECTIONS / CUSTOMER SERVICE so less page footer is captured (app-side StripBoilerplate cleans the rest). Structural fields (sku, color, price tiers) verified correct on live data. Co-Authored-By: Claude Opus 4.8 --- .../Services/PrismaticScraper.cs | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs b/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs index 38b24d9..3952afe 100644 --- a/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs +++ b/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs @@ -21,7 +21,8 @@ public class PrismaticScraper private static readonly Regex SkuRegex = new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex DescRegex = - new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); + new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)", + RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); private static readonly Regex PriceTierRegex = new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled); @@ -251,12 +252,23 @@ public class PrismaticScraper var parts = entry.Split(''); var text = parts.Length > 0 ? parts[0] : ""; var href = parts.Length > 1 ? parts[1] : ""; - if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase))) + // Require the link to point at an actual document, not a generic /documents nav page. + if (href.Length > 0 + && IsDocumentUrl(href) + && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase))) return href; } return ""; } + /// True when an href looks like a real document (hosted on the NIC CDN or a direct PDF). + private static bool IsDocumentUrl(string href) + { + var path = href.Split('?')[0]; + return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase) + || path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase); + } + private async Task GetSampleImageUrlAsync() { var srcs = await _session.Page.EvalOnSelectorAllAsync( @@ -264,10 +276,11 @@ public class PrismaticScraper "els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" + ".filter(Boolean)"); - return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase) + // Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT + // fall back to any "prismatic"-ish URL — that catches the site logo on products with no image. + return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase) && !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase)) - ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)) - ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase)) + ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)) ?? ""; }