Tighten Prismatic scrape parsing after live smoke test

Validated against live product pages; fixed three edge cases (also present in
the original JS scraper) surfaced by specialty AkzoNobel products:

- Sample image: only accept real product images on the NIC CDN
  (images.nicindustries.com/prismatic/products), preferring full-size over
  thumbnail. Dropped the loose "prismatic|powder|color" fallback that grabbed
  the site logo on products with no image.
- SDS/TDS/app-guide links: require the href to be an actual document (NIC CDN
  or a .pdf) so a generic /documents nav link isn't captured as the SDS.
- Description: also stop at PRODUCT SUPPORT / PRODUCT COLLECTIONS / CUSTOMER
  SERVICE so less page footer is captured (app-side StripBoilerplate cleans the
  rest).

Structural fields (sku, color, price tiers) verified correct on live data.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-18 12:41:47 -04:00
parent 843d1c3c51
commit da2bb46d5a
@@ -21,7 +21,8 @@ public class PrismaticScraper
private static readonly Regex SkuRegex = private static readonly Regex SkuRegex =
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex DescRegex = private static readonly Regex DescRegex =
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled); new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
private static readonly Regex PriceTierRegex = private static readonly Regex PriceTierRegex =
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled); new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled); private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
@@ -251,12 +252,23 @@ public class PrismaticScraper
var parts = entry.Split(''); var parts = entry.Split('');
var text = parts.Length > 0 ? parts[0] : ""; var text = parts.Length > 0 ? parts[0] : "";
var href = parts.Length > 1 ? parts[1] : ""; var href = parts.Length > 1 ? parts[1] : "";
if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase))) // Require the link to point at an actual document, not a generic /documents nav page.
if (href.Length > 0
&& IsDocumentUrl(href)
&& patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
return href; return href;
} }
return ""; return "";
} }
/// <summary>True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).</summary>
private static bool IsDocumentUrl(string href)
{
var path = href.Split('?')[0];
return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
|| path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
}
private async Task<string> GetSampleImageUrlAsync() private async Task<string> GetSampleImageUrlAsync()
{ {
var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>( var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
@@ -264,10 +276,11 @@ public class PrismaticScraper
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" + "els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
".filter(Boolean)"); ".filter(Boolean)");
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase) // Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
// fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase)) && !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)) ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase))
?? ""; ?? "";
} }