Tighten Prismatic scrape parsing after live smoke test

Validated against live product pages; fixed three edge cases (also present in the original JS scraper) surfaced by specialty AkzoNobel products: - Sample image: only accept real product images on the NIC CDN (images.nicindustries.com/prismatic/products), preferring full-size over thumbnail. Dropped the loose "prismatic|powder|color" fallback that grabbed the site logo on products with no image. - SDS/TDS/app-guide links: require the href to be an actual document (NIC CDN or a .pdf) so a generic /documents nav link isn't captured as the SDS. - Description: also stop at PRODUCT SUPPORT / PRODUCT COLLECTIONS / CUSTOMER SERVICE so less page footer is captured (app-side StripBoilerplate cleans the rest). Structural fields (sku, color, price tiers) verified correct on live data. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 12:41:47 -04:00
parent 843d1c3c51
commit da2bb46d5a
1 changed files with 18 additions and 5 deletions
@@ -21,7 +21,8 @@ public class PrismaticScraper
    private static readonly Regex SkuRegex =
        new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    private static readonly Regex DescRegex =
-        new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
+        new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
+            RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
    private static readonly Regex PriceTierRegex =
        new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
@@ -251,12 +252,23 @@ public class PrismaticScraper
            var parts = entry.Split('');
            var text = parts.Length > 0 ? parts[0] : "";
            var href = parts.Length > 1 ? parts[1] : "";
-            if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
+            // Require the link to point at an actual document, not a generic /documents nav page.
+            if (href.Length > 0
+                && IsDocumentUrl(href)
+                && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
                return href;
        }
        return "";
    }

+    /// <summary>True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).</summary>
+    private static bool IsDocumentUrl(string href)
+    {
+        var path = href.Split('?')[0];
+        return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
+               || path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
+    }
+
    private async Task<string> GetSampleImageUrlAsync()
    {
        var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
@@ -264,10 +276,11 @@ public class PrismaticScraper
            "els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
            ".filter(Boolean)");

-        return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)
+        // Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
+        // fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
+        return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
                                        && !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
-               ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase))
-               ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase))
+               ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
               ?? "";
    }