diff --git a/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs b/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs
index 38b24d9..3952afe 100644
--- a/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs
+++ b/scripts/Prismatic Data Scraper/Services/PrismaticScraper.cs
@@ -21,7 +21,8 @@ public class PrismaticScraper
private static readonly Regex SkuRegex =
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex DescRegex =
- new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
+ new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
+ RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
private static readonly Regex PriceTierRegex =
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
@@ -251,12 +252,23 @@ public class PrismaticScraper
var parts = entry.Split('');
var text = parts.Length > 0 ? parts[0] : "";
var href = parts.Length > 1 ? parts[1] : "";
- if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
+ // Require the link to point at an actual document, not a generic /documents nav page.
+ if (href.Length > 0
+ && IsDocumentUrl(href)
+ && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
return href;
}
return "";
}
+ /// True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).
+ private static bool IsDocumentUrl(string href)
+ {
+ var path = href.Split('?')[0];
+ return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
+ || path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
+ }
+
private async Task GetSampleImageUrlAsync()
{
var srcs = await _session.Page.EvalOnSelectorAllAsync(
@@ -264,10 +276,11 @@ public class PrismaticScraper
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
".filter(Boolean)");
- return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)
+ // Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
+ // fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
+ return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
- ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase))
- ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase))
+ ?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
?? "";
}