Tighten Prismatic scrape parsing after live smoke test
Validated against live product pages; fixed three edge cases (also present in the original JS scraper) surfaced by specialty AkzoNobel products: - Sample image: only accept real product images on the NIC CDN (images.nicindustries.com/prismatic/products), preferring full-size over thumbnail. Dropped the loose "prismatic|powder|color" fallback that grabbed the site logo on products with no image. - SDS/TDS/app-guide links: require the href to be an actual document (NIC CDN or a .pdf) so a generic /documents nav link isn't captured as the SDS. - Description: also stop at PRODUCT SUPPORT / PRODUCT COLLECTIONS / CUSTOMER SERVICE so less page footer is captured (app-side StripBoilerplate cleans the rest). Structural fields (sku, color, price tiers) verified correct on live data. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -21,7 +21,8 @@ public class PrismaticScraper
|
||||
private static readonly Regex SkuRegex =
|
||||
new(@"Item:\s*([A-Z0-9-]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex DescRegex =
|
||||
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|$)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
||||
new(@"Description:\s*(.*?)(WARNING:|What does this match\?|PRODUCT SUPPORT|PRODUCT COLLECTIONS|CUSTOMER SERVICE|$)",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
|
||||
private static readonly Regex PriceTierRegex =
|
||||
new(@"(\d+\s*-\s*\d+\s*lbs|\d+\s*\+\s*lbs)\s*\$([\d.]+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex RangeRegex = new(@"(\d+)\s*-\s*(\d+)", RegexOptions.Compiled);
|
||||
@@ -251,12 +252,23 @@ public class PrismaticScraper
|
||||
var parts = entry.Split('');
|
||||
var text = parts.Length > 0 ? parts[0] : "";
|
||||
var href = parts.Length > 1 ? parts[1] : "";
|
||||
if (href.Length > 0 && patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
|
||||
// Require the link to point at an actual document, not a generic /documents nav page.
|
||||
if (href.Length > 0
|
||||
&& IsDocumentUrl(href)
|
||||
&& patterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)))
|
||||
return href;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/// <summary>True when an href looks like a real document (hosted on the NIC CDN or a direct PDF).</summary>
|
||||
private static bool IsDocumentUrl(string href)
|
||||
{
|
||||
var path = href.Split('?')[0];
|
||||
return href.Contains("nicindustries.com", StringComparison.OrdinalIgnoreCase)
|
||||
|| path.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private async Task<string> GetSampleImageUrlAsync()
|
||||
{
|
||||
var srcs = await _session.Page.EvalOnSelectorAllAsync<string[]>(
|
||||
@@ -264,10 +276,11 @@ public class PrismaticScraper
|
||||
"els => els.map(i => i.currentSrc || i.src || i.getAttribute('src') || i.getAttribute('data-src') || '')" +
|
||||
".filter(Boolean)");
|
||||
|
||||
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase)
|
||||
// Only accept real product images on the NIC CDN (prefer full-size over thumbnail). Do NOT
|
||||
// fall back to any "prismatic"-ish URL — that catches the site logo on products with no image.
|
||||
return srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase)
|
||||
&& !Regex.IsMatch(s, "thumbnail", RegexOptions.IgnoreCase))
|
||||
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com", RegexOptions.IgnoreCase))
|
||||
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, "prismatic|powder|color", RegexOptions.IgnoreCase))
|
||||
?? srcs.FirstOrDefault(s => Regex.IsMatch(s, @"images\.nicindustries\.com/prismatic/products", RegexOptions.IgnoreCase))
|
||||
?? "";
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user