diff --git a/lib/crawler/data/crawl_result/html.rb b/lib/crawler/data/crawl_result/html.rb index a441e497..0fe062c8 100644 --- a/lib/crawler/data/crawl_result/html.rb +++ b/lib/crawler/data/crawl_result/html.rb @@ -219,7 +219,10 @@ def headings(limit: 10) def get_body_tag(exclude_tags) exclude_tags ||= {} - tags_to_exclude_for_domain = exclude_tags.fetch(url, []) + # Config stores exclude_tags keyed by domain URL (e.g., "https://example.com") + # Try site first (scheme + host), which is the standard format + tags_to_exclude_for_domain = exclude_tags.fetch(url.site, nil) || + exclude_tags.fetch(url.to_s, []) if tags_to_exclude_for_domain.empty? parsed_content.body diff --git a/spec/lib/crawler/data/crawl_result/html_spec.rb b/spec/lib/crawler/data/crawl_result/html_spec.rb index 693c143f..12b5d943 100644 --- a/spec/lib/crawler/data/crawl_result/html_spec.rb +++ b/spec/lib/crawler/data/crawl_result/html_spec.rb @@ -329,13 +329,42 @@ end context 'when given a list of tags to exclude' do - let(:body_text) { crawl_result.document_body(exclude_tags: { url => ['h1'] }) } + # Keys must be site strings (scheme + host) to match how config stores them + let(:body_text) { crawl_result.document_body(exclude_tags: { url.site => ['h1'] }) } it 'should remove content associated with those tags, even if there is a data-elastic-include' do expect(body_text).to_not match('Page header') end end + context 'when given a list of tags to exclude with string keys (as from config)' do + let(:html) do + <<~HTML + + +
HEADER TEXT Should not be indexed
+

title

+

BODY content

+
main street 123 to be ignored too
+ + + + HTML + end + + # This reproduces the bug from https://github.com/elastic/crawler/issues/416 + # The config stores exclude_tags with string keys (site URLs like "https://example.com"), + # so the lookup must use url.site to match the config format + let(:body_text) { crawl_result.document_body(exclude_tags: { url.site => %w[header address] }) } + + it 'should remove content associated with those tags when keys are strings' do + expect(body_text).to_not match('HEADER TEXT') + expect(body_text).to_not match('main street') + expect(body_text).to match('BODY content') + expect(body_text).to match('FOOTER TEXT') + end + end + it 'should remove empty spaces from the content' do expect(body_text).to match('Something something else') end