elastic · seanstory · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026
@@ -219,7 +219,10 @@ def headings(limit: 10)
 
         def get_body_tag(exclude_tags)
           exclude_tags ||= {}
-          tags_to_exclude_for_domain = exclude_tags.fetch(url, [])
+          # Config stores exclude_tags keyed by domain URL (e.g., "https://example.com")
+          # Try site first (scheme + host), which is the standard format
+          tags_to_exclude_for_domain = exclude_tags.fetch(url.site, nil) ||
+                                       exclude_tags.fetch(url.to_s, [])
 
           if tags_to_exclude_for_domain.empty?
             parsed_content.body

@@ -329,13 +329,42 @@
     end
 
     context 'when given a list of tags to exclude' do
-      let(:body_text) { crawl_result.document_body(exclude_tags: { url => ['h1'] }) }
+      # Keys must be site strings (scheme + host) to match how config stores them
+      let(:body_text) { crawl_result.document_body(exclude_tags: { url.site => ['h1'] }) }
 
       it 'should remove content associated with those tags, even if there is a data-elastic-include' do
         expect(body_text).to_not match('Page header')
       end
     end
 
+    context 'when given a list of tags to exclude with string keys (as from config)' do
+      let(:html) do
+        <<~HTML
+          <html>
+          <body>
+            <header>HEADER TEXT Should not be indexed</header>
+            <h2>title</h2>
+            <p>BODY content</p>
+            <address>main street 123 to be ignored too</address>
+            <footer>FOOTER TEXT</footer>
+          </body>
+          </html>
+        HTML
+      end
+
+      # This reproduces the bug from https://github.com/elastic/crawler/issues/416
+      # The config stores exclude_tags with string keys (site URLs like "https://example.com"),
+      # so the lookup must use url.site to match the config format
+      let(:body_text) { crawl_result.document_body(exclude_tags: { url.site => %w[header address] }) }
+
+      it 'should remove content associated with those tags when keys are strings' do
+        expect(body_text).to_not match('HEADER TEXT')
+        expect(body_text).to_not match('main street')
+        expect(body_text).to match('BODY content')
+        expect(body_text).to match('FOOTER TEXT')
+      end
+    end
+
     it 'should remove empty spaces from the content' do
       expect(body_text).to match('Something something else')
     end