Add ai specific disabllows to robots.txt and meta tags. (#4607)

tulibraries · Aug 26, 2024 · c29890c · c29890c
1 parent 4aaad80
commit c29890c
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 0 deletions.
diff --git a/app/views/layouts/blacklight.html.erb b/app/views/layouts/blacklight.html.erb
@@ -7,6 +7,7 @@
     <!-- Mobile viewport optimization h5bp.com/ad -->
     <meta name="HandheldFriendly" content="True">
     <meta name="viewport" content="width=device-width,initial-scale=1.0">
+    <meta name="robots" content="noai, noimageai">
 
     <!-- Internet Explorer use the highest version available -->
     <meta http-equiv="X-UA-Compatible" content="IE=edge">

diff --git a/public/robots.txt b/public/robots.txt
@@ -31,3 +31,74 @@ Disallow: /query_list
  Allow: /articles/advanced$
  Allow: /databases/advanced$
  Allow: /journals/advanced$
+
+
+ # Common Crawl's bot - Common Crawl is one of the largest public datasets used by AI for training, with ChatGPT, Bard and other large language models.
+User-agent: CCBot
+Disallow: /
+
+# ChatGPT Bot - bot used when a ChatGPT user instructs it to reference your website.
+User-agent: ChatGPT-User
+Disallow: /
+
+# OpenAI API - bot that OpenAI specifically uses to collect bulk training data from your website for ChatGPT.
+User-agent: GPTBot
+Disallow: /
+
+# Google Bard and VertexAI. This will not have an impact on Google Search indexing. This will not affect GoogleBot crawling.
+User-agent: Google-Extended
+Disallow: /
+
+# Anthropic AI Bot
+User-agent: anthropic-ai
+Disallow: /
+
+# Claude Bot run by Anthropic
+User-agent: Claude-Web
+Disallow: /
+
+# Cohere AI Bot - unconfirmed bot believed to be associated with Cohere’s chatbot.
+User-agent: cohere-ai
+Disallow: /
+
+# OMGilibot - They sell data for training LLMs (large language models)
+User-agent: omgilibot
+Disallow: /
+
+# Omgili (Oh My God I Love It)
+User-agent: omgili
+Disallow: /
+
+# Perplexity AI
+User-agent: PerplexityBot
+Disallow: /
+
+# KUKA's youBot
+User-agent: YouBot
+Disallow: /
+
+# Diffbot - somewhat dishonest scraping bot used to collect data to train LLMs.
+User-agent: Diffbot
+Disallow: /
+
+# Bytespider is a web crawler operated by ByteDance, the Chinese owner of TikTok
+User-agent: Bytespider
+Disallow: /
+
+# ImagesiftBot is billed as a reverse image search tool, but it's associated with The Hive, a company that produces models for image generation.
+User-agent: ImagesiftBot
+Disallow: /
+
+## Social Media Bots
+
+# Amazon Bot - enabling Alexa to answer even more questions for customers.
+User-agent: Amazonbot
+Disallow: /
+
+# Apple Bot - collects website data for its Siri and Spotlight services.
+User-agent: Applebot
+Disallow: /
+
+# Meta’s bot that crawls public web pages to improve language models for their speech recognition technology.
+User-agent: FacebookBot
+Disallow: /