diff --git a/app/views/layouts/blacklight.html.erb b/app/views/layouts/blacklight.html.erb index 302d5ab26..af186eb7b 100644 --- a/app/views/layouts/blacklight.html.erb +++ b/app/views/layouts/blacklight.html.erb @@ -7,6 +7,7 @@ + diff --git a/public/robots.txt b/public/robots.txt index 399e38d3d..17d661fe5 100644 --- a/public/robots.txt +++ b/public/robots.txt @@ -31,3 +31,74 @@ Disallow: /query_list Allow: /articles/advanced$ Allow: /databases/advanced$ Allow: /journals/advanced$ + + + # Common Crawl's bot - Common Crawl is one of the largest public datasets used by AI for training, with ChatGPT, Bard and other large language models. +User-agent: CCBot +Disallow: / + +# ChatGPT Bot - bot used when a ChatGPT user instructs it to reference your website. +User-agent: ChatGPT-User +Disallow: / + +# OpenAI API - bot that OpenAI specifically uses to collect bulk training data from your website for ChatGPT. +User-agent: GPTBot +Disallow: / + +# Google Bard and VertexAI. This will not have an impact on Google Search indexing. This will not affect GoogleBot crawling. +User-agent: Google-Extended +Disallow: / + +# Anthropic AI Bot +User-agent: anthropic-ai +Disallow: / + +# Claude Bot run by Anthropic +User-agent: Claude-Web +Disallow: / + +# Cohere AI Bot - unconfirmed bot believed to be associated with Cohere’s chatbot. +User-agent: cohere-ai +Disallow: / + +# OMGilibot - They sell data for training LLMs (large language models) +User-agent: omgilibot +Disallow: / + +# Omgili (Oh My God I Love It) +User-agent: omgili +Disallow: / + +# Perplexity AI +User-agent: PerplexityBot +Disallow: / + +# KUKA's youBot +User-agent: YouBot +Disallow: / + +# Diffbot - somewhat dishonest scraping bot used to collect data to train LLMs. +User-agent: Diffbot +Disallow: / + +# Bytespider is a web crawler operated by ByteDance, the Chinese owner of TikTok +User-agent: Bytespider +Disallow: / + +# ImagesiftBot is billed as a reverse image search tool, but it's associated with The Hive, a company that produces models for image generation. +User-agent: ImagesiftBot +Disallow: / + +## Social Media Bots + +# Amazon Bot - enabling Alexa to answer even more questions for customers. +User-agent: Amazonbot +Disallow: / + +# Apple Bot - collects website data for its Siri and Spotlight services. +User-agent: Applebot +Disallow: / + +# Meta’s bot that crawls public web pages to improve language models for their speech recognition technology. +User-agent: FacebookBot +Disallow: /