diff --git a/.env.test b/.env.test
index 8e4a15ce..e39230dd 100644
--- a/.env.test
+++ b/.env.test
@@ -1,4 +1,6 @@
ALMA_OPENURL=https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?
+TURNSTILE_SITEKEY=test-sitekey
+TURNSTILE_SECRET=test-secret
FEATURE_TIMDEX_FULLTEXT=true
FEATURE_GEODATA=false
MIT_PRIMO_URL=https://mit.primo.exlibrisgroup.com
diff --git a/Gemfile b/Gemfile
index bb840a7b..7f80e13c 100644
--- a/Gemfile
+++ b/Gemfile
@@ -14,6 +14,7 @@ gem 'openssl'
gem 'puma'
gem 'rack-attack'
gem 'rack-timeout'
+gem 'crawler_detect'
gem 'rails', '~> 7.2.0'
gem 'redis'
gem 'scout_apm'
diff --git a/Gemfile.lock b/Gemfile.lock
index 0b455b06..46f26fa1 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -117,6 +117,8 @@ GEM
bigdecimal
rexml
crass (1.0.6)
+ crawler_detect (1.2.9)
+ qonfig (>= 0.24)
date (3.4.1)
debug (1.11.0)
irb (~> 1.10)
@@ -235,6 +237,8 @@ GEM
public_suffix (6.0.2)
puma (7.0.4)
nio4r (~> 2.0)
+ qonfig (0.30.0)
+ base64 (>= 0.2)
racc (1.8.1)
rack (3.1.18)
rack-attack (6.7.0)
@@ -413,6 +417,7 @@ DEPENDENCIES
bootsnap
capybara
climate_control
+ crawler_detect
debug
dotenv-rails
graphql
diff --git a/README.md b/README.md
index 1e9ff60c..40ffd8e9 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,7 @@ See `Optional Environment Variables` for more information.
- `BOOLEAN_OPTIONS`: comma separated list of values to present to testers on instances where `BOOLEAN_PICKER` feature is enabled.
- `FEATURE_BOOLEAN_PICKER`: feature to allow users to select their preferred boolean type. If set to `true`, feature is enabled. This feature is only intended for internal team
testing and should never be enabled in production (mostly because the UI is a mess more than it would cause harm).
+- `FEATURE_BOT_DETECTION`: When set to `true`, enables bot detection using crawler_detect and Cloudflare Turnstile challenges for suspected bots on search result pages. Requires `TURNSTILE_SITEKEY` and `TURNSTILE_SECRET` to be set. If disabled, bots may crawl search results freely.
- `FEATURE_GEODATA`: Enables features related to geospatial data discovery. Setting this variable to `true` will trigger geodata
mode. Note that this is currently intended _only_ for the geodata app and
may have unexpected consequences if applied to other TIMDEX UI apps.
@@ -146,6 +147,8 @@ instance is sending what search traffic. Defaults to "unset" if not defined.
- `TIMDEX_INDEX`: Name of the index, or alias, to provide to the GraphQL endpoint. Defaults to `nil` which will let TIMDEX determine the best index to use. Wildcard values can be set, for example `rdi*` would search any indexes that begin with `rdi` in the underlying OpenSearch instance behind TIMDEX.
- `TIMDEX_SOURCES`: Comma-separated list of sources to display in the advanced-search source selection element. This
overrides the default which is set in ApplicationHelper.
+- `TURNSTILE_SECRET`: The Cloudflare Turnstile secret key used to verify challenge responses. If not set, bot challenge protection is disabled.
+- `TURNSTILE_SITEKEY`: The Cloudflare Turnstile site key used to render the challenge widget. If not set, bot challenge protection is disabled.
#### Test Environment-only Variables
diff --git a/app/controllers/search_controller.rb b/app/controllers/search_controller.rb
index 1d1d3ba2..e0fb3e6a 100644
--- a/app/controllers/search_controller.rb
+++ b/app/controllers/search_controller.rb
@@ -2,6 +2,7 @@ class SearchController < ApplicationController
before_action :validate_q!, only: %i[results]
before_action :validate_format_token, only: %i[results]
before_action :set_active_tab, only: %i[results]
+ before_action :challenge_bots!, only: %i[results]
around_action :sleep_if_too_fast, only: %i[results]
before_action :validate_geobox_presence!, only: %i[results]
@@ -271,6 +272,18 @@ def validate_q!
redirect_to root_url
end
+ # If a request is flagged as a bot and violates crawling rules, redirect to
+ # a Cloudflare Turnstile challenge so a human can prove they're legitimate.
+ # Only applicable if the bot_detection feature is enabled.
+ def challenge_bots!
+ return if session[:passed_turnstile]
+ return unless Feature.enabled?(:bot_detection)
+
+ if BotDetector.should_challenge?(request, params)
+ redirect_to turnstile_path(return_to: request.fullpath)
+ end
+ end
+
def validate_geodistance_presence!
return unless Feature.enabled?(:geodata)
diff --git a/app/controllers/turnstile_controller.rb b/app/controllers/turnstile_controller.rb
new file mode 100644
index 00000000..d9011f22
--- /dev/null
+++ b/app/controllers/turnstile_controller.rb
@@ -0,0 +1,51 @@
+class TurnstileController < ApplicationController
+ # Rails CSRF protection stays enabled here. The Turnstile challenge only adds an
+ # additional bot validation layer rather than replacing Rails' forgery defenses.
+
+ # Render a page with the Cloudflare Turnstile widget. Expects `TURNSTILE_SITEKEY`
+ # to be present in the environment. `return_to` is preserved so we can redirect after success.
+ def new
+ @sitekey = ENV.fetch('TURNSTILE_SITEKEY', nil)
+ @return_to = params[:return_to] || root_path
+ end
+
+ # Verify Turnstile token posted by the widget. Expects param `cf-turnstile-response`.
+ def verify
+ token = params['cf-turnstile-response']
+ return_to = params[:return_to].presence || root_path
+
+ if token.blank?
+ flash[:error] = 'Turnstile validation failed. Please try again.'
+ redirect_to turnstile_path(return_to: return_to)
+ return
+ end
+
+ secret = ENV.fetch('TURNSTILE_SECRET', nil)
+ verification = verify_turnstile_token(secret, token)
+
+ if verification && verification['success']
+ session[:passed_turnstile] = true
+ redirect_to return_to
+ else
+ flash[:error] = 'Turnstile verification failed. Please try again.'
+ redirect_to turnstile_path(return_to: return_to)
+ end
+ end
+
+ private
+
+ def verify_turnstile_token(secret, token)
+ return nil if secret.blank?
+
+ begin
+ response = HTTP.post('https://challenges.cloudflare.com/turnstile/v0/siteverify', form: {
+ secret: secret,
+ response: token,
+ })
+ JSON.parse(response.to_s)
+ rescue StandardError => e
+ Rails.logger.error "Turnstile verification error: #{e.message}"
+ nil
+ end
+ end
+end
diff --git a/app/models/bot_detector.rb b/app/models/bot_detector.rb
new file mode 100644
index 00000000..049a5227
--- /dev/null
+++ b/app/models/bot_detector.rb
@@ -0,0 +1,26 @@
+class BotDetector
+ # Returns true if the request appears to be a bot according to crawler_detect.
+ def self.bot?(request)
+ ua = request.user_agent.to_s
+ detector = CrawlerDetect.new(ua)
+ detector.crawler?
+ rescue StandardError => e
+ Rails.logger.warn("BotDetector: crawler_detect failed for UA '#{ua}': #{e.message}")
+ false
+ end
+
+ # Returns true when the request appears to be performing crawling behavior that we
+ # want to challenge. For our initial approach, treat requests to the search results
+ # endpoint as subject to challenge if they're flagged as bots.
+ def self.should_challenge?(request, params = {})
+ return false unless bot?(request)
+
+ # Basic rule: crawling any results page triggers a challenge. We consider the
+ # SearchController `results` action (path `/results`) and other search-related
+ # paths to be search result pages. This keeps the rule simple and conservative.
+ path = request.path.to_s
+ return true if path.start_with?('/search') || path.start_with?('/results') || path.include?('/search')
+
+ false
+ end
+end
diff --git a/app/models/feature.rb b/app/models/feature.rb
index 8ee5c61c..391db847 100644
--- a/app/models/feature.rb
+++ b/app/models/feature.rb
@@ -33,7 +33,7 @@
#
class Feature
# List of all valid features in the application
- VALID_FEATURES = %i[geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all
+ VALID_FEATURES = %i[bot_detection geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all
tab_timdex_alma record_link timdex_fulltext].freeze
# Check if a feature is enabled by name
diff --git a/app/views/turnstile/new.html.erb b/app/views/turnstile/new.html.erb
new file mode 100644
index 00000000..31f649f0
--- /dev/null
+++ b/app/views/turnstile/new.html.erb
@@ -0,0 +1,12 @@
+<%# Simple Turnstile challenge page. Requires TURNSTILE_SITEKEY env var. %>
+
Prove you're not a bot
+To continue, please complete the challenge below.
+
+
+
+<%= form_with url: turnstile_verify_path, method: :post, local: true do |form| %>
+ <%= form.hidden_field :return_to, value: @return_to %>
+
+
+ <%= form.submit 'Submit', class: 'btn button-primary' %>
+<% end %>
diff --git a/config/routes.rb b/config/routes.rb
index 440ea41e..f8e49544 100644
--- a/config/routes.rb
+++ b/config/routes.rb
@@ -12,6 +12,8 @@
as: 'record',
:constraints => { :id => /[0-z\.\-\_~\(\)]+/ }
get 'results', to: 'search#results'
+ get 'turnstile', to: 'turnstile#new', as: 'turnstile'
+ post 'turnstile/verify', to: 'turnstile#verify', as: 'turnstile_verify'
get 'style-guide', to: 'static#style_guide'
get 'boolpref', to: 'static#boolpref'
diff --git a/test/controllers/search_controller_test.rb b/test/controllers/search_controller_test.rb
index 865ce0ba..59dc197a 100644
--- a/test/controllers/search_controller_test.rb
+++ b/test/controllers/search_controller_test.rb
@@ -1165,4 +1165,41 @@ def source_filter_count(controller)
get '/results?q=test&format=foo'
assert_response :not_acceptable
end
+
+ # Bot detection tests
+ test 'bots are redirected to Turnstile challenge' do
+ ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do
+ bot_ua = 'Mozilla/5.0 (compatible; Googlebot/2.1)'
+
+ # Mock BotDetector to identify this as a bot
+ BotDetector.stubs(:should_challenge?).returns(true)
+
+ get '/results?q=test', headers: { 'HTTP_USER_AGENT' => bot_ua }
+
+ assert_redirected_to turnstile_path(return_to: '/results?q=test')
+ end
+ end
+
+ test 'human users bypass Turnstile challenge' do
+ ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do
+ human_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36'
+ mock_primo_search_success
+
+ # Ensure BotDetector doesn't flag this as bot
+ BotDetector.stubs(:should_challenge?).returns(false)
+
+ get '/results?q=test&tab=primo', headers: { 'HTTP_USER_AGENT' => human_ua }
+
+ assert_response :success
+ end
+ end
+
+ test 'bots on non-search paths are not challenged' do
+ bot_ua = 'Googlebot/2.1'
+
+ get '/', headers: { 'HTTP_USER_AGENT' => bot_ua }
+
+ # Should not be redirected to Turnstile (doesn't hit SearchController)
+ assert_response :success
+ end
end
diff --git a/test/controllers/turnstile_controller_test.rb b/test/controllers/turnstile_controller_test.rb
new file mode 100644
index 00000000..d79a2820
--- /dev/null
+++ b/test/controllers/turnstile_controller_test.rb
@@ -0,0 +1,127 @@
+require 'test_helper'
+
+class TurnstileControllerTest < ActionDispatch::IntegrationTest
+ test 'GET /turnstile renders challenge page' do
+ ClimateControl.modify(TURNSTILE_SITEKEY: 'test-sitekey', TURNSTILE_SECRET: 'test-secret') do
+ get turnstile_path
+
+ assert_response :success
+ assert_select '.cf-turnstile'
+ assert_select 'h1', text: "Prove you're not a bot"
+ end
+ end
+
+ test 'GET /turnstile passes sitekey to view' do
+ ClimateControl.modify(TURNSTILE_SITEKEY: 'another-test-sitekey') do
+ get turnstile_path
+
+ assert_response :success
+ assert_select 'div.cf-turnstile[data-sitekey="another-test-sitekey"]'
+ end
+ end
+
+ test 'GET /turnstile uses empty sitekey if env var missing' do
+ ClimateControl.modify(TURNSTILE_SITEKEY: nil) do
+ get turnstile_path
+
+ assert_response :success
+ assert_select '.cf-turnstile'
+ end
+ end
+
+ test 'GET /turnstile preserves return_to parameter' do
+ ClimateControl.modify(TURNSTILE_SITEKEY: 'test-sitekey') do
+ return_to = '/results?q=test'
+ get turnstile_path(return_to: return_to)
+
+ assert_response :success
+ assert_select "input[name='return_to'][value='#{return_to}']"
+ end
+ end
+
+ test 'GET /turnstile defaults return_to to root_path' do
+ ClimateControl.modify(TURNSTILE_SITEKEY: 'test-sitekey') do
+ get turnstile_path
+
+ assert_response :success
+ assert_select "input[name='return_to'][value='/']"
+ end
+ end
+
+ test 'POST /turnstile/verify with missing token redirects with error' do
+ post turnstile_verify_path, params: { return_to: '/results' }
+
+ assert_redirected_to turnstile_path(return_to: '/results')
+ assert_equal 'Turnstile validation failed. Please try again.', flash[:error]
+ end
+
+ test 'POST /turnstile/verify with valid token sets session and redirects' do
+ # Stub the HTTP post to Cloudflare to return success
+ stub_response = { 'success' => true, 'challenge_ts' => '2024-02-24T10:00:00Z' }
+ response_mock = mock(to_s: stub_response.to_json)
+ HTTP.stubs(:post).returns(response_mock)
+
+ post turnstile_verify_path, params: {
+ 'cf-turnstile-response' => 'success_token',
+ return_to: '/results?q=test'
+ }
+
+ assert_redirected_to '/results?q=test'
+ assert session[:passed_turnstile]
+ end
+
+ test 'POST /turnstile/verify with invalid token redirects with error' do
+ # Stub the HTTP post to Cloudflare to return failure
+ stub_response = { 'success' => false, 'error-codes' => ['invalid-input-response'] }
+ response_mock = mock(to_s: stub_response.to_json)
+ HTTP.stubs(:post).returns(response_mock)
+
+ post turnstile_verify_path, params: {
+ 'cf-turnstile-response' => 'invalid_token',
+ return_to: '/results'
+ }
+
+ assert_redirected_to turnstile_path(return_to: '/results')
+ assert_equal 'Turnstile verification failed. Please try again.', flash[:error]
+ assert_nil session[:passed_turnstile]
+ end
+
+ test 'POST /turnstile/verify with missing secret returns error' do
+ ClimateControl.modify(TURNSTILE_SECRET: nil) do
+ post turnstile_verify_path, params: {
+ 'cf-turnstile-response' => 'token',
+ return_to: '/results'
+ }
+
+ assert_redirected_to turnstile_path(return_to: '/results')
+ assert_equal 'Turnstile verification failed. Please try again.', flash[:error]
+ end
+ end
+
+ test 'POST /turnstile/verify defaults return_to to root_path' do
+ stub_response = { 'success' => true, 'challenge_ts' => '2024-02-24T10:00:00Z' }
+ response_mock = mock(to_s: stub_response.to_json)
+ HTTP.stubs(:post).returns(response_mock)
+
+ post turnstile_verify_path, params: {
+ 'cf-turnstile-response' => 'success_token'
+ }
+
+ assert_redirected_to root_path
+ assert session[:passed_turnstile]
+ end
+
+ test 'POST /turnstile/verify handles verification API errors gracefully' do
+ # Mock the HTTP call to raise an error
+ HTTP.stubs(:post).raises(StandardError.new('Connection timeout'))
+
+ post turnstile_verify_path, params: {
+ 'cf-turnstile-response' => 'token',
+ return_to: '/results'
+ }
+
+ assert_redirected_to turnstile_path(return_to: '/results')
+ assert_equal 'Turnstile verification failed. Please try again.', flash[:error]
+ end
+
+end
diff --git a/test/models/bot_detector_test.rb b/test/models/bot_detector_test.rb
new file mode 100644
index 00000000..fe7c0a72
--- /dev/null
+++ b/test/models/bot_detector_test.rb
@@ -0,0 +1,92 @@
+require 'test_helper'
+require 'ostruct'
+
+class BotDetectorTest < ActiveSupport::TestCase
+ test 'bot? detects bots when crawler_detect returns true' do
+ request = mock(user_agent: 'Googlebot/2.1')
+
+ # Mock CrawlerDetect to return a detector that reports a bot
+ mock_detector = mock(crawler?: true)
+ CrawlerDetect.stubs(:new).returns(mock_detector)
+
+ assert BotDetector.bot?(request)
+ end
+
+ test 'bot? allows non-bots when crawler_detect returns false' do
+ request = mock(user_agent: 'Mozilla/5.0 (X11; Linux x86_64)')
+
+ mock_detector = mock(crawler?: false)
+ CrawlerDetect.stubs(:new).returns(mock_detector)
+
+ refute BotDetector.bot?(request)
+ end
+
+ test 'bot? handles nil user agent gracefully' do
+ request = mock(user_agent: nil)
+
+ mock_detector = mock(crawler?: false)
+ CrawlerDetect.stubs(:new).returns(mock_detector)
+
+ refute BotDetector.bot?(request)
+ end
+
+ test 'bot? logs and returns false on detector failure' do
+ request = mock(user_agent: 'Test UA')
+
+ # Mock crawler_detect to raise an error
+ CrawlerDetect.stubs(:new).raises(StandardError.new('Detector failure'))
+
+ Rails.logger.expects(:warn).with(includes('BotDetector: crawler_detect failed'))
+
+ refute BotDetector.bot?(request)
+ end
+
+ test 'should_challenge? returns false for non-bots' do
+ request = OpenStruct.new(user_agent: 'Mozilla/5.0 (X11; Linux)', path: '/search')
+
+ mock_detector = mock(crawler?: false)
+ CrawlerDetect.stubs(:new).returns(mock_detector)
+
+ refute BotDetector.should_challenge?(request)
+ end
+
+ test 'should_challenge? returns false for bots not on search paths' do
+ bot_ua = 'Googlebot/2.1'
+ request = OpenStruct.new(user_agent: bot_ua, path: '/static/style-guide')
+
+ mock_detector = mock(crawler?: true)
+ CrawlerDetect.stubs(:new).returns(mock_detector)
+
+ refute BotDetector.should_challenge?(request)
+ end
+
+ test 'should_challenge? returns true for bots on /search paths' do
+ bot_ua = 'Googlebot/2.1'
+ request = OpenStruct.new(user_agent: bot_ua, path: '/search')
+
+ mock_detector = mock(crawler?: true)
+ CrawlerDetect.stubs(:new).returns(mock_detector)
+
+ assert BotDetector.should_challenge?(request)
+ end
+
+ test 'should_challenge? returns true for bots on results endpoint' do
+ bot_ua = 'Mozilla/5.0 (compatible; bingbot/2.0)'
+ request = OpenStruct.new(user_agent: bot_ua, path: '/results?q=test')
+
+ mock_detector = mock(crawler?: true)
+ CrawlerDetect.stubs(:new).returns(mock_detector)
+
+ assert BotDetector.should_challenge?(request)
+ end
+
+ test 'should_challenge? handles nil path gracefully' do
+ bot_ua = 'Googlebot/2.1'
+ request = OpenStruct.new(user_agent: bot_ua, path: nil)
+
+ mock_detector = mock(crawler?: true)
+ CrawlerDetect.stubs(:new).returns(mock_detector)
+
+ refute BotDetector.should_challenge?(request)
+ end
+end
diff --git a/test/vcr_cassettes/turnstile_verify_api_error.yml b/test/vcr_cassettes/turnstile_verify_api_error.yml
new file mode 100644
index 00000000..f5373497
--- /dev/null
+++ b/test/vcr_cassettes/turnstile_verify_api_error.yml
@@ -0,0 +1,22 @@
+---
+http_interactions:
+- request:
+ body:
+ string: 'secret=test_secret&response=token'
+ encoding: UTF-8
+ headers:
+ User-Agent:
+ - http/5.0
+ Accept-Encoding:
+ - gzip;q=1.0,deflate;q=0.6,*;q=0.3
+ response:
+ status:
+ code: 500
+ message: Internal Server Error
+ headers:
+ Content-Type:
+ - application/json
+ body:
+ string: '{"success":false,"error-codes":["internal-error"]}'
+ recorded_at: 2024-02-24 10:00:00 GMT
+version: 1
diff --git a/test/vcr_cassettes/turnstile_verify_failure.yml b/test/vcr_cassettes/turnstile_verify_failure.yml
new file mode 100644
index 00000000..1cb2e977
--- /dev/null
+++ b/test/vcr_cassettes/turnstile_verify_failure.yml
@@ -0,0 +1,22 @@
+---
+http_interactions:
+- request:
+ body:
+ string: 'secret=test_secret&response=invalid_token'
+ encoding: UTF-8
+ headers:
+ User-Agent:
+ - http/5.0
+ Accept-Encoding:
+ - gzip;q=1.0,deflate;q=0.6,*;q=0.3
+ response:
+ status:
+ code: 200
+ message: OK
+ headers:
+ Content-Type:
+ - application/json
+ body:
+ string: '{"success":false,"error-codes":["invalid-input-response"]}'
+ recorded_at: 2024-02-24 10:00:00 GMT
+version: 1
diff --git a/test/vcr_cassettes/turnstile_verify_success.yml b/test/vcr_cassettes/turnstile_verify_success.yml
new file mode 100644
index 00000000..646784b2
--- /dev/null
+++ b/test/vcr_cassettes/turnstile_verify_success.yml
@@ -0,0 +1,22 @@
+---
+http_interactions:
+- request:
+ body:
+ string: 'secret=test_secret&response=success_token'
+ encoding: UTF-8
+ headers:
+ User-Agent:
+ - http/5.0
+ Accept-Encoding:
+ - gzip;q=1.0,deflate;q=0.6,*;q=0.3
+ response:
+ status:
+ code: 200
+ message: OK
+ headers:
+ Content-Type:
+ - application/json
+ body:
+ string: '{"success":true,"challenge_ts":"2024-02-24T10:00:00Z","hostname":"localhost"}'
+ recorded_at: 2024-02-24 10:00:00 GMT
+version: 1