diff --git a/.env.test b/.env.test index 8e4a15ce..e39230dd 100644 --- a/.env.test +++ b/.env.test @@ -1,4 +1,6 @@ ALMA_OPENURL=https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl? +TURNSTILE_SITEKEY=test-sitekey +TURNSTILE_SECRET=test-secret FEATURE_TIMDEX_FULLTEXT=true FEATURE_GEODATA=false MIT_PRIMO_URL=https://mit.primo.exlibrisgroup.com diff --git a/Gemfile b/Gemfile index bb840a7b..7f80e13c 100644 --- a/Gemfile +++ b/Gemfile @@ -14,6 +14,7 @@ gem 'openssl' gem 'puma' gem 'rack-attack' gem 'rack-timeout' +gem 'crawler_detect' gem 'rails', '~> 7.2.0' gem 'redis' gem 'scout_apm' diff --git a/Gemfile.lock b/Gemfile.lock index 0b455b06..46f26fa1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -117,6 +117,8 @@ GEM bigdecimal rexml crass (1.0.6) + crawler_detect (1.2.9) + qonfig (>= 0.24) date (3.4.1) debug (1.11.0) irb (~> 1.10) @@ -235,6 +237,8 @@ GEM public_suffix (6.0.2) puma (7.0.4) nio4r (~> 2.0) + qonfig (0.30.0) + base64 (>= 0.2) racc (1.8.1) rack (3.1.18) rack-attack (6.7.0) @@ -413,6 +417,7 @@ DEPENDENCIES bootsnap capybara climate_control + crawler_detect debug dotenv-rails graphql diff --git a/README.md b/README.md index 1e9ff60c..40ffd8e9 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ See `Optional Environment Variables` for more information. - `BOOLEAN_OPTIONS`: comma separated list of values to present to testers on instances where `BOOLEAN_PICKER` feature is enabled. - `FEATURE_BOOLEAN_PICKER`: feature to allow users to select their preferred boolean type. If set to `true`, feature is enabled. This feature is only intended for internal team testing and should never be enabled in production (mostly because the UI is a mess more than it would cause harm). +- `FEATURE_BOT_DETECTION`: When set to `true`, enables bot detection using crawler_detect and Cloudflare Turnstile challenges for suspected bots on search result pages. Requires `TURNSTILE_SITEKEY` and `TURNSTILE_SECRET` to be set. If disabled, bots may crawl search results freely. - `FEATURE_GEODATA`: Enables features related to geospatial data discovery. Setting this variable to `true` will trigger geodata mode. Note that this is currently intended _only_ for the geodata app and may have unexpected consequences if applied to other TIMDEX UI apps. @@ -146,6 +147,8 @@ instance is sending what search traffic. Defaults to "unset" if not defined. - `TIMDEX_INDEX`: Name of the index, or alias, to provide to the GraphQL endpoint. Defaults to `nil` which will let TIMDEX determine the best index to use. Wildcard values can be set, for example `rdi*` would search any indexes that begin with `rdi` in the underlying OpenSearch instance behind TIMDEX. - `TIMDEX_SOURCES`: Comma-separated list of sources to display in the advanced-search source selection element. This overrides the default which is set in ApplicationHelper. +- `TURNSTILE_SECRET`: The Cloudflare Turnstile secret key used to verify challenge responses. If not set, bot challenge protection is disabled. +- `TURNSTILE_SITEKEY`: The Cloudflare Turnstile site key used to render the challenge widget. If not set, bot challenge protection is disabled. #### Test Environment-only Variables diff --git a/app/controllers/search_controller.rb b/app/controllers/search_controller.rb index 1d1d3ba2..e0fb3e6a 100644 --- a/app/controllers/search_controller.rb +++ b/app/controllers/search_controller.rb @@ -2,6 +2,7 @@ class SearchController < ApplicationController before_action :validate_q!, only: %i[results] before_action :validate_format_token, only: %i[results] before_action :set_active_tab, only: %i[results] + before_action :challenge_bots!, only: %i[results] around_action :sleep_if_too_fast, only: %i[results] before_action :validate_geobox_presence!, only: %i[results] @@ -271,6 +272,18 @@ def validate_q! redirect_to root_url end + # If a request is flagged as a bot and violates crawling rules, redirect to + # a Cloudflare Turnstile challenge so a human can prove they're legitimate. + # Only applicable if the bot_detection feature is enabled. + def challenge_bots! + return if session[:passed_turnstile] + return unless Feature.enabled?(:bot_detection) + + if BotDetector.should_challenge?(request, params) + redirect_to turnstile_path(return_to: request.fullpath) + end + end + def validate_geodistance_presence! return unless Feature.enabled?(:geodata) diff --git a/app/controllers/turnstile_controller.rb b/app/controllers/turnstile_controller.rb new file mode 100644 index 00000000..d9011f22 --- /dev/null +++ b/app/controllers/turnstile_controller.rb @@ -0,0 +1,51 @@ +class TurnstileController < ApplicationController + # Rails CSRF protection stays enabled here. The Turnstile challenge only adds an + # additional bot validation layer rather than replacing Rails' forgery defenses. + + # Render a page with the Cloudflare Turnstile widget. Expects `TURNSTILE_SITEKEY` + # to be present in the environment. `return_to` is preserved so we can redirect after success. + def new + @sitekey = ENV.fetch('TURNSTILE_SITEKEY', nil) + @return_to = params[:return_to] || root_path + end + + # Verify Turnstile token posted by the widget. Expects param `cf-turnstile-response`. + def verify + token = params['cf-turnstile-response'] + return_to = params[:return_to].presence || root_path + + if token.blank? + flash[:error] = 'Turnstile validation failed. Please try again.' + redirect_to turnstile_path(return_to: return_to) + return + end + + secret = ENV.fetch('TURNSTILE_SECRET', nil) + verification = verify_turnstile_token(secret, token) + + if verification && verification['success'] + session[:passed_turnstile] = true + redirect_to return_to + else + flash[:error] = 'Turnstile verification failed. Please try again.' + redirect_to turnstile_path(return_to: return_to) + end + end + + private + + def verify_turnstile_token(secret, token) + return nil if secret.blank? + + begin + response = HTTP.post('https://challenges.cloudflare.com/turnstile/v0/siteverify', form: { + secret: secret, + response: token, + }) + JSON.parse(response.to_s) + rescue StandardError => e + Rails.logger.error "Turnstile verification error: #{e.message}" + nil + end + end +end diff --git a/app/models/bot_detector.rb b/app/models/bot_detector.rb new file mode 100644 index 00000000..049a5227 --- /dev/null +++ b/app/models/bot_detector.rb @@ -0,0 +1,26 @@ +class BotDetector + # Returns true if the request appears to be a bot according to crawler_detect. + def self.bot?(request) + ua = request.user_agent.to_s + detector = CrawlerDetect.new(ua) + detector.crawler? + rescue StandardError => e + Rails.logger.warn("BotDetector: crawler_detect failed for UA '#{ua}': #{e.message}") + false + end + + # Returns true when the request appears to be performing crawling behavior that we + # want to challenge. For our initial approach, treat requests to the search results + # endpoint as subject to challenge if they're flagged as bots. + def self.should_challenge?(request, params = {}) + return false unless bot?(request) + + # Basic rule: crawling any results page triggers a challenge. We consider the + # SearchController `results` action (path `/results`) and other search-related + # paths to be search result pages. This keeps the rule simple and conservative. + path = request.path.to_s + return true if path.start_with?('/search') || path.start_with?('/results') || path.include?('/search') + + false + end +end diff --git a/app/models/feature.rb b/app/models/feature.rb index 8ee5c61c..391db847 100644 --- a/app/models/feature.rb +++ b/app/models/feature.rb @@ -33,7 +33,7 @@ # class Feature # List of all valid features in the application - VALID_FEATURES = %i[geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all + VALID_FEATURES = %i[bot_detection geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all tab_timdex_alma record_link timdex_fulltext].freeze # Check if a feature is enabled by name diff --git a/app/views/turnstile/new.html.erb b/app/views/turnstile/new.html.erb new file mode 100644 index 00000000..31f649f0 --- /dev/null +++ b/app/views/turnstile/new.html.erb @@ -0,0 +1,12 @@ +<%# Simple Turnstile challenge page. Requires TURNSTILE_SITEKEY env var. %> +

Prove you're not a bot

+

To continue, please complete the challenge below.

+ + + +<%= form_with url: turnstile_verify_path, method: :post, local: true do |form| %> + <%= form.hidden_field :return_to, value: @return_to %> +
+
+ <%= form.submit 'Submit', class: 'btn button-primary' %> +<% end %> diff --git a/config/routes.rb b/config/routes.rb index 440ea41e..f8e49544 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -12,6 +12,8 @@ as: 'record', :constraints => { :id => /[0-z\.\-\_~\(\)]+/ } get 'results', to: 'search#results' + get 'turnstile', to: 'turnstile#new', as: 'turnstile' + post 'turnstile/verify', to: 'turnstile#verify', as: 'turnstile_verify' get 'style-guide', to: 'static#style_guide' get 'boolpref', to: 'static#boolpref' diff --git a/test/controllers/search_controller_test.rb b/test/controllers/search_controller_test.rb index 865ce0ba..59dc197a 100644 --- a/test/controllers/search_controller_test.rb +++ b/test/controllers/search_controller_test.rb @@ -1165,4 +1165,41 @@ def source_filter_count(controller) get '/results?q=test&format=foo' assert_response :not_acceptable end + + # Bot detection tests + test 'bots are redirected to Turnstile challenge' do + ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do + bot_ua = 'Mozilla/5.0 (compatible; Googlebot/2.1)' + + # Mock BotDetector to identify this as a bot + BotDetector.stubs(:should_challenge?).returns(true) + + get '/results?q=test', headers: { 'HTTP_USER_AGENT' => bot_ua } + + assert_redirected_to turnstile_path(return_to: '/results?q=test') + end + end + + test 'human users bypass Turnstile challenge' do + ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do + human_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36' + mock_primo_search_success + + # Ensure BotDetector doesn't flag this as bot + BotDetector.stubs(:should_challenge?).returns(false) + + get '/results?q=test&tab=primo', headers: { 'HTTP_USER_AGENT' => human_ua } + + assert_response :success + end + end + + test 'bots on non-search paths are not challenged' do + bot_ua = 'Googlebot/2.1' + + get '/', headers: { 'HTTP_USER_AGENT' => bot_ua } + + # Should not be redirected to Turnstile (doesn't hit SearchController) + assert_response :success + end end diff --git a/test/controllers/turnstile_controller_test.rb b/test/controllers/turnstile_controller_test.rb new file mode 100644 index 00000000..d79a2820 --- /dev/null +++ b/test/controllers/turnstile_controller_test.rb @@ -0,0 +1,127 @@ +require 'test_helper' + +class TurnstileControllerTest < ActionDispatch::IntegrationTest + test 'GET /turnstile renders challenge page' do + ClimateControl.modify(TURNSTILE_SITEKEY: 'test-sitekey', TURNSTILE_SECRET: 'test-secret') do + get turnstile_path + + assert_response :success + assert_select '.cf-turnstile' + assert_select 'h1', text: "Prove you're not a bot" + end + end + + test 'GET /turnstile passes sitekey to view' do + ClimateControl.modify(TURNSTILE_SITEKEY: 'another-test-sitekey') do + get turnstile_path + + assert_response :success + assert_select 'div.cf-turnstile[data-sitekey="another-test-sitekey"]' + end + end + + test 'GET /turnstile uses empty sitekey if env var missing' do + ClimateControl.modify(TURNSTILE_SITEKEY: nil) do + get turnstile_path + + assert_response :success + assert_select '.cf-turnstile' + end + end + + test 'GET /turnstile preserves return_to parameter' do + ClimateControl.modify(TURNSTILE_SITEKEY: 'test-sitekey') do + return_to = '/results?q=test' + get turnstile_path(return_to: return_to) + + assert_response :success + assert_select "input[name='return_to'][value='#{return_to}']" + end + end + + test 'GET /turnstile defaults return_to to root_path' do + ClimateControl.modify(TURNSTILE_SITEKEY: 'test-sitekey') do + get turnstile_path + + assert_response :success + assert_select "input[name='return_to'][value='/']" + end + end + + test 'POST /turnstile/verify with missing token redirects with error' do + post turnstile_verify_path, params: { return_to: '/results' } + + assert_redirected_to turnstile_path(return_to: '/results') + assert_equal 'Turnstile validation failed. Please try again.', flash[:error] + end + + test 'POST /turnstile/verify with valid token sets session and redirects' do + # Stub the HTTP post to Cloudflare to return success + stub_response = { 'success' => true, 'challenge_ts' => '2024-02-24T10:00:00Z' } + response_mock = mock(to_s: stub_response.to_json) + HTTP.stubs(:post).returns(response_mock) + + post turnstile_verify_path, params: { + 'cf-turnstile-response' => 'success_token', + return_to: '/results?q=test' + } + + assert_redirected_to '/results?q=test' + assert session[:passed_turnstile] + end + + test 'POST /turnstile/verify with invalid token redirects with error' do + # Stub the HTTP post to Cloudflare to return failure + stub_response = { 'success' => false, 'error-codes' => ['invalid-input-response'] } + response_mock = mock(to_s: stub_response.to_json) + HTTP.stubs(:post).returns(response_mock) + + post turnstile_verify_path, params: { + 'cf-turnstile-response' => 'invalid_token', + return_to: '/results' + } + + assert_redirected_to turnstile_path(return_to: '/results') + assert_equal 'Turnstile verification failed. Please try again.', flash[:error] + assert_nil session[:passed_turnstile] + end + + test 'POST /turnstile/verify with missing secret returns error' do + ClimateControl.modify(TURNSTILE_SECRET: nil) do + post turnstile_verify_path, params: { + 'cf-turnstile-response' => 'token', + return_to: '/results' + } + + assert_redirected_to turnstile_path(return_to: '/results') + assert_equal 'Turnstile verification failed. Please try again.', flash[:error] + end + end + + test 'POST /turnstile/verify defaults return_to to root_path' do + stub_response = { 'success' => true, 'challenge_ts' => '2024-02-24T10:00:00Z' } + response_mock = mock(to_s: stub_response.to_json) + HTTP.stubs(:post).returns(response_mock) + + post turnstile_verify_path, params: { + 'cf-turnstile-response' => 'success_token' + } + + assert_redirected_to root_path + assert session[:passed_turnstile] + end + + test 'POST /turnstile/verify handles verification API errors gracefully' do + # Mock the HTTP call to raise an error + HTTP.stubs(:post).raises(StandardError.new('Connection timeout')) + + post turnstile_verify_path, params: { + 'cf-turnstile-response' => 'token', + return_to: '/results' + } + + assert_redirected_to turnstile_path(return_to: '/results') + assert_equal 'Turnstile verification failed. Please try again.', flash[:error] + end + +end diff --git a/test/models/bot_detector_test.rb b/test/models/bot_detector_test.rb new file mode 100644 index 00000000..fe7c0a72 --- /dev/null +++ b/test/models/bot_detector_test.rb @@ -0,0 +1,92 @@ +require 'test_helper' +require 'ostruct' + +class BotDetectorTest < ActiveSupport::TestCase + test 'bot? detects bots when crawler_detect returns true' do + request = mock(user_agent: 'Googlebot/2.1') + + # Mock CrawlerDetect to return a detector that reports a bot + mock_detector = mock(crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + assert BotDetector.bot?(request) + end + + test 'bot? allows non-bots when crawler_detect returns false' do + request = mock(user_agent: 'Mozilla/5.0 (X11; Linux x86_64)') + + mock_detector = mock(crawler?: false) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.bot?(request) + end + + test 'bot? handles nil user agent gracefully' do + request = mock(user_agent: nil) + + mock_detector = mock(crawler?: false) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.bot?(request) + end + + test 'bot? logs and returns false on detector failure' do + request = mock(user_agent: 'Test UA') + + # Mock crawler_detect to raise an error + CrawlerDetect.stubs(:new).raises(StandardError.new('Detector failure')) + + Rails.logger.expects(:warn).with(includes('BotDetector: crawler_detect failed')) + + refute BotDetector.bot?(request) + end + + test 'should_challenge? returns false for non-bots' do + request = OpenStruct.new(user_agent: 'Mozilla/5.0 (X11; Linux)', path: '/search') + + mock_detector = mock(crawler?: false) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.should_challenge?(request) + end + + test 'should_challenge? returns false for bots not on search paths' do + bot_ua = 'Googlebot/2.1' + request = OpenStruct.new(user_agent: bot_ua, path: '/static/style-guide') + + mock_detector = mock(crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.should_challenge?(request) + end + + test 'should_challenge? returns true for bots on /search paths' do + bot_ua = 'Googlebot/2.1' + request = OpenStruct.new(user_agent: bot_ua, path: '/search') + + mock_detector = mock(crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + assert BotDetector.should_challenge?(request) + end + + test 'should_challenge? returns true for bots on results endpoint' do + bot_ua = 'Mozilla/5.0 (compatible; bingbot/2.0)' + request = OpenStruct.new(user_agent: bot_ua, path: '/results?q=test') + + mock_detector = mock(crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + assert BotDetector.should_challenge?(request) + end + + test 'should_challenge? handles nil path gracefully' do + bot_ua = 'Googlebot/2.1' + request = OpenStruct.new(user_agent: bot_ua, path: nil) + + mock_detector = mock(crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.should_challenge?(request) + end +end diff --git a/test/vcr_cassettes/turnstile_verify_api_error.yml b/test/vcr_cassettes/turnstile_verify_api_error.yml new file mode 100644 index 00000000..f5373497 --- /dev/null +++ b/test/vcr_cassettes/turnstile_verify_api_error.yml @@ -0,0 +1,22 @@ +--- +http_interactions: +- request: + body: + string: 'secret=test_secret&response=token' + encoding: UTF-8 + headers: + User-Agent: + - http/5.0 + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,*;q=0.3 + response: + status: + code: 500 + message: Internal Server Error + headers: + Content-Type: + - application/json + body: + string: '{"success":false,"error-codes":["internal-error"]}' + recorded_at: 2024-02-24 10:00:00 GMT +version: 1 diff --git a/test/vcr_cassettes/turnstile_verify_failure.yml b/test/vcr_cassettes/turnstile_verify_failure.yml new file mode 100644 index 00000000..1cb2e977 --- /dev/null +++ b/test/vcr_cassettes/turnstile_verify_failure.yml @@ -0,0 +1,22 @@ +--- +http_interactions: +- request: + body: + string: 'secret=test_secret&response=invalid_token' + encoding: UTF-8 + headers: + User-Agent: + - http/5.0 + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,*;q=0.3 + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json + body: + string: '{"success":false,"error-codes":["invalid-input-response"]}' + recorded_at: 2024-02-24 10:00:00 GMT +version: 1 diff --git a/test/vcr_cassettes/turnstile_verify_success.yml b/test/vcr_cassettes/turnstile_verify_success.yml new file mode 100644 index 00000000..646784b2 --- /dev/null +++ b/test/vcr_cassettes/turnstile_verify_success.yml @@ -0,0 +1,22 @@ +--- +http_interactions: +- request: + body: + string: 'secret=test_secret&response=success_token' + encoding: UTF-8 + headers: + User-Agent: + - http/5.0 + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,*;q=0.3 + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json + body: + string: '{"success":true,"challenge_ts":"2024-02-24T10:00:00Z","hostname":"localhost"}' + recorded_at: 2024-02-24 10:00:00 GMT +version: 1