Skip to content

Commit

Permalink
Merge pull request #243 from DataDog/anmarchenko/http_client_retries
Browse files Browse the repository at this point in the history
[SDTEST-292] Retry HTTP requests on 429 and 5xx responses
  • Loading branch information
anmarchenko authored Sep 26, 2024
2 parents 570d019 + 82207ac commit 7d381d5
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 23 deletions.
1 change: 1 addition & 0 deletions lib/datadog/ci/ext/transport.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module Transport
HEADER_CONTENT_ENCODING = "Content-Encoding"
HEADER_EVP_SUBDOMAIN = "X-Datadog-EVP-Subdomain"
HEADER_CONTAINER_ID = "Datadog-Container-ID"
HEADER_RATELIMIT_RESET = "X-RateLimit-Reset"

EVP_PROXY_V2_PATH_PREFIX = "/evp_proxy/v2/"
EVP_PROXY_V4_PATH_PREFIX = "/evp_proxy/v4/"
Expand Down
46 changes: 37 additions & 9 deletions lib/datadog/ci/transport/http.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class HTTP
DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3
INITIAL_BACKOFF = 1
MAX_BACKOFF = 30

def initialize(host:, port:, timeout: DEFAULT_TIMEOUT, ssl: true, compress: false)
@host = host
Expand Down Expand Up @@ -78,21 +79,48 @@ def request(
private

def perform_http_call(path:, payload:, headers:, verb:, retries: MAX_RETRIES, backoff: INITIAL_BACKOFF)
adapter.call(
path: path, payload: payload, headers: headers, verb: verb
)
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, SocketError, Net::HTTPBadResponse => e
Datadog.logger.debug("Failed to send request with #{e} (#{e.message})")
response = nil

begin
response = adapter.call(
path: path, payload: payload, headers: headers, verb: verb
)
return response if response.ok?

if response.code == 429
backoff = (response.header(Ext::Transport::HEADER_RATELIMIT_RESET) || 1).to_i

Datadog.logger.debug do
"Received rate limit response, retrying in #{backoff} seconds from X-RateLimit-Reset header"
end
elsif response.server_error?
Datadog.logger.debug { "Received server error response, retrying in #{backoff} seconds" }
else
return response
end
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, SocketError, Net::HTTPBadResponse => e
Datadog.logger.debug { "Failed to send request with #{e} (#{e.message})" }

if retries.positive?
response = ErrorResponse.new(e)
end

if retries.positive? && backoff <= MAX_BACKOFF
sleep(backoff)

perform_http_call(
path: path, payload: payload, headers: headers, verb: verb, retries: retries - 1, backoff: backoff * 2
path: path,
payload: payload,
headers: headers,
verb: verb,
retries: retries - 1,
backoff: backoff * 2
)
else
Datadog.logger.error("Failed to send request after #{MAX_RETRIES} retries")
ErrorResponse.new(e)
Datadog.logger.error(
"Failed to send request after #{MAX_RETRIES - retries} retries (current backoff value #{backoff})"
)

response
end
end

Expand Down
2 changes: 2 additions & 0 deletions sig/datadog/ci/ext/transport.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ module Datadog

HEADER_CONTAINER_ID: "Datadog-Container-ID"

HEADER_RATELIMIT_RESET: "X-RateLimit-Reset"

EVP_PROXY_V2_PATH_PREFIX: "/evp_proxy/v2/"

EVP_PROXY_V4_PATH_PREFIX: "/evp_proxy/v4/"
Expand Down
1 change: 1 addition & 0 deletions sig/datadog/ci/transport/http.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ module Datadog
DEFAULT_TIMEOUT: 30
MAX_RETRIES: 3
INITIAL_BACKOFF: 1
MAX_BACKOFF: 30

def initialize: (host: String, port: Integer, ?ssl: bool, ?timeout: Integer, ?compress: bool) -> void

Expand Down
107 changes: 93 additions & 14 deletions spec/datadog/ci/transport/http_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@
context "when request fails" do
let(:request_options) { {backoff: 0} }

context "when server returns error status code" do
context "when server returns 400" do
let(:net_http_response) { double("Net::HTTP::Response", code: 400, body: "error", "[]": nil) }

before do
Expand All @@ -178,27 +178,106 @@
end
end

context "when succeeds after retries" do
before do
expect(adapter).to receive(:call).and_raise(Errno::ECONNRESET).exactly(described_class::MAX_RETRIES).times
expect(adapter).to receive(:call).and_return(http_response)
context "when server returns 429" do
let(:no_backoff) { "0" }
let(:response_429_no_backoff) do
Datadog::CI::Transport::Adapters::Net::Response.new(
double("Net::HTTP::Response", code: 429, body: "error", "[]": no_backoff)
)
end

it "produces a response" do
is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response)
context "when backoff increases" do
let(:high_backoff) { "31" }
let(:response_429_high_backoff) do
Datadog::CI::Transport::Adapters::Net::Response.new(
double("Net::HTTP::Response", code: 429, body: "error", "[]": high_backoff)
)
end

before do
expect(adapter).to receive(:call).and_return(response_429_no_backoff).once
expect(adapter).to receive(:call).and_return(response_429_high_backoff).once
end

it "retries until backoff is over 30, afterwards returns failed response" do
is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response)

expect(response.code).to eq(429)
end
end

context "when the call eventually succeeds" do
before do
expect(adapter).to(
receive(:call).and_return(response_429_no_backoff).exactly(described_class::MAX_RETRIES).times
)
expect(adapter).to receive(:call).and_return(http_response).once
end

it "produces a response" do
is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response)

expect(response.code).to eq(200)
expect(response.code).to eq(200)
end
end
end

context "when retries are exhausted" do
before do
expect(adapter).to receive(:call).and_raise(Errno::ECONNRESET).exactly(described_class::MAX_RETRIES + 1).times
context "when server returns 503" do
let(:response_503) do
Datadog::CI::Transport::Adapters::Net::Response.new(
double("Net::HTTP::Response", code: 503, body: "error", "[]": nil)
)
end

context "when succeeds after retries" do
before do
expect(adapter).to receive(:call).and_return(response_503).exactly(described_class::MAX_RETRIES).times
expect(adapter).to receive(:call).and_return(http_response)
end

it "produces a response" do
is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response)

expect(response.code).to eq(200)
end
end

context "when retries are exhausted" do
before do
expect(adapter).to receive(:call).and_return(response_503).exactly(described_class::MAX_RETRIES + 1).times
end

it "returns failed response" do
is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response)

expect(response.code).to eq(503)
end
end
end

context "when network connection fails" do
context "when succeeds after retries" do
before do
expect(adapter).to receive(:call).and_raise(Errno::ECONNRESET).exactly(described_class::MAX_RETRIES).times
expect(adapter).to receive(:call).and_return(http_response)
end

it "produces a response" do
is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response)

expect(response.code).to eq(200)
end
end

context "when retries are exhausted" do
before do
expect(adapter).to receive(:call).and_raise(Errno::ECONNRESET).exactly(described_class::MAX_RETRIES + 1).times
end

it "returns ErrorRsponse" do
expect(response.error).to be_kind_of(Errno::ECONNRESET)
expect(response.telemetry_error_type).to eq(Datadog::CI::Ext::Telemetry::ErrorType::NETWORK)
it "returns ErrorRsponse" do
expect(response.error).to be_kind_of(Errno::ECONNRESET)
expect(response.telemetry_error_type).to eq(Datadog::CI::Ext::Telemetry::ErrorType::NETWORK)
end
end
end
end
Expand Down

0 comments on commit 7d381d5

Please sign in to comment.