Skip to content

Commit c1b9cf9

Browse files
MrSerthDome-GER
authored andcommitted
Reserve runners for the whole duration of their use
This commit changes how runners are reserved and released. Previously, a runner was reserved only for the actual time it was actively used and the aim was to reserve it for a time as short as possible. The previous mechanism worked, but had some drawbacks: For example, a learner could start a submission scoring, which would perform (1) a file copy, (2) an execution of the first test file, (3) an execution of the second test file, etc. For each of these actions, the runner would be reserved and released. Now, it could happen that another run was triggered between the execution of both test files, causing issues with the remaining scoring. In the refactored version, a runner is reserved once before initially copying files and only released after the last command finished. This ensures that no one can interrupt an execution started once, hopefully making it more robust (and faster, since less locks are required).
1 parent e3471cb commit c1b9cf9

File tree

2 files changed

+50
-22
lines changed

2 files changed

+50
-22
lines changed

app/models/runner.rb

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@ class Runner < ApplicationRecord
44
include ContributorCreation
55
belongs_to :execution_environment
66

7+
# Each reservation is extended by the specified duration.
8+
# This allows us to extend the reservation in case of a timeout and a subsequent runner swap.
9+
# By observation, we know that a runner swap usually takes 500ms on average, sometimes 700ms.
10+
# This time includes requesting a new runner and copying the submission files to the new runner.
11+
RESERVATION_BUFFER = 1.second
12+
713
before_validation :request_id
814
validates :runner_id, presence: true
915

@@ -43,10 +49,10 @@ def self.for(contributor, execution_environment)
4349
runner
4450
end
4551

46-
def copy_files(files)
47-
reserve!
52+
def copy_files(files, exclusive: true)
53+
reserve! if exclusive
4854
@strategy.copy_files(files)
49-
release!
55+
release! if exclusive
5056
rescue Runner::Error::RunnerInUse => e
5157
Rails.logger.debug { "Copying files failed because the runner was already in use: #{e.message}" }
5258
raise e
@@ -55,7 +61,7 @@ def copy_files(files)
5561
request_new_id
5662
save
5763
@strategy.copy_files(files)
58-
release!
64+
release! if exclusive
5965
end
6066

6167
def download_file(desired_file, privileged_execution:, exclusive: true, &)
@@ -192,12 +198,17 @@ def reserve!
192198
@error = Runner::Error::RunnerInUse.new("The desired Runner #{id} is already in use until #{reserved_until.iso8601}.")
193199
raise @error
194200
else
195-
update!(reserved_until: Time.zone.now + execution_environment.permitted_execution_time.seconds)
196-
@error = nil
201+
lock_runner_for_permitted_execution_time!
197202
end
198203
end
199204
end
200205

206+
def extend!
207+
with_lock do
208+
lock_runner_for_permitted_execution_time!
209+
end
210+
end
211+
201212
def release!
202213
return if @error.present?
203214

@@ -237,4 +248,13 @@ def request_new_id
237248
end
238249
end
239250
end
251+
252+
def lock_runner_for_permitted_execution_time!
253+
# We reserve the runner for the permitted execution time plus the specified buffer time to ensure that the runner is
254+
# not released before the execution is finished. This is especially important for the case that a command execution
255+
# timed out, but subsequent commands are scheduled (i.e., when calculating a score and swapping a runner).
256+
reservation = execution_environment.permitted_execution_time.seconds + RESERVATION_BUFFER
257+
update!(reserved_until: Time.zone.now + reservation)
258+
@error = nil
259+
end
240260
end

app/models/submission.rb

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def calculate_score(requesting_user)
154154
file_scores = assessments.sort_by {|file| file.teacher_defined_linter? ? 0 : 1 }.map.with_index(1) do |file, index|
155155
output = run_test_file file, runner, waiting_duration
156156
# If the previous execution failed and there is at least one more test, we request a new runner.
157-
runner, waiting_duration = swap_runner(runner) if output[:status] == :timeout && index < assessment_number
157+
swap_runner(runner) if output[:status] == :timeout && index < assessment_number
158158
score_file(output, file, requesting_user)
159159
end
160160
end
@@ -169,7 +169,7 @@ def run(file, &block)
169169
run_command = command_for execution_environment.run_command, file.filepath
170170
durations = {}
171171
prepared_runner do |runner, waiting_duration|
172-
durations[:execution_duration] = runner.attach_to_execution(run_command, &block)
172+
durations[:execution_duration] = runner.attach_to_execution(run_command, exclusive: false, &block)
173173
durations[:waiting_duration] = waiting_duration
174174
rescue Runner::Error => e
175175
e.waiting_duration = waiting_duration
@@ -193,7 +193,7 @@ def test(file, requesting_user)
193193
def run_test_file(file, runner, waiting_duration)
194194
test_command = command_for execution_environment.test_command, file.filepath
195195
result = {file_role: file.role, waiting_for_container_time: waiting_duration}
196-
output = runner.execute_command(test_command, raise_exception: false)
196+
output = runner.execute_command(test_command, raise_exception: false, exclusive: false)
197197
result.merge(output)
198198
end
199199

@@ -211,10 +211,11 @@ def build_files_hash(files, attribute)
211211
files&.map(&attribute.to_proc)&.zip(files).to_h
212212
end
213213

214-
def prepared_runner
214+
def prepared_runner(existing_runner: nil, exclusive: true)
215215
request_time = Time.zone.now
216216
begin
217-
runner = Runner.for(contributor, exercise.execution_environment)
217+
runner = existing_runner || Runner.for(contributor, exercise.execution_environment)
218+
runner.reserve! if exclusive
218219
files = collect_files
219220

220221
case cause
@@ -233,25 +234,32 @@ def prepared_runner
233234
end
234235

235236
Rails.logger.debug { "#{Time.zone.now.getutc.inspect}: Copying files to Runner #{runner.id} for #{contributor_type} #{contributor_id} and Submission #{id}." }
236-
runner.copy_files(files)
237+
# We don't want `copy_files` to be exclusive since we reserve runners for the whole `prepared_runner` block.
238+
runner.copy_files(files, exclusive: false)
237239
rescue Runner::Error => e
238240
e.waiting_duration = Time.zone.now - request_time
239241
raise
240242
end
241243
waiting_duration = Time.zone.now - request_time
242-
yield(runner, waiting_duration)
244+
yield(runner, waiting_duration) if block_given?
245+
ensure
246+
runner&.release! if exclusive
243247
end
244248

245-
def swap_runner(old_runner)
246-
old_runner.update(runner_id: nil)
247-
new_runner = nil
248-
new_waiting_duration = nil
249-
# We request a new runner that will also include all files of the current submission
250-
prepared_runner do |runner, waiting_duration|
251-
new_runner = runner
252-
new_waiting_duration = waiting_duration
249+
def swap_runner(runner, exclusive: true)
250+
# We use a transaction to ensure that the runner is swapped atomically.
251+
transaction do
252+
# Due to the `before_validation` callback in the `Runner` model,
253+
# the following line will immediately request a new runner.
254+
runner.update(runner_id: nil)
255+
256+
# With the new runner being ready, we only need to prepare it (by copying the files).
257+
# Since no actual execution is performed, we don't need to reserve the runner.
258+
prepared_runner(existing_runner: runner, exclusive: false)
259+
260+
# Now, we update the locks if desired. This is only necessary when a runner is used exclusively.
261+
runner.extend! if exclusive
253262
end
254-
[new_runner, new_waiting_duration]
255263
end
256264

257265
def command_for(template, filepath)

0 commit comments

Comments
 (0)