From d64864bfd6c57d3abf62fbf4071a66027323a50b Mon Sep 17 00:00:00 2001 From: Felix Dreissig Date: Sat, 25 May 2024 12:50:25 +0200 Subject: [PATCH] (Internally) distinguish between timeout termination and Checker Script errors Closes: https://github.com/fausecteam/ctf-gameserver/issues/80 Co-authored-by: Simon Ruderich --- src/ctf_gameserver/checker/database.py | 15 ++++++------ src/ctf_gameserver/checker/master.py | 16 ++++++++++--- src/ctf_gameserver/checker/metrics.py | 4 +++- src/ctf_gameserver/checker/supervisor.py | 8 +++++-- src/ctf_gameserver/lib/checkresult.py | 4 ++++ .../web/scoring/calculations.py | 4 +++- src/ctf_gameserver/web/scoring/models.py | 9 +++++--- .../scoring/templatetags/status_css_class.py | 3 ++- src/ctf_gameserver/web/scoring/views.py | 23 +++++++++++++++---- .../web/static/missing_checks.js | 6 ++++- src/ctf_gameserver/web/static/service_util.js | 3 ++- tests/checker/test_integration.py | 10 +++++--- 12 files changed, 77 insertions(+), 28 deletions(-) diff --git a/src/ctf_gameserver/checker/database.py b/src/ctf_gameserver/checker/database.py index 7d5d9c07..6cd6d120 100644 --- a/src/ctf_gameserver/checker/database.py +++ b/src/ctf_gameserver/checker/database.py @@ -1,5 +1,6 @@ import logging +from ctf_gameserver.lib.checkresult import STATUS_TIMEOUT from ctf_gameserver.lib.database import transaction_cursor from ctf_gameserver.lib.exceptions import DBDataError @@ -181,13 +182,13 @@ def commit_result(db_conn, service_id, team_net_no, tick, result, prohibit_chang cursor.execute('INSERT INTO scoring_statuscheck' ' (service_id, team_id, tick, status, timestamp)' ' VALUES (%s, %s, %s, %s, NOW())', (service_id, team_id, tick, result)) - # (In case of `prohibit_changes`,) PostgreSQL checks the database grants even if nothing is matched - # by `WHERE` - cursor.execute('UPDATE scoring_flag' - ' SET placement_end = NOW()' - ' WHERE service_id = %s AND protecting_team_id = %s AND tick = %s', (service_id, - team_id, - tick)) + if result != STATUS_TIMEOUT: + # (In case of `prohibit_changes`,) PostgreSQL checks the database grants even if nothing is + # matched by `WHERE` + cursor.execute('UPDATE scoring_flag' + ' SET placement_end = NOW()' + ' WHERE service_id = %s AND protecting_team_id = %s AND tick = %s', + (service_id, team_id, tick)) def set_flagid(db_conn, service_id, team_net_no, tick, flagid, prohibit_changes=False, fake_team_id=None): diff --git a/src/ctf_gameserver/checker/master.py b/src/ctf_gameserver/checker/master.py index a564870b..ae47fca8 100644 --- a/src/ctf_gameserver/checker/master.py +++ b/src/ctf_gameserver/checker/master.py @@ -12,8 +12,8 @@ from ctf_gameserver.lib.args import get_arg_parser_with_db, parse_host_port from ctf_gameserver.lib import daemon +from ctf_gameserver.lib.checkresult import CheckResult, STATUS_TIMEOUT from ctf_gameserver.lib.database import transaction_cursor -from ctf_gameserver.lib.checkresult import CheckResult from ctf_gameserver.lib.exceptions import DBDataError import ctf_gameserver.lib.flag as flag_lib @@ -264,11 +264,13 @@ def step(self): # We can't signal an error to the Checker Script (which might be waiting for a response), # so our only option is to kill it self.supervisor.terminate_runner(req['runner_id']) + metrics.inc(self.metrics_queue, 'killed_tasks') send_resp = False except: # noqa, pylint: disable=bare-except logging.exception('Checker Script communication error for team %d (net number %d) in tick ' '%d:', req['info']['_team_id'], req['info']['team'], req['info']['tick']) self.supervisor.terminate_runner(req['runner_id']) + metrics.inc(self.metrics_queue, 'killed_tasks') else: if send_resp: req['send'].send(resp) @@ -332,8 +334,16 @@ def handle_result_request(self, task_info, param): result) def launch_tasks(self): + def timeout_runners(): + for task_info in self.supervisor.terminate_runners(): + logging.info('Forcefully terminated Checker Script for team %d (net number %d) in tick %d', + task_info['_team_id'], task_info['team'], task_info['tick']) + metrics.inc(self.metrics_queue, 'timeout_tasks') + database.commit_result(self.db_conn, self.service['id'], task_info['team'], + task_info['tick'], STATUS_TIMEOUT) + def change_tick(new_tick): - self.supervisor.terminate_runners() + timeout_runners() self.update_launch_params(new_tick) self.known_tick = new_tick @@ -345,7 +355,7 @@ def change_tick(new_tick): change_tick(current_tick) elif cancel_checks: # Competition over - self.supervisor.terminate_runners() + timeout_runners() return tasks = database.get_new_tasks(self.db_conn, self.service['id'], self.tasks_per_launch) diff --git a/src/ctf_gameserver/checker/metrics.py b/src/ctf_gameserver/checker/metrics.py index 6751988a..1c1c4e74 100644 --- a/src/ctf_gameserver/checker/metrics.py +++ b/src/ctf_gameserver/checker/metrics.py @@ -58,7 +58,8 @@ def checker_metrics_factory(registry, service): counters = [ ('started_tasks', 'Number of started Checker Script instances'), - ('terminated_tasks', 'Number of Checker Script instances forcibly terminated') + ('timeout_tasks', 'Number of Checker Script instances forcibly terminated at end of tick'), + ('killed_tasks', 'Number of Checker Script instances forcibly terminated because of misbehavior') ] for name, doc in counters: metrics[name] = prometheus_client.Counter(metric_prefix+name, doc, ['service'], registry=registry) @@ -66,6 +67,7 @@ def checker_metrics_factory(registry, service): metrics[name].labels(service) metrics['completed_tasks'] = prometheus_client.Counter( + # Timeouts do not count as successfully completed checks metric_prefix+'completed_tasks', 'Number of successfully completed checks', ['result', 'service'], registry=registry ) diff --git a/src/ctf_gameserver/checker/supervisor.py b/src/ctf_gameserver/checker/supervisor.py index 6d9c852c..7c84c073 100644 --- a/src/ctf_gameserver/checker/supervisor.py +++ b/src/ctf_gameserver/checker/supervisor.py @@ -86,17 +86,21 @@ def terminate_runner(self, runner_id): logging.info('Terminating Runner process, info: %s', self.processes[runner_id][2]) self.processes[runner_id][0].terminate() # Afterwards, get_request() will join the child and remove it from `self.processes` - metrics.inc(self.metrics_queue, 'terminated_tasks') def terminate_runners(self): + terminated_infos = [] + if len(self.processes) > 0: logging.warning('Terminating all %d Runner processes', len(self.processes)) - for runner_id in self.processes: + for runner_id, process in self.processes.items(): self.terminate_runner(runner_id) + terminated_infos.append(process[2]) # Prevent memory leaks self._reset() + return terminated_infos + def get_request(self): # Use a loop to not leak our implementation detail for ACTION_RUNNER_EXIT: Only return None when the # queue is really empty (barring non-critical race conditions) diff --git a/src/ctf_gameserver/lib/checkresult.py b/src/ctf_gameserver/lib/checkresult.py index e6323419..4c737a38 100644 --- a/src/ctf_gameserver/lib/checkresult.py +++ b/src/ctf_gameserver/lib/checkresult.py @@ -1,6 +1,9 @@ import enum +STATUS_TIMEOUT = 5 + + class CheckResult(enum.Enum): """ Maps possible Checker results to their integer values. @@ -12,6 +15,7 @@ class CheckResult(enum.Enum): FAULTY = 2 # Service is available, but not behaving as expected FLAG_NOT_FOUND = 3 RECOVERING = 4 + # TIMEOUT (5) is only used internally and not exposed here, especially not to Checker Scripts def __str__(self): return self.name diff --git a/src/ctf_gameserver/web/scoring/calculations.py b/src/ctf_gameserver/web/scoring/calculations.py index cc1307ac..5a40a1fa 100644 --- a/src/ctf_gameserver/web/scoring/calculations.py +++ b/src/ctf_gameserver/web/scoring/calculations.py @@ -1,6 +1,7 @@ from collections import defaultdict, OrderedDict from django.core.cache import cache +from django.utils.translation import gettext_lazy as _ from ctf_gameserver.web.registration.models import Team from . import models @@ -89,7 +90,8 @@ def team_statuses(from_tick, to_tick, select_related_team_fields=None, only_team statuses[team] = defaultdict(lambda: {}) teams[team.pk] = team - status_checks = models.StatusCheck.objects.filter(tick__gte=from_tick, tick__lte=to_tick) + status_checks = models.StatusCheck.objects.filter(tick__gte=from_tick, tick__lte=to_tick) \ + .exclude(status=models.StatusCheck.STATUSES[_('timeout')]) for check in status_checks: statuses[teams[check.team_id]][check.tick][check.service_id] = check.status diff --git a/src/ctf_gameserver/web/scoring/models.py b/src/ctf_gameserver/web/scoring/models.py index da6d3197..166ed711 100644 --- a/src/ctf_gameserver/web/scoring/models.py +++ b/src/ctf_gameserver/web/scoring/models.py @@ -78,18 +78,21 @@ class StatusCheck(models.Model): # Mapping from human-readable status texts to their integer values in the database # For a discussion on the plural form of "status", refer to https://english.stackexchange.com/q/877 STATUSES = { - _('up'): 0, # Maps to "OK" from the checkers' perspective + # "up" maps to "OK" from the checkers' perspective + _('up'): 0, _('down'): 1, _('faulty'): 2, _('flag not found'): 3, - _('recovering'): 4 + _('recovering'): 4, + # Script has been forcefully terminated at end of tick, publicly displayed as "Not checked" + _('timeout'): 5 } service = models.ForeignKey(Service, on_delete=models.CASCADE) team = models.ForeignKey(Team, on_delete=models.CASCADE) tick = models.PositiveIntegerField(db_index=True) # REVISIT: Add check constraint for the values as soon as we have Django >= 2.2 - status = models.PositiveSmallIntegerField(choices=[(i, t) for t, i in STATUSES.items()]) + status = models.SmallIntegerField(choices=[(i, t) for t, i in STATUSES.items()]) timestamp = models.DateTimeField(auto_now_add=True) class Meta: diff --git a/src/ctf_gameserver/web/scoring/templatetags/status_css_class.py b/src/ctf_gameserver/web/scoring/templatetags/status_css_class.py index 37419fa0..156d31ad 100644 --- a/src/ctf_gameserver/web/scoring/templatetags/status_css_class.py +++ b/src/ctf_gameserver/web/scoring/templatetags/status_css_class.py @@ -8,7 +8,8 @@ _('down'): 'danger', _('faulty'): 'danger', _('flag not found'): 'warning', - _('recovering'): 'info' + _('recovering'): 'info', + _('timeout'): 'active' } diff --git a/src/ctf_gameserver/web/scoring/views.py b/src/ctf_gameserver/web/scoring/views.py index 1dc18176..f1a9a1f6 100644 --- a/src/ctf_gameserver/web/scoring/views.py +++ b/src/ctf_gameserver/web/scoring/views.py @@ -5,6 +5,7 @@ from django.db.models import Max from django.http import JsonResponse from django.shortcuts import render +from django.utils.translation import gettext_lazy as _ from django.views.decorators.cache import cache_page import ctf_gameserver.web.registration.models as registration_models @@ -331,8 +332,8 @@ def missing_checks(request): @staff_member_required def missing_checks_json(request): """ - View which returns the teams with status "not checked" per tick for a specific service. The result is in - JSON format as required by the JavaScript code in def missing_checks(). + View which returns the teams with status "not checked" or "timeout" per tick for a specific service. The + result is in JSON format as required by the JavaScript code in missing_checks(). This can help to find unhandled exceptions in checker scripts, as "not checked" normally shouldn't occur. """ @@ -352,14 +353,24 @@ def missing_checks_json(request): except ValueError: return JsonResponse({'error': 'Ticks must be integers'}) + status_timeout = models.StatusCheck.STATUSES[_('timeout')] + all_flags = models.Flag.objects.filter(service=service) \ .filter(tick__gte=from_tick, tick__lt=to_tick) \ .values_list('tick', 'protecting_team') all_status_checks = models.StatusCheck.objects.filter(service=service) \ .filter(tick__gte=from_tick, tick__lt=to_tick) \ + .exclude(status=status_timeout) \ .values_list('tick', 'team') checks_missing = all_flags.difference(all_status_checks).order_by('-tick', 'protecting_team') + checks_timeout = defaultdict(set) + for tick, team in models.StatusCheck.objects.filter(service=service) \ + .filter(tick__gte=from_tick, tick__lt=to_tick) \ + .filter(status=status_timeout) \ + .values_list('tick', 'team'): + checks_timeout[tick].add(team) + result = [] current_tick = {'tick': -1} @@ -371,12 +382,14 @@ def append_result(): result.append(current_tick) for check in checks_missing: + check_tick, check_team = check + # Status checks are ordered by tick, finalize result for a tick when it changes - if current_tick['tick'] != check[0]: + if current_tick['tick'] != check_tick: append_result() - current_tick = {'tick': check[0], 'teams': []} + current_tick = {'tick': check_tick, 'teams': []} - current_tick['teams'].append(check[1]) + current_tick['teams'].append((check_team, check_team in checks_timeout[check_tick])) # Add result from last iteration append_result() diff --git a/src/ctf_gameserver/web/static/missing_checks.js b/src/ctf_gameserver/web/static/missing_checks.js index 26d86779..427595a6 100644 --- a/src/ctf_gameserver/web/static/missing_checks.js +++ b/src/ctf_gameserver/web/static/missing_checks.js @@ -31,7 +31,8 @@ function buildList(data) { tickEntry.appendChild(prefix) for (let i = 0; i < check['teams'].length; i++) { - const teamID = check['teams'][i] + const teamID = check['teams'][i][0] + const isTimeout = check['teams'][i][1] const teamName = data['all-teams'][teamID]['name'] const teamNetNo = data['all-teams'][teamID]['net-number'] @@ -45,6 +46,9 @@ function buildList(data) { 'q=service:' + data['service-slug'] + ' AND team:' + teamNetNo + ' AND tick:' + check['tick']) teamEntry.target = '_blank' + if (isTimeout) { + teamEntry.classList.add('text-muted') + } } teamEntry.textContent = teamName + ' (' + teamNetNo + ')' tickEntry.appendChild(teamEntry) diff --git a/src/ctf_gameserver/web/static/service_util.js b/src/ctf_gameserver/web/static/service_util.js index c231eeb6..3c87dae1 100644 --- a/src/ctf_gameserver/web/static/service_util.js +++ b/src/ctf_gameserver/web/static/service_util.js @@ -8,7 +8,8 @@ const statusClasses = { 1: 'danger', 2: 'danger', 3: 'warning', - 4: 'info' + 4: 'info', + 5: 'active' } diff --git a/tests/checker/test_integration.py b/tests/checker/test_integration.py index 79d5af4c..87afbfae 100644 --- a/tests/checker/test_integration.py +++ b/tests/checker/test_integration.py @@ -230,7 +230,10 @@ def test_unfinished(self, monotonic_mock, warning_mock): ' WHERE placement_start IS NOT NULL AND placement_end IS NULL') self.assertEqual(cursor.fetchone()[0], 1) cursor.execute('SELECT COUNT(*) FROM scoring_statuscheck') - self.assertEqual(cursor.fetchone()[0], 0) + self.assertEqual(cursor.fetchone()[0], 1) + cursor.execute('SELECT status FROM scoring_statuscheck' + ' WHERE service_id=1 AND team_id=2 AND tick=0') + self.assertEqual(cursor.fetchone()[0], 5) warning_mock.assert_called_with('Terminating all %d Runner processes', 1) @@ -596,8 +599,9 @@ def signal_script(): cursor.execute('SELECT COUNT(*) FROM scoring_flag' ' WHERE placement_start IS NOT NULL AND placement_end IS NULL') self.assertEqual(cursor.fetchone()[0], 1) - cursor.execute('SELECT COUNT(*) FROM scoring_statuscheck') - self.assertEqual(cursor.fetchone()[0], 0) + cursor.execute('SELECT status FROM scoring_statuscheck' + ' WHERE service_id=1 AND team_id=2 AND tick=0') + self.assertEqual(cursor.fetchone()[0], 5) warning_mock.assert_called_with('Terminating all %d Runner processes', 1)