Skip to content

Commit

Permalink
improve postpone checks outside of check period
Browse files Browse the repository at this point in the history
with #490 we rescheduled checks found outside their check period to the
next time slot in their check period with a random delay of
60 seconds (or what ever retained_scheduling_randomize_window was set to)

There are 2 scenarios when this leads to scheduling issues:

1) Lots of checks with a medium check interval (ex. 2h) and office hours
   timeperiod. Those checks should evenly be scheduled over the 2h interval
   but currently, they would start with the office hours and from then they
   all at once every 2 hours which results in load peaks.
   The solution here is to take the check period into account when postponing
   the next check.
2) Consider checks with a long check interval (ex.: 24h) and small timeperiods,
   ex.: only 08:00 till 08:05. In this case we need to take the actual time slot
   into account to find a valid next check time slot.

While on it, i merged the code into a generic function which is then used for hosts and services.
  • Loading branch information
sni committed Feb 21, 2025
1 parent 08e3302 commit dac3cec
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 44 deletions.
50 changes: 50 additions & 0 deletions src/naemon/checks.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "flapping.h"
#include "sehandlers.h"
#include "notifications.h"
#include "objects_timeperiod.h"
#include "logging.h"
#include "globals.h"
#include "nm_alloc.h"
Expand Down Expand Up @@ -544,3 +545,52 @@ int free_check_result(check_result *info)

return OK;
}

/* ensure next check falls into check period */
time_t get_random_next_timeperiod_slot(time_t check_interval, const timeperiod *check_period_ptr)
{
time_t next_check = time(NULL);
time_t timeperiod_start = time(NULL);
time_t timeperiod_end = time(NULL);
time_t check_slot_duration;
time_t delay_max;

/* get start of next check_period block */
get_next_valid_time(next_check, &timeperiod_start, check_period_ptr);
if(timeperiod_start == 0) {
return 0;
}

/* get end of current check_period block */
get_next_invalid_time(timeperiod_start, &timeperiod_end, check_period_ptr);
if(timeperiod_end == 0) {
return 0;
}

/* Add random delay, so not all checks start at the same second.
* The delay is a random number of seconds between 0 and
* whatever is smaller, either the duration of the next block in the
* timeperiod or the check_interval.
* However, it should not be less than the configured randomize window.
*/
check_slot_duration = timeperiod_end - timeperiod_start;
delay_max = check_slot_duration;
if(check_interval > 0 && delay_max > check_interval)
delay_max = check_interval;
if(delay_max < retained_scheduling_randomize_window)
delay_max = retained_scheduling_randomize_window;
if(delay_max > check_slot_duration)
delay_max = check_slot_duration;

if(delay_max <= 1)
return 0;

/* reduce by one second because the last second is the start of the invalid time slot already */
delay_max = delay_max - 1;

timeperiod_start += ranged_urand(0, delay_max);

log_debug_info(DEBUGL_CHECKS, 1, "delay next check within next check timeperiod block: from %s till %s, check_interval: %ld, max delay: %ld\n", ctime(&timeperiod_start), ctime(&timeperiod_end), check_interval, delay_max);

return(timeperiod_start);
}
2 changes: 2 additions & 0 deletions src/naemon/checks.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#endif

#include "lib/lnae-utils.h"
#include "objects_timeperiod.h"
#include <stdio.h>
#include <sys/resource.h>

Expand Down Expand Up @@ -88,6 +89,7 @@ int process_check_result(check_result *);
int delete_check_result_file(char *);
int init_check_result(check_result *);
int free_check_result(check_result *); /* frees memory associated with a host/service check result */
time_t get_random_next_timeperiod_slot(time_t, const timeperiod *);

NAGIOS_END_DECL

Expand Down
21 changes: 4 additions & 17 deletions src/naemon/checks_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ static int run_async_host_check(host *hst, int check_options, double latency)

/* make sure this is a valid time to check the host */
if (check_time_against_period(time(NULL), hst->check_period_ptr) != OK) {
delay_host_if_next_check_is_outside_timeperiod(hst);
delay_host_check_till_next_timeperiod_slot(hst);
return ERROR;
}

Expand Down Expand Up @@ -1397,26 +1397,13 @@ static int determine_host_reachability(host *hst)
}

/* ensure next check falls into check period */
void delay_host_if_next_check_is_outside_timeperiod(host *hst)
void delay_host_check_till_next_timeperiod_slot(host *hst)
{
time_t timeperiod_start = time(NULL);

if(hst->next_check == 0) {
return;
}

if(check_time_against_period(hst->next_check, hst->check_period_ptr) == OK) {
return;
}

get_next_valid_time(hst->next_check, &timeperiod_start, hst->check_period_ptr);
time_t timeperiod_start = get_random_next_timeperiod_slot(get_host_check_interval_s(hst), hst->check_period_ptr);
if(timeperiod_start == 0) {
return;
}

// add random delay, so not all checks start at the same second
timeperiod_start += ranged_urand(0, retained_scheduling_randomize_window);

log_debug_info(DEBUGL_CHECKS, 1, "delay next service check for %s until check timeperiod starts: %s\n", hst->name, ctime(&timeperiod_start));
log_debug_info(DEBUGL_CHECKS, 1, "delay next host check for %s until check timeperiod starts: %s\n", hst->name, ctime(&timeperiod_start));
schedule_host_check(hst, timeperiod_start, CHECK_OPTION_ALLOW_POSTPONE);
}
4 changes: 2 additions & 2 deletions src/naemon/checks_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ int check_host_dependencies(host *hst, int dependency_type);
/* adjusts current host check attempt when a check is processed */
int adjust_host_check_attempt(host *hst, int is_active);

/* ensure next check falls into check period */
void delay_host_if_next_check_is_outside_timeperiod(host *);
/* move next check into a valid check period slot */
void delay_host_check_till_next_timeperiod_slot(host *);

NAGIOS_END_DECL

Expand Down
19 changes: 3 additions & 16 deletions src/naemon/checks_service.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ static void handle_service_check_event(struct nm_event_execution_properties *evp

/* make sure this is a valid time to check the service */
if (check_time_against_period(time(NULL), temp_service->check_period_ptr) == ERROR) {
delay_service_if_next_check_is_outside_timeperiod(temp_service);
delay_service_check_till_next_timeperiod_slot(temp_service);
return;
}

Expand Down Expand Up @@ -1438,26 +1438,13 @@ static int is_service_result_fresh(service *temp_service, time_t current_time, i
}

/* ensure next check falls into check period */
void delay_service_if_next_check_is_outside_timeperiod(service *svc)
void delay_service_check_till_next_timeperiod_slot(service *svc)
{
time_t timeperiod_start = time(NULL);

if(svc->next_check == 0) {
return;
}

if(check_time_against_period(svc->next_check, svc->check_period_ptr) == OK) {
return;
}

get_next_valid_time(svc->next_check, &timeperiod_start, svc->check_period_ptr);
time_t timeperiod_start = get_random_next_timeperiod_slot(get_service_check_interval_s(svc), svc->check_period_ptr);
if(timeperiod_start == 0) {
return;
}

// add random delay, so not all checks start at the same second
timeperiod_start += ranged_urand(0, retained_scheduling_randomize_window);

log_debug_info(DEBUGL_CHECKS, 1, "delay next service check for %s - %s until check timeperiod starts: %s\n", svc->host_name, svc->description, ctime(&timeperiod_start));
schedule_service_check(svc, timeperiod_start, CHECK_OPTION_ALLOW_POSTPONE);
}
4 changes: 2 additions & 2 deletions src/naemon/checks_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ int handle_async_service_check_result(service *, check_result *);
/* Immutable, check if service is reachable */
int check_service_dependencies(service *, int);

/* ensure next check falls into check period */
void delay_service_if_next_check_is_outside_timeperiod(service *);
/* move next check into a valid check period slot */
void delay_service_check_till_next_timeperiod_slot(service *);

NAGIOS_END_DECL

Expand Down
11 changes: 6 additions & 5 deletions src/naemon/objects_timeperiod.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,9 +678,10 @@ int check_time_against_period(time_t test_time, const timeperiod *tperiod)


/*#define TEST_TIMEPERIODS_B 1*/
static void _get_next_valid_time(time_t pref_time, time_t *valid_time, timeperiod *tperiod);
static void _get_next_valid_time(time_t pref_time, time_t *valid_time, const timeperiod *tperiod);

static void _get_next_invalid_time(time_t pref_time, time_t *invalid_time, timeperiod *tperiod)
/* calculate the next time this period ends */
void get_next_invalid_time(time_t pref_time, time_t *invalid_time, const timeperiod *tperiod)
{
timeperiodexclusion *temp_timeperiodexclusion = NULL;
int depth = 0;
Expand Down Expand Up @@ -795,7 +796,7 @@ static void _get_next_invalid_time(time_t pref_time, time_t *invalid_time, timep


/* Separate this out from public get_next_valid_time for testing */
static void _get_next_valid_time(time_t pref_time, time_t *valid_time, timeperiod *tperiod)
static void _get_next_valid_time(time_t pref_time, time_t *valid_time, const timeperiod *tperiod)
{
timeperiodexclusion *temp_timeperiodexclusion = NULL;
int depth = 0;
Expand Down Expand Up @@ -878,7 +879,7 @@ static void _get_next_valid_time(time_t pref_time, time_t *valid_time, timeperio
if (check_time_against_period(earliest_time, temp_timeperiodexclusion->timeperiod_ptr) == ERROR) {
continue;
}
_get_next_invalid_time(earliest_time, &excluded_time, temp_timeperiodexclusion->timeperiod_ptr);
get_next_invalid_time(earliest_time, &excluded_time, temp_timeperiodexclusion->timeperiod_ptr);
if (!max_excluded || max_excluded < excluded_time) {
max_excluded = excluded_time;
earliest_time = excluded_time;
Expand All @@ -900,7 +901,7 @@ static void _get_next_valid_time(time_t pref_time, time_t *valid_time, timeperio


/* given a preferred time, get the next valid time within a time period */
void get_next_valid_time(time_t pref_time, time_t *valid_time, timeperiod *tperiod)
void get_next_valid_time(time_t pref_time, time_t *valid_time, const timeperiod *tperiod)
{
time_t current_time = (time_t)0L;

Expand Down
3 changes: 2 additions & 1 deletion src/naemon/objects_timeperiod.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ struct timeperiod *find_timeperiod(const char *);
void fcache_timeperiod(FILE *fp, const struct timeperiod *temp_timeperiod);

int check_time_against_period(time_t, const timeperiod *); /* check to see if a specific time is covered by a time period */
void get_next_valid_time(time_t, time_t *, timeperiod *); /* get the next valid time in a time period */
void get_next_valid_time(time_t, time_t *, const timeperiod *); /* get the next valid time in a time period */
void get_next_invalid_time(time_t, time_t *, const timeperiod *); /* get the next invalid time in a time period (aka end of the period) */

NAGIOS_END_DECL
#endif
2 changes: 1 addition & 1 deletion t-tap/test_timeperiods.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ static struct timeperiod *test_get_timeperiod(const char *name)
char ct_expect[32], ct_chosen[32], ct_when[32]; \
struct timeperiod *tp; \
tp = test_get_timeperiod(tp_name); \
_get_next_invalid_time(when, &chosen, tp); \
get_next_invalid_time(when, &chosen, tp); \
noeol_ctime(&chosen, ct_chosen); \
noeol_ctime(&t_when, ct_when); \
noeol_ctime(&t_expect, ct_expect); \
Expand Down

0 comments on commit dac3cec

Please sign in to comment.