Skip to content

Commit

Permalink
improve postpone checks outside of check period
Browse files Browse the repository at this point in the history
with #490 we rescheduled checks found outside their check period to the
next time slot in their check period with a random delay of
60 seconds (or what ever retained_scheduling_randomize_window was set to)

There are 2 scenarios when this leads to scheduling issues:

1) Lots of checks with a medium check interval (ex. 2h) and office hours
   timeperiod. Those checks should evenly be scheduled over the 2h interval
   but currently, they would start with the office hours and from then they
   all at once every 2 hours which results in load peaks.
   The solution here is to take the check period into account when postponing
   the next check.
2) Consider checks with a long check interval (ex.: 24h) and small timeperiods,
   ex.: only 08:00 till 08:05. In this case we need to take the actual time slot
   into account to find a valid next check time slot.

While on it, i merged the code into a generic function which is then used for hosts and services.
  • Loading branch information
sni committed Feb 21, 2025
1 parent 08e3302 commit f9b2699
Show file tree
Hide file tree
Showing 11 changed files with 102 additions and 46 deletions.
60 changes: 60 additions & 0 deletions src/naemon/checks.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "flapping.h"
#include "sehandlers.h"
#include "notifications.h"
#include "objects_timeperiod.h"
#include "logging.h"
#include "globals.h"
#include "nm_alloc.h"
Expand Down Expand Up @@ -544,3 +545,62 @@ int free_check_result(check_result *info)

return OK;
}

/* ensure next check falls into check period */
time_t get_random_next_timeperiod_slot(time_t check_interval, const timeperiod *check_period_ptr)
{
char buf_from[32];
char buf_to[32];
char buf_final[32];
time_t next_check = time(NULL);
time_t timeperiod_start = time(NULL);
time_t timeperiod_end = time(NULL);
time_t check_slot_duration;
time_t delay_max;

/* get start of next check_period block */
get_next_valid_time(next_check, &timeperiod_start, check_period_ptr);
if(timeperiod_start == 0) {
return 0;
}

/* get end of current check_period block */
get_next_invalid_time(timeperiod_start, &timeperiod_end, check_period_ptr);
if(timeperiod_end == 0) {
return 0;
}

/* Add random delay, so not all checks start at the same second.
* The delay is a random number of seconds between 0 and
* whatever is smaller, either the duration of the next block in the
* timeperiod or the check_interval.
* However, it should not be less than the configured randomize window.
*/
check_slot_duration = timeperiod_end - timeperiod_start;
delay_max = check_slot_duration;
if(check_interval > 0 && delay_max > check_interval)
delay_max = check_interval;
if(delay_max < retained_scheduling_randomize_window)
delay_max = retained_scheduling_randomize_window;
if(delay_max > check_slot_duration)
delay_max = check_slot_duration;

if(delay_max <= 1)
return 0;

noeol_ctime(&timeperiod_start, buf_from);
noeol_ctime(&timeperiod_end, buf_to);

/* reduce by one second because the last second is the start of the invalid time slot already */
timeperiod_start += ranged_urand(0, (delay_max-1));
noeol_ctime(&timeperiod_start, buf_final);

log_debug_info(DEBUGL_CHECKS, 1, "delay next check within next valid timeperiod block (from %s till %s) check_interval: %lu, max delay: %lu -> %s\n",
buf_from,
buf_to,
check_interval,
delay_max,
buf_final);

return(timeperiod_start);
}
2 changes: 2 additions & 0 deletions src/naemon/checks.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#endif

#include "lib/lnae-utils.h"
#include "objects_timeperiod.h"
#include <stdio.h>
#include <sys/resource.h>

Expand Down Expand Up @@ -88,6 +89,7 @@ int process_check_result(check_result *);
int delete_check_result_file(char *);
int init_check_result(check_result *);
int free_check_result(check_result *); /* frees memory associated with a host/service check result */
time_t get_random_next_timeperiod_slot(time_t, const timeperiod *);

NAGIOS_END_DECL

Expand Down
25 changes: 10 additions & 15 deletions src/naemon/checks_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ static int run_async_host_check(host *hst, int check_options, double latency)

/* make sure this is a valid time to check the host */
if (check_time_against_period(time(NULL), hst->check_period_ptr) != OK) {
delay_host_if_next_check_is_outside_timeperiod(hst);
delay_host_check_till_next_timeperiod_slot(hst);
return ERROR;
}

Expand Down Expand Up @@ -1397,26 +1397,21 @@ static int determine_host_reachability(host *hst)
}

/* ensure next check falls into check period */
void delay_host_if_next_check_is_outside_timeperiod(host *hst)
void delay_host_check_till_next_timeperiod_slot(host *hst)
{
time_t timeperiod_start = time(NULL);
time_t timeperiod_start;
time_t check_interval;

if(hst->next_check == 0) {
return;
}

if(check_time_against_period(hst->next_check, hst->check_period_ptr) == OK) {
return;
}
if (hst->current_state != STATE_UP && hst->state_type == SOFT_STATE && hst->retry_interval != 0.0)
check_interval = get_host_check_interval_s(hst);
else
check_interval = get_host_retry_interval_s(hst);

get_next_valid_time(hst->next_check, &timeperiod_start, hst->check_period_ptr);
timeperiod_start = get_random_next_timeperiod_slot(check_interval, hst->check_period_ptr);
if(timeperiod_start == 0) {
return;
}

// add random delay, so not all checks start at the same second
timeperiod_start += ranged_urand(0, retained_scheduling_randomize_window);

log_debug_info(DEBUGL_CHECKS, 1, "delay next service check for %s until check timeperiod starts: %s\n", hst->name, ctime(&timeperiod_start));
log_debug_info(DEBUGL_CHECKS, 1, "delay next host check for %s until check timeperiod starts: %s\n", hst->name, ctime(&timeperiod_start));
schedule_host_check(hst, timeperiod_start, CHECK_OPTION_ALLOW_POSTPONE);
}
4 changes: 2 additions & 2 deletions src/naemon/checks_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ int check_host_dependencies(host *hst, int dependency_type);
/* adjusts current host check attempt when a check is processed */
int adjust_host_check_attempt(host *hst, int is_active);

/* ensure next check falls into check period */
void delay_host_if_next_check_is_outside_timeperiod(host *);
/* move next check into a valid check period slot */
void delay_host_check_till_next_timeperiod_slot(host *);

NAGIOS_END_DECL

Expand Down
23 changes: 9 additions & 14 deletions src/naemon/checks_service.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ static void handle_service_check_event(struct nm_event_execution_properties *evp

/* make sure this is a valid time to check the service */
if (check_time_against_period(time(NULL), temp_service->check_period_ptr) == ERROR) {
delay_service_if_next_check_is_outside_timeperiod(temp_service);
delay_service_check_till_next_timeperiod_slot(temp_service);
return;
}

Expand Down Expand Up @@ -1438,26 +1438,21 @@ static int is_service_result_fresh(service *temp_service, time_t current_time, i
}

/* ensure next check falls into check period */
void delay_service_if_next_check_is_outside_timeperiod(service *svc)
void delay_service_check_till_next_timeperiod_slot(service *svc)
{
time_t timeperiod_start = time(NULL);
time_t timeperiod_start;
time_t check_interval;

if(svc->next_check == 0) {
return;
}

if(check_time_against_period(svc->next_check, svc->check_period_ptr) == OK) {
return;
}
if (svc->current_state != STATE_UP && svc->state_type == SOFT_STATE && svc->retry_interval != 0.0)
check_interval = get_service_check_interval_s(svc);
else
check_interval = get_service_retry_interval_s(svc);

get_next_valid_time(svc->next_check, &timeperiod_start, svc->check_period_ptr);
timeperiod_start = get_random_next_timeperiod_slot(check_interval, svc->check_period_ptr);
if(timeperiod_start == 0) {
return;
}

// add random delay, so not all checks start at the same second
timeperiod_start += ranged_urand(0, retained_scheduling_randomize_window);

log_debug_info(DEBUGL_CHECKS, 1, "delay next service check for %s - %s until check timeperiod starts: %s\n", svc->host_name, svc->description, ctime(&timeperiod_start));
schedule_service_check(svc, timeperiod_start, CHECK_OPTION_ALLOW_POSTPONE);
}
4 changes: 2 additions & 2 deletions src/naemon/checks_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ int handle_async_service_check_result(service *, check_result *);
/* Immutable, check if service is reachable */
int check_service_dependencies(service *, int);

/* ensure next check falls into check period */
void delay_service_if_next_check_is_outside_timeperiod(service *);
/* move next check into a valid check period slot */
void delay_service_check_till_next_timeperiod_slot(service *);

NAGIOS_END_DECL

Expand Down
11 changes: 6 additions & 5 deletions src/naemon/objects_timeperiod.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,9 +678,10 @@ int check_time_against_period(time_t test_time, const timeperiod *tperiod)


/*#define TEST_TIMEPERIODS_B 1*/
static void _get_next_valid_time(time_t pref_time, time_t *valid_time, timeperiod *tperiod);
static void _get_next_valid_time(time_t pref_time, time_t *valid_time, const timeperiod *tperiod);

static void _get_next_invalid_time(time_t pref_time, time_t *invalid_time, timeperiod *tperiod)
/* calculate the next time this period ends */
void get_next_invalid_time(time_t pref_time, time_t *invalid_time, const timeperiod *tperiod)
{
timeperiodexclusion *temp_timeperiodexclusion = NULL;
int depth = 0;
Expand Down Expand Up @@ -795,7 +796,7 @@ static void _get_next_invalid_time(time_t pref_time, time_t *invalid_time, timep


/* Separate this out from public get_next_valid_time for testing */
static void _get_next_valid_time(time_t pref_time, time_t *valid_time, timeperiod *tperiod)
static void _get_next_valid_time(time_t pref_time, time_t *valid_time, const timeperiod *tperiod)
{
timeperiodexclusion *temp_timeperiodexclusion = NULL;
int depth = 0;
Expand Down Expand Up @@ -878,7 +879,7 @@ static void _get_next_valid_time(time_t pref_time, time_t *valid_time, timeperio
if (check_time_against_period(earliest_time, temp_timeperiodexclusion->timeperiod_ptr) == ERROR) {
continue;
}
_get_next_invalid_time(earliest_time, &excluded_time, temp_timeperiodexclusion->timeperiod_ptr);
get_next_invalid_time(earliest_time, &excluded_time, temp_timeperiodexclusion->timeperiod_ptr);
if (!max_excluded || max_excluded < excluded_time) {
max_excluded = excluded_time;
earliest_time = excluded_time;
Expand All @@ -900,7 +901,7 @@ static void _get_next_valid_time(time_t pref_time, time_t *valid_time, timeperio


/* given a preferred time, get the next valid time within a time period */
void get_next_valid_time(time_t pref_time, time_t *valid_time, timeperiod *tperiod)
void get_next_valid_time(time_t pref_time, time_t *valid_time, const timeperiod *tperiod)
{
time_t current_time = (time_t)0L;

Expand Down
3 changes: 2 additions & 1 deletion src/naemon/objects_timeperiod.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ struct timeperiod *find_timeperiod(const char *);
void fcache_timeperiod(FILE *fp, const struct timeperiod *temp_timeperiod);

int check_time_against_period(time_t, const timeperiod *); /* check to see if a specific time is covered by a time period */
void get_next_valid_time(time_t, time_t *, timeperiod *); /* get the next valid time in a time period */
void get_next_valid_time(time_t, time_t *, const timeperiod *); /* get the next valid time in a time period */
void get_next_invalid_time(time_t, time_t *, const timeperiod *); /* get the next invalid time in a time period (aka end of the period) */

NAGIOS_END_DECL
#endif
6 changes: 6 additions & 0 deletions src/naemon/shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,12 @@ char *trim(char *c)
return(lstrip(rstrip(c)));
}

void noeol_ctime(const time_t *when, char *buf)
{
ctime_r(when, buf);
buf[strlen(buf) - 1] = 0;
}

/*
* given a date/time in time_t format, produce a corresponding
* date/time string, including timezone
Expand Down
1 change: 1 addition & 0 deletions src/naemon/shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ void strip(char *buffer);
char *rstrip(char *c);
char *lstrip(char *c);
char *trim(char *c);
void noeol_ctime(const time_t *, char *);
void get_datetime_string(time_t *raw_time, char *buffer,
int buffer_length, int type);
void get_time_breakdown(unsigned long raw_time, int *days, int *hours,
Expand Down
9 changes: 2 additions & 7 deletions t-tap/test_timeperiods.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,9 @@
#include "naemon/configuration.h"
#include "naemon/defaults.h"
#include "naemon/globals.h"
#include "naemon/shared.h"
#include "tap.h"

static void noeol_ctime(const time_t *when, char *buf)
{
ctime_r(when, buf);
buf[strlen(buf) - 1] = 0;
}

static struct timeperiod *test_get_timeperiod(const char *name)
{
struct timeperiod *tp;
Expand Down Expand Up @@ -86,7 +81,7 @@ static struct timeperiod *test_get_timeperiod(const char *name)
char ct_expect[32], ct_chosen[32], ct_when[32]; \
struct timeperiod *tp; \
tp = test_get_timeperiod(tp_name); \
_get_next_invalid_time(when, &chosen, tp); \
get_next_invalid_time(when, &chosen, tp); \
noeol_ctime(&chosen, ct_chosen); \
noeol_ctime(&t_when, ct_when); \
noeol_ctime(&t_expect, ct_expect); \
Expand Down

0 comments on commit f9b2699

Please sign in to comment.