-
Notifications
You must be signed in to change notification settings - Fork 6
/
check_griderrors.sh
executable file
·186 lines (174 loc) · 6.63 KB
/
check_griderrors.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/bin/bash
#
# ============================== SUMMARY =====================================
#
# Program : check_griderrors.sh
# Version : 0.1
# Date : March 14 2014
# Author : Dirk Doerflinger - dirk(at)doerflinger(dot)org
# Summary : This is a nagios plugin that checks the status of the queues of an
# SGE (or forks) installation
#
# Licence : MIT
#
# =========================== PROGRAM LICENSE =================================
#
# Copyright (C) 2014 Dirk Doerflinger
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# ===================== INFORMATION ABOUT THIS PLUGIN =========================
#
# This plugin checks the status of all queues of an SGE installation, warns
# when a defined number of queues have errors and logs performance data.
# The host where the script is being executed needs to be a submit or an admin
# host of the SGE installation. NRPE is recommended in this case.
#
# This program is written and maintained by:
# Dirk Doerflinger - dirk(at)doerflinger(dot)org
#
# ============================= SETUP NOTES ====================================
#
# Use NRPE.
# Copy this file to a submit or admin host of your installation.
# Adapt the paths of installation directly below this comment.
# You will need to set the base path of your SGE installation, the name of your
# SGE cell and the path to the common environment settings file of your
# installation.
# If you have overlapping queues (i.e. some nodes belong to more than one queue)
# You will have to ignore some of them, otherwise the total sums of cores will
# be wrong.
#
# Example to test, run on submit host:
#
# ./check_griderrors.sh -w 1 -c 2
#
#
# ========================= SETUP EXAMPLES ==================================
#
# define command{
# command_name check_griderrors.sh
# command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c check_griderrors
# }
#
# nrpe.cfg:
# command[check_griderrors]=/usr/lib64/nagios/plugins/check_griderrors.sh -w 1 -c 2
#
# define service{
# use generic-service
# host_name submithost01
# service_description Check Griderrors
# check_command check_griderrors
# normal_check_interval 3
# retry_check_interval 1
# }
#
#
# ================================ REVISION ==================================
#
# 0.1 Initial release
#
# ============================================================================
package=check_griderrors
# Path to SGE installation
export SGE_ROOT=<BASEPATH TO SGE INSTALLATION>
# Name of the SGE cell to be monitored
export SGE_CELL=<NAME OF THE SGE CELL>
# Import environment
source $SGE_ROOT/<NAME OF THE INSTALLATION>/common/settings.sh
# Extension of queues
QEXT=".q"
# List of queues ignored when summing up. Separate by pipes.
IGNORE_QUEUES="<QUEUES.q|TO.q|BE.q|IGNORED.q>"
# Parse command line options
if [ "$#" == "0" ]; then
echo "No arguments provided"
exit 3
fi
# parse parameters
while test $# -gt 0; do
case "$1" in
-h|--help)
echo "$package - Check faulty SGE queues"
echo " "
echo "Check all queues for errors and get performance data separately for each queue"
echo " "
echo "ATTN: The host running this script needs to be a submit or admin node, it is helpful to use NRPE"
echo " "
echo "$package -w <warning> -c <critical>"
echo " "
echo "options:"
echo "-h, --help show brief help"
echo "-w, --warning Number of faulty queues (not hosts!) triggering a warning"
echo "-c, --critical Number of faulty queues (not hosts!) triggering a critical error"
exit 0
;;
-w|--warning)
shift
if test $# -gt 0; then
export warning=$1
else
echo "no warning level specified"
fi
shift
;;
-c|--critical)
shift
if test $# -gt 0; then
export critical=$1
else
echo "no critical level specified"
fi
shift
;;
esac
done
# Get a list of all queues, filtering doubles
QUEUES=$(qconf -sql | egrep -v -E '($IGNORE_QUEUES)')
# Get the number of faulty qeues by checking if the state column is different to 0
num_faulty_queues=`qstat -r -u "all" -g c | grep $QEXT | egrep -v -E '($IGNORE_QUEUES)' | awk '{if ($8 > 0 ) print $0;}' | wc -l`
# Get the total number of used cores
used_cores=`qstat -r -u "all" -g c | grep $QEXT | egrep -v -E "($IGNORE_QUEUES)" | awk '{ SUM += $3 } END {print SUM}'`
# Get the total number of available cores
available_cores=`qstat -r -u "all" -g c | grep $QEXT | egrep -v -E "($IGNORE_QUEUES)" | awk '{ SUM += $5 } END {print SUM}'`
# Get the total number of cores in all queues
total_cores=`qstat -r -u "all" -g c | grep $QEXT | egrep -v -E "($IGNORE_QUEUES)" | awk '{ SUM += $6 } END {print SUM}'`
# Get number of used cores on each queue
all_queue_stats=''
for qs in $QUEUES; do
ret=`qstat -r -u "all" -g c | grep $QEXT | grep $qs | awk '{ print $3 }'`
all_queue_stats+=" $qs=$ret,"
done
# Warning threshold
thresh_warn=-1
# Critical threshold
thresh_crit=-1
perfdata=" | available_cores=$available_cores, used_cores=$used_cores, total_cores=$total_cores, $all_queue_stats"
if test $num_faulty_queues -ge $critical
then
echo "CRITICAL: $num_faulty_queues queues have errors"$perfdata
exit 2
elif test $num_faulty_queues -ge $warning
then
echo "WARNING: $queues_with_errors queues have errors"$perfdata
exit 1
else
echo "OK: All queues are fine"$perfdata
exit 0
fi