1 #!/bin/bash
3 #
4 # Check_procl.sh
5 #
6 # Program: Process load check plugin for Nagios
7 # License : GPL
8 # Copyright (c) 2002 Jerome Tytgat (j.tytgat@sioban.net)
9 #
10 # check_procl.sh,v 1.1 2002/07/04 09:35
11 #
12 # Description :
13 #
14 # This plugin is for check the %cpu, %mem or cputime of one or more process
15 #
16 # Usage :
17 #
18 # check_procl.sh -p process1,process2,... -w a.b -c c.d --cpu
19 # check_procl.sh -p process1,process2,... -w a.b -c c.d --mem
20 # check_procl.sh -p process1,process2,... -w a:b:c -c d:e:f --cputime
21 #
22 # check_procl.sh -p %all% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
23 # check_procl.sh -p %max% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
24 #
25 # Example :
26 #
27 # To know the memory eaten by HTTPD processes, be warned when it reach 50% and be critical when it reach 75%
28 # check_procl.sh -p httpd -w 50.0 -c 75.0 --mem
29 # > OK - total %MEM for process httpd : 46.1
30 #
31 # To know the process which eat the more cpu time, but as we are under linux and are using kapm we do :
32 # check_procl.sh -p %max% -e kapmd-idle,kapmd -w 0:1:0 -c 0:2:0 --cputime
33 # > CRITICAL - total CPUTIME for process named : 02:32:10
34 #
35 # Tested on solaris 7/8, Linux Redhat 7.3 and Linux Suse 7.1
36 #
37 # BUGS : problems with handling time on solaris...
40 help_usage() {
41 echo "Usage:"
42 echo " $0 -p <process_name1,process_name2,... | %all% | %max%>"
43 echo " [-e <process_name1,process_name2,...>] -w warning -c critical < --cpu | --mem | --cputime>"
44 echo " $0 (-v | --version)"
45 echo " $0 (-h | --help)"
46 }
48 help_version() {
49 echo "check_procl.sh (nagios-plugins) 1.1"
50 echo "The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute"
51 echo "copies of the plugins under the terms of the GNU General Public License."
52 echo "For more information about these matters, see the file named COPYING."
53 echo "Copyright (c) 2002 Jerome Tytgat - j.tytgat@sioban.net"
54 echo "Greetings goes to Websurg which kindly let me took time to develop this"
55 echo " Manu Feig and Jacques Kern who were my beta testers, thanks to them !"
56 }
58 verify_dep() {
59 needed="bash cut egrep expr grep let ps sed sort tail test tr wc"
60 for i in `echo $needed`
61 do
62 type $i > /dev/null 2>&1 /dev/null
63 if [ $? -eq 1 ]
64 then
65 echo "I am missing an important component : $i"
66 echo "Cannot continue, sorry, try to find the missing one..."
67 exit 3
68 fi
69 done
70 }
72 myself=$0
74 verify_dep
76 if [ "$1" = "-h" -o "$1" = "--help" ]
77 then
78 help_version
79 echo ""
80 echo "This plugin will check either the cumulutative %cpu, %mem or cputime"
81 echo "of a process."
82 echo ""
83 help_usage
84 echo ""
85 echo "Required Arguments:"
86 echo " -p, --process STRING1,STRING2,..."
87 echo " names of the processes we want to monitor,"
88 echo " you can add as much as process as you want, separated by comma,"
89 echo " hey will be cumulated"
90 echo " -p, --process %all%"
91 echo " The special keyword %all% will check the cumulative cpu/mem/time of all process"
92 echo " WARNING : Can be very slow on heavy loaded servers, watch your timeout !"
93 echo " -p, --process %max%"
94 echo " The special keyword %max% will check the process which eat the most"
95 echo " WARNING : only select the process which eat the more, not the cumulative,"
96 echo " but return the cumulative"
97 echo " -w, --warning INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
98 echo " generate warning state if process count is outside this range"
99 echo " -c, --critical INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
100 echo " generate critical state if process count is outside this range"
101 echo " --cpu"
102 echo " return the current cpu usage for the given process"
103 echo " --mem"
104 echo " return the current memory usage for the given process"
105 echo " --cputime"
106 echo " return the total cputime usage for the given process"
107 echo ""
108 echo "Optional Argument:"
109 echo " -e, --exclude-process STRING1,STRING2,..."
110 echo " names of the processes we want don't want to monitor"
111 echo " only useful when associated with %all% or %max% keywords, else ignored"
112 echo " ex : kapm-idled on linux is a process which eat memory / cputime but not really... ;-)"
113 echo ""
114 exit 3
115 fi
117 if [ "$1" = "-v" -o "$1" = "--version" ]
118 then
119 help_version
120 exit 3
121 fi
123 if [ `echo $@|tr "=" " "|wc -w` -lt 7 ]
124 then
125 echo "Bad arguments number (need at least 7)!"
126 help_usage
127 exit 3
128 fi
130 tt=0
131 process_name=""
132 exclude_process_name=""
133 wt=""
134 ct=""
136 # Test of the command lines arguments
137 while test $# -gt 0
138 do
140 case "$1" in
141 -p|--process)
142 if [ -n "$process_name" ]
143 then
144 echo "Only one --process argument is useful..."
145 help_usage
146 exit 3
147 fi
148 shift
149 process_name="`echo $1|tr \",\" \"|\"`"
150 ;;
151 -e|--exclude-process)
152 if [ -n "$exclude_process_name" ]
153 then
154 echo "Only one --exclude-process argument is useful..."
155 help_usage
156 exit 3
157 fi
158 shift
159 exclude_process_name="`echo $1|tr \",\" \"|\"`"
160 ;;
161 -w|--warning)
162 if [ -n "$wt" ]
163 then
164 echo "Only one --warning argument needed... Trying to test bad things ? :-)"
165 help_usage
166 exit 3
167 fi
168 shift
169 wt=$1
170 ;;
171 -c|--critical)
172 if [ -n "$ct" ]
173 then
174 echo "Only one --critical argument needed... Trying to test bad things ? :-)"
175 help_usage
176 exit 3
177 fi
178 shift
179 ct=$1
180 ;;
181 --cpu)
182 if [ $tt -eq 0 ]
183 then
184 tt=1
185 else
186 echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
187 help_usage
188 exit 3
189 fi
190 type_arg_aff="%CPU"
191 type_arg="pcpu"
192 delim="."
193 ;;
194 --mem)
195 if [ $tt -eq 0 ]
196 then
197 tt=2
198 else
199 echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
200 help_usage
201 exit 3
202 fi
203 type_arg_aff="%MEM"
204 type_arg="pmem"
205 delim="."
206 ;;
207 --cputime)
208 if [ $tt -eq 0 ]
209 then
210 tt=3
211 else
212 echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
213 help_usage
214 exit 3
215 fi
216 type_arg_aff="TIME"
217 type_arg="time"
218 delim=":"
219 ;;
220 *)
221 echo "Unknown argument $1"
222 help_usage
223 exit 3
224 ;;
225 esac
226 shift
227 done
229 # Is the process running ?
230 if [ -z "`ps -e | egrep \"$process_name?\"`" -a "$process_name" != "%all%" -a "$process_name" != "%max%" ]
231 then
232 echo "WARNING: process $process_name not running !"
233 exit 3
234 fi
236 # Cut of warning and critical values
237 wt_value1=`echo $wt|cut -d"$delim" -f1`
238 wt_value2=`echo $wt|cut -d"$delim" -f2`
239 ct_value1=`echo $ct|cut -d"$delim" -f1`
240 ct_value2=`echo $ct|cut -d"$delim" -f2`
242 if [ $tt -eq 3 ]
243 then
244 wt_value3=`echo $wt|cut -d"$delim" -f3`
245 ct_value3=`echo $ct|cut -d"$delim" -f3`
246 else
247 wt_value3=0
248 ct_value3=0
249 fi
251 # Integrity check of warning and critical values
252 if [ -z "$wt_value1" -o -z "$wt_value2" -o -z "$wt_value3" ]
253 then
254 echo "Bad expression in the WARNING field : $wt"
255 help_usage
256 exit 3
257 fi
259 if [ "`echo $wt_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value3|tr -d \"[:digit:]\"`" != "" ]
260 then
261 echo "Bad expression in the WARNING field : $wt"
262 help_usage
263 exit 3
264 fi
266 if [ -z "$ct_value1" -o -z "$ct_value2" -o -z "$ct_value3" ]
267 then
268 echo "Bad expression in the CRITICAL field : $ct"
269 help_usage
270 exit 3
271 fi
274 if [ "`echo $ct_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value3|tr -d \"[:digit:]\"`" != "" ]
275 then
276 echo "Bad expression in the CRITICAL field : $ct"
277 help_usage
278 exit 3
279 fi
281 # ps line construction set...
282 case "$process_name" in
283 %all%)
284 if [ -z "$exclude_process_name" ]
285 then
286 psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
287 else
288 psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
289 fi
290 ;;
291 %max%)
292 if [ -z "$exclude_process_name" ]
293 then
294 pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
295 else
296 pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
297 fi
298 psline=`ps -eo $type_arg,comm|grep $pstmp|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
299 process_name=$pstmp
300 ;;
301 *)
302 psline=`ps -eo $type_arg,comm|egrep "$process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
303 ;;
304 esac
306 total1=0
307 total2=0
308 total3=0
311 # fetching the values
312 for i in $psline
313 do
314 # Special case for solaris - several format exist for the time function...
315 if [ ${#i} -le 6 -a "$tt" -eq 3 ]
316 then
317 i="00:$i"
318 fi
319 value1=`echo $i|cut -d$delim -f1`
320 value2=`echo $i|cut -d$delim -f2`
321 value3=`echo $i|cut -d$delim -f3`
322 value3=`test -z "$value3" && echo 0 || echo $value3`
323 total1=`expr $total1 + $value1`
324 total2=`expr $total2 + $value2`
325 total3=`expr $total3 + $value3`
326 if [ $tt -eq 3 ]
327 then
328 if [ $total3 -ge 60 ]
329 then
330 let total2+=1
331 let total3-=60
332 fi
333 if [ $total2 -ge 60 ]
334 then
335 let total1+=1
336 let total2-=60
337 fi
338 else
339 if [ $total2 -ge 10 ]
340 then
341 let total1+=1
342 let total2=total2-10
343 fi
344 fi
345 done
347 warn=0
348 crit=0
350 # evaluation of the cumulative values vs warning and critical values
351 case "$tt" in
352 1)
353 return_total="$total1.$total2"
354 test $total1 -gt $ct_value1 && crit=1
355 test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
356 test $total1 -gt $wt_value1 && warn=1
357 test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
358 ;;
359 2)
360 return_total="$total1.$total2"
361 test $total1 -gt $ct_value1 && crit=1
362 test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
363 test $total1 -gt $wt_value1 && warn=1
364 test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
365 ;;
366 3)
367 return_total="`test ${#total1} -eq 1 && echo 0`$total1:`test ${#total2} -eq 1 && echo 0`$total2:`test ${#total3} -eq 1 && echo 0`$total3"
368 test $total1 -gt $ct_value1 && crit=1
369 test $total1 -eq $ct_value1 -a $total2 -gt $ct_value2 && crit=1
370 test $total1 -eq $ct_value1 -a $total2 -eq $ct_value2 -a $total3 -ge $ct_value3 && crit=1
371 test $total1 -gt $wt_value1 && warn=1
372 test $total1 -eq $wt_value1 -a $total2 -gt $wt_value2 && warn=1
373 test $total1 -eq $wt_value1 -a $total2 -eq $wt_value2 -a $total3 -ge $wt_value3 && warn=1
374 ;;
375 esac
377 # last check ...
378 if [ $crit -eq 1 -a $warn -eq 0 ]
379 then
380 echo "Critical value must be greater than warning value !"
381 help_usage
382 exit 3
383 fi
385 # Finally Inform Nagios of what we found...
386 if [ $crit -eq 1 ]
387 then
388 echo "CRITICAL - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
389 exit 2
390 elif [ $warn -eq 1 ]
391 then
392 echo "WARNING - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
393 exit 1
394 else
395 echo "OK - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
396 exit 0
397 fi
399 # Hey what are we doing here ???
400 exit 3