Code

typo: had commented use Net::SNMP for testing
[nagiosplug.git] / contrib / check_procl.sh
1 #!/bin/bash
3 #
4 # Check_procl.sh 
5
6 # Program: Process load check plugin for Nagios
7 # License : GPL
8 # Copyright (c) 2002 Jerome Tytgat (j.tytgat@sioban.net)
9 #
10 # check_procl.sh,v 1.1 2002/07/04 09:35 
11 #
12 # Description :
13 #   
14 #  This plugin is for check the %cpu, %mem or cputime of one or more process
15 #
16 # Usage :
17 #
18 #  check_procl.sh -p process1,process2,... -w a.b -c c.d --cpu 
19 #  check_procl.sh -p process1,process2,... -w a.b -c c.d --mem
20 #  check_procl.sh -p process1,process2,... -w a:b:c -c d:e:f --cputime
21 #
22 #  check_procl.sh -p %all% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
23 #  check_procl.sh -p %max% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
24 #
25 # Example :
26 #   
27 #  To know the memory eaten by HTTPD processes, be warned when it reach 50% and be critical when it reach 75%
28 #       check_procl.sh -p httpd -w 50.0 -c 75.0 --mem
29 #       > OK - total %MEM for process httpd : 46.1
30 #
31 #  To know the process which eat the more cpu time, but as we are under linux and are using kapm we do :
32 #       check_procl.sh -p %max% -e kapmd-idle,kapmd -w 0:1:0 -c 0:2:0 --cputime
33 #       > CRITICAL - total CPUTIME for process named : 02:32:10
34 #
35 # Tested on solaris 7/8, Linux Redhat 7.3 and Linux Suse 7.1
36 #
37 # BUGS : problems with handling time on solaris...
40 help_usage() {
41         echo "Usage:"
42         echo " $0 -p <process_name1,process_name2,... | %all% | %max%>"
43         echo "    [-e <process_name1,process_name2,...>] -w warning -c critical < --cpu | --mem | --cputime>"
44         echo " $0 (-v | --version)"
45         echo " $0 (-h | --help)"
46 }
48 help_version() {
49         echo "check_procl.sh (nagios-plugins) 1.1"
50         echo "The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute"
51         echo "copies of the plugins under the terms of the GNU General Public License."
52         echo "For more information about these matters, see the file named COPYING."
53         echo "Copyright (c) 2002 Jerome Tytgat - j.tytgat@sioban.net"
54         echo "Greetings goes to Websurg which kindly let me took time to develop this"
55         echo "                  Manu Feig and Jacques Kern who were my beta testers, thanks to them !"
56 }
58 verify_dep() {
59         needed="bash cut egrep expr grep let ps sed sort tail test tr wc"
60         for i in `echo $needed`
61         do
62                 type $i > /dev/null 2>&1 /dev/null
63                 if [ $? -eq 1 ]
64                 then
65                         echo "I am missing an important component : $i"
66                         echo "Cannot continue, sorry, try to find the missing one..."
67                         exit 3
68                 fi
69         done
70 }
72 myself=$0
74 verify_dep
76 if [ "$1" = "-h" -o "$1" = "--help" ]
77 then 
78         help_version    
79         echo ""
80         echo "This plugin will check either the cumulutative %cpu, %mem or cputime"
81         echo "of a process."
82         echo ""
83         help_usage
84         echo ""
85         echo "Required Arguments:"
86         echo " -p, --process STRING1,STRING2,..."
87         echo "    names of the processes we want to monitor,"
88         echo "    you can add as much as process as you want, separated by comma,"
89         echo "    hey will be cumulated"
90         echo " -p, --process %all%"
91         echo "    The special keyword %all% will check the cumulative cpu/mem/time of all process"
92         echo "    WARNING : Can be very slow on heavy loaded servers, watch your timeout !"
93         echo " -p, --process %max%"
94         echo "    The special keyword %max% will check the process which eat the most"
95         echo "    WARNING : only select the process which eat the more, not the cumulative,"
96         echo "              but return the cumulative"
97         echo " -w, --warning INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
98         echo "    generate warning state if process count is outside this range"
99         echo " -c, --critical INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
100         echo "    generate critical state if process count is outside this range"
101         echo " --cpu"
102         echo "    return the current cpu usage for the given process"
103         echo " --mem"
104         echo "    return the current memory usage for the given process"
105         echo " --cputime"
106         echo "    return the total cputime usage for the given process"
107         echo ""
108         echo "Optional Argument:"
109         echo " -e, --exclude-process STRING1,STRING2,..."
110         echo "    names of the processes we want don't want to monitor"
111         echo "    only useful when associated with %all% or %max% keywords, else ignored"
112         echo "    ex : kapm-idled on linux is a process which eat memory / cputime but not really... ;-)"
113         echo ""
114         exit 3
115 fi
117 if [ "$1" = "-v" -o "$1" = "--version" ]
118 then
119         help_version
120         exit 3
121 fi
123 if [ `echo $@|tr "=" " "|wc -w` -lt 7 ]
124 then 
125         echo "Bad arguments number (need at least 7)!"
126         help_usage
127         exit 3
128 fi
130 tt=0
131 process_name=""
132 exclude_process_name=""
133 wt=""
134 ct=""
136 # Test of the command lines arguments
137 while test $# -gt 0
138 do
139         
140         case "$1" in
141                 -p|--process)
142                         if [ -n "$process_name" ]
143                         then
144                                 echo "Only one --process argument is useful..."
145                                 help_usage
146                                 exit 3
147                         fi
148                         shift
149                         process_name="`echo $1|tr \",\" \"|\"`"
150                         ;;
151                 -e|--exclude-process)
152                         if [ -n "$exclude_process_name" ]
153                         then
154                                 echo "Only one --exclude-process argument is useful..."
155                                 help_usage
156                                 exit 3
157                         fi
158                         shift
159                         exclude_process_name="`echo $1|tr \",\" \"|\"`"
160                         ;;
161                 -w|--warning)
162                         if [ -n "$wt" ]
163                         then
164                                 echo "Only one --warning argument needed... Trying to test bad things ? :-)"
165                                 help_usage
166                                 exit 3
167                         fi
168                         shift
169                         wt=$1
170                         ;;
171                 -c|--critical)
172                         if [ -n "$ct" ]
173                         then
174                                 echo "Only one --critical argument needed... Trying to test bad things ? :-)"
175                                 help_usage
176                                 exit 3
177                         fi
178                         shift
179                         ct=$1
180                         ;;
181                 --cpu)
182                         if [ $tt -eq 0 ]
183                         then
184                                 tt=1
185                         else
186                                 echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
187                                 help_usage
188                                 exit 3
189                         fi
190                         type_arg_aff="%CPU"             
191                         type_arg="pcpu"         
192                         delim="."
193                         ;;
194                 --mem)
195                         if [ $tt -eq 0 ]
196                         then
197                                 tt=2
198                         else
199                                 echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
200                                 help_usage
201                                 exit 3
202                         fi
203                         type_arg_aff="%MEM"
204                         type_arg="pmem"
205                         delim="."
206                         ;;
207                 --cputime)
208                         if [ $tt -eq 0 ]
209                         then
210                                 tt=3
211                         else
212                                 echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
213                                 help_usage
214                                 exit 3
215                         fi
216                         type_arg_aff="TIME"
217                         type_arg="time"
218                         delim=":"
219                         ;;
220                 *)
221                         echo "Unknown argument $1"
222                         help_usage
223                         exit 3
224                         ;;
225         esac
226         shift
227 done
229 # Is the process running ?
230 if [ -z "`ps -e | egrep \"$process_name?\"`" -a "$process_name" != "%all%" -a "$process_name" != "%max%" ]
231 then
232         echo "WARNING: process $process_name not running !"
233         exit 3
234 fi
236 # Cut of warning and critical values
237 wt_value1=`echo $wt|cut -d"$delim" -f1`
238 wt_value2=`echo $wt|cut -d"$delim" -f2`
239 ct_value1=`echo $ct|cut -d"$delim" -f1`
240 ct_value2=`echo $ct|cut -d"$delim" -f2`
242 if [ $tt -eq 3 ]
243 then
244         wt_value3=`echo $wt|cut -d"$delim" -f3`
245         ct_value3=`echo $ct|cut -d"$delim" -f3`
246 else
247         wt_value3=0
248         ct_value3=0
249 fi
251 # Integrity check of warning and critical values
252 if [ -z "$wt_value1" -o -z "$wt_value2" -o -z "$wt_value3" ]
253 then
254         echo "Bad expression in the WARNING field : $wt"
255         help_usage
256         exit 3
257 fi
259 if [ "`echo $wt_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value3|tr -d \"[:digit:]\"`" != "" ]
260 then
261         echo "Bad expression in the WARNING field : $wt"
262         help_usage
263         exit 3
264 fi
266 if [ -z "$ct_value1" -o -z "$ct_value2" -o -z "$ct_value3" ]
267 then
268         echo "Bad expression in the CRITICAL field : $ct"
269         help_usage
270         exit 3
271 fi
274 if [ "`echo $ct_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value3|tr -d \"[:digit:]\"`" != "" ]
275 then
276         echo "Bad expression in the CRITICAL field : $ct"
277         help_usage
278         exit 3
279 fi
281 # ps line construction set...
282 case "$process_name" in 
283         %all%)
284                 if [ -z "$exclude_process_name" ]
285                 then
286                         psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
287                 else
288                         psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
289                 fi
290                 ;;
291         %max%)
292                 if [ -z "$exclude_process_name" ]
293                 then
294                         pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
295                 else
296                         pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
297                 fi
298                 psline=`ps -eo $type_arg,comm|grep $pstmp|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
299                 process_name=$pstmp
300                 ;;
301         *)
302                 psline=`ps -eo $type_arg,comm|egrep "$process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
303                 ;;
304 esac
306 total1=0
307 total2=0
308 total3=0
311 # fetching the values
312 for i in $psline
313 do
314         # Special case for solaris - several format exist for the time function...
315         if [ ${#i} -le 6 -a "$tt" -eq 3 ]
316         then
317                 i="00:$i"
318         fi 
319         value1=`echo $i|cut -d$delim -f1`
320         value2=`echo $i|cut -d$delim -f2`
321         value3=`echo $i|cut -d$delim -f3`
322         value3=`test -z "$value3" && echo 0 || echo $value3`
323         total1=`expr $total1 + $value1`
324         total2=`expr $total2 + $value2`
325         total3=`expr $total3 + $value3`
326         if [ $tt -eq 3 ]
327         then
328                 if [ $total3 -ge 60 ]
329                 then
330                         let total2+=1
331                         let total3-=60
332                 fi
333                 if [ $total2 -ge 60 ]
334                 then
335                         let total1+=1
336                         let total2-=60
337                 fi
338         else
339                 if [ $total2 -ge 10 ]
340                 then
341                         let total1+=1
342                         let total2=total2-10
343                 fi
344         fi
345 done
347 warn=0
348 crit=0
350 # evaluation of the cumulative values vs warning and critical values
351 case "$tt" in
352         1)
353                 return_total="$total1.$total2"
354                 test $total1 -gt $ct_value1 && crit=1
355                 test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
356                 test $total1 -gt $wt_value1 && warn=1
357                 test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
358                 ;;
359         2)
360                 return_total="$total1.$total2"
361                 test $total1 -gt $ct_value1 && crit=1
362                 test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
363                 test $total1 -gt $wt_value1 && warn=1
364                 test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
365                 ;;
366         3)
367                 return_total="`test ${#total1} -eq 1 && echo 0`$total1:`test ${#total2} -eq 1 && echo 0`$total2:`test ${#total3} -eq 1 && echo 0`$total3"
368                 test $total1 -gt $ct_value1 && crit=1
369                 test $total1 -eq $ct_value1 -a $total2 -gt $ct_value2 && crit=1
370                 test $total1 -eq $ct_value1 -a $total2 -eq $ct_value2 -a $total3 -ge $ct_value3 && crit=1
371                 test $total1 -gt $wt_value1 && warn=1
372                 test $total1 -eq $wt_value1 -a $total2 -gt $wt_value2 && warn=1
373                 test $total1 -eq $wt_value1 -a $total2 -eq $wt_value2 -a $total3 -ge $wt_value3 && warn=1
374                 ;;
375 esac
377 # last check ...
378 if [ $crit -eq 1 -a $warn -eq 0 ]
379 then
380         echo "Critical value must be greater than warning value !"
381         help_usage
382         exit 3
383 fi
385 # Finally Inform Nagios of what we found...
386 if [ $crit -eq 1 ]
387 then
388         echo "CRITICAL - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
389         exit 2
390 elif [ $warn -eq 1 ]
391 then
392         echo "WARNING - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
393         exit 1
394 else
395         echo "OK - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
396         exit 0
397 fi
399 # Hey what are we doing here ???
400 exit 3