[torqueusers] Nagios Plugins for Torque

Steve Young chemadm at hamilton.edu
Tue Apr 17 10:11:00 MDT 2007


Resent... not sure if this list likes attachments so I'm including the
check_pbs_E.pl script below


Hi,
I am using Nagios to monitor certain things. Attached is a plugin script
I use to check if any jobs are in the E state. Back when I used PBSPro I
need this to notify me so that I could restart the pbs server in order
to remove stuck error jobs. Torque handles E state jobs much better. I
also have some basic checks for the different torque services. I'll
paste my entries below:


checkcommands.cfg
------------------

# 'check_pbs' command definition
define command{
        command_name    check_pbs
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15001
        }

# 'check_pbs_mom' command definition
define command{
        command_name    check_pbs_mom
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15002
        }

# 'check_pbs_resmon' command definition
define command{
        command_name    check_pbs_resmon
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15003
        }

# 'check_pbs_sched' command definition
define command{
        command_name    check_pbs_sched
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15004
        }

# 'check_pbs_mom_globus' command definition
define command{
        command_name    check_pbs_mom_globus
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15005
        }

# 'check_pbs_resmon_globus' command definition
define command{
        command_name    check_pbs_resmon_globus
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15006
        }
# 'check_pbs_E.pl' command definition
define command{
        command_name    check_pbs_E
        command_line     $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ -C
"/usr/local/global/lib/plugins/check_pbs_E.pl"
        }
# 'restart-pbs-mom' command definition
define command{
        command_name    restart-pbs-mom
        command_line    $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ -C
"sudo /usr/local/global/lib/eventhandlers/restart_pbsmom $SERVICESTATE$
$SERVICESTATETYPE$ $SERVICEATTEMPT$"
        }


services.cfg
--------------

define service{
        host_name               <hostname>
        service_description     PBS
        check_command           check_pbs
        max_check_attempts      5
        normal_check_interval   5
        retry_check_interval    3
        check_period            24x7
        notification_interval   30
        notification_period     24x7
        notification_options    w,c,r
        contact_groups          admins
        }
define service{
        host_name               <host names>
        hostgroup_name          <hostgroups too>
        service_description     PBS pbs_mom
        servicegroups           pbs_mom
        check_command           check_pbs_mom
        max_check_attempts      5
        normal_check_interval   5
        retry_check_interval    3
        check_period            24x7
        notification_interval   30
        notification_period     24x7
        notification_options    w,c,r
        contact_groups          admins
        event_handler           restart-pbs-mom
        }
define service{
        host_name               <hostnames>
        hostgroup_name          <hostgroup names>
        service_description     PBS pbs_resmon
        check_command           check_pbs_resmon
        max_check_attempts      5
        normal_check_interval   5
        retry_check_interval    3
        check_period            24x7
        notification_interval   30
        notification_period     24x7
        notification_options    w,c,r
        contact_groups          admins
        }
define service{
        host_name               jake
        service_description     PBS pbs_sched
        check_command           check_pbs_sched
        max_check_attempts      5
        normal_check_interval   5
        retry_check_interval    3
        check_period            24x7
        notification_interval   30
        notification_period     24x7
        notification_options    w,c,r
        contact_groups          admins
        }


restart_pbsmom:
---------------

#!/bin/sh
#
# Event handler script for Restarting the PBS MOM client service.
#
# Note: This script will only restart the ntp daemon if the service is
#       retried 3 times (in a "soft" state) or if the ntp service
somehow
#       manages to fall into a "hard" error state.
#


# What state is the service in?
case "$1" in
OK)
        # The service just came back up, so don't do anything...
        ;;
WARNING)
        # We don't really care about warning states, since the service
is probably still running...
        ;;
UNKNOWN)
        # We don't know what might be causing an unknown error, so don't
do anything...
        ;;
CRITICAL)
        # Aha!  The service appears to have a problem - perhaps we
should restart the daemon...

        # Is this a "soft" or a "hard" state?
        case "$2" in

        # We're in a "soft" state, meaning that Nagios is in the middle
of retrying the
        # check before it turns into a "hard" state and contacts get
notified...
        SOFT)

                # What check attempt are we on?  We don't want to
restart the NTP server on the first
                # check, because it may just be a fluke!
                case "$3" in

                # Wait until the check has been tried 3 times before
restarting the service.
                # If the check fails on the 4th time (after we restart
the service), the state
                # type will turn to "hard" and contacts will be notified
of the problem.
                # Hopefully this will restart the service successfully,
so the 4th check will
                # result in a "soft" recovery.  If that happens no one
gets notified because we
                # fixed the problem!
                3)
                        echo -n "Restarting PBS MOM service (3rd soft
critical state)..."
                        # Call the init script to restart the PBS MOM
server
                        /usr/local/sbin/pbs_mom
                        mailx -s "Restarted PBS_MOM Critical Soft 3
state" <email address> <  pbs.txt
                        ;;
                        esac
                ;;

        # The service somehow managed to turn into a hard error without
getting fixed.
        # It should have been restarted by the code above, but for some
reason it didn't.
        # Let's give it one last try, shall we?
        # Note: Contacts have already been notified of a problem with
the service at this
        # point (unless you disabled notifications for this service)
        HARD)
                echo -n "Restarting PBS MOM service..."
                # Call the init script to restart the PBS MOM service
                /usr/local/sbin/pbs_mom
                mailx -s "Restarted PBS_MOM HARD state" <email address>
< pbs.txt
                ;;
        esac
        ;;
esac
exit 0


check_pbs_E.pl
---------------
#!/usr/bin/perl -w
#
# check_pbs_E.pl 
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# you should have received a copy of the GNU General Public License
# along with this program (or with Nagios);  if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA

# check_pbs_E.pl checks the output of the PBS command qstat for
# jobs that are hung in an error state. Errored jobs will have
# an 'E' in the 'S' (status) column of qstat.

# Tell Perl what we need to use
use strict;
use Getopt::Std;


use vars qw(%exit_codes @status_list $error_count $total_jobs
$curr_job);

# Predefined exit codes for Nagios
%exit_codes   = ('OK' 		, 0,
		 'WARNING'	, 1,
		 'CRITICAL'	, 2,
		 'INVALID'	, 3,);

# Set up the PBS environment
#$ENV{PBS_EXEC} = "/usr/pbs";
#$ENV{PBS_HOME} = "/var/spool/PBS";
#$ENV{PBS_SERVER} = `/usr/bsd/hostname`;

# Zero out error count.
$error_count = 0;

# This the unix command string that brings Perl the data
@status_list = `/usr/local/bin/qstat | tail +3 | awk '{print \$5}'`;
$total_jobs  = @status_list;


foreach (@status_list) {
	chomp;
	if ($_ eq 'E') {
		$error_count += 1;
	}
}

# Figure out what return code we'll be sending. Currently
# only supporting "OK" (no errors) and "CRITICAL" (errors).
if ($error_count == 0) {
	print "PBS OK -- no jobs in E state.\n";
	exit $exit_codes{'OK'};
} elsif ($error_count >= 1) {
	print " PBS CRITICAL -- $error_count out of $total_jobs in E state.\n";
	exit $exit_codes{'CRITICAL'};	
}

	

exit $exit_codes{'INVALID'};



On Sun, 2007-04-15 at 19:04 +0100, Chris Vaughan wrote:
> Hi,
> 
> I was wondering if anyone has developed any Nagios plugins for Torque?  
> If so would you mind sharing?
> 
> I'm looking for something to monitor the moms and the server.  Can 
> anyone think of any other services related to torque that would be good 
> to monitor?
> 
> Thanks,
> 



More information about the torqueusers mailing list