[torqueusers] Nagios Plugins for Torque

Steve Young chemadm at hamilton.edu
Tue Apr 17 09:50:48 MDT 2007


Hi,
I am using Nagios to monitor certain things. Attached is a plugin script
I use to check if any jobs are in the E state. Back when I used PBSPro I
need this to notify me so that I could restart the pbs server in order
to remove stuck error jobs. Torque handles E state jobs much better. I
also have some basic checks for the different torque services. I'll
paste my entries below:


checkcommands.cfg
------------------

# 'check_pbs' command definition
define command{
        command_name    check_pbs
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15001
        }

# 'check_pbs_mom' command definition
define command{
        command_name    check_pbs_mom
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15002
        }

# 'check_pbs_resmon' command definition
define command{
        command_name    check_pbs_resmon
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15003
        }

# 'check_pbs_sched' command definition
define command{
        command_name    check_pbs_sched
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15004
        }

# 'check_pbs_mom_globus' command definition
define command{
        command_name    check_pbs_mom_globus
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15005
        }

# 'check_pbs_resmon_globus' command definition
define command{
        command_name    check_pbs_resmon_globus
        command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p 15006
        }
# 'check_pbs_E.pl' command definition
define command{
        command_name    check_pbs_E
        command_line     $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ -C
"/usr/local/global/lib/plugins/check_pbs_E.pl"
        }
# 'restart-pbs-mom' command definition
define command{
        command_name    restart-pbs-mom
        command_line    $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ -C
"sudo /usr/local/global/lib/eventhandlers/restart_pbsmom $SERVICESTATE$
$SERVICESTATETYPE$ $SERVICEATTEMPT$"
        }


services.cfg
--------------

define service{
        host_name               <hostname>
        service_description     PBS
        check_command           check_pbs
        max_check_attempts      5
        normal_check_interval   5
        retry_check_interval    3
        check_period            24x7
        notification_interval   30
        notification_period     24x7
        notification_options    w,c,r
        contact_groups          admins
        }
define service{
        host_name               <host names>
        hostgroup_name          <hostgroups too>
        service_description     PBS pbs_mom
        servicegroups           pbs_mom
        check_command           check_pbs_mom
        max_check_attempts      5
        normal_check_interval   5
        retry_check_interval    3
        check_period            24x7
        notification_interval   30
        notification_period     24x7
        notification_options    w,c,r
        contact_groups          admins
        event_handler           restart-pbs-mom
        }
define service{
        host_name               <hostnames>
        hostgroup_name          <hostgroup names>
        service_description     PBS pbs_resmon
        check_command           check_pbs_resmon
        max_check_attempts      5
        normal_check_interval   5
        retry_check_interval    3
        check_period            24x7
        notification_interval   30
        notification_period     24x7
        notification_options    w,c,r
        contact_groups          admins
        }
define service{
        host_name               jake
        service_description     PBS pbs_sched
        check_command           check_pbs_sched
        max_check_attempts      5
        normal_check_interval   5
        retry_check_interval    3
        check_period            24x7
        notification_interval   30
        notification_period     24x7
        notification_options    w,c,r
        contact_groups          admins
        }


restart_pbsmom:
---------------

#!/bin/sh
#
# Event handler script for Restarting the PBS MOM client service.
#
# Note: This script will only restart the ntp daemon if the service is
#       retried 3 times (in a "soft" state) or if the ntp service
somehow
#       manages to fall into a "hard" error state.
#


# What state is the service in?
case "$1" in
OK)
        # The service just came back up, so don't do anything...
        ;;
WARNING)
        # We don't really care about warning states, since the service
is probably still running...
        ;;
UNKNOWN)
        # We don't know what might be causing an unknown error, so don't
do anything...
        ;;
CRITICAL)
        # Aha!  The service appears to have a problem - perhaps we
should restart the daemon...

        # Is this a "soft" or a "hard" state?
        case "$2" in

        # We're in a "soft" state, meaning that Nagios is in the middle
of retrying the
        # check before it turns into a "hard" state and contacts get
notified...
        SOFT)

                # What check attempt are we on?  We don't want to
restart the NTP server on the first
                # check, because it may just be a fluke!
                case "$3" in

                # Wait until the check has been tried 3 times before
restarting the service.
                # If the check fails on the 4th time (after we restart
the service), the state
                # type will turn to "hard" and contacts will be notified
of the problem.
                # Hopefully this will restart the service successfully,
so the 4th check will
                # result in a "soft" recovery.  If that happens no one
gets notified because we
                # fixed the problem!
                3)
                        echo -n "Restarting PBS MOM service (3rd soft
critical state)..."
                        # Call the init script to restart the PBS MOM
server
                        /usr/local/sbin/pbs_mom
                        mailx -s "Restarted PBS_MOM Critical Soft 3
state" <email address> <  pbs.txt
                        ;;
                        esac
                ;;

        # The service somehow managed to turn into a hard error without
getting fixed.
        # It should have been restarted by the code above, but for some
reason it didn't.
        # Let's give it one last try, shall we?
        # Note: Contacts have already been notified of a problem with
the service at this
        # point (unless you disabled notifications for this service)
        HARD)
                echo -n "Restarting PBS MOM service..."
                # Call the init script to restart the PBS MOM service
                /usr/local/sbin/pbs_mom
                mailx -s "Restarted PBS_MOM HARD state" <email address>
< pbs.txt
                ;;
        esac
        ;;
esac
exit 0


On Sun, 2007-04-15 at 19:04 +0100, Chris Vaughan wrote: 
> Hi,
> 
> I was wondering if anyone has developed any Nagios plugins for Torque?  
> If so would you mind sharing?
> 
> I'm looking for something to monitor the moms and the server.  Can 
> anyone think of any other services related to torque that would be good 
> to monitor?
> 
> Thanks,
> 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: check_pbs_E.pl
Type: application/x-perl
Size: 1991 bytes
Desc: not available
Url : http://www.supercluster.org/pipermail/torqueusers/attachments/20070417/c969b5b1/check_pbs_E-0001.bin


More information about the torqueusers mailing list