Tag Archives: Nagios

Monitor failed SQL jobs in Nagios

This script can be used to monitor for failed SQL agent jobs in Nagios.

It requires python-argparse and python-pymssql which can be installed like this in debian Linux:

sudo yum install pymssql

Sudo yum install python-argparse

Script is then run like this:

./check_mssql_job_history.py -H server1 -U username -P password -c 1

Hera are oprtional parameters that can be used.

usage: check_mssql_job_history.py [-h] [-t TIMEOUT]

[–query-timeout QUERY_TIMEOUT] [-w WARNING]

[-c CRITICAL] -H HOST [-p PORT] -U USER -P

PASSWORD [-j JOB] [-x EXCLUDE] [-l] [-v]

Check a SQL Server for failed jobs – results based on state of last run for

all jobs or for a specific job if specified

optional arguments:

-h, –help show this help message and exit

-t TIMEOUT, –timeout TIMEOUT

Timeout for connection and login in seconds

–query-timeout QUERY_TIMEOUT

Query timeout in seconds

-w WARNING, –warning WARNING

Failed jobs before a warning alert is generated

-c CRITICAL, –critical CRITICAL

Failed jobs before a critical alert is generated

-H HOST, –host HOST Host name or IP address to connect to

-p PORT, –port PORT SQL Server port number (only change if you know you

need to)

-U USER, –user USER User name to connect as (does not support Windows

built in or Active Directory accounts)

-P PASSWORD, –password PASSWORD

Password to for user you are authenticating as

-j JOB, –job JOB A comma seperate list of jobs to check instead of all

enabled jobs

-x EXCLUDE, –exclude EXCLUDE

A comma seperated list of jobs not to check

-l, –list This will list all jobs in on your server. This does

not return a nagios check and is used for setup and

debugging

-v, –verbose This shows the Transaction SQL code that will be

executed to help debug

#!/usr/bin/env python

# Written by Joseph "jetole" Tole on 10/31/11

# This is a simple script because I saw the idea of monitoring failed jobs at

# <a href="http://goo.gl/qDqGL">http://goo.gl/qDqGL</a> (shortened Nagios Exchange URL) however I really didn't

# want to implement an unnecessarily complex, multi part, compiled java program

# on my nagios hosts to perform this same check. I added a couple features as

# well such as listing the failed jobs and the time they had last run as well

# as allowing you to specify a single job to check which will return

# OK|CRITICAL for that job alone.

import sys

import argparse

import pymssql

# 0 - OK

# 1 - Warning

# 2 - Critical

# 3 - Unknown

def nagios_exit(exit_type, msg):

try:

exit_code=int(exit_type)

if exit_code &lt; 0 or exit_code &gt; 2:

exit_code = 3

except ValueError:

exit_code=3

if exit_code == 0:

status = "OK"

elif exit_code == 1:

status = "WARNING"

elif exit_code == 2:

status = "CRITICAL"

else:

status = "UNKNOWN"

print "%s - %s" % (status, msg)

sys.exit(exit_code)

def run_datetime(run_date, run_time):

run_date = str(run_date)

run_time = str("%06d" % (int(run_time)))

run_year = run_date[0:4]

run_month = run_date[4:6]

run_day = run_date[6:8]

run_hour = run_time[0:2]

run_minute = run_time[2:4]

run_second = run_time[4:6]

return "%s/%s/%s %s:%s:%s" % (run_month, run_day, run_year, run_hour, run_minute, run_second)

parser = argparse.ArgumentParser(description="Check a SQL Server for failed jobs - results based on state of last run for all jobs or for a specific job if specified")

parser.add_argument("-t", "--timeout", action="store", type=int, help="Timeout for connection and login in seconds", default="60", dest="timeout")

parser.add_argument("--query-timeout", action="store", type=int, help="Query timeout in seconds", default="60", dest="query_timeout")

parser.add_argument("-w", "--warning", action="store", type=int, help="Failed jobs before a warning alert is generated", default="0", dest="warning")

parser.add_argument("-c", "--critical", action="store", type=int, help="Failed jobs before a critical alert is generated", default="1", dest="critical")

parser.add_argument("-H", "--host", action="store", help="Host name or IP address to connect to", required=True, dest="host")

parser.add_argument("-p", "--port", action="store", type=int, help="SQL Server port number (only change if you know you need to)", default=1433, dest="port")

parser.add_argument("-U", "--user", action="store", help="User name to connect as (does not support Windows built in or Active Directory accounts)", required=True, dest="user")

parser.add_argument("-P", "--password", action="store", help="Password to for user you are authenticating as", required=True, dest="password")

parser.add_argument("-j", "--job", action="store", help="A comma seperate list of jobs to check instead of all enabled jobs", dest="job");

parser.add_argument("-x", "--exclude", action="store", help="A comma seperated list of jobs not to check", dest="exclude");

parser.add_argument("-l", "--list", action="store_true", help="This will list all jobs in on your server. This does not return a nagios check and is used for setup and debugging", dest="list_jobs")

parser.add_argument("-v", "--verbose", action="store_true", help="This shows the Transaction SQL code that will be executed to help debug", dest="verbose")

results = parser.parse_args()

connect_host = results.host

if results.port != 1433:

connect_host += ":%s" % (str(results.port))

try:

conn = pymssql.connect(user = results.user, password = results.password, host = connect_host, timeout = results.query_timeout, login_timeout = results.timeout)

except:

nagios_exit(3, "Unable to connect to SQL Server")

cur = conn.cursor()

if results.list_jobs:

tsql_cmd = """ SELECT [name], [enabled]

FROM [msdb]..[sysjobs]"""

if results.verbose:

print "%s\n" % (tsql_cmd)

cur.execute(tsql_cmd)

rows = cur.fetchall()

print "Jobs on %s" % (results.host)

print "\"-\" at the begining means the job is disabled\n"

for row in rows:

if int(row[1]) == 1:

print " %s" % (row[0])

else:

print "- %s" % (row[0])

sys.exit()

tsql_cmd = """SELECT [j].[name], [h].[run_date], [h].[run_time]

FROM [msdb]..[sysjobs] [j]

INNER JOIN [msdb]..[sysjobhistory] [h] ON [j].[job_id] = [h].[job_id]

INNER JOIN (

SELECT [job_id], MAX(instance_id) AS max_instance_id

FROM [msdb]..[sysjobhistory]

GROUP BY [job_id]

) [tmp_sjh] ON [h].[job_id] = [tmp_sjh].[job_id] AND [h].[instance_id] = [tmp_sjh].[max_instance_id]

WHERE [j].[enabled] = 1

AND [h].[run_status] = 0"""

if results.job:

tsql_cmd += "\nAND (\n\t[j].[name] = '%s'" % (results.job.split(',')[0].strip())

if len(results.job.split(',')) &gt; 1:

for x in results.job.split(',')[1:]:

tsql_cmd += "\n\tOR [j].[name] = '%s'" % (x.strip())

tsql_cmd += "\n)"

elif results.exclude:

for x in results.exclude.split(','):

tsql_cmd += "\nAND [j].[name] != '%s'" % (x.strip())

if results.verbose:

print "%s\n" % (tsql_cmd)

cur.execute(tsql_cmd)

rows = cur.fetchall()

rowcount = cur.rowcount

if rowcount == 0:

nagios_exit(0, "All jobs completed successfully on their last run")

else:

failed_stats = "Number of failed jobs: %d - Failed Jobs: " % (rowcount)

for row in rows:

failed_stats += "%s last run at %s, " % (row[0], run_datetime(row[1], row[2]))

failed_stats = failed_stats.rstrip(', ')

if rowcount &gt;= results.warning and rowcount &lt; results.critical:

nagios_exit(1, failed_stats)

elif rowcount &lt; results.warning:

nagios_exit(0, "%d failed jobs is below the warning count specified." % (rowcount))

elif rowcount &gt;= results.critical:

nagios_exit(2, failed_stats)

else:

nagios_exit(3, "This should never appear")
Advertisements

Monitor Sql server blocking sessions in Nagios

Here is a script that uses Freetds to monitor sql server blocking it can be used by Nagios to trigger alarms.

#Script is designed to be used by Nagios. It checks for blocking sessions in MSSQL.

#

# Requirements:

#

# FreeTDS 6.0+ (http://www.freetds.org/)

#

# Original check_mssql.sh written by Tom De Blende (tom.deblende@village.uunet.be) in 2003.

# Changed script to check for blocking sessions in a database by Staffan.olofsson@b3it.se in 2013

#

# Script takes 4 parameters 1. Servername 2.username 3.password 4. 2000 (for SQL 2000 and up)

# Freedtds has to be configured with connection information

#

# Current threshold for blocking is 35000 ms. This can be changed in the SQL code below.

#

# You might want to change these values:

tsqlcmd=’/usr/bin/tsql’

catcmd=`which cat`

grepcmd=`which grep`

rmcmd=`which rm`

mktempcmd=`which mktemp`

wccmd=`which wc`

sedcmd=`which sed`

trcmd=`which tr`

uniqcmd=`which uniq`

###################################################################################################################

hostname=$1

usr=$2

pswd=$3

srv=$4

if [ ! "$#" == "4" ]; then

echo -e "\nYou did not supply enough arguments. \nUsage: $0 <host> <username> <password> <version> \n \n$0 checks Microsoft SQL Server connectivity."

elif [ $tsqlcmd == "" ]; then

echo -e "tsql not found! Please verify you have a working version of tsql (included in the FreeTDS version 6.0+) and enter the full path in the script."

fi

exit="3"

# Creating the command file that contains the sql statement that has to be run on the SQL server.

tmpfile=`$mktempcmd /tmp/$hostname.XXXXXX`

if [ $srv == "7" ]; then

spid=7

elif [ $srv == "2000" ]; then

spid=50

else

echo -e "$srv is not a supported MS SQL Server version!" && exit "3"

fi

echo -e "SELECT host_name,program_name,login_time,login_name,convert(nvarchar(4000), [text]),wait_duration_ms

FROM sys.dm_os_waiting_tasks w

INNER JOIN sys.dm_exec_sessions s

ON w.session_id = s.session_id

INNER JOIN sys.dm_exec_requests r

ON s.session_id = r.session_id

OUTER APPLY sys.dm_exec_sql_text (r.sql_handle) t

WHERE s.is_user_process = 1

and wait_duration_ms >35000\ngo" > $tmpfile

# Running tsql to get the results back.

resultfile=`$mktempcmd /tmp/$hostname.XXXXXX`

errorfile=`$mktempcmd /tmp/$hostname.XXXXXX`

$tsqlcmd -S $hostname -U $usr -P $pswd < $tmpfile 2>$errorfile > $resultfile

$grepcmd -q "Login failed for user" $errorfile

if [ "$?" == "0" ]; then

$rmcmd -f $tmpfile $resultfile $errorfile;

echo CRITICAL – Could not make connection to SQL server. Login failed.;

exit 2;

fi

$grepcmd -q "There was a problem connecting to the server" $errorfile

if [ "$?" == "0" ]; then

$rmcmd -f $tmpfile $resultfile $errorfile;

echo CRITICAL – Could not make connection to SQL server. Incorrect server name or SQL service not running.;

exit 2;

fi

resultfileln=`$catcmd $resultfile | $wccmd -l | $sedcmd ‘s/ //g’`

if [ "$resultfileln" == "2" ]; then

$rmcmd -f $tmpfile $resultfile $errorfile;

echo CRITICAL – Could not make connection to SQL server. No data received from host.;

exit 2;

else

nmbr=`$catcmd $resultfile | $grepcmd -v locale | $grepcmd -v charset| $grepcmd -v 1\> | $sedcmd ‘/^$/d’ | $sedcmd ‘s/ //g’ | $wccmd -l | sed ‘s/ //g’`

hosts=`$catcmd $resultfile | $grepcmd -v locale | $grepcmd -v charset| $grepcmd -v 1\> | $sedcmd ‘/^$/d’ | $sedcmd ‘s/ //g’ | $uniqcmd -c | $trcmd \\\n , | $sedcmd ‘s/,$/./g’ | $sedcmd ‘s/,/, /g’ | $sedcmd ‘s/ //g’ | $trcmd \\\t " " | $sedcmd ‘s/ \./\./g’ | $sedcmd ‘s/ ,/,/g’`

$rmcmd -f $tmpfile $resultfile;

if [ "$nmbr" == "0" ]; then

echo "OK – MS SQL Server $srv has $nmbr blocks: $hosts" | sed ‘s/: $/./g’;

else

echo "$hosts $nmbr"

fi

exit 0;

fi

# Cleaning up.

$catcmd $tmpfile

$catcmd $resultfile

$catcmd $errorfile

$rmcmd -f $tmpfile $resultfile $errorfile

echo $stdio

exit $exit