监控脚本:
[root@node10 chapter-10]# python nagios_check.py critical
Status is CRITICAL
[root@node10 chapter-10]# cat nagios_check.py
import sys,json,httplib,base64
status = sys.argv[1]
if status.lower() == "warning":
print "Status is WARN"
exit(1)
elif status.lower() =="critical":
print "Status is CRITICAL"
exit(2)
elif status.lower() == "unknow":
print "Status is UNKNOWN"
exit(3)
else:
print "Status is OK"
exit(0)
使用AMQP模拟检测来确认MQ是否运行
测试:
[root@node10 chapter-10]# python amqp_ping_check.py localhost:5672 / admin admin
OK: Connect to localhost:5672 successfull.
[root@node10 chapter-10]# python amqp_ping_check.py localhost:5672 / admin admin
CRITICAL:Could not connect to localhost:5672!
代码:
[root@node10 chapter-10]# cat amqp_ping_check.py
import sys,pika
EXIT_OK=0
EXIT_WARNING=1
EXIT_CRITICAL=2
EXIT_UNKNOWN=3
server,port = sys.argv[1].split(":")
vhost = sys.argv[2]
username=sys.argv[3]
password = sys.argv[4]
creds_broker = pika.PlainCredentials(username,password)
conn_params = pika.ConnectionParameters(server,virtual_host=vhost,credentials=creds_broker)
try:
conn_broker = pika.BlockingConnection(conn_params)
channel = conn_broker.channel()
except Exception:
print "CRITICAL:Could not connect to %s:%s!" %(server,port)
exit(EXIT_CRITICAL)
print "OK: Connect to %s:%s successfull." %(server,port)
exit(EXIT_OK)
通过REST API来检测MQ
测试:
[root@node10 chapter-10]# python api_ping_check.py localhost:15672 / admin admin
OK: Broker alive: {"status":"ok"}
[root@node10 chapter-10]# cat api_ping_check.py
import sys,json,httplib,urllib,base64,socket
EXIT_OK=0
EXIT_WARNING=1
EXIT_CRITICAL=2
EXIT_UNKNOWN=3
server,port = sys.argv[1].split(":")
vhost = sys.argv[2]
username=sys.argv[3]
password = sys.argv[4]
conn = httplib.HTTPConnection(server,port)
path="/api/aliveness-test/%s" %urllib.quote(vhost,safe="")
method= "GET"
credentials = base64.b64encode("%s:%s" %(username,password))
try:
conn.request(method,path,"",{"Content-Type":"application/json","Authorization":"Basic " + credentials})
except socket.error:
print "CRITICAL: Could not connect to %s:%s" %(server,port)
exit(EXIT_CRITICAL)
response = conn.getresponse()
if response.status >299:
print "CRITICAL: Broker not alive: %s" %response.read()
exit(EXIT_CRITICAL)
print "OK: Broker alive: %s" %response.read()
exit(EXIT_OK)
server,port = sys.argv[1].split(":")
vhost = sys.argv[2]
username=sys.argv[3]
password = sys.argv[4]
creds_broker = pika.PlainCredentials(username,password)
conn_params = pika.ConnectionParameters(server,virtual_host=vhost,credentials=creds_broker)
try:
conn_broker = pika.BlockingConnection(conn_params)
channel = conn_broker.channel()
except Exception:
print "CRITICAL:Could not connect to %s:%s!" %(server,port)
exit(EXIT_CRITICAL)
print "OK: Connect to %s:%s successfull." %(server,port)
exit(EXIT_OK)
监控配置文件修改监控
演示:
[root@node10 chapter-10]# python api_config_file_modify_check.py localhost:15672 / admin admin backup_orders true trueWARN:Queue 'backup_orders' - auto_delete flag is NOT True.
[root@node10 chapter-10]# python api_config_file_modify_check.py localhost:15672 / admin admin backup_orders false true
OK: Queue backup_orders configured correctly.
[root@node10 chapter-10]# python api_config_file_modify_check.py localhost:15672 / admin admin backup_orders false false
WARN:Queue 'backup_orders' - durable flag is NOT False.
代码:
[root@node10 chapter-10]# cat api_config_file_modify_check.py
import sys,json,httplib,urllib,base64,socket
EXIT_OK=0
EXIT_WARNING=1
EXIT_CRITICAL=2
EXIT_UNKNOWN=3
server,port = sys.argv[1].split(":")
vhost = sys.argv[2]
username=sys.argv[3]
password = sys.argv[4]
queue_name = sys.argv[5]
auto_delete = json.loads(sys.argv[6].lower())
durable = json.loads(sys.argv[7].lower())
conn = httplib.HTTPConnection(server,port)
path="/api/queues/%s/%s" %(urllib.quote(vhost,safe=""),urllib.quote(queue_name))
method= "GET"
credentials = base64.b64encode("%s:%s" %(username,password))
try:
conn.request(method,path,"",{"Content-Type":"application/json","Authorization":"Basic " + credentials})
except socket.error:
print "UNKNOWN: Could not connect to %s:%s" %(server,port)
exit(EXIT_UNKNOWN)
response = conn.getresponse()
if response.status == 404:
print "CRITICAL: Queue %s does not exits." % queue_name
exit(EXIT_CRITICAL)
if response.status >299:
print "UNKNOWN: Unexpected API error: %s" %response.read()
exit(EXIT_UNKNOWN)
response = json.loads(response.read())
if response["auto_delete"] !=auto_delete:
print "WARN:Queue '%s' - auto_delete flag is NOT %s." %(queue_name,auto_delete)
exit(EXIT_WARNING)
if response["durable"] !=durable:
print "WARN:Queue '%s' - durable flag is NOT %s." %(queue_name,durable)
exit(EXIT_WARNING)
print "OK: Queue %s configured correctly." %queue_name
exit(EXIT_OK)
监控集群状态
演示:
手动查看;
[root@node10 chapter-10]# curl -i -u admin:admin http://localhost:15672/api/nodes
通过api接口:
[root@node10 chapter-10]# python cluster_health_check.py localhost:15672 admin admin rabbit@node6,rabbit@node10 340000 3300000
CRITICAL:Node rabbit@node10 memory usage is 31515144.
[root@node10 chapter-10]# python cluster_health_check.py localhost:15672 admin admin rabbit@node6,rabbit@node10 34000000 335845520
OK:2 node. All memory usage below 335845520.
[root@node10 chapter-10]# python cluster_health_check.py localhost:15672 admin admin rabbit@node6,rabbit@node10 34000000 335845520
UNKNOWN: Could not connect to localhost:15672
[root@node10 chapter-10]# python cluster_health_check.py localhost:15672 admin admin rabbit@node6,rabbit@node10 34000000 335845520
WARNING:Cluster missing nodes: ['rabbit@node6']
代码:
[root@node10 chapter-10]# cat cluster_health_check.py
import sys,json,httplib,urllib,base64,socket
EXIT_OK=0
EXIT_WARNING=1
EXIT_CRITICAL=2
EXIT_UNKNOWN=3
server,port = sys.argv[1].split(":")
username=sys.argv[2]
password = sys.argv[3]
node_list = sys.argv[4].split(",")
mem_critical = int(sys.argv[5])
mem_warning = int(sys.argv[6])
conn = httplib.HTTPConnection(server,port)
path="/api/nodes"
method= "GET"
credentials = base64.b64encode("%s:%s" %(username,password))
try:
conn.request(method,path,"",{"Content-Type":"application/json","Authorization":"Basic " + credentials})
except socket.error:
print "UNKNOWN: Could not connect to %s:%s" %(server,port)
exit(EXIT_UNKNOWN)
response = conn.getresponse()
if response.status >299:
print "UNKNOWN: Unexpected API error: %s" %response.read()
exit(EXIT_UNKNOWN)
response = json.loads(response.read())
for node in response:
if node["name"] in node_list and node["running"] !=False:
node_list.remove(node["name"])
if len(node_list):
print "WARNING:Cluster missing nodes: %s" %str(node_list)
exit(EXIT_WARNING)
for node in response:
if node["mem_used"] >mem_critical:
print "CRITICAL:Node %s memory usage is %d." %(node["name"],node["mem_used"])
exit(EXIT_CRITICAL)
elif node["mem_used"] >mem_warning:
print "WARNING: Node %s memory usage is %d."%(node["name"],node["mem_used"])
exit(EXIT_WARNING)
print "OK:%d node. All memory usage below %d." %(len(response),mem_warning)
exit(EXIT_OK)