Created
August 26, 2013 21:54
-
-
Save joemiller/6347137 to your computer and use it in GitHub Desktop.
check the outbound bandwidth on a host and compare against a table of rackspace cloud quotas. alert if near the limit
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# check-rackspace-bandwidth-limits | |
# ================================ | |
# | |
# Check the bandwidth usage of a network device on a Rackspace Cloud server | |
# and alert if it is nearing the threshold allowed for the image size. | |
# | |
# Requires sar(1) and sadf(1) to be installed (sysstat package, usually) | |
# | |
# Uses the last 60 minutes of sar data to calculate an average usage over that time | |
# and alerts if that average is within a certain percentage of hitting the rackspace | |
# bandwidth limits. | |
# | |
# examples: | |
# | |
# check if eth1 is within 90% of its limits over the last hour: | |
# | |
# ./check-rackspace-bandwidth-limits.rb -c 90 -i eth1 | |
# CheckBandwidth CRITICAL: eth1 average TX rate is 91.16666667 % of max allowed (550 / 600 mbps) | |
# | |
require 'rubygems' if RUBY_VERSION < '1.9.0' | |
require 'sensu-plugin/check/cli' | |
class CheckBandwidth < Sensu::Plugin::Check::CLI | |
option :verbose, :short => '-v', :boolean => true, :default => false | |
option :iface, :short => '-i IFACE', :default => 'eth0' | |
option :warn, :long => "--warn PERCENT", :default => 90, :proc => Proc.new { |w| w.to_f } | |
option :crit, :long => "--crit PERCENT", :default => 95, :proc => Proc.new { |w| w.to_f } | |
# Rackspace Cloud bandwidth quotas for each instance size on this page: | |
# http://www.rackspace.com/knowledge_center/product-faq/cloud-servers | |
RACKSPACE_CLOUD_BW_LIMITS = { | |
'512MB' => { 'eth0' => 20, 'eth1' => 40 }, # mbits/sec | |
'1024MB' => { 'eth0' => 30, 'eth1' => 60 }, | |
'2048MB' => { 'eth0' => 60, 'eth1' => 120 }, | |
'4096MB' => { 'eth0' => 100, 'eth1' => 200 }, | |
'8192MB' => { 'eth0' => 150, 'eth1' => 300 }, | |
'15872MB' => { 'eth0' => 200, 'eth1' => 400 }, | |
'30720MB' => { 'eth0' => 300, 'eth1' => 600 }, | |
} | |
# try to figure out the size of this Rackspace cloud server instance. | |
# The exact amount of RAM varies between generations of their software stack, | |
# so we use some heuristics to figure out what we are. | |
def rackspace_image_size | |
mem_kb = `cat /proc/meminfo | grep MemTotal | awk '{print $2}'`.to_i | |
mem_mb = mem_kb / 1024 | |
rs_size = case mem_mb | |
when 400..600 then '512MB' | |
when 900..1200 then '1024MB' | |
when 1800..2200 then '2048MB' | |
when 7800..8400 then '8192MB' | |
when 14000..16200 then '15872MB' | |
when 28000..33000 then '30720MB' | |
else raise "Could not determine rackspace image type from mem MB: #{mem_mb}" | |
end | |
rs_size | |
end | |
# start_time is passed to date(1) -d param | |
# | |
# uses sar(1) data to return a set of statistics about the networking | |
# devies of the system from start_time until now, eg: | |
# | |
# get_netstats('60 minutes ago') => | |
# => { 'eth0' => { 'num_samples' => 6, | |
# 'txpkts_sec_average' => 1200, | |
# 'txpkts_sec_max' => 5000, | |
# 'rxpkts_sec_average' => 1200, | |
# 'rxpkts_sec_max' => 5000, | |
# 'txkB_sec_average' => 32033.2, | |
# 'txKB_sec_max' => 52123.2, | |
# 'rxkB_sec_average' => 32033.2, | |
# 'rxKB_sec_max' => 52123.2' | |
# }, | |
# 'eth1' ..... | |
# } | |
def get_netstats(start_time='60 minutes ago') | |
now = Time.now | |
day_of_month = now.day | |
start = `date +%H:%M:%S -d '#{start_time}'` | |
# auto-vivification - http://alisdair.mcdiarmid.org/2012/09/01/auto-vivifying-hash.html | |
stats = Hash.new {|h, k| h[k] = Hash.new(0) } | |
out = `sadf -d /var/log/sa/sa#{day_of_month} -- -n DEV -s #{start}` | |
if out == '' | |
ok "no data returned by sadf. It's possible that the sar data just rolled over to a new day and needs some time to collect data" | |
end | |
out.split("\n").each do |l| | |
next if l =~ /^#/ | |
l.strip! | |
(host, interval, timestamp, dev, rxpkts, txpkts, rxkb, txkb, rxcmp, txcmp, rxmcst) = l.split(';') | |
stats[dev]['num_samples'] += 1 | |
rxkb = rxkb.to_f ; txkb = txkb.to_f | |
rxpkts = rxpkts.to_f ; txpkts = txpkts.to_f | |
# set max values | |
stats[dev]['rxpkts_sec_max'] = rxpkts if rxpkts > stats[dev]['rxpkts_sec_max'] | |
stats[dev]['txpkts_sec_max'] = txpkts if txpkts > stats[dev]['txpkts_sec_max'] | |
stats[dev]['rxKB_sec_max'] = rxkb if rxkb > stats[dev]['rxKB_sec_max'] | |
stats[dev]['txKB_sec_max'] = txkb if txkb > stats[dev]['txKB_sec_max'] | |
# cumulative average | |
stats[dev]['rxpkts_sec_sum'] = stats[dev]['rxpkts_sec_sum'] + rxpkts | |
stats[dev]['rxpkts_sec_average'] = stats[dev]['rxpkts_sec_sum'] / stats[dev]['num_samples'] | |
stats[dev]['txpkts_sec_sum'] = stats[dev]['txpkts_sec_sum'] + txpkts | |
stats[dev]['txpkts_sec_average'] = stats[dev]['txpkts_sec_sum'] / stats[dev]['num_samples'] | |
stats[dev]['rxKB_sec_sum'] = stats[dev]['rxKB_sec_sum'] + rxkb | |
stats[dev]['rxKB_sec_average'] = stats[dev]['rxKB_sec_sum'] / stats[dev]['num_samples'] | |
stats[dev]['txKB_sec_sum'] = stats[dev]['txKB_sec_sum'] + txkb | |
stats[dev]['txKB_sec_average'] = stats[dev]['txKB_sec_sum'] / stats[dev]['num_samples'] | |
end | |
stats | |
end | |
def run | |
dev = config[:iface] | |
image_type = rackspace_image_size | |
puts "Detected Rackspace image size: #{image_type}" if config[:verbose] | |
stats = get_netstats | |
puts "Checking stats on interface: #{dev}" if config[:verbose] | |
max_mbps = RACKSPACE_CLOUD_BW_LIMITS[image_type][dev] | |
# don't forget to convert stats data from bytes to bits | |
tx_rate_mbps = (stats[dev]['txKB_sec_average'] * 8) / 1000 | |
puts "Interface #{dev} tx rate: #{tx_rate_mbps} mbps" if config[:verbose] | |
usage_percent = (tx_rate_mbps / max_mbps) * 100 | |
puts "Interface #{dev} tx rate is #{usage_percent}%" if config[:verbose] | |
msg = "#{dev} average TX rate is #{usage_percent} % of max allowed (#{tx_rate_mbps.to_i} / #{max_mbps} mbps)" | |
if usage_percent >= config[:crit] | |
critical msg | |
elsif usage_percent >= config[:warn] | |
warning msg | |
else | |
ok msg | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment