monitoring-plugins/check_pve.rb

481 lines
14 KiB
Ruby
Raw Permalink Normal View History

2020-04-27 14:49:12 +02:00
#!/usr/bin/env ruby
# frozen_string_literal: true
#
# PVE Plugin
# ==
# Author: Marco Peterseil
# Created: 12-2017
# License: GPLv3 - http://www.gnu.org/licenses
# URL: https://gitlab.com/6uellerBpanda/check_pve
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
require 'optparse'
require 'net/https'
require 'json'
require 'date'
version = 'v0.2.4'
# optparser
banner = <<~HEREDOC
check_pve #{version} [https://gitlab.com/6uellerBpanda/check_pve]\n
This plugin checks various parameters of Proxmox Virtual Environment via API(v2)\n
Mode:
Cluster:
cluster Checks quorum of cluster
Node:
smart Checks SMART health of disks
updates Checks for available updates
subscription Checks for valid subscription
services Checks if services are running
storage Checks storage usage in percentage
cpu Checks CPU usage in percentage
memory Checks Memory usage in gigabytes
io_wait Checks IO wait in percentage
net_in Checks inbound network usage in kilobytes
net_out Checks outbound network usage in kilobytes
ksm Checks KSM sharing usage in megabytes
VM:
vm_cpu Checks CPU usage in percentage
vm_disk_read Checks how many kb last 60s was read (timeframe: hour)
vm_disk_write Checks how many kb last 60s was written (timeframe: hour)
vm_net_in Checks incoming kb from last 60s (timeframe: hour)
vm_net_out Checks outgoing kb from last 60s (timeframe: hour)
Usage: #{File.basename(__FILE__)} [mode] [options]
HEREDOC
options = {}
OptionParser.new do |opts| # rubocop:disable Metrics/BlockLength
opts.banner = banner.to_s
opts.separator ''
opts.separator 'Options:'
opts.on('-s', '--address ADDRESS', '-H', 'PVE host address') do |s|
options[:address] = s
end
opts.on('-k', '--insecure', 'No SSL verification') do |k|
options[:insecure] = k
end
opts.on('-m', '--mode MODE', 'Mode to check') do |m|
options[:mode] = m
end
opts.on('-n', '--node NODE', 'PVE Node name') do |n|
options[:node] = n
end
opts.on('-u', '--username USERNAME', 'Username with auth realm e.g. monitoring@pve') do |u|
options[:username] = u
end
opts.on('-p', '--password PASSWORD', 'Password') do |p|
options[:password] = p
end
opts.on('-w', '--warning WARNING', 'Warning threshold') do |w|
options[:warning] = w
end
opts.on('-c', '--critical CRITICAL', 'Critical threshold') do |c|
options[:critical] = c
end
opts.on('--name NAME', 'Name for storage') do |name|
options[:name] = name
end
opts.on('-i', '--vmid VMID', 'Vmid of lxc,qemu') do |i|
options[:vmid] = i
end
opts.on('-t', '--type TYPE', 'VM type lxc or qemu') do |t|
options[:type] = t
end
opts.on('-x', '--exclude EXCLUDE', 'Exclude (regex)') do |x|
options[:exclude] = x
end
opts.on('--timeframe TIMEFRAME', 'Timeframe for vm checks: hour,day,week,month or year') do |timeframe|
options[:timeframe] = timeframe
end
opts.on('--cf CONSOLIDATION_FUNCTION', 'RRD cf: average or max') do |cf|
options[:cf] = cf
end
opts.on('-v', '--version', 'Print version information') do
puts "check_pve #{version}"
end
opts.on('-h', '--help', 'Show this help message') do
puts opts
end
ARGV.push('-h') if ARGV.empty?
end.parse!
# check pve
class CheckPve
def initialize(options) # rubocop:disable Metrics/MethodLength
@options = options
init_arr
cluster
smart
updates
services
subscription
cpu
mem
ksm
io_wait
storage
net_in
net_out
vm_disk_write
vm_disk_read
vm_cpu
vm_net_in
vm_net_out
end
def init_arr
@perfdata = []
@message = []
@critical = []
@warning = []
@okays = []
end
#--------#
# HELPER #
#--------#
# define some helper methods for naemon with appropriate exit codes
def ok_msg(message)
puts "OK - #{message}"
exit 0
end
def crit_msg(message)
puts "Critical - #{message}"
exit 2
end
def warn_msg(message)
puts "Warning - #{message}"
exit 1
end
def unk_msg(message)
puts "Unknown - #{message}"
exit 3
end
# helper to convert bytes
def convert_bytes_to_unit(data:, unit:)
case unit
when 'kb' then @usage = data.to_i / 1024
when 'mb' then @usage = data.to_i / 1024 / 1024
when 'gb' then @usage = data.to_i / 1024 / 1024 / 1024
end
end
# check only one value
def check_single_data(data:, message:)
crit_msg(message) if data
end
# check only two values
def check_multiple_data(data:, warn_msg:, ok_msg:)
if data
warn_msg(warn_msg)
else
ok_msg(ok_msg)
end
end
# helper for excluding
def exclude(data:, value:)
data.delete_if { |item| /#{@options[:exclude]}/.match(item[value]) }
end
# generate perfdata
def build_perfdata(perfdata:)
@perfdata << "#{perfdata};#{@options[:warning]};#{@options[:critical]}"
end
# build service output
def build_output(msg:)
@message = msg
end
# helper for threshold checking
def check_thresholds(data:)
if data > @options[:critical].to_i
@critical << @message
elsif data > @options[:warning].to_i
@warning << @message
else
@okays << @message
end
# make the final step
build_final_output
end
# mix everything together for exit
def build_final_output
perf_output = " | #{@perfdata.join(' ')}"
if @critical.any?
crit_msg(@critical.join(', ') + perf_output)
elsif @warning.any?
warn_msg(@warning.join(', ') + perf_output)
else
ok_msg(@okays.join(', ') + perf_output)
end
end
#----------#
# API AUTH #
#----------#
def url(path:, req: 'get') # rubocop:disable Metrics/MethodLength
uri = URI("https://#{@options[:address]}:8006/#{path}")
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE if @options[:insecure]
if req == 'post'
request = Net::HTTP::Post.new(uri.request_uri)
request.set_form_data('username' => @options[:username].to_s, 'password' => @options[:password].to_s)
else
fetch_cookie
request = Net::HTTP::Get.new(uri.request_uri)
request['cookie'] = @token
end
@response = http.request(request)
rescue StandardError => e
unk_msg(e)
end
# check http response
def check_http_response
unk_msg(@response.message).to_s if @response.code != '200'
end
# init http req
def http_connect(path:, req: 'get')
url(path: path, req: req)
check_http_response
end
# get cookie
def fetch_cookie
http_connect(path: 'api2/json/access/ticket', req: 'post')
@token = "PVEAuthCookie=#{JSON.parse(@response.body)['data']['ticket']}"
end
#--------#
# CHECKS #
#--------#
###--- CLUSTER CHECK ---###
def cluster
return unless @options[:mode] == 'cluster'
http_connect(path: 'api2/json/cluster/status')
cluster = JSON.parse(@response.body)['data'].first
check_multiple_data(
data: cluster['quorate'] != 1,
warn_msg: "#{cluster['name'].upcase}: Cluster not ready - no quorum",
ok_msg: "#{cluster['name'].upcase}: Cluster ready - quorum is ok"
)
end
###--- SMART CHECK ---###
def smart
return unless @options[:mode] == 'smart'
http_connect(path: "api2/json/nodes/#{@options[:node]}/disks/list")
unhealthy = JSON.parse(@response.body)['data'].reject { |item| item['health'] == 'PASSED' }
exclude(data: unhealthy, value: 'devpath') unless @options[:exclude].to_s.empty?
warn_msg(unhealthy.map { |item| "#{item['model']}:#{item['used']}-#{item['devpath']} SMART error detected" }.join(', ')) if unhealthy.any?
ok_msg('No SMART errors detected')
end
###--- UPDATE CHECK ---###
def updates
return unless @options[:mode] == 'update'
http_connect(path: "api2/json/nodes/#{@options[:node]}/apt/update")
check_multiple_data(
data: JSON.parse(@response.body)['data'].any?,
warn_msg: 'New updates available',
ok_msg: 'System up to date'
)
end
###--- SERVICES CHECK ---###
def services
return unless @options[:mode] == 'services'
http_connect(path: "api2/json/nodes/#{@options[:node]}/services")
services_down = JSON.parse(@response.body)['data'].reject { |item| item['state'] == 'running' }
exclude(data: services_down, value: 'name') unless @options[:exclude].to_s.empty?
warn_msg(services_down.map { |item| item['name'].to_s }.join(', ') << ' not running') if services_down.any?
ok_msg('All services running')
end
###--- SUBSCRIPTION CHECK ---###
def subscription
return unless @options[:mode] == 'subscription'
http_connect(path: "api2/json/nodes/#{@options[:node]}/subscription")
data = JSON.parse(@response.body)['data']
due_date = data['nextduedate']
check_single_data(data: data['status'] == 'Inactive', message: 'Subscription not valid')
check_multiple_data(
data: Date.parse(due_date) < Date.today + @options[:warning].to_i,
warn_msg: "Subscription will end at #{due_date}",
ok_msg: "Subscription is valid till #{due_date}"
)
end
###--- NODE CHECKS ---###
def format_float(float_data:)
format("%.2f", float_data * 100).to_f.round # rubocop:disable Style/StringLiterals, Style/FormatStringToken
end
# helper for percentage values
def node_helper_to_pct(path: 'status', value:, output_msg:, perf_label: 'Usage', **args)
http_connect(path: "api2/json/nodes/#{@options[:node]}/#{path}")
data = JSON.parse(@response.body)['data']
# calc used data
value = if args.empty?
format_float(float_data: data[value])
else
format_float(float_data: data[value].to_f / data[args[:value_to_compare]])
end
build_output(msg: "#{output_msg}: #{value}%")
build_perfdata(perfdata: "#{perf_label}=#{value}%")
check_thresholds(data: value)
end
2023-12-01 21:43:08 +01:00
# helper for nested percentage values
def node_helper_to_nested_pct(path: 'status', key:, value:, output_msg:, perf_label: 'Usage', **args)
http_connect(path: "api2/json/nodes/#{@options[:node]}/#{path}")
data = JSON.parse(@response.body)['data'][key]
# calc used data
value = if args.empty?
format_float(float_data: data[value])
else
format_float(float_data: data[value].to_f / data[args[:value_to_compare]])
end
build_output(msg: "#{output_msg}: #{value}%")
build_perfdata(perfdata: "#{perf_label}=#{value}%")
check_thresholds(data: value)
end
2020-04-27 14:49:12 +02:00
# helper for unit values
def node_helper_to_units(path: 'status', type:, value: 'used', output_msg:, unit: 'gb', perf_label: 'Usage')
http_connect(path: "api2/json/nodes/#{@options[:node]}/#{path}")
data = JSON.parse(@response.body)['data'][type][value]
convert_bytes_to_unit(data: data, unit: unit)
build_output(msg: "#{output_msg}: #{@usage}#{unit.upcase}")
build_perfdata(perfdata: "#{perf_label}=#{@usage}#{unit.upcase}")
check_thresholds(data: @usage)
end
# helper for rrddata
def check_rrddata_path
@options[:vmid] ? @rrddata_path = "#{@options[:node]}/#{@options[:type]}/#{@options[:vmid]}" : @rrddata_path = @options[:node]
end
def rrddata_helper(unit: 'kb', value:, output_msg:, perf_label: 'Usage')
check_rrddata_path
http_connect(path: "api2/json/nodes/#{@rrddata_path}/rrddata?timeframe=#{@options[:timeframe]}&cf=#{@options[:cf].upcase}")
data = JSON.parse(@response.body)['data'][-1][value]
unit == '%' ? @usage = format_float(float_data: data) : @usage = convert_bytes_to_unit(data: data, unit: unit)
build_output(msg: "#{output_msg}: #{@usage}#{unit.upcase}")
build_perfdata(perfdata: "#{perf_label}=#{@usage}#{unit.upcase}")
check_thresholds(data: @usage)
end
### node: cpu
def cpu
return unless @options[:mode] == 'cpu'
node_helper_to_pct(value: 'cpu', output_msg: 'CPU usage')
end
### node: io wait
def io_wait
return unless @options[:mode] == 'io_wait'
node_helper_to_pct(value: 'wait', output_msg: 'IO Wait', perf_label: 'Wait')
end
### node: memory
def mem
return unless @options[:mode] == 'memory'
2023-12-01 21:43:08 +01:00
node_helper_to_nested_pct(
key: 'memory',
value: 'used',
value_to_compare: 'total',
output_msg: 'Memory usage'
)
2020-04-27 14:49:12 +02:00
end
### node: ksm
def ksm
return unless @options[:mode] == 'ksm'
node_helper_to_units(type: 'ksm', value: 'shared', unit: 'mb', output_msg: 'KSM sharing')
end
### node: storage
def storage
return unless @options[:mode] == 'storage'
node_helper_to_pct(
path: "storage/#{@options[:name]}/status",
value: 'used',
value_to_compare: 'total',
output_msg: 'Storage usage'
)
end
### node: netin
def net_in
return unless @options[:mode] == 'net_in'
rrddata_helper(value: 'netin', output_msg: 'Network usage in')
end
### node: netout
def net_out
return unless @options[:mode] == 'net_out'
rrddata_helper(value: 'netout', output_msg: 'Network usage out')
end
###--- QEMU, LXC CHECKS ---###
# disk
def vm_disk_write
return unless @options[:mode] == 'vm_disk_write'
rrddata_helper(value: 'diskwrite', output_msg: 'Disk write')
end
def vm_disk_read
return unless @options[:mode] == 'vm_disk_read'
rrddata_helper(value: 'diskread', output_msg: 'Disk read')
end
# cpu
def vm_cpu
return unless @options[:mode] == 'vm_cpu'
rrddata_helper(unit: '%', value: 'cpu', output_msg: 'CPU usage')
end
# network
def vm_net_in
return unless @options[:mode] == 'vm_net_in'
rrddata_helper(value: 'netin', output_msg: 'Network usage in')
end
def vm_net_out
return unless @options[:mode] == 'vm_net_out'
rrddata_helper(value: 'netout', output_msg: 'Network usage out')
end
end
CheckPve.new(options)