share/check_mk/checks/diskstat (1.5.0p7: fixes diskstat crashes with kernel 4.19.*):
#!/usr/bin/python
# -*- encoding: utf-8; py-indent-offset: 4 -*-
# +------------------------------------------------------------------+
# | ____ _ _ __ __ _ __ |
# | / ___| |__ ___ ___| | __ | \/ | |/ / |
# | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
# | | |___| | | | __/ (__| < | | | | . \ |
# | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
# | |
# | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
# +------------------------------------------------------------------+
#
# This file is part of Check_MK.
# The official homepage is at http://mathias-kettner.de/check_mk.
#
# check_mk is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation in version 2. check_mk is distributed
# in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
# out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more de-
# tails. You should have received a copy of the GNU General Public
# License along with GNU Make; see the file COPYING. If not, write
# to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA 02110-1301 USA.
# <<<diskstat>>>
# 1300264105
# 8 0 sda 691860 951191 13559915 491748 234686 197346 3359512 94944 0 56844 586312
# 8 32 sdb 791860 91191 23589915 491748 234686 197346 3359512 94944 0 56844 586312
# Newer agent output also dm-* and Veritas devices and if
# available the following additional information for name rewriting:
# <<<diskstat>>>
# 1338931242
# 8 0 sda 6142 327 219612 2244 3190 6233 74075 8206 0 6523 10446
# 253 0 dm-0 4579 0 181754 2343 9249 0 73960 259491 0 1208 261833
# 253 1 dm-1 342 0 2736 47 3 0 11796464 5016 0 5063 5063
# 253 2 dm-2 160 0 1274 27 11 0 56 3 0 27 30
# 8 16 sdb 464 858 7717 336 1033 0 311454 3899 0 3007 4231
# 8 32 sdc 855 13352 106777 1172 915 0 154467 2798 0 3012 3967
# 8 48 sdd 1217 861 109802 1646 118 0 56151 1775 0 2736 3420
# 8 80 sdf 359 1244 58323 792 66 0 4793 388 0 765 1178
# 8 64 sde 310 1242 6964 268 118 0 56151 1607 0 1307 1872
# 8 96 sdg 1393 1242 314835 3759 129 0 56172 1867 0 4027 5619
# 199 27000 VxVM27000 131 0 990 61 11 0 21 29 0 89 90
# 199 27001 VxVM27001 0 0 0 0 0 0 0 0 0 0 0
# [dmsetup_info]
# vg_zwei-lv_home 253:2 vg_zwei lv_home
# vg_zwei-lv_swap 253:1 vg_zwei lv_swap
# vg_zwei-lv_root 253:0 vg_zwei lv_root
# [vx_dsk]
# c7 6978 /dev/vx/dsk/datadg/lalavol
# c7 6979 /dev/vx/dsk/datadg/oravol
# output may have zeros appended
#
# 8 0 sda 111918756 929875 3960367050 349083041 20142495 1149711 1021234448 851284769 0 233177192 1197549009 0 0 0 0
# 8 1 sda1 226 0 27481 3388 381 3 31472 35862 0 8123 39260 0 0 0 0
# 8 2 sda2 111918500 929875 3960337473 349079568 20142114 1149708 1021202976 851248906 0 233176504 1197492420 0 0 0 0
# 253 0 dm-0 883953 0 92124097 10287533 108572 0 2251672 809814 0 7545567 11097424 0 0 0 0
# 253 1 dm-1 21046 0 172072 157766 164020 0 1312160 29292970 0 124138 29451007 0 0 0 0
# 253 2 dm-2 750714 0 19747073 7702216 1445987 0 36811608 9817313 0 7159271 17520030 0 0 0 0
# Fields in /proc/diskstats
# Index 0 -- major number
# Index 1 -- minor number
# Index 2 -- device name --> used by check
# Index 3 -- # of reads issued
# Index 4 -- # of reads merged
# Index 5 -- # of sectors read (a 512 Byte) --> used by check
# Index 6 -- # of milliseconds spent reading
# Index 7 -- # of writes completed
# Index 8 -- # of writes merged
# Index 9 -- # of sectors written (a 512 Byte) --> used by check
# Index 10 -- # of milliseconds spent writing
# Index 11 -- # of I/Os currently in progress
# Index 12 -- # of milliseconds spent doing I/Os
# Index 13 -- weighted # of milliseconds spent doing I/Os
# Convert information to generic format also generated
# by winperf_phydisk
# [ now, [( disk, readctr, writectr ), ... ]]
# where counters are in sectors (512 bytes)
# Parse /proc/diskstat and additional information into a nice canonical
# dictionary of the form:
# disks = {
# "hda" : {
# 'average_read_request_size' : 0.0,
# 'average_read_wait' : 0.0,
# 'average_request_size' : 40569.90476190476,
# 'average_wait' : 0.761904761904762,
# 'average_write_request_size' : 40569.90476190476,
# 'average_write_wait' : 0.0007619047619047619,
# 'node' : None,
# 'read_ios' : 0.0,
# 'read_throughput' : 0.0,
# 'latency' : 0.00038095238095238096,
# 'utilization' : 0.0006153846153846154,
# 'write_ios' : 1.6153846153846154,
# 'write_throughput' : 65536.0,
# },
# "LVM foobar" : {
# ...
# }
# }
#
# Returns a pair of the timestamp and that dictionary
# parsed = timestamp, disks
def parse_diskstat(info):
timestamp_str, proc_diskstat, name_info = diskstat_extract_name_info(info)
# limit diskstat to first elements before actual parsing
proc_diskstat = [ds[:15] for ds in proc_diskstat]
timestamp = int(timestamp_str)
# Here we discover real partitions and exclude them:
# Sort of partitions with disks - typical in XEN virtual setups.
# Eg. there are xvda1, xvda2, but no xvda...
device_names = [line[3] for line in proc_diskstat]
real_partitions = {device_name for device_name in device_names
if diskstat_diskless_pattern.match(device_name)
and re.sub('[0-9]+$', '', device_name)}
disks = {}
for line in proc_diskstat:
if line[3] in real_partitions:
continue
node_name, major, minor, device, \
read_ios, _read_merges, read_sectors, read_ticks, \
write_ios, _write_merges, write_sectors, write_ticks, \
ios_in_prog, total_ticks, _rq_ticks = line
if (node_name, int(major), int(minor)) in name_info:
device = name_info[(node_name, int(major), int(minor))]
counter_base = "diskstat.%s." % device
# Some of the following computations were learned from Munin. Thanks
# to that project!
# There are 1000 ticks per second
# Note: we use onwrap=0.0 here because the parse function is being used also during
# service discovery. If we raise a counter wrap exception here, then nothing will
# be inventorized.
read_ticks_rate = get_rate(counter_base + "read_ticks", timestamp, int(read_ticks), onwrap=0.0)
write_ticks_rate = get_rate(counter_base + "write_ticks", timestamp, int(write_ticks), onwrap=0.0)
total_ticks_rate = get_rate(counter_base + "total_ticks", timestamp, int(total_ticks), onwrap=0.0)
read_ios_rate = get_rate(counter_base + "read_ios", timestamp, int(read_ios), onwrap=0.0)
write_ios_rate = get_rate(counter_base + "write_ios", timestamp, int(write_ios), onwrap=0.0)
total_ios_rate = read_ios_rate + write_ios_rate
utilization = total_ticks_rate / 1000 # not percent, but 0...1
read_bytes_rate = get_rate(counter_base + "read_sectors", timestamp, int(read_sectors), onwrap=0.0) * 512
write_bytes_rate = get_rate(counter_base + "write_sectors", timestamp, int(write_sectors), onwrap=0.0) * 512
total_bytes_rate = read_bytes_rate + write_bytes_rate
# The service time is computed from the utilization. If we work
# e.g. 0.34 (34%) of the time and we can do 17 operations in that
# time then the average latency is time * 0.34 / 17
if total_ios_rate:
latency = utilization / total_ios_rate
average_wait = (read_ticks_rate + write_ticks_rate) / total_ios_rate / 1000.0
average_request_size = total_bytes_rate / total_ios_rate
else:
latency = 0.0
average_wait = 0.0
average_request_size = 0.0
# Average read and write rate, from end to end, including queuing, etc.
# and average size of one request
if read_ticks_rate and read_ios_rate > 0:
average_read_wait = read_ticks_rate / read_ios_rate / 1000.0
average_read_size = read_bytes_rate / read_ios_rate
else:
average_read_wait = 0.0
average_read_size = 0.0
if write_ticks_rate and write_ios_rate > 0:
average_write_wait = write_ticks_rate / write_ios_rate / 1000.0
average_write_size = write_bytes_rate / write_ios_rate
else:
average_write_wait = 0.0
average_write_size = 0.0
disks[device] = {
"node" : node_name,
"read_ios" : read_ios_rate,
"write_ios" : write_ios_rate,
"read_throughput" : read_bytes_rate,
"write_throughput" : write_bytes_rate,
"utilization" : utilization,
"latency" : latency,
"average_request_size" : average_request_size,
"average_wait" : average_wait,
"average_read_wait" : average_read_wait,
"average_read_request_size" : average_read_size,
"average_write_wait" : average_write_wait,
"average_write_request_size" : average_write_size,
"queue_length" : int(ios_in_prog),
}
return disks
### # Index 0 -- major number
### # Index 1 -- minor number
### # Index 2 -- device name --> used by check
### # Index 3 -- # of reads issued
### # Index 4 -- # of reads merged
### # Index 5 -- # of sectors read (a 512 Byte) --> used by check
### # Index 6 -- # of milliseconds spent reading
### # Index 7 -- # of writes completed
### # Index 8 -- # of writes merged
### # Index 9 -- # of sectors written (a 512 Byte) --> used by check
### # Index 10 -- # of milliseconds spent writing
### # Index 11 -- # of I/Os currently in progress
### # Index 12 -- # of milliseconds spent doing I/Os
### # Index 13 -- weighted # of milliseconds spent doing I/Os
### for line in proc_diskstat:
### node = line[0]
###
###
###
### # For multipath devices use the entries for dm-?? and rename
### # them with their multipath UUID/alias - and drop the according
### # sdXY that belong to the paths.
### multipath_name_info = {}
### skipped_devices = set([])
###
### # The generic function takes the following values per line:
### # 0: None or node name
### # 1: devname
### # 2: read bytes counter
### # 3: write bytes counter
### # Optional ones:
### # 4: number of reads
### # 5: number of writes
### # 6: timems
### # 7: read queue length *counters*
### # 8: write queue length *counters*
### rewritten = [
### ( l[0], # node name or None
### diskstat_rewrite_device(name_info, multipath_name_info, l[0:4]),
### int(l[6]),
### int(l[10]),
### int(l[4]),
### int(l[8]),
### # int(l[13])
### ) for l in info[1:] if len(l) >= 14
### ]
###
### # Remove device mapper devices without a translated name
### return [ line for line in rewritten
### if not line[1].startswith("dm-")
### and not line[1] in skipped_devices ]
# Extra additional information from diskstat section about
# LVM and DM devices. These information is encapsulated
# with [dmsetup_info] and [vx_dsk] subsections. Example for
# name_info:
# {
# (None, 253, 0): 'LVM vg00-rootvol',
# (None, 253, 1): 'LVM vg00-tmpvol',
# (None, 253, 2): 'LVM vg00-varvol',
# (None, 253, 3): 'LVM vg00-optvol',
# (None, 253, 4): 'LVM vg00-usrvol',
# (None, 253, 5): 'LVM vg00-swapvol',
# (None, 253, 6): 'LVM vgappl-applvol',
# }
def diskstat_extract_name_info(info):
name_info = {} # dict from (node, major, minor) to itemname
timestamp = None
info_plain = []
phase = 'info'
node = None
for line in info:
if node is None:
node = line[0]
if line[1] == '[dmsetup_info]':
phase = 'dmsetup_info'
elif line[1] == '[vx_dsk]':
phase = 'vx_dsk'
# new node in case of a cluster, restart with info phase
elif line[0] != node:
phase = 'info'
node = line[0]
else:
if phase == 'info':
if len(line) == 2:
timestamp = int(line[1])
else:
info_plain.append(line)
elif phase == 'dmsetup_info':
try:
major, minor = map(int, line[2].split(':'))
if len(line) == 5:
name = "LVM %s" % line[1]
else:
name = "DM %s" % line[1]
name_info[node, major, minor] = name
except:
pass # ignore such crap as "No Devices Found"
elif phase == 'vx_dsk':
major = int(line[1], 16)
minor = int(line[2], 16)
group, disk = line[3].split('/')[-2:]
name = "VxVM %s-%s" % (group, disk)
name_info[(node, major, minor)] = name
return timestamp, info_plain, name_info
def diskstat_convert_info(parsed):
disks, multipath_info = parsed
converted_disks = dict(disks.items()) # we must not modify info!
# If we have information about multipathing, then remove the
# physical path devices from the disks array. But only do this,
# when there are information for the multipath device available.
#
# For multipath entries: Rename the generic names like "dm-8"
# with multipath names like "SDataCoreSANsymphony_DAT07-fscl"
if multipath_info:
for uuid, multipath in multipath_info.items():
if "alias" not in multipath:
multipath["alias"] = ""
if multipath["device"] in converted_disks or \
"DM %s" % multipath["alias"] in converted_disks:
for path in multipath["paths"]:
if path in converted_disks:
del converted_disks[path]
if multipath["device"] in converted_disks:
converted_disks[uuid] = converted_disks[multipath["device"]]
del converted_disks[multipath["device"]]
if "DM %s" % multipath["alias"] in converted_disks:
alias = "DM %s" % multipath["alias"]
converted_disks[uuid] = converted_disks[alias]
del converted_disks[alias]
# Remove any left-over device mapper devices that are not part of a
# known multipath device, LVM device or whatever
for device in converted_disks.keys():
if device.startswith("dm-"):
del converted_disks[device]
return converted_disks
def inventory_diskstat(parsed):
converted_disks = diskstat_convert_info(parsed)
# Use generic diskstat inventory function that is used also for other
# Disk IO checks. That expects a table of (node, device, ...)
return inventory_diskstat_generic([
(disk["node"], device) for device, disk in converted_disks.items()])
def check_diskstat(item, params, parsed):
return check_diskstat_dict(item, params, diskstat_convert_info(parsed))
check_info["diskstat"] = {
'parse_function' : parse_diskstat,
'inventory_function' : inventory_diskstat,
'check_function' : check_diskstat,
'service_description' : 'Disk IO %s',
'has_perfdata' : True,
'group' : 'diskstat',
"node_info" : True, # add first column with actual host name
'includes' : [ "diskstat.include" ],
'extra_sections' : [ "multipath" ],
}