share/check_mk/checks/diskstat (1.5.0p7: fixes diskstat crashes with kernel 4.19.*):
#!/usr/bin/python # -*- encoding: utf-8; py-indent-offset: 4 -*- # +------------------------------------------------------------------+ # | ____ _ _ __ __ _ __ | # | / ___| |__ ___ ___| | __ | \/ | |/ / | # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / | # | | |___| | | | __/ (__| < | | | | . \ | # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ | # | | # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de | # +------------------------------------------------------------------+ # # This file is part of Check_MK. # The official homepage is at http://mathias-kettner.de/check_mk. # # check_mk is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation in version 2. check_mk is distributed # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with- # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. See the GNU General Public License for more de- # tails. You should have received a copy of the GNU General Public # License along with GNU Make; see the file COPYING. If not, write # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, # Boston, MA 02110-1301 USA. # <<<diskstat>>> # 1300264105 # 8 0 sda 691860 951191 13559915 491748 234686 197346 3359512 94944 0 56844 586312 # 8 32 sdb 791860 91191 23589915 491748 234686 197346 3359512 94944 0 56844 586312 # Newer agent output also dm-* and Veritas devices and if # available the following additional information for name rewriting: # <<<diskstat>>> # 1338931242 # 8 0 sda 6142 327 219612 2244 3190 6233 74075 8206 0 6523 10446 # 253 0 dm-0 4579 0 181754 2343 9249 0 73960 259491 0 1208 261833 # 253 1 dm-1 342 0 2736 47 3 0 11796464 5016 0 5063 5063 # 253 2 dm-2 160 0 1274 27 11 0 56 3 0 27 30 # 8 16 sdb 464 858 7717 336 1033 0 311454 3899 0 3007 4231 # 8 32 sdc 855 13352 106777 1172 915 0 154467 2798 0 3012 3967 # 8 48 sdd 1217 861 109802 1646 118 0 56151 1775 0 2736 3420 # 8 80 sdf 359 1244 58323 792 66 0 4793 388 0 765 1178 # 8 64 sde 310 1242 6964 268 118 0 56151 1607 0 1307 1872 # 8 96 sdg 1393 1242 314835 3759 129 0 56172 1867 0 4027 5619 # 199 27000 VxVM27000 131 0 990 61 11 0 21 29 0 89 90 # 199 27001 VxVM27001 0 0 0 0 0 0 0 0 0 0 0 # [dmsetup_info] # vg_zwei-lv_home 253:2 vg_zwei lv_home # vg_zwei-lv_swap 253:1 vg_zwei lv_swap # vg_zwei-lv_root 253:0 vg_zwei lv_root # [vx_dsk] # c7 6978 /dev/vx/dsk/datadg/lalavol # c7 6979 /dev/vx/dsk/datadg/oravol # output may have zeros appended # # 8 0 sda 111918756 929875 3960367050 349083041 20142495 1149711 1021234448 851284769 0 233177192 1197549009 0 0 0 0 # 8 1 sda1 226 0 27481 3388 381 3 31472 35862 0 8123 39260 0 0 0 0 # 8 2 sda2 111918500 929875 3960337473 349079568 20142114 1149708 1021202976 851248906 0 233176504 1197492420 0 0 0 0 # 253 0 dm-0 883953 0 92124097 10287533 108572 0 2251672 809814 0 7545567 11097424 0 0 0 0 # 253 1 dm-1 21046 0 172072 157766 164020 0 1312160 29292970 0 124138 29451007 0 0 0 0 # 253 2 dm-2 750714 0 19747073 7702216 1445987 0 36811608 9817313 0 7159271 17520030 0 0 0 0 # Fields in /proc/diskstats # Index 0 -- major number # Index 1 -- minor number # Index 2 -- device name --> used by check # Index 3 -- # of reads issued # Index 4 -- # of reads merged # Index 5 -- # of sectors read (a 512 Byte) --> used by check # Index 6 -- # of milliseconds spent reading # Index 7 -- # of writes completed # Index 8 -- # of writes merged # Index 9 -- # of sectors written (a 512 Byte) --> used by check # Index 10 -- # of milliseconds spent writing # Index 11 -- # of I/Os currently in progress # Index 12 -- # of milliseconds spent doing I/Os # Index 13 -- weighted # of milliseconds spent doing I/Os # Convert information to generic format also generated # by winperf_phydisk # [ now, [( disk, readctr, writectr ), ... ]] # where counters are in sectors (512 bytes) # Parse /proc/diskstat and additional information into a nice canonical # dictionary of the form: # disks = { # "hda" : { # 'average_read_request_size' : 0.0, # 'average_read_wait' : 0.0, # 'average_request_size' : 40569.90476190476, # 'average_wait' : 0.761904761904762, # 'average_write_request_size' : 40569.90476190476, # 'average_write_wait' : 0.0007619047619047619, # 'node' : None, # 'read_ios' : 0.0, # 'read_throughput' : 0.0, # 'latency' : 0.00038095238095238096, # 'utilization' : 0.0006153846153846154, # 'write_ios' : 1.6153846153846154, # 'write_throughput' : 65536.0, # }, # "LVM foobar" : { # ... # } # } # # Returns a pair of the timestamp and that dictionary # parsed = timestamp, disks def parse_diskstat(info): timestamp_str, proc_diskstat, name_info = diskstat_extract_name_info(info) # limit diskstat to first elements before actual parsing proc_diskstat = [ds[:15] for ds in proc_diskstat] timestamp = int(timestamp_str) # Here we discover real partitions and exclude them: # Sort of partitions with disks - typical in XEN virtual setups. # Eg. there are xvda1, xvda2, but no xvda... device_names = [line[3] for line in proc_diskstat] real_partitions = {device_name for device_name in device_names if diskstat_diskless_pattern.match(device_name) and re.sub('[0-9]+$', '', device_name)} disks = {} for line in proc_diskstat: if line[3] in real_partitions: continue node_name, major, minor, device, \ read_ios, _read_merges, read_sectors, read_ticks, \ write_ios, _write_merges, write_sectors, write_ticks, \ ios_in_prog, total_ticks, _rq_ticks = line if (node_name, int(major), int(minor)) in name_info: device = name_info[(node_name, int(major), int(minor))] counter_base = "diskstat.%s." % device # Some of the following computations were learned from Munin. Thanks # to that project! # There are 1000 ticks per second # Note: we use onwrap=0.0 here because the parse function is being used also during # service discovery. If we raise a counter wrap exception here, then nothing will # be inventorized. read_ticks_rate = get_rate(counter_base + "read_ticks", timestamp, int(read_ticks), onwrap=0.0) write_ticks_rate = get_rate(counter_base + "write_ticks", timestamp, int(write_ticks), onwrap=0.0) total_ticks_rate = get_rate(counter_base + "total_ticks", timestamp, int(total_ticks), onwrap=0.0) read_ios_rate = get_rate(counter_base + "read_ios", timestamp, int(read_ios), onwrap=0.0) write_ios_rate = get_rate(counter_base + "write_ios", timestamp, int(write_ios), onwrap=0.0) total_ios_rate = read_ios_rate + write_ios_rate utilization = total_ticks_rate / 1000 # not percent, but 0...1 read_bytes_rate = get_rate(counter_base + "read_sectors", timestamp, int(read_sectors), onwrap=0.0) * 512 write_bytes_rate = get_rate(counter_base + "write_sectors", timestamp, int(write_sectors), onwrap=0.0) * 512 total_bytes_rate = read_bytes_rate + write_bytes_rate # The service time is computed from the utilization. If we work # e.g. 0.34 (34%) of the time and we can do 17 operations in that # time then the average latency is time * 0.34 / 17 if total_ios_rate: latency = utilization / total_ios_rate average_wait = (read_ticks_rate + write_ticks_rate) / total_ios_rate / 1000.0 average_request_size = total_bytes_rate / total_ios_rate else: latency = 0.0 average_wait = 0.0 average_request_size = 0.0 # Average read and write rate, from end to end, including queuing, etc. # and average size of one request if read_ticks_rate and read_ios_rate > 0: average_read_wait = read_ticks_rate / read_ios_rate / 1000.0 average_read_size = read_bytes_rate / read_ios_rate else: average_read_wait = 0.0 average_read_size = 0.0 if write_ticks_rate and write_ios_rate > 0: average_write_wait = write_ticks_rate / write_ios_rate / 1000.0 average_write_size = write_bytes_rate / write_ios_rate else: average_write_wait = 0.0 average_write_size = 0.0 disks[device] = { "node" : node_name, "read_ios" : read_ios_rate, "write_ios" : write_ios_rate, "read_throughput" : read_bytes_rate, "write_throughput" : write_bytes_rate, "utilization" : utilization, "latency" : latency, "average_request_size" : average_request_size, "average_wait" : average_wait, "average_read_wait" : average_read_wait, "average_read_request_size" : average_read_size, "average_write_wait" : average_write_wait, "average_write_request_size" : average_write_size, "queue_length" : int(ios_in_prog), } return disks ### # Index 0 -- major number ### # Index 1 -- minor number ### # Index 2 -- device name --> used by check ### # Index 3 -- # of reads issued ### # Index 4 -- # of reads merged ### # Index 5 -- # of sectors read (a 512 Byte) --> used by check ### # Index 6 -- # of milliseconds spent reading ### # Index 7 -- # of writes completed ### # Index 8 -- # of writes merged ### # Index 9 -- # of sectors written (a 512 Byte) --> used by check ### # Index 10 -- # of milliseconds spent writing ### # Index 11 -- # of I/Os currently in progress ### # Index 12 -- # of milliseconds spent doing I/Os ### # Index 13 -- weighted # of milliseconds spent doing I/Os ### for line in proc_diskstat: ### node = line[0] ### ### ### ### # For multipath devices use the entries for dm-?? and rename ### # them with their multipath UUID/alias - and drop the according ### # sdXY that belong to the paths. ### multipath_name_info = {} ### skipped_devices = set([]) ### ### # The generic function takes the following values per line: ### # 0: None or node name ### # 1: devname ### # 2: read bytes counter ### # 3: write bytes counter ### # Optional ones: ### # 4: number of reads ### # 5: number of writes ### # 6: timems ### # 7: read queue length *counters* ### # 8: write queue length *counters* ### rewritten = [ ### ( l[0], # node name or None ### diskstat_rewrite_device(name_info, multipath_name_info, l[0:4]), ### int(l[6]), ### int(l[10]), ### int(l[4]), ### int(l[8]), ### # int(l[13]) ### ) for l in info[1:] if len(l) >= 14 ### ] ### ### # Remove device mapper devices without a translated name ### return [ line for line in rewritten ### if not line[1].startswith("dm-") ### and not line[1] in skipped_devices ] # Extra additional information from diskstat section about # LVM and DM devices. These information is encapsulated # with [dmsetup_info] and [vx_dsk] subsections. Example for # name_info: # { # (None, 253, 0): 'LVM vg00-rootvol', # (None, 253, 1): 'LVM vg00-tmpvol', # (None, 253, 2): 'LVM vg00-varvol', # (None, 253, 3): 'LVM vg00-optvol', # (None, 253, 4): 'LVM vg00-usrvol', # (None, 253, 5): 'LVM vg00-swapvol', # (None, 253, 6): 'LVM vgappl-applvol', # } def diskstat_extract_name_info(info): name_info = {} # dict from (node, major, minor) to itemname timestamp = None info_plain = [] phase = 'info' node = None for line in info: if node is None: node = line[0] if line[1] == '[dmsetup_info]': phase = 'dmsetup_info' elif line[1] == '[vx_dsk]': phase = 'vx_dsk' # new node in case of a cluster, restart with info phase elif line[0] != node: phase = 'info' node = line[0] else: if phase == 'info': if len(line) == 2: timestamp = int(line[1]) else: info_plain.append(line) elif phase == 'dmsetup_info': try: major, minor = map(int, line[2].split(':')) if len(line) == 5: name = "LVM %s" % line[1] else: name = "DM %s" % line[1] name_info[node, major, minor] = name except: pass # ignore such crap as "No Devices Found" elif phase == 'vx_dsk': major = int(line[1], 16) minor = int(line[2], 16) group, disk = line[3].split('/')[-2:] name = "VxVM %s-%s" % (group, disk) name_info[(node, major, minor)] = name return timestamp, info_plain, name_info def diskstat_convert_info(parsed): disks, multipath_info = parsed converted_disks = dict(disks.items()) # we must not modify info! # If we have information about multipathing, then remove the # physical path devices from the disks array. But only do this, # when there are information for the multipath device available. # # For multipath entries: Rename the generic names like "dm-8" # with multipath names like "SDataCoreSANsymphony_DAT07-fscl" if multipath_info: for uuid, multipath in multipath_info.items(): if "alias" not in multipath: multipath["alias"] = "" if multipath["device"] in converted_disks or \ "DM %s" % multipath["alias"] in converted_disks: for path in multipath["paths"]: if path in converted_disks: del converted_disks[path] if multipath["device"] in converted_disks: converted_disks[uuid] = converted_disks[multipath["device"]] del converted_disks[multipath["device"]] if "DM %s" % multipath["alias"] in converted_disks: alias = "DM %s" % multipath["alias"] converted_disks[uuid] = converted_disks[alias] del converted_disks[alias] # Remove any left-over device mapper devices that are not part of a # known multipath device, LVM device or whatever for device in converted_disks.keys(): if device.startswith("dm-"): del converted_disks[device] return converted_disks def inventory_diskstat(parsed): converted_disks = diskstat_convert_info(parsed) # Use generic diskstat inventory function that is used also for other # Disk IO checks. That expects a table of (node, device, ...) return inventory_diskstat_generic([ (disk["node"], device) for device, disk in converted_disks.items()]) def check_diskstat(item, params, parsed): return check_diskstat_dict(item, params, diskstat_convert_info(parsed)) check_info["diskstat"] = { 'parse_function' : parse_diskstat, 'inventory_function' : inventory_diskstat, 'check_function' : check_diskstat, 'service_description' : 'Disk IO %s', 'has_perfdata' : True, 'group' : 'diskstat', "node_info" : True, # add first column with actual host name 'includes' : [ "diskstat.include" ], 'extra_sections' : [ "multipath" ], }