Chotaire Wiki

Stuff you didn't know

User Tools

Site Tools


linux-check_mk-fc29

Check_MK diskstat patch to run with Fedora 29 kernels

This is now fixed and has been merged into Check_MK Version 1.5.0p9.

share/check_mk/checks/diskstat (1.5.0p7: fixes diskstat crashes with kernel 4.19.*):

#!/usr/bin/python
# -*- encoding: utf-8; py-indent-offset: 4 -*-
# +------------------------------------------------------------------+
# |             ____ _               _        __  __ _  __           |
# |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
# |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
# |           | |___| | | |  __/ (__|   <    | |  | | . \            |
# |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
# |                                                                  |
# | Copyright Mathias Kettner 2014             mk@mathias-kettner.de |
# +------------------------------------------------------------------+
#
# This file is part of Check_MK.
# The official homepage is at http://mathias-kettner.de/check_mk.
#
# check_mk is free software;  you can redistribute it and/or modify it
# under the  terms of the  GNU General Public License  as published by
# the Free Software Foundation in version 2.  check_mk is  distributed
# in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
# out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
# PARTICULAR PURPOSE. See the  GNU General Public License for more de-
# tails. You should have  received  a copy of the  GNU  General Public
# License along with GNU Make; see the file  COPYING.  If  not,  write
# to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
# Boston, MA 02110-1301 USA.

# <<<diskstat>>>
# 1300264105
#    8       0 sda 691860 951191 13559915 491748 234686 197346 3359512 94944 0 56844 586312
#    8      32 sdb 791860 91191 23589915 491748 234686 197346 3359512 94944 0 56844 586312

# Newer agent output also dm-* and Veritas devices and if
# available the following additional information for name rewriting:

# <<<diskstat>>>
# 1338931242
#    8       0 sda 6142 327 219612 2244 3190 6233 74075 8206 0 6523 10446
#  253       0 dm-0 4579 0 181754 2343 9249 0 73960 259491 0 1208 261833
#  253       1 dm-1 342 0 2736 47 3 0 11796464 5016 0 5063 5063
#  253       2 dm-2 160 0 1274 27 11 0 56 3 0 27 30
#    8      16 sdb 464 858 7717 336 1033 0 311454 3899 0 3007 4231
#    8      32 sdc 855 13352 106777 1172 915 0 154467 2798 0 3012 3967
#    8      48 sdd 1217 861 109802 1646 118 0 56151 1775 0 2736 3420
#    8      80 sdf 359 1244 58323 792 66 0 4793 388 0 765 1178
#    8      64 sde 310 1242 6964 268 118 0 56151 1607 0 1307 1872
#    8      96 sdg 1393 1242 314835 3759 129 0 56172 1867 0 4027 5619
#  199   27000 VxVM27000 131 0 990 61 11 0 21 29 0 89 90
#  199   27001 VxVM27001 0 0 0 0 0 0 0 0 0 0 0
# [dmsetup_info]
# vg_zwei-lv_home 253:2 vg_zwei lv_home
# vg_zwei-lv_swap 253:1 vg_zwei lv_swap
# vg_zwei-lv_root 253:0 vg_zwei lv_root
# [vx_dsk]
# c7 6978 /dev/vx/dsk/datadg/lalavol
# c7 6979 /dev/vx/dsk/datadg/oravol

# output may have zeros appended
#
# 8 0 sda 111918756 929875 3960367050 349083041 20142495 1149711 1021234448 851284769 0 233177192 1197549009 0 0 0 0
# 8 1 sda1 226 0 27481 3388 381 3 31472 35862 0 8123 39260 0 0 0 0
# 8 2 sda2 111918500 929875 3960337473 349079568 20142114 1149708 1021202976 851248906 0 233176504 1197492420 0 0 0 0
# 253 0 dm-0 883953 0 92124097 10287533 108572 0 2251672 809814 0 7545567 11097424 0 0 0 0
# 253 1 dm-1 21046 0 172072 157766 164020 0 1312160 29292970 0 124138 29451007 0 0 0 0
# 253 2 dm-2 750714 0 19747073 7702216 1445987 0 36811608 9817313 0 7159271 17520030 0 0 0 0

# Fields in /proc/diskstats
#  Index 0 -- major number
#  Index 1 -- minor number
#  Index 2 -- device name                        --> used by check
#  Index 3 -- # of reads issued
#  Index 4 -- # of reads merged
#  Index 5 -- # of sectors read (a 512 Byte)     --> used by check
#  Index 6 -- # of milliseconds spent reading
#  Index 7 -- # of writes completed
#  Index 8 -- # of writes merged
#  Index 9 -- # of sectors written (a 512 Byte)  --> used by check
#  Index 10 -- # of milliseconds spent writing
#  Index 11 -- # of I/Os currently in progress
#  Index 12 -- # of milliseconds spent doing I/Os
#  Index 13 -- weighted # of milliseconds spent doing I/Os

# Convert information to generic format also generated
# by winperf_phydisk
# [ now, [( disk, readctr, writectr ), ... ]]
# where counters are in sectors (512 bytes)

# Parse /proc/diskstat and additional information into a nice canonical
# dictionary of the form:
# disks = {
#     "hda" : {
#       'average_read_request_size'  : 0.0,
#       'average_read_wait'          : 0.0,
#       'average_request_size'       : 40569.90476190476,
#       'average_wait'               : 0.761904761904762,
#       'average_write_request_size' : 40569.90476190476,
#       'average_write_wait'         : 0.0007619047619047619,
#       'node'                       : None,
#       'read_ios'                   : 0.0,
#       'read_throughput'            : 0.0,
#       'latency'                    : 0.00038095238095238096,
#       'utilization'                : 0.0006153846153846154,
#       'write_ios'                  : 1.6153846153846154,
#       'write_throughput'           : 65536.0,
#     },
#     "LVM foobar" : {
#         ...
#     }
# }
#
# Returns a pair of the timestamp and that dictionary
# parsed = timestamp, disks


def parse_diskstat(info):
    timestamp_str, proc_diskstat, name_info = diskstat_extract_name_info(info)
    # limit diskstat to first elements before actual parsing
    proc_diskstat = [ds[:15] for ds in proc_diskstat]
    timestamp = int(timestamp_str)

    # Here we discover real partitions and exclude them:
    # Sort of partitions with disks - typical in XEN virtual setups.
    # Eg. there are xvda1, xvda2, but no xvda...
    device_names = [line[3] for line in proc_diskstat]
    real_partitions = {device_name for device_name in device_names
                       if diskstat_diskless_pattern.match(device_name)
                          and re.sub('[0-9]+$', '', device_name)}
    disks = {}
    for line in proc_diskstat:
        if line[3] in real_partitions:
            continue

        node_name, major, minor, device, \
            read_ios, _read_merges, read_sectors, read_ticks, \
            write_ios, _write_merges, write_sectors, write_ticks, \
            ios_in_prog, total_ticks, _rq_ticks = line

        if (node_name, int(major), int(minor)) in name_info:
            device = name_info[(node_name, int(major), int(minor))]

        counter_base = "diskstat.%s." % device

        # Some of the following computations were learned from Munin. Thanks
        # to that project!

        # There are 1000 ticks per second
        # Note: we use onwrap=0.0 here because the parse function is being used also during
        # service discovery. If we raise a counter wrap exception here, then nothing will
        # be inventorized.
        read_ticks_rate  = get_rate(counter_base + "read_ticks",   timestamp, int(read_ticks), onwrap=0.0)
        write_ticks_rate = get_rate(counter_base + "write_ticks",  timestamp, int(write_ticks), onwrap=0.0)
        total_ticks_rate = get_rate(counter_base + "total_ticks",  timestamp, int(total_ticks), onwrap=0.0)
        read_ios_rate    = get_rate(counter_base + "read_ios",     timestamp, int(read_ios), onwrap=0.0)
        write_ios_rate   = get_rate(counter_base + "write_ios",    timestamp, int(write_ios), onwrap=0.0)
        total_ios_rate   = read_ios_rate + write_ios_rate
        utilization      = total_ticks_rate / 1000 # not percent, but 0...1
        read_bytes_rate  = get_rate(counter_base + "read_sectors",  timestamp, int(read_sectors), onwrap=0.0) * 512
        write_bytes_rate = get_rate(counter_base + "write_sectors", timestamp, int(write_sectors), onwrap=0.0) * 512
        total_bytes_rate = read_bytes_rate + write_bytes_rate

        # The service time is computed from the utilization. If we work
        # e.g. 0.34 (34%) of the time and we can do 17 operations in that
        # time then the average latency is time * 0.34 / 17
        if total_ios_rate:
            latency              = utilization / total_ios_rate
            average_wait         = (read_ticks_rate + write_ticks_rate) / total_ios_rate / 1000.0
            average_request_size = total_bytes_rate / total_ios_rate
        else:
            latency              = 0.0
            average_wait         = 0.0
            average_request_size = 0.0

        # Average read and write rate, from end to end, including queuing, etc.
        # and average size of one request
        if read_ticks_rate and read_ios_rate > 0:
            average_read_wait = read_ticks_rate / read_ios_rate / 1000.0
            average_read_size = read_bytes_rate / read_ios_rate
        else:
            average_read_wait = 0.0
            average_read_size = 0.0

        if write_ticks_rate and write_ios_rate > 0:
            average_write_wait = write_ticks_rate / write_ios_rate / 1000.0
            average_write_size = write_bytes_rate / write_ios_rate
        else:
            average_write_wait = 0.0
            average_write_size = 0.0

        disks[device] = {
            "node"                       : node_name,
            "read_ios"                   : read_ios_rate,
            "write_ios"                  : write_ios_rate,
            "read_throughput"            : read_bytes_rate,
            "write_throughput"           : write_bytes_rate,
            "utilization"                : utilization,
            "latency"                    : latency,
            "average_request_size"       : average_request_size,
            "average_wait"               : average_wait,
            "average_read_wait"          : average_read_wait,
            "average_read_request_size"  : average_read_size,
            "average_write_wait"         : average_write_wait,
            "average_write_request_size" : average_write_size,
            "queue_length"               : int(ios_in_prog),
        }

    return disks


### #  Index 0 -- major number
### #  Index 1 -- minor number
### #  Index 2 -- device name                        --> used by check
### #  Index 3 -- # of reads issued
### #  Index 4 -- # of reads merged
### #  Index 5 -- # of sectors read (a 512 Byte)     --> used by check
### #  Index 6 -- # of milliseconds spent reading
### #  Index 7 -- # of writes completed
### #  Index 8 -- # of writes merged
### #  Index 9 -- # of sectors written (a 512 Byte)  --> used by check
### #  Index 10 -- # of milliseconds spent writing
### #  Index 11 -- # of I/Os currently in progress
### #  Index 12 -- # of milliseconds spent doing I/Os
### #  Index 13 -- weighted # of milliseconds spent doing I/Os
###     for line in proc_diskstat:
###         node = line[0]
###
###
###
###     # For multipath devices use the entries for dm-?? and rename
###     # them with their multipath UUID/alias - and drop the according
###     # sdXY that belong to the paths.
###     multipath_name_info = {}
###     skipped_devices = set([])
###
###     # The generic function takes the following values per line:
###     #  0: None or node name
###     #  1: devname
###     #  2: read bytes counter
###     #  3: write bytes counter
###     # Optional ones:
###     #  4: number of reads
###     #  5: number of writes
###     #  6: timems
###     #  7: read queue length *counters*
###     #  8: write queue length *counters*
###     rewritten = [
###         ( l[0], # node name or None
###         diskstat_rewrite_device(name_info, multipath_name_info, l[0:4]),
###         int(l[6]),
###         int(l[10]),
###         int(l[4]),
###         int(l[8]),
###         # int(l[13])
###         ) for l in info[1:] if len(l) >= 14
###     ]
###
###     # Remove device mapper devices without a translated name
###     return [ line for line in rewritten
###              if not line[1].startswith("dm-")
###                 and not line[1] in skipped_devices ]


# Extra additional information from diskstat section about
# LVM and DM devices. These information is encapsulated
# with [dmsetup_info] and [vx_dsk] subsections. Example for
# name_info:
# {
#     (None, 253, 0): 'LVM vg00-rootvol',
#     (None, 253, 1): 'LVM vg00-tmpvol',
#     (None, 253, 2): 'LVM vg00-varvol',
#     (None, 253, 3): 'LVM vg00-optvol',
#     (None, 253, 4): 'LVM vg00-usrvol',
#     (None, 253, 5): 'LVM vg00-swapvol',
#     (None, 253, 6): 'LVM vgappl-applvol',
# }
def diskstat_extract_name_info(info):
    name_info = {} # dict from (node, major, minor) to itemname
    timestamp = None

    info_plain = []
    phase = 'info'
    node = None
    for line in info:
        if node is None:
            node = line[0]

        if line[1] == '[dmsetup_info]':
            phase = 'dmsetup_info'
        elif line[1] == '[vx_dsk]':
            phase = 'vx_dsk'
        # new node in case of a cluster, restart with info phase
        elif line[0] != node:
            phase = 'info'
            node = line[0]
        else:
            if phase == 'info':
                if len(line) == 2:
                    timestamp = int(line[1])
                else:
                    info_plain.append(line)
            elif phase == 'dmsetup_info':
                try:
                    major, minor = map(int, line[2].split(':'))
                    if len(line) == 5:
                        name = "LVM %s" % line[1]
                    else:
                        name = "DM %s" % line[1]
                    name_info[node, major, minor] = name
                except:
                    pass # ignore such crap as "No Devices Found"
            elif phase == 'vx_dsk':
                major = int(line[1], 16)
                minor = int(line[2], 16)
                group, disk = line[3].split('/')[-2:]
                name = "VxVM %s-%s" % (group, disk)
                name_info[(node, major, minor)] = name

    return timestamp, info_plain, name_info

def diskstat_convert_info(parsed):
    disks, multipath_info = parsed
    converted_disks = dict(disks.items()) # we must not modify info!

    # If we have information about multipathing, then remove the
    # physical path devices from the disks array. But only do this,
    # when there are information for the multipath device available.
    #
    # For multipath entries: Rename the generic names like "dm-8"
    # with multipath names like "SDataCoreSANsymphony_DAT07-fscl"
    if multipath_info:
        for uuid, multipath in multipath_info.items():
            if "alias" not in multipath:
                multipath["alias"] = ""

            if multipath["device"] in converted_disks or \
               "DM %s" % multipath["alias"] in converted_disks:
                for path in multipath["paths"]:
                    if path in converted_disks:
                        del converted_disks[path]

            if multipath["device"] in converted_disks:
                converted_disks[uuid] = converted_disks[multipath["device"]]
                del converted_disks[multipath["device"]]

            if "DM %s" % multipath["alias"] in converted_disks:
                alias = "DM %s" % multipath["alias"]
                converted_disks[uuid] = converted_disks[alias]
                del converted_disks[alias]

    # Remove any left-over device mapper devices that are not part of a
    # known multipath device, LVM device or whatever
    for device in converted_disks.keys():
        if device.startswith("dm-"):
            del converted_disks[device]

    return converted_disks


def inventory_diskstat(parsed):
    converted_disks = diskstat_convert_info(parsed)

    # Use generic diskstat inventory function that is used also for other
    # Disk IO checks. That expects a table of (node, device, ...)
    return inventory_diskstat_generic([
        (disk["node"], device) for device, disk in converted_disks.items()])


def check_diskstat(item, params, parsed):
    return check_diskstat_dict(item, params, diskstat_convert_info(parsed))


check_info["diskstat"] = {
    'parse_function'      : parse_diskstat,
    'inventory_function'  : inventory_diskstat,
    'check_function'      : check_diskstat,
    'service_description' : 'Disk IO %s',
    'has_perfdata'        : True,
    'group'               : 'diskstat',
    "node_info"           : True, # add first column with actual host name
    'includes'            : [ "diskstat.include" ],
    'extra_sections'      : [ "multipath" ],
}



linux-check_mk-fc29.txt ยท Last modified: 2019/04/20 16:59 by chotaire