conf.d/health.d/disks.conf

   1 # -----------------------------------------------------------------------------
   2 # make sure we collect values for each disk
   3
   4 # for mount points
   5 template: disk_space_last_collected_secs
   6       on: disk.space
   7     calc: $now - $last_collected_t
   8    units: seconds ago
   9    every: 10s
  10     warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
  11     crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
  12    delay: down 5m multiplier 1.5 max 1h
  13     info: number of seconds since the last successful data collection of the mount point
  14       to: sysadmin
  15
  16 # for block devices
  17 template: disk_last_collected_secs
  18       on: disk.io
  19     calc: $now - $last_collected_t
  20    units: seconds ago
  21    every: 10s
  22     warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
  23     crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
  24    delay: down 5m multiplier 1.5 max 1h
  25     info: number of seconds since the last successful data collection of the block device
  26       to: sysadmin
  27
  28
  29 # -----------------------------------------------------------------------------
  30 # low disk space
  31
  32 # checking the latest collected values
  33 # raise an alarm if the disk is low on
  34 # available disk space
  35
  36 template: disk_space_usage
  37       on: disk.space
  38     calc: $used * 100 / ($avail + $used)
  39    units: %
  40    every: 1m
  41     warn: $this > (($status >= $WARNING ) ? (70) : (80))
  42     crit: $this > (($status == $CRITICAL) ? (85) : (95))
  43    delay: up 1m down 15m multiplier 1.5 max 1h
  44     info: current disk space usage
  45       to: sysadmin
  46
  47 template: disk_inode_usage
  48       on: disk.inodes
  49     calc: $used * 100 / ($avail + $used)
  50    units: %
  51    every: 1m
  52     warn: $this > (($status >= $WARNING)  ? (75) : (80))
  53     crit: $this > (($status == $CRITICAL) ? (90) : (95))
  54    delay: up 1m down 15m multiplier 1.5 max 1h
  55     info: current disk inode usage
  56       to: sysadmin
  57
  58
  59 # -----------------------------------------------------------------------------
  60 # disk fill rate
  61
  62 # calculate the rate the disk fills
  63 # use as base, the available space change
  64 # during the last hour
  65
  66 # this is just a calculation - it has no alarm
  67 # we will use it in the next template to find
  68 # the hours remaining
  69
  70 template: disk_fill_rate
  71       on: disk.space
  72   lookup: min -10m at -50m unaligned of avail
  73     calc: ($this - $avail) / (($now - $after) / 3600)
  74    every: 1m
  75    units: GB/hour
  76     info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
  77
  78
  79 # calculate the hours remaining
  80 # if the disk continues to fill
  81 # in this rate
  82
  83 template: out_of_disk_space_time
  84       on: disk.space
  85     calc: $avail / $disk_fill_rate
  86    units: hours
  87    every: 10s
  88     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
  89     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
  90    delay: down 15m multiplier 1.2 max 1h
  91     info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
  92       to: sysadmin
  93
  94
  95 # -----------------------------------------------------------------------------
  96 # disk congestion
  97
  98 # raise an alarm if the disk is congested
  99 # by calculating the average disk utilization
 100 # for the last 10 minutes
 101
 102 template: 10min_disk_utilization
 103       on: disk.util
 104   lookup: average -10m unaligned
 105    units: %
 106    every: 1m
 107    green: 90
 108      red: 98
 109     warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
 110     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
 111    delay: down 15m multiplier 1.2 max 1h
 112     info: the percentage of time the disk was busy, during the last 10 minutes
 113       to: sysadmin
 114
 115
 116 # raise an alarm if the disk backlog
 117 # is above 1000ms (1s) per second
 118 # for 10 minutes
 119 # (i.e. the disk cannot catch up)
 120
 121 template: 10min_disk_backlog
 122       on: disk.backlog
 123   lookup: average -10m unaligned
 124    units: ms
 125    every: 1m
 126    green: 2000
 127      red: 5000
 128     warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
 129     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
 130    delay: down 15m multiplier 1.2 max 1h
 131     info: average of the kernel estimated disk backlog, for the last 10 minutes
 132       to: sysadmin