conf.d/health.d/disks.conf

   1 # -----------------------------------------------------------------------------
   2 # make sure we collect values for each disk
   3
   4 # for mount points
   5 template: disk_space_last_collected_secs
   6       on: disk.space
   7 families: *
   8     calc: $now - $last_collected_t
   9    units: seconds ago
  10    every: 10s
  11     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
  12     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
  13    delay: down 5m multiplier 1.5 max 1h
  14     info: number of seconds since the last successful data collection of the mount point
  15       to: sysadmin
  16
  17 # for block devices
  18 template: disk_last_collected_secs
  19       on: disk.io
  20 families: *
  21     calc: $now - $last_collected_t
  22    units: seconds ago
  23    every: 10s
  24     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
  25     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
  26    delay: down 5m multiplier 1.5 max 1h
  27     info: number of seconds since the last successful data collection of the block device
  28       to: sysadmin
  29
  30
  31 # -----------------------------------------------------------------------------
  32 # low disk space
  33
  34 # checking the latest collected values
  35 # raise an alarm if the disk is low on
  36 # available disk space
  37
  38 template: disk_space_usage
  39       on: disk.space
  40 families: *
  41     calc: $used * 100 / ($avail + $used)
  42    units: %
  43    every: 1m
  44     warn: $this > (($status >= $WARNING ) ? (80) : (90))
  45     crit: $this > (($status == $CRITICAL) ? (90) : (98))
  46    delay: up 1m down 15m multiplier 1.5 max 1h
  47     info: current disk space usage
  48       to: sysadmin
  49
  50 template: disk_inode_usage
  51       on: disk.inodes
  52 families: *
  53     calc: $used * 100 / ($avail + $used)
  54    units: %
  55    every: 1m
  56     warn: $this > (($status >= $WARNING)  ? (80) : (90))
  57     crit: $this > (($status == $CRITICAL) ? (90) : (98))
  58    delay: up 1m down 15m multiplier 1.5 max 1h
  59     info: current disk inode usage
  60       to: sysadmin
  61
  62
  63 # -----------------------------------------------------------------------------
  64 # disk fill rate
  65
  66 # calculate the rate the disk fills
  67 # use as base, the available space change
  68 # during the last hour
  69
  70 # this is just a calculation - it has no alarm
  71 # we will use it in the next template to find
  72 # the hours remaining
  73
  74 template: disk_fill_rate
  75       on: disk.space
  76 families: *
  77   lookup: min -10m at -50m unaligned of avail
  78     calc: ($this - $avail) / (($now - $after) / 3600)
  79    every: 1m
  80    units: GB/hour
  81     info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
  82
  83
  84 # calculate the hours remaining
  85 # if the disk continues to fill
  86 # in this rate
  87
  88 template: out_of_disk_space_time
  89       on: disk.space
  90 families: *
  91     calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
  92    units: hours
  93    every: 10s
  94     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
  95     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
  96    delay: down 15m multiplier 1.2 max 1h
  97     info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
  98       to: sysadmin
  99
 100
 101 # -----------------------------------------------------------------------------
 102 # disk congestion
 103
 104 # raise an alarm if the disk is congested
 105 # by calculating the average disk utilization
 106 # for the last 10 minutes
 107
 108 template: 10min_disk_utilization
 109       on: disk.util
 110 families: *
 111   lookup: average -10m unaligned
 112    units: %
 113    every: 1m
 114    green: 90
 115      red: 98
 116     warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
 117     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
 118    delay: down 15m multiplier 1.2 max 1h
 119     info: the percentage of time the disk was busy, during the last 10 minutes
 120       to: sysadmin
 121
 122
 123 # raise an alarm if the disk backlog
 124 # is above 1000ms (1s) per second
 125 # for 10 minutes
 126 # (i.e. the disk cannot catch up)
 127
 128 template: 10min_disk_backlog
 129       on: disk.backlog
 130 families: *
 131   lookup: average -10m unaligned
 132    units: ms
 133    every: 1m
 134    green: 2000
 135      red: 5000
 136     warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
 137     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
 138    delay: down 15m multiplier 1.2 max 1h
 139     info: average of the kernel estimated disk backlog, for the last 10 minutes
 140       to: sysadmin