From edb92fe4e8792f998a677c127f1c8b0ee434214b Mon Sep 17 00:00:00 2001 From: Chris Grindstaff Date: Mon, 18 Nov 2024 15:08:37 -0500 Subject: [PATCH] feat: Harvest should monitor `wafl.dir.size.warning` Fixes: #3243 --- conf/ems/9.6.0/ems.yaml | 7 +++++++ container/prometheus/ems_alert_rules.yml | 25 +++++++++++++++++++++++- docs/resources/ems-alert-runbook.md | 12 ++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/conf/ems/9.6.0/ems.yaml b/conf/ems/9.6.0/ems.yaml index c8b1014d4..4ebfe64ce 100644 --- a/conf/ems/9.6.0/ems.yaml +++ b/conf/ems/9.6.0/ems.yaml @@ -944,6 +944,13 @@ events: - parameters.mirror_config_id => mirror_config_id - parameters.primary_config_id => primary_config_id + - name: wafl.dir.size.warning + exports: + - parameters.fileid => directory_inum + - parameters.vol => volume + - parameters.app => app + - parameters.volident => vol_ident + - name: wafl.readdir.expired exports: - parameters.app => app diff --git a/container/prometheus/ems_alert_rules.yml b/container/prometheus/ems_alert_rules.yml index e69d452a4..2319ef212 100644 --- a/container/prometheus/ems_alert_rules.yml +++ b/container/prometheus/ems_alert_rules.yml @@ -2008,4 +2008,27 @@ groups: {{- end -}} annotations: summary: "SnapMirror active sync planned failover operation completed for Destination path: \"{{ $labels.dstpath }}\"." - impact: "Protection" \ No newline at end of file + impact: "Protection" + + - alert: Directory size is approaching the maximum directory size (maxdirsize) limit + expr: last_over_time(ems_events{message="wafl.dir.size.warning"}[5m]) == 1 + labels: + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} + annotations: + summary: "Directory size for file ID \"{{ $labels.directory_inum }}\" in volume \"{{ $labels.volume }}\" is approaching the maximum directory size (maxdirsize) limit." + impact: "Availability" diff --git a/docs/resources/ems-alert-runbook.md b/docs/resources/ems-alert-runbook.md index ecc2fd5bd..74981f4ee 100644 --- a/docs/resources/ems-alert-runbook.md +++ b/docs/resources/ems-alert-runbook.md @@ -52,6 +52,18 @@ If you use Cloud Volumes ONTAP, perform the following corrective actions: 2. Ensure that the login and connectivity information is still valid. Contact NetApp technical support if the issue persists. +### Directory size is approaching the maximum directory size (maxdirsize) limit + +**Impact**: Availability + +**EMS Event**: `wafl.dir.size.warning` + +This message occurs when the size of a directory surpasses a configured percentage (default: 90%) of its current maximum directory size (maxdirsize) limit. + +**Remediation** + +Use the "volume file show-inode" command with the file ID and volume name information to find the file path. Reduce the number of files in the directory. If not possible, use the (privilege:advanced) option "volume modify -volume vol_name -maxdir-size new_value" to increase the maximum number of files per directory. However, doing so could impact system performance. If you need to increase the maximum directory size, contact NetApp technical support. + ### Disk Out of Service **Impact**: Availability