Creating Prometheus rules to generate alerts for storage issues
Wait for user monitoring to be running and create a Prometheus rule to generate alerts for storage issues.
-
Run the following command before running the script, or replace it with the Cloud Pak for AIOps namespace in the script.
export PROJECT_CP4AIOPS=<aiops-namespace>
-
Example: alerts that inform you when storage is running low.
cat << EOF | oc apply --validate -f - apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: role: alert-rules app: events.ibm.com name: aiops-storage-k8s-rules namespace: ${PROJECT_CP4AIOPS} spec: groups: - name: aiops storage alerts rules: - alert: LowNodeStorage expr: sum(node_filesystem_avail_bytes{mountpoint="/"}) / sum(node_filesystem_size_bytes{mountpoint="/"}) < 0.1 for: 1m labels: severity: critical annotations: summary: "Critical: Node storage running low" description: "One or more nodes in the cluster have less than 10% free storage available on the root partition." - alert: HighPodStorageUsage expr: sum(container_fs_usage_bytes{pod=~".*"}) / sum(container_fs_limit_bytes{pod=~".*"}) > 0.8 for: 5m labels: severity: warning annotations: summary: "Warning: High Pod Storage Usage" description: "A pod is exceeding 80% of its storage limit." - alert: NodeDiskUsage expr: 100 - (100 * (node_filesystem_avail_bytes / node_filesystem_size_bytes)) > 85 for: 5m labels: severity: critical annotations: summary: "{{ $labels.node }}: High Disk Usage detected" description: "{{ $labels.node }}: Disk usage is above 85% on {{ $labels.device }} (current value is: {{ $value | printf \"%.2f\" }}%)" resolved_desc: "{{ $labels.node }}: *RESOLVED*: Disk usage is within thresholds on {{ $labels.device }} (current value is: {{ $value | printf \"%.2f\" }}%)" - alert: KubePersistentVolumeFillingUpInAIopsNamespace expr: (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.85 for: 10m labels: severity: warning annotations: description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is filled up more than 85%. Currently {{ $value | humanizePercentage }}. summary: PersistentVolume {{ $labels.persistentvolumeclaim }} is filling up. - alert: HighPVUsage expr: (kube_persistent_volume_used_bytes / kube_persistent_volume_capacity_bytes) > 0.8 for: 5m labels: severity: critical annotations: summary: "Persistent Volume usage is high on {{ $labels.persistentvolume }}" description: "Persistent Volume {{ $labels.persistentvolume }} is currently using more than 80% of its capacity." EOF