Creating Prometheus rule to generate OpenShift alerts
Wait for user monitoring to be running and create a Prometheus rule to generate OpenShift alerts.
-
Run the following command before running the script, or replace it with the Cloud Pak for AIOps namespace in the script.
export PROJECT_CP4AIOPS=<aiops-namespace>
-
Example: alerts that inform you when the targets are down and when the enforced sample limit is approaching.
cat << EOF | oc apply --validate -f - apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: cluster-operator-metrics namespace: ${PROJECT_CP4AIOPS} labels: app: ibmevents.ibm.com spec: selector: matchLabels: ibmevents.ibm.com/kind: cluster-operator namespaceSelector: matchNames: - ${PROJECT_CP4AIOPS} podMetricsEndpoints: - path: /metrics port: http --- apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: entity-operator-metrics namespace: ${PROJECT_CP4AIOPS} labels: app: ibmevents.ibm.com spec: selector: matchLabels: app.kubernetes.io/name: entity-operator namespaceSelector: matchNames: - ${PROJECT_CP4AIOPS} podMetricsEndpoints: - path: /metrics port: healthcheck --- apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: kafka-resources-metrics namespace: ${PROJECT_CP4AIOPS} labels: app: ibmevents.ibm.com spec: selector: matchExpressions: - key: ibmevents.ibm.com/kind operator: In values: [ Kafka ] namespaceSelector: matchNames: - ${PROJECT_CP4AIOPS} podMetricsEndpoints: - path: /metrics port: tcp-prometheus relabelings: - separator: ; regex: __meta_kubernetes_pod_label_(ibmevents_ibm_com_.+) replacement: \$1 action: labelmap - sourceLabels: [__meta_kubernetes_namespace] separator: ; regex: (.*) targetLabel: namespace replacement: \$1 action: replace - sourceLabels: [__meta_kubernetes_pod_name] separator: ; regex: (.*) targetLabel: kubernetes_pod_name replacement: \$1 action: replace - sourceLabels: [__meta_kubernetes_pod_node_name] separator: ; regex: (.*) targetLabel: node_name replacement: \$1 action: replace - sourceLabels: [__meta_kubernetes_pod_host_ip] separator: ; regex: (.*) targetLabel: node_ip replacement: \$1 action: replace --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: role: alert-rules app: events.ibm.com name: prometheus-k8s-rules namespace: ${PROJECT_CP4AIOPS} spec: groups: - name: kafka rules: - alert: AIOpsKafkaRunningOutOfSpace expr: round(kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data(-[0-9]+)?-(.+)-kafka-[0-9]+"} * 100 / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"data(-[0-9]+)?-(.+)-kafka-[0-9]+"} < 15, 0.01) for: 10s labels: severity: critical annotations: summary: 'Kafka is running out of free disk space' description: |- There are only {{ \$value }} percent available at {{ \$labels.persistentvolumeclaim }} PVC See https://www.ibm.com/docs/en/cloud-paks/cloud-pak-aiops/4.6.1?topic=pak-adjusting-pvcs for how to add additional disk space - alert: AIOpsKafkaUnderReplicatedPartitions expr: kafka_server_replicamanager_underreplicatedpartitions > 0 for: 10s labels: severity: warning annotations: summary: 'Kafka under replicated partitions' description: 'There are {{ \$value }} under replicated partitions on {{ \$labels.kubernetes_pod_name }}' - alert: AIOpsKafkaAbnormalControllerState expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (ibmevents_ibm_com_name) != 1 for: 10s labels: severity: warning annotations: summary: 'Kafka abnormal controller state' description: 'There are {{ \$value }} active controllers in the cluster' - alert: AIOpsKafkaOfflinePartitions expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount) > 0 for: 10s labels: severity: warning annotations: summary: 'Kafka offline partitions' description: 'One or more partitions have no leader' - alert: AIOpsKafkaUnderMinIsrPartitionCount expr: kafka_server_replicamanager_underminisrpartitioncount > 0 for: 10s labels: severity: warning annotations: summary: 'Kafka under min ISR partitions' description: 'There are {{ \$value }} partitions under the min ISR on {{ \$labels.kubernetes_pod_name }}' - alert: AIOpsKafkaOfflineLogDirectoryCount expr: kafka_log_logmanager_offlinelogdirectorycount > 0 for: 10s labels: severity: warning annotations: summary: 'Kafka offline log directories' description: 'There are {{ \$value }} offline log directories on {{ \$labels.kubernetes_pod_name }}' - alert: AIOpsKafkaScrapeProblem expr: up{kubernetes_namespace!~"openshift-.+",kubernetes_pod_name=~".+-kafka-[0-9]+"} == 0 for: 3m labels: severity: major annotations: summary: 'OpenShift monitoring is unable to gather metrics from {{ \$labels.kubernetes_pod_name }}/{{ \$labels.instance }}' description: 'OpenShift monitoring was unable to scrape metrics from {{ \$labels.kubernetes_pod_name }}/{{ \$labels.instance }} for more than 3 minutes' - alert: AIOpsKafkaClusterOperatorContainerDown expr: count((container_last_seen{container="ibm-events-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="ibm-events-operator"}) for: 1m labels: severity: major annotations: summary: 'The AIOps Kafka cluster operator down' description: 'The AIOps Kafka cluster operator/controller has been down for longer than 90 seconds' - alert: AIOpsKafkaBrokerContainersDown expr: absent(container_last_seen{container="kafka",pod=~".+-kafka-[0-9]+"}) for: 3m labels: severity: major annotations: summary: 'All kafka containers down or in CrashLookBackOff status' description: 'All kafka containers have been down or in CrashLookBackOff status for 3 minutes' - alert: AIOpsKafkaContainerRestartedInTheLast5Minutes expr: count(count_over_time(container_last_seen{container="kafka"}[5m])) > 2 * count(container_last_seen{container="kafka",pod=~".+-kafka-[0-9]+"}) for: 5m labels: severity: warning annotations: summary: 'One or more Kafka containers restarted too often' description: 'One or more Kafka containers were restarted too often within the last 5 minutes' - name: zookeeper rules: - alert: AIOpsZookeeperAvgRequestLatency expr: zookeeper_avgrequestlatency > 10 for: 10s labels: severity: warning annotations: summary: 'Zookeeper average request latency' description: 'The average request latency is {{ \$value }} on {{ \$labels.kubernetes_pod_name }}' - alert: AIOpsZookeeperOutstandingRequests expr: zookeeper_outstandingrequests > 10 for: 10s labels: severity: warning annotations: summary: 'Zookeeper outstanding requests' description: 'There are {{ \$value }} outstanding requests on {{ \$labels.kubernetes_pod_name }}' - alert: AIOpsZookeeperRunningOutOfSpace expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data-(.+)-zookeeper-[0-9]+"} < 5368709120 for: 10s labels: severity: warning annotations: summary: 'Zookeeper is running out of free disk space' description: 'There are only {{ \$value }} bytes available at {{ \$labels.persistentvolumeclaim }} PVC' - alert: AIOpsZookeeperContainerRestartedInTheLast5Minutes expr: count(count_over_time(container_last_seen{container="zookeeper"}[5m])) > 2 * count(container_last_seen{container="zookeeper",pod=~".+-zookeeper-[0-9]+"}) for: 5m labels: severity: warning annotations: summary: 'One or more Zookeeper containers were restarted too often' description: 'One or more Zookeeper containers were restarted too often within the last 5 minutes. This alert can be ignored when the Zookeeper cluster is scaling up' - alert: AIOpsZookeeperContainersDown expr: absent(container_last_seen{container="zookeeper",pod=~".+-zookeeper-[0-9]+"}) for: 3m labels: severity: major annotations: summary: 'All zookeeper containers in the Zookeeper pods down or in CrashLookBackOff status' description: 'All zookeeper containers in the Zookeeper pods have been down or in CrashLookBackOff status for 3 minutes' - name: entityOperator rules: - alert: AIOpsKafkaTopicOperatorContainerDown expr: absent(container_last_seen{container="topic-operator",pod=~".+-entity-operator-.+"}) for: 3m labels: severity: major annotations: summary: 'Container topic-operator in Entity Operator pod down or in CrashLookBackOff status' description: 'Container topic-operator in Entity Operator pod has been or in CrashLookBackOff status for 3 minutes' - alert: AIOpsKafkaUserOperatorContainerDown expr: absent(container_last_seen{container="user-operator",pod=~".+-entity-operator-.+"}) for: 3m labels: severity: major annotations: summary: 'Container user-operator in Entity Operator pod down or in CrashLookBackOff status' description: 'Container user-operator in Entity Operator pod have been down or in CrashLookBackOff status for 3 minutes' EOF
-
Example: Red Hat OpenShift Container Platform alert when Kafka is running out of space.