From 279bed060aed53197819e6b47527b7cfecdeb158 Mon Sep 17 00:00:00 2001
From: "lucas.mathieu" <lmlucas.mathieu@gmail.com>
Date: Tue, 30 Dec 2025 00:12:30 +0100
Subject: [PATCH] feat(monitoring): add cadvisor alerts

---
 .../prometheus/alerts/google-cadvisor.yml     | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 monitoring/prometheus/alerts/google-cadvisor.yml

diff --git a/monitoring/prometheus/alerts/google-cadvisor.yml b/monitoring/prometheus/alerts/google-cadvisor.yml
new file mode 100644
index 0000000..c427a55
--- /dev/null
+++ b/monitoring/prometheus/alerts/google-cadvisor.yml
@@ -0,0 +1,90 @@
+groups:
+
+- name: GoogleCadvisor
+
+  
+  rules:
+
+    # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
+    # - alert: ContainerKilled
+    #   expr: 'time() - container_last_seen > 60'
+    #   for: 0m
+    #   labels:
+    #     severity: warning
+    #   annotations:
+    #     summary: Container killed (instance {{ $labels.instance }})
+    #     description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
+    # - alert: ContainerAbsent
+    #   expr: 'absent(container_last_seen)'
+    #   for: 5m
+    #   labels:
+    #     severity: warning
+    #   annotations:
+    #     summary: Container absent (instance {{ $labels.instance }})
+    #     description: "A container is absent for 5 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ContainerHighCpuUtilization
+      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Container High CPU utilization (instance {{ $labels.instance }})
+        description: "Container CPU utilization is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
+    - alert: ContainerHighMemoryUsage
+      expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Container High Memory usage (instance {{ $labels.instance }})
+        description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ContainerVolumeUsage
+      expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Container Volume usage (instance {{ $labels.instance }})
+        description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ContainerHighThrottleRate
+      expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Container high throttle rate (instance {{ $labels.instance }})
+        description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ContainerHighLowChangeCpuUsage
+      expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Container high low change CPU usage (instance {{ $labels.instance }})
+        description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ContainerLowCpuUtilization
+      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
+      for: 7d
+      labels:
+        severity: info
+      annotations:
+        summary: Container Low CPU utilization (instance {{ $labels.instance }})
+        description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ContainerLowMemoryUsage
+      expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
+      for: 7d
+      labels:
+        severity: info
+      annotations:
+        summary: Container Low Memory usage (instance {{ $labels.instance }})
+        description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"