From 279bed060aed53197819e6b47527b7cfecdeb158 Mon Sep 17 00:00:00 2001 From: "lucas.mathieu" Date: Tue, 30 Dec 2025 00:12:30 +0100 Subject: [PATCH] feat(monitoring): add cadvisor alerts --- .../prometheus/alerts/google-cadvisor.yml | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 monitoring/prometheus/alerts/google-cadvisor.yml diff --git a/monitoring/prometheus/alerts/google-cadvisor.yml b/monitoring/prometheus/alerts/google-cadvisor.yml new file mode 100644 index 0000000..c427a55 --- /dev/null +++ b/monitoring/prometheus/alerts/google-cadvisor.yml @@ -0,0 +1,90 @@ +groups: + +- name: GoogleCadvisor + + + rules: + + # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. + # - alert: ContainerKilled + # expr: 'time() - container_last_seen > 60' + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: Container killed (instance {{ $labels.instance }}) + # description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. + # - alert: ContainerAbsent + # expr: 'absent(container_last_seen)' + # for: 5m + # labels: + # severity: warning + # annotations: + # summary: Container absent (instance {{ $labels.instance }}) + # description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerHighCpuUtilization + expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80' + for: 2m + labels: + severity: warning + annotations: + summary: Container High CPU utilization (instance {{ $labels.instance }}) + description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d + - alert: ContainerHighMemoryUsage + expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80' + for: 2m + labels: + severity: warning + annotations: + summary: Container High Memory usage (instance {{ $labels.instance }}) + description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerVolumeUsage + expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80' + for: 2m + labels: + severity: warning + annotations: + summary: Container Volume usage (instance {{ $labels.instance }}) + description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerHighThrottleRate + expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )' + for: 5m + labels: + severity: warning + annotations: + summary: Container high throttle rate (instance {{ $labels.instance }}) + description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerHighLowChangeCpuUsage + expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25' + for: 0m + labels: + severity: info + annotations: + summary: Container high low change CPU usage (instance {{ $labels.instance }}) + description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerLowCpuUtilization + expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20' + for: 7d + labels: + severity: info + annotations: + summary: Container Low CPU utilization (instance {{ $labels.instance }}) + description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerLowMemoryUsage + expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20' + for: 7d + labels: + severity: info + annotations: + summary: Container Low Memory usage (instance {{ $labels.instance }}) + description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"