Created
December 20, 2022 08:59
-
-
Save rewanthtammana/d4315690bc80012cc01b1ff4dc88a80f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
rule_files: | |
- loki.all.rules.yml | |
tests: | |
- interval: 1m | |
input_series: | |
- series: 'cortex_ring_members{container="service", job="zj88t-prometheus/workload-zj88t/0", name="service", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", pod="loki-service-676b8c897b-rq298", provider="aws", service_priority="highest", state="Unhealthy"}' | |
values: "0+0x20 1+0x160" # 1 unhealthy value after 20 minutes | |
- series: 'loki_panic_total{app="loki-service", container="service", job="zj88t-prometheus/workload-zj88t/0", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", pod="loki-service-676b8c897b-rq29", provider="aws", service_priority="highest"}' | |
values: "0+0x20 1+0x160" # 1 panic after 20 minutes | |
- series: 'loki_request_duration_seconds_count{app="loki-distributor", container="distributor", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", route="loki_api_v1_push", service_priority="highest", status_code="204", ws="false"}' | |
values: "0+60x180" # 1 request per second OK for 3 hours | |
- series: 'loki_request_duration_seconds_count{app="loki-distributor", container="distributor", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", route="loki_api_v1_push", service_priority="highest", status_code="503", ws="false"}' | |
values: "0+0x20 0+30x160" # After 20 minutes, we also have 0.5 rq/s failing | |
alert_rule_test: | |
- alertname: LokiRequestPanics | |
eval_time: 15m # should be OK after 15 minutes | |
exp_alerts: | |
- alertname: LokiRequestPanics | |
eval_time: 25m # After 25 minutes, should fire an alert for the t+20 error | |
exp_alerts: | |
- exp_labels: | |
area: services | |
cancel_if_apiserver_down: true | |
cancel_if_cluster_status_creating: true | |
cancel_if_cluster_status_deleting: "true" | |
cancel_if_cluster_status_updating: true | |
cancel_if_outside_working_hours: false | |
cancel_if_scrape_timeout: true | |
job: zj88t-prometheus/workload-zj88t/0 | |
namespace: loki | |
severity: page | |
topic: observability | |
exp_annotations: | |
description: This alert checks that we have no panic errors on Loki. | |
- alertname: LokiRequestPanics | |
eval_time: 40m # After 40 minutes, all should be back to normal | |
exp_alerts: | |
- alertname: LokiRequestErrors | |
eval_time: 15m # should be OK after 15 minutes | |
exp_alerts: | |
- alertname: LokiRequestErrors | |
eval_time: 160m # Alert after more than 120m of incident | |
exp_alerts: | |
- exp_labels: | |
area: services | |
cancel_if_apiserver_down: true | |
cancel_if_cluster_status_creating: true | |
cancel_if_cluster_status_deleting: true | |
cancel_if_cluster_status_updating: true | |
cancel_if_outside_working_hours: false | |
cancel_if_scrape_timeout: true | |
job: zj88t-prometheus/workload-zj88t/0 | |
namespace: loki | |
route: loki_api_v1_push | |
severity: page | |
topic: observability | |
exp_annotations: | |
description: This alert checks that we have less than 10% errors on Loki requests. | |
- alertname: LokiRingUnhealthy | |
eval_time: 15m # should be OK after 15 minutes | |
exp_alerts: | |
- alertname: LokiRingUnhealthy | |
eval_time: 25m # after 25 minutes we have an unhealthy member, but we want to filter too short events. So no alert yet. | |
exp_alerts: | |
- alertname: LokiRingUnhealthy | |
eval_time: 40m # now the event has been there for 20 minutes, we should have an alert. | |
exp_alerts: | |
- exp_labels: | |
app: loki-service | |
cancel_if_apiserver_down: "true" | |
cancel_if_cluster_status_creating: "true" | |
cancel_if_cluster_status_deleting: "true" | |
cancel_if_cluster_status_updating: "true" | |
cancel_if_scrape_timeout: "true" | |
cancel_if_outside_working_hours: "true" | |
container: service | |
name: service | |
namespace: loki | |
pod: loki-service-676b8c897b-rq29 | |
severity: page | |
topic: observability | |
exp_annotations: | |
description: "Loki pod loki-service-676b8c897b-rq298 (namespace loki) sees 1 unhealthy ring members" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment