docs/mixin/alerts/alerts.yaml

# Note: Alert thresholds are experimental. Feel free to change them or suggest back at
# Promscale channel in TimescaleDB slack.
groups:
- name: promscale-general
  rules:
  - alert: PromscaleDown
    expr: absent(up{job=~".*promscale.*"})
    labels:
      severity: critical
    annotations:
      summary: Promscale is down.
      description: "{{ $labels.instance }} of job {{ $labels.job }} is down."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleDown.md
- name: promscale-ingest
  rules:
  - alert: PromscaleIngestHighErrorRate
    expr: |
      (
        sum by (job, instance, namespace, type) (
          rate(promscale_ingest_requests_total{code=~"5.."}[5m])
        )
      /
        sum by (job, instance, namespace, type) (
          rate(promscale_ingest_requests_total[5m])
        )
      ) > 0.05
    labels:
      severity: warning
    annotations:
      summary: High error rate in Promscale ingestion.
      description: "Promscale ingestion is having a {{ $value | humanizePercentage }} error rate."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
  - alert: PromscaleIngestHighErrorRate
    expr: |
      (
        sum by (job, instance, namespace, type) (
          rate(promscale_ingest_requests_total{code=~"5.."}[5m])
        )
      /
        sum by (job, instance, namespace, type) (
          rate(promscale_ingest_requests_total[5m])
        )
      ) > 0.1
    labels:
      severity: critical
    annotations:
      summary: High error rate in Promscale ingestion.
      description: "Promscale ingestion is having a {{ $value | humanizePercentage }} error rate."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
  - alert: PromscaleIngestHighLatency
    expr: |
      (
        histogram_quantile(
          0.90,
          sum by (job, instance, namespace, type, le) (
            rate(promscale_ingest_duration_seconds_bucket[5m])
          )
        ) > 10
      and
        sum by (job, instance, namespace, type) (
            rate(promscale_ingest_duration_seconds_bucket[5m])
        )
      ) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Slow Promscale ingestion.
      description: "Slowest 10% of ingestion batch took more than {{ $value | humanizeDuration }} seconds to ingest."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
  - alert: PromscaleIngestHighLatency
    expr: |
      (
        histogram_quantile(
          0.90,
          sum by (job, instance, namespace, type, le) (
            rate(promscale_ingest_duration_seconds_bucket[5m])
          )
        ) > 30
      and
        sum by (job, instance, namespace, type) (
            rate(promscale_ingest_duration_seconds_bucket[5m])
        )
      ) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: Slow Promscale ingestion.
      description: "Slowest 10% of ingestion batch took more than {{ $value | humanizeDuration }} seconds to ingest."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
  - alert: PromscaleIngestHighDataDuplication
    expr: |
      sum by (job, namespace) (rate(promscale_ingest_duplicates_total{kind="sample"}[15m])) > 0
    for: 30m
    labels:
      severity: warning
    annotations:
      summary: Duplicate data being inserted.
      description: "More than {{ $value | humanize }} samples/sec are rejected as duplicates by promscale."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighDataDuplication.md
- name: promscale-query
  rules:
  - alert: PromscaleQueryHighErrorRate
    expr: |
      (
        sum by (job, instance, namespace, type, handler) (
          rate(promscale_query_requests_total{code=~"5..",handler!="/api/v1/query_range",err!="canceled"}[5m])
        )
      /
        sum by (job, instance, namespace, type, handler) (
          rate(promscale_query_requests_total{handler!="/api/v1/query_range",err!="canceled"}[5m])
        )
      ) > 0.05
    labels:
      severity: warning
    annotations:
      summary: High error rate in querying Promscale.
      description: "Evaluating queries via Promscale {{ $labels.handler }} endpoint has {{ $value | humanizePercentage }} error rate."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
  - alert: PromscaleQueryHighErrorRate
    expr: |
      (
        sum by (job, instance, namespace, type, handler) (
          rate(promscale_query_requests_total{code=~"5..",handler="/api/v1/query_range",err!="canceled"}[5m])
        )
      /
        sum by (job, instance, namespace, type, handler) (
          rate(promscale_query_requests_total{handler="/api/v1/query_range",err!="canceled"}[5m])
        )
      ) > 0.1
    labels:
      severity: warning
    annotations:
      summary: High error rate in querying Promscale.
      description: "Evaluating queries via Promscale {{ $labels.handler }} endpoint has {{ $value | humanizePercentage }} error rate."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
  - alert: PromscaleQueryHighErrorRate
    expr: |
      (
        sum by (job, instance, namespace, type) (
          rate(promscale_query_requests_total{code=~"5..",err!="canceled"}[5m])
        )
      /
        sum by (job, instance, namespace, type) (
          rate(promscale_query_requests_total[5m])
        )
      ) > 0.1
    labels:
      severity: critical
    annotations:
      summary: High error rate in querying Promscale.
      description: "Evaluating queries via Promscale had {{ $value | humanizePercentage }} error rate."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
  - alert: PromscaleQueryHighLatency
    expr: |
      (
        histogram_quantile(
          0.90,
          sum by (job, instance, namespace, type, le) (
            rate(promscale_query_duration_seconds_bucket[5m])
          )
        ) > 5
      and
        sum by (job, instance, namespace, type) (
          rate(promscale_query_duration_seconds_bucket[5m])
        ) > 0
      )
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Slow Promscale querying.
      description: "Slowest 10% of the queries took more than {{ $value | humanizeDuration }} seconds to evaluate."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
  - alert: PromscaleQueryHighLatency
    expr: |
      (
        histogram_quantile(
          0.90,
          sum by (job, instance, namespace, type, le) (
            rate(promscale_query_duration_seconds_bucket[5m])
          )
        ) > 10
      and
        sum by (job, instance, namespace, type) (
          rate(promscale_query_duration_seconds_bucket[5m])
        ) > 0
      )
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: Slow Promscale querying.
      description: "Slowest 10% of the queries took {{ $value | humanizeDuration }} seconds to evaluate."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
- name: promscale-cache
  rules:
  - alert: PromscaleCacheHighNumberOfEvictions
    expr: |
      (
        rate(promscale_cache_evictions_total[5m])
        /
        promscale_cache_capacity_elements
      ) > 0.2
    labels:
      severity: warning
    annotations:
      summary: High cache eviction in Promscale.
      description: "Promscale {{ $labels.name }} is evicting at {{ $value }} entries a second."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCacheHighNumberOfEvictions.md
- name: promscale-database-connection
  rules:
  - alert: PromscaleStorageHighErrorRate
    expr: |
      (
        sum by (job, instance, namespace, method) (
          rate(promscale_database_request_errors_total[5m])
        )
      /
        sum by (job, instance, namespace, method) (
          rate(promscale_database_requests_total[5m])
        )
      ) > 0.05
    labels:
      severity: warning
    annotations:
      summary: Promscale experiences a high error rate when connecting to the database.
      description: "Promscale connection with the database has an error rate of {{ $value | humanizePercentage }}."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleDBHighErrorRate.md
  - alert: PromscaleStorageHighLatency
    expr: |
      (
        histogram_quantile(0.9,
          sum by (job, instance, namespace, method, le) (
            rate(promscale_database_requests_duration_seconds_bucket{method!="exec"}[5m])
          )
        ) > 5
      and
        sum by (job, instance, namespace, method) (
          rate(promscale_database_requests_duration_seconds_count{method!="exec"}[5m])
        ) > 0
      )
    labels:
      severity: warning
    annotations:
      summary: Slow database response.
      description: "Slowest 10% of database requests with method {{ $labels.method }} are taking more than {{ $value | humanizeDuration }} seconds to respond."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageHighLatency.md
- name: promscale-database
  rules:
  - alert: PromscaleStorageUnhealthy
    expr: |
      (
        sum by (job, instance, namespace) (
          rate(promscale_sql_database_health_check_errors_total[5m])
        )
      /
        sum by (job, instance, namespace) (
          rate(promscale_sql_database_health_check_total[5m])
        )
      ) > 0.05
    labels:
      severity: warning
    annotations:
      summary: Promscale database is unhealthy.
      description: "Promscale connection with the database has an error of {{ $value | humanizePercentage }}."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageUnhealthy.md
  - alert: PromscaleMaintenanceJobRunningTooLong
    expr: |
      (
        (
          (
            time()
          -
            promscale_sql_database_worker_maintenance_job_start_timestamp_seconds
          )
            >
              30 * 60 * 2 # 30 mins (we launch maintenance jobs scheduled at 30 mins) * 60 (to seconds) * 2 (wait max for 2 complete scans before firing alert).
        )
      and
        promscale_sql_database_worker_maintenance_job_start_timestamp_seconds > 0
      )
    labels:
      severity: warning
    annotations:
      summary: Promscale maintenance jobs taking too long to complete.
      description: "Promscale Database is taking {{ $value | humanizeDuration }} seconds to respond to Promscale's requests."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobRunningTooLong.md
  - alert: PromscaleMaintenanceJobNotKeepingup
    expr: |
      (
          (
            min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > promscale_sql_database_metric_count
          )
        and
          (
            delta(promscale_sql_database_chunks_metrics_uncompressed_count[10m]) > 0
          )
      )
      or
      (
          (
            min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > promscale_sql_database_metric_count
          )
        and
          (
            delta(promscale_sql_database_chunks_metrics_expired_count[10m]) > 0
          )
      )
      or
      (
          (
            min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > promscale_sql_database_metric_count
          )
        and
          (
            delta(promscale_sql_database_chunks_traces_uncompressed_count[10m]) > 0
          )
      )
      or
      (
          (
            min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > promscale_sql_database_metric_count
          )
        and
          (
            delta(promscale_sql_database_chunks_traces_expired_count[10m]) > 0
          )
      )
    labels:
      severity: warning
    annotations:
      summary: Promscale maintenance jobs are not keeping up.
      description: "The amount of work for the promscale maintenance {{ $labels.name }} job is not decreasing for long time."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobRunningTooLong.md
  - alert: PromscaleMaintenanceJobFailures
    expr: promscale_sql_database_worker_maintenance_job_failed == 1
    labels:
      severity: warning
    annotations:
      summary: Promscale maintenance job failed.
      description: "Maintenance job for Promscale instance {{ $labels.instance }} failed to successfully execute."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobFailures.md
  - alert: PromscaleCompressionLow
    expr: |
      (
        (
          (promscale_sql_database_chunks_count - promscale_sql_database_chunks_compressed_count) # Number of uncompressed chunks.
        /
          promscale_sql_database_metric_count
        ) > 4 # If total number of average uncompressed chunk per metric is more than 4 chunks at maximum, we should alert.
      and
        promscale_sql_database_compression_status == 1
      )
    labels:
      severity: warning
    annotations:
      summary: High uncompressed data.
      description: "High uncompressed data in Promscale, on average, {{ $value }} uncompressed chunks per metric."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCompressionLow.md
  - alert: PromscalePostgreSQLSharedBuffersLow
    expr: |
      (
        ((promscale_sql_database_open_chunks_total_table_size + promscale_sql_database_open_chunks_total_index_size)
        /
        promscale_sql_database_shared_buffers_size)
        > 1 )
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: Promscale database performance will be affected.
      description: "Currently open chunks are {{ $value | humanizePercentage }} of PostgreSQL shared_buffers. This will impact database performance."
      runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscalePostgreSQLSharedBuffersLow.md