Skip to content

Commit

Permalink
feat: add default descriptions to alerts (#10)
Browse files Browse the repository at this point in the history
* feat: add default descriptions to alerts
  • Loading branch information
Perttu Savolainen authored Dec 17, 2020
1 parent 4ff10ec commit 9465251
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 20 deletions.
60 changes: 60 additions & 0 deletions src/lib/monitoring/alarmDescriptions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
export const defaultAlarmDescriptionTemplate = `
Consider the following:
- Be precise: character limit of 1024 for the alert description
- Actionability: Is it actionable, is the alert even needed?
- Prefer examples: share e.g. CloudWatch Insights queries
- Reusability: If instruction is generic open a PR to mca-cli :)
`.trim();

export const lambdaErrorsAlarmDescriptionTemplate = `
- Evaluate the criticality of alert:
* Check the amount of errors
* If there are a lot of errors inform the product owner immediately
- Find the requestId of the error with CloudWatch Insights query:
fields @timestamp, @message
| sort @timestamp desc
| filter @message like /ERROR/
- Get the logs for the requestId:
fields @timestamp, @message
| sort @timestamp desc
| filter @requestId = "requestIdHere"
- Check if a development ticket exists of this issue
* If not create one
`.trim();

export const lambdaDurationAlarmDescriptionTemplate = `
- Check metric history for changes to durations
- Evaluate whether alarm threshold or applications needs to change
- CloudWatch Insights query to find offending durations:
fields @timestamp, @message
| sort @timestamp desc
| filter @duration > durationThresholdHere
`.trim();

export const lambdaInvocationsAlarmDescriptionTemplate = `
- Check metric history for changes to invocations
- Evaluate whether alarm threshold or applications needs to change
- CloudWatch Insights query to check the invocation counts:
fields @timestamp, @message
| sort @timestamp desc
| filter @message like /START RequestId:/
| stats count() by bin(5m)
`.trim();

export const lambdaThrottlesAlarmDescriptionTemplate = `
- Check metric history for throttles
- Evaluate the severity
* Check how retry logic has been implemented
-> if retry logic is missing, issue is CRITICAL
* If situation is critical and urgent, request
concurrency limit extension from AWS support immediately
* Check whether the issue affects data integrity
* Add retry logic if it's missing!
`.trim();

export const lambda = {
errors: lambdaErrorsAlarmDescriptionTemplate,
duration: lambdaDurationAlarmDescriptionTemplate,
invocations: lambdaInvocationsAlarmDescriptionTemplate,
throttles: lambdaThrottlesAlarmDescriptionTemplate,
};
55 changes: 35 additions & 20 deletions src/lib/monitoring/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@ import {
ConfigCustomDefaults,
ConfigLogGroupAlarms,
} from './types';
import * as descriptions from './alarmDescriptions';
import { Args, AWSItem } from './types';
import diff from './diff';

type AlarmMetricConfig = ConfigLocals<ConfigMetricAlarms>;

const defaultGenericCriticalConfig = {
evaluationPeriods: 1,
alarmDescription: descriptions.defaultAlarmDescriptionTemplate,
};

export class ConfigGenerator {
private config: Config;

Expand Down Expand Up @@ -346,8 +352,9 @@ export class ConfigGenerator {
autoResolve: false,
alarm: {
critical: {
...defaultGenericCriticalConfig,
alarmDescription: descriptions.lambda.errors,
threshold: 1,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -360,8 +367,9 @@ export class ConfigGenerator {
autoResolve: false,
alarm: {
critical: {
...defaultGenericCriticalConfig,
alarmDescription: descriptions.lambda.invocations,
threshold: 1000,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -374,8 +382,9 @@ export class ConfigGenerator {
autoResolve: false,
alarm: {
critical: {
...defaultGenericCriticalConfig,
alarmDescription: descriptions.lambda.duration,
threshold: 2000,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -388,8 +397,9 @@ export class ConfigGenerator {
autoResolve: false,
alarm: {
critical: {
...defaultGenericCriticalConfig,
alarmDescription: descriptions.lambda.throttles,
threshold: 1,
evaluationPeriods: 1,
},
},
metric: {
Expand Down Expand Up @@ -432,8 +442,8 @@ export class ConfigGenerator {
autoResolve: false,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 100,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -446,8 +456,8 @@ export class ConfigGenerator {
autoResolve: false,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 200,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -460,8 +470,8 @@ export class ConfigGenerator {
autoResolve: false,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 2000,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -474,8 +484,8 @@ export class ConfigGenerator {
autoResolve: false,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 10,
evaluationPeriods: 1,
},
},
metric: {
Expand Down Expand Up @@ -550,8 +560,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 90,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -563,8 +573,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 90,
evaluationPeriods: 1,
},
},
metric: {
Expand Down Expand Up @@ -600,8 +610,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 1,
evaluationPeriods: 1,
},
},
metric: {
Expand Down Expand Up @@ -641,8 +651,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 1,
evaluationPeriods: 1,
},
},
metric: {
Expand Down Expand Up @@ -689,6 +699,7 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 75,
evaluationPeriods: 5,
},
Expand All @@ -703,8 +714,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 1000000000, // 1GB
evaluationPeriods: 1,
comparisonOperator: 'LESS_THAN_THRESHOLD',
},
},
Expand All @@ -718,8 +729,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 25,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -732,8 +743,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 75000000, // 75MB
evaluationPeriods: 1,
comparisonOperator: 'LESS_THAN_THRESHOLD',
},
},
Expand All @@ -747,8 +758,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 1,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -761,6 +772,7 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 2,
evaluationPeriods: 1,
},
Expand All @@ -775,8 +787,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 60,
evaluationPeriods: 1,
},
},
metric: {
Expand Down Expand Up @@ -808,7 +820,7 @@ export class ConfigGenerator {
this.config = {
...this.config,
rdsInstances: rdsInstances.reduce(
(acc, i) => ({ ...acc, [i.DBInstanceIdentifier || '']: {} }),
(acc, instance) => ({ ...acc, [instance.DBInstanceIdentifier || '']: {} }),
{} as AlarmMetricConfig,
),
custom: {
Expand All @@ -832,8 +844,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 1,
evaluationPeriods: 1,
},
},
metric: {
Expand All @@ -845,6 +857,7 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 75,
evaluationPeriods: 5,
},
Expand All @@ -859,6 +872,7 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 75,
evaluationPeriods: 5,
},
Expand All @@ -873,6 +887,7 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 20,
evaluationPeriods: 1,
},
Expand Down Expand Up @@ -929,8 +944,8 @@ export class ConfigGenerator {
enabled: true,
alarm: {
critical: {
...defaultGenericCriticalConfig,
threshold: 10,
evaluationPeriods: 1,
},
},
metric: {
Expand Down

0 comments on commit 9465251

Please sign in to comment.