From 685182fb92b6959e15cbf848f1122b458cc43280 Mon Sep 17 00:00:00 2001 From: Michele Cereda Date: Thu, 13 Feb 2025 21:57:08 +0100 Subject: [PATCH] feat(pulumi/examples): function to create cloudwatch alarms --- .../aws/create alarms for ec2 instances.ts | 277 ++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 examples/pulumi/aws/create alarms for ec2 instances.ts diff --git a/examples/pulumi/aws/create alarms for ec2 instances.ts b/examples/pulumi/aws/create alarms for ec2 instances.ts new file mode 100644 index 0000000..6532e67 --- /dev/null +++ b/examples/pulumi/aws/create alarms for ec2 instances.ts @@ -0,0 +1,277 @@ +import * as aws from "@pulumi/aws"; +import * as pulumi from "@pulumi/pulumi"; + +/** + * Creates CloudWatch alarms for EC2 instances. + * + * Refer + * for details on the alarms. + * + * @param instance The EC2 instance to create alarms for + * @param topic The SNS topic to send notifications to + * @param namePrefix The prefix to use for the alarms' name (defaults to the instance's ID) + * @param [cpuUtilization=true] Whether to create an alarm for the instance's CPUUtilization metric + * @param [instanceStatusCheck=true] Whether to create an alarm for the instance's instance status check + * @param [systemStatusCheck=true] Whether to create an alarm for the instance's system status check + * @param [memUsedPercent=false] Whether to create an alarm for the instance's mem_used_percent metric + * @param [diskUsedPercent=false] Whether to create an alarm for the instance's disk_used_percent metric + * @param [extraTags={}] Extra tags to apply to the alarms + * @param [protectFromDeletion=true] Whether to protect the alarms from deletion + * @returns An object with the alarms that were created + */ +export function createCloudwatchAlarmsForEc2Instance( + ec2Instance: aws.ec2.Instance | pulumi.Output | pulumi.Output, + snsTopic?: aws.sns.Topic | pulumi.Output | pulumi.Output, + namePrefix: string | pulumi.Output = pulumi.interpolate `${ ec2Instance.id }`, + cpuUtilization: boolean = true, + instanceStatusCheck: boolean = true, + systemStatusCheck: boolean = true, + memUsedPercent: boolean = false, // requires CWAgent configured + diskUsedPercent: boolean = false, // requires CWAgent configured + extraTags: { [key: string]: string } = {}, + protectFromDeletion: boolean = true, +) { + namePrefix = pulumi.interpolate `${ namePrefix }`; + const idPrefix = pulumi.interpolate `${ namePrefix.apply((s: string) => s.replace('_','-').replace(/-?\w/g, match => match.toLowerCase()))}`; + const ec2InstanceId = pulumi.interpolate `${ ec2Instance.id }`; + const snsTopicArn = snsTopic !== undefined ? pulumi.interpolate `${ snsTopic.arn }` : undefined; + + /** + * Helps monitoring the CPU utilization of an EC2 instance. + * + * Depending on the application, consistently high utilization levels might be normal. But if performance is + * degraded, and the application is not constrained by disk I/O, memory, or network resources, then a maxed-out CPU + * might indicate a resource bottleneck or application performance problems. + * High CPU utilization might indicate that an upgrade to a more CPU intensive instance is required. + * If detailed monitoring is enabled, one can change the period to 60 seconds instead of 300 seconds. + * + * This alarm will trigger when the CPU utilization is > 80% for 3 consecutive times over 15 minutes. + * It only sends a notification to the given SNS topics (if any was given). + */ + const cpuUtilization_cloudwatchMetricAlarm = cpuUtilization ? pulumi + .all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ]) + .apply( + ([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) => + new aws.cloudwatch.MetricAlarm( + `${ idPrefix }-cpuUtilization`, + { + name: `${ namePrefix }_CPUUtilization`, + alarmDescription: "Notify the team when the CPU utilization is > 80% 3 consecutive times over 15 minutes.", + tags: { + Controls: "SOC2/CC7.2", + ...extraTags, + }, + + namespace: "AWS/EC2", + dimensions: { + InstanceId: instanceId, + }, + metricName: "CPUUtilization", + statistic: "Average", + comparisonOperator: "GreaterThanThreshold", + threshold: 80, + period: 300, + evaluationPeriods: 3, + datapointsToAlarm: 3, + alarmActions: [ + topicArn, + ].filter(item => item !== undefined), + }, + { + protect: protectFromDeletion, + }, + ), + ) : undefined; + + /** + * Helps monitoring an instance's InstanceStatus check. + * + * This alarm is used to detect underlying problems with instances. + * Should this status check fail, this alarm should be in ALARM state. + * + * This alarm will trigger when the Instance status check fails 2 consecutive times over 10 minutes. + * It: + * - sends a notification to the given SNS topics (if any was given); + * - tries to automatically restart the instance. + */ + const instanceStatusCheck_cloudwatchMetricAlarm = instanceStatusCheck ? pulumi + .all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ]) + .apply( + ([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) => + new aws.cloudwatch.MetricAlarm( + `${ idPrefix }-instanceStatusCheck`, + { + name: `${ namePrefix ?? ec2InstanceId }_InstanceStatusCheck`, + alarmDescription: "Notify the team and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.", + tags: { + Controls: "SOC2/CC7.2", + ...extraTags, + }, + + namespace: "AWS/EC2", + dimensions: { + InstanceId: instanceId, + }, + metricName: "StatusCheckFailed_Instance", + statistic: "Maximum", + unit: "Count", + comparisonOperator: "GreaterThanOrEqualToThreshold", + threshold: 1, + period: 300, + evaluationPeriods: 2, + datapointsToAlarm: 2, + alarmActions: [ + topicArn, + "arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0", + ].filter(item => item !== undefined), + }, + { + protect: protectFromDeletion, + }, + ), + ) : undefined; + + /** + * Helps monitoring an instance's SystemStatus check. + * + * This alarm is used to detect underlying problems with instances. + * Should this status check fail, this alarm should be in the ALARM state. + * + * This alarm will trigger when the System status check fails 2 consecutive times over 10 minutes. + * It: + * - sends a notification to the given SNS topics (if any was given); + * - tries to automatically recover the instance. + */ + const systemStatusCheck_cloudwatchMetricAlarm = systemStatusCheck ? pulumi + .all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ]) + .apply( + ([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) => + new aws.cloudwatch.MetricAlarm( + `${ idPrefix }-systemStatusCheck`, + { + name: `${ namePrefix ?? ec2InstanceId }_SystemStatusCheck`, + alarmDescription: "Notify the team and recover the instance when the System status check fails 2 consecutive times over 10 minutes.", + tags: { + Controls: "SOC2/CC7.2", + ...extraTags, + }, + + namespace: "AWS/EC2", + dimensions: { + InstanceId: instanceId, + }, + metricName: "StatusCheckFailed_System", + statistic: "Maximum", + unit: "Count", + comparisonOperator: "GreaterThanOrEqualToThreshold", + threshold: 1, + period: 300, + evaluationPeriods: 2, + datapointsToAlarm: 2, + alarmActions: [ + topicArn, + "arn:aws:automate:eu-west-1:ec2:recover", + ].filter(item => item !== undefined), + }, + { + protect: protectFromDeletion, + }, + ), + ) : undefined; + + /** + * Helps monitoring the memory utilization of an EC2 instance. + * + * High memory utilization might indicate that an upgrade to a more memory oriented instance is required. + * + * This alarm requires the CloudWatch Agent to be configured to send mem_used_percent data. + * + * This alarm will trigger when the memory utilization is > 85% for 3 consecutive times over 15 minutes. + * It only sends a notification to the given SNS topics (if any was given). + */ + const memUsedPercent_cloudwatchMetricAlarm = memUsedPercent ? pulumi + .all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ]) + .apply( + ([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) => + new aws.cloudwatch.MetricAlarm( + `${ idPrefix }-memUsedPercent`, + { + name: `${ namePrefix ?? ec2InstanceId }_MemUsedPercent`, + alarmDescription: "Notify the team when the memory utilization is > 85% 3 consecutive times over 15 minutes.", + tags: { + Controls: "SOC2/CC7.2", + ...extraTags, + }, + + namespace: "CWAgent", + dimensions: { + InstanceId: instanceId, + }, + metricName: "mem_used_percent", + statistic: "Average", + comparisonOperator: "GreaterThanThreshold", + threshold: 85, + period: 300, + evaluationPeriods: 3, + datapointsToAlarm: 3, + alarmActions: [ + topicArn, + ].filter(item => item !== undefined), + }, + { + protect: protectFromDeletion, + }, + ), + ) : undefined; + + /** + * Helps monitoring the disk utilization of an EC2 instance. + * + * This alarm requires the CloudWatch Agent to be configured to send disk_used_percent data. + * + * This alarm will trigger when the disk utilization is > 85% for 3 consecutive times over 15 minutes. + * It only sends a notification to the given SNS topics (if any was given). + */ + const diskUsedPercent_cloudwatchMetricAlarm = diskUsedPercent ? pulumi + .all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ]) + .apply( + ([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) => + new aws.cloudwatch.MetricAlarm( + `${ idPrefix }-diskUsedPercent`, + { + name: `${ namePrefix ?? ec2InstanceId }_DiskUsedPercent`, + alarmDescription: "Notify the team when the disk utilization is > 85% 3 consecutive times over 15 minutes.", + tags: { + Controls: "SOC2/CC7.2", + ...extraTags, + }, + + namespace: "CWAgent", + dimensions: { + InstanceId: instanceId, + }, + metricName: "disk_used_percent", + statistic: "Average", + comparisonOperator: "GreaterThanThreshold", + threshold: 85, + period: 300, + evaluationPeriods: 3, + datapointsToAlarm: 3, + alarmActions: [ + topicArn, + ].filter(item => item !== undefined), + }, + { + protect: protectFromDeletion, + }, + ), + ) : undefined; + + return { + cpuUtilization: cpuUtilization_cloudwatchMetricAlarm, + instanceStatusCheck: instanceStatusCheck_cloudwatchMetricAlarm, + systemStatusCheck: systemStatusCheck_cloudwatchMetricAlarm, + memUsedPercent: memUsedPercent_cloudwatchMetricAlarm, + diskUsedPercent: diskUsedPercent_cloudwatchMetricAlarm, + } as const; +};