feat(pulumi/examples): function to create cloudwatch alarms

This commit is contained in:
Michele Cereda
2025-02-13 21:57:08 +01:00
parent 18870ca937
commit 685182fb92

View File

@@ -0,0 +1,277 @@
import * as aws from "@pulumi/aws";
import * as pulumi from "@pulumi/pulumi";
/**
* Creates CloudWatch alarms for EC2 instances.
*
* Refer <https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Best_Practice_Recommended_Alarms_AWS_Services.html#EC2>
* for details on the alarms.
*
* @param instance The EC2 instance to create alarms for
* @param topic The SNS topic to send notifications to
* @param namePrefix The prefix to use for the alarms' name (defaults to the instance's ID)
* @param [cpuUtilization=true] Whether to create an alarm for the instance's CPUUtilization metric
* @param [instanceStatusCheck=true] Whether to create an alarm for the instance's instance status check
* @param [systemStatusCheck=true] Whether to create an alarm for the instance's system status check
* @param [memUsedPercent=false] Whether to create an alarm for the instance's mem_used_percent metric
* @param [diskUsedPercent=false] Whether to create an alarm for the instance's disk_used_percent metric
* @param [extraTags={}] Extra tags to apply to the alarms
* @param [protectFromDeletion=true] Whether to protect the alarms from deletion
* @returns An object with the alarms that were created
*/
export function createCloudwatchAlarmsForEc2Instance(
ec2Instance: aws.ec2.Instance | pulumi.Output<aws.ec2.Instance> | pulumi.Output<aws.ec2.GetInstanceResult>,
snsTopic?: aws.sns.Topic | pulumi.Output<aws.sns.Topic> | pulumi.Output<aws.sns.GetTopicResult>,
namePrefix: string | pulumi.Output<string> = pulumi.interpolate `${ ec2Instance.id }`,
cpuUtilization: boolean = true,
instanceStatusCheck: boolean = true,
systemStatusCheck: boolean = true,
memUsedPercent: boolean = false, // requires CWAgent configured
diskUsedPercent: boolean = false, // requires CWAgent configured
extraTags: { [key: string]: string } = {},
protectFromDeletion: boolean = true,
) {
namePrefix = pulumi.interpolate `${ namePrefix }`;
const idPrefix = pulumi.interpolate `${ namePrefix.apply((s: string) => s.replace('_','-').replace(/-?\w/g, match => match.toLowerCase()))}`;
const ec2InstanceId = pulumi.interpolate `${ ec2Instance.id }`;
const snsTopicArn = snsTopic !== undefined ? pulumi.interpolate `${ snsTopic.arn }` : undefined;
/**
* Helps monitoring the CPU utilization of an EC2 instance.
*
* Depending on the application, consistently high utilization levels might be normal. But if performance is
* degraded, and the application is not constrained by disk I/O, memory, or network resources, then a maxed-out CPU
* might indicate a resource bottleneck or application performance problems.
* High CPU utilization might indicate that an upgrade to a more CPU intensive instance is required.
* If detailed monitoring is enabled, one can change the period to 60 seconds instead of 300 seconds.
*
* This alarm will trigger when the CPU utilization is > 80% for 3 consecutive times over 15 minutes.
* It only sends a notification to the given SNS topics (if any was given).
*/
const cpuUtilization_cloudwatchMetricAlarm = cpuUtilization ? pulumi
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
.apply(
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
new aws.cloudwatch.MetricAlarm(
`${ idPrefix }-cpuUtilization`,
{
name: `${ namePrefix }_CPUUtilization`,
alarmDescription: "Notify the team when the CPU utilization is > 80% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2",
...extraTags,
},
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId,
},
metricName: "CPUUtilization",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 80,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
topicArn,
].filter(item => item !== undefined),
},
{
protect: protectFromDeletion,
},
),
) : undefined;
/**
* Helps monitoring an instance's InstanceStatus check.
*
* This alarm is used to detect underlying problems with instances.
* Should this status check fail, this alarm should be in ALARM state.
*
* This alarm will trigger when the Instance status check fails 2 consecutive times over 10 minutes.
* It:
* - sends a notification to the given SNS topics (if any was given);
* - tries to automatically restart the instance.
*/
const instanceStatusCheck_cloudwatchMetricAlarm = instanceStatusCheck ? pulumi
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
.apply(
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
new aws.cloudwatch.MetricAlarm(
`${ idPrefix }-instanceStatusCheck`,
{
name: `${ namePrefix ?? ec2InstanceId }_InstanceStatusCheck`,
alarmDescription: "Notify the team and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.",
tags: {
Controls: "SOC2/CC7.2",
...extraTags,
},
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId,
},
metricName: "StatusCheckFailed_Instance",
statistic: "Maximum",
unit: "Count",
comparisonOperator: "GreaterThanOrEqualToThreshold",
threshold: 1,
period: 300,
evaluationPeriods: 2,
datapointsToAlarm: 2,
alarmActions: [
topicArn,
"arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0",
].filter(item => item !== undefined),
},
{
protect: protectFromDeletion,
},
),
) : undefined;
/**
* Helps monitoring an instance's SystemStatus check.
*
* This alarm is used to detect underlying problems with instances.
* Should this status check fail, this alarm should be in the ALARM state.
*
* This alarm will trigger when the System status check fails 2 consecutive times over 10 minutes.
* It:
* - sends a notification to the given SNS topics (if any was given);
* - tries to automatically recover the instance.
*/
const systemStatusCheck_cloudwatchMetricAlarm = systemStatusCheck ? pulumi
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
.apply(
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
new aws.cloudwatch.MetricAlarm(
`${ idPrefix }-systemStatusCheck`,
{
name: `${ namePrefix ?? ec2InstanceId }_SystemStatusCheck`,
alarmDescription: "Notify the team and recover the instance when the System status check fails 2 consecutive times over 10 minutes.",
tags: {
Controls: "SOC2/CC7.2",
...extraTags,
},
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId,
},
metricName: "StatusCheckFailed_System",
statistic: "Maximum",
unit: "Count",
comparisonOperator: "GreaterThanOrEqualToThreshold",
threshold: 1,
period: 300,
evaluationPeriods: 2,
datapointsToAlarm: 2,
alarmActions: [
topicArn,
"arn:aws:automate:eu-west-1:ec2:recover",
].filter(item => item !== undefined),
},
{
protect: protectFromDeletion,
},
),
) : undefined;
/**
* Helps monitoring the memory utilization of an EC2 instance.
*
* High memory utilization might indicate that an upgrade to a more memory oriented instance is required.
*
* This alarm requires the CloudWatch Agent to be configured to send mem_used_percent data.
*
* This alarm will trigger when the memory utilization is > 85% for 3 consecutive times over 15 minutes.
* It only sends a notification to the given SNS topics (if any was given).
*/
const memUsedPercent_cloudwatchMetricAlarm = memUsedPercent ? pulumi
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
.apply(
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
new aws.cloudwatch.MetricAlarm(
`${ idPrefix }-memUsedPercent`,
{
name: `${ namePrefix ?? ec2InstanceId }_MemUsedPercent`,
alarmDescription: "Notify the team when the memory utilization is > 85% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2",
...extraTags,
},
namespace: "CWAgent",
dimensions: {
InstanceId: instanceId,
},
metricName: "mem_used_percent",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 85,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
topicArn,
].filter(item => item !== undefined),
},
{
protect: protectFromDeletion,
},
),
) : undefined;
/**
* Helps monitoring the disk utilization of an EC2 instance.
*
* This alarm requires the CloudWatch Agent to be configured to send disk_used_percent data.
*
* This alarm will trigger when the disk utilization is > 85% for 3 consecutive times over 15 minutes.
* It only sends a notification to the given SNS topics (if any was given).
*/
const diskUsedPercent_cloudwatchMetricAlarm = diskUsedPercent ? pulumi
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
.apply(
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
new aws.cloudwatch.MetricAlarm(
`${ idPrefix }-diskUsedPercent`,
{
name: `${ namePrefix ?? ec2InstanceId }_DiskUsedPercent`,
alarmDescription: "Notify the team when the disk utilization is > 85% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2",
...extraTags,
},
namespace: "CWAgent",
dimensions: {
InstanceId: instanceId,
},
metricName: "disk_used_percent",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 85,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
topicArn,
].filter(item => item !== undefined),
},
{
protect: protectFromDeletion,
},
),
) : undefined;
return {
cpuUtilization: cpuUtilization_cloudwatchMetricAlarm,
instanceStatusCheck: instanceStatusCheck_cloudwatchMetricAlarm,
systemStatusCheck: systemStatusCheck_cloudwatchMetricAlarm,
memUsedPercent: memUsedPercent_cloudwatchMetricAlarm,
diskUsedPercent: diskUsedPercent_cloudwatchMetricAlarm,
} as const;
};