mirror of
https://gitea.com/mcereda/oam.git
synced 2026-02-08 21:34:25 +00:00
feat(pulumi/examples): function to create cloudwatch alarms
This commit is contained in:
277
examples/pulumi/aws/create alarms for ec2 instances.ts
Normal file
277
examples/pulumi/aws/create alarms for ec2 instances.ts
Normal file
@@ -0,0 +1,277 @@
|
||||
import * as aws from "@pulumi/aws";
|
||||
import * as pulumi from "@pulumi/pulumi";
|
||||
|
||||
/**
|
||||
* Creates CloudWatch alarms for EC2 instances.
|
||||
*
|
||||
* Refer <https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Best_Practice_Recommended_Alarms_AWS_Services.html#EC2>
|
||||
* for details on the alarms.
|
||||
*
|
||||
* @param instance The EC2 instance to create alarms for
|
||||
* @param topic The SNS topic to send notifications to
|
||||
* @param namePrefix The prefix to use for the alarms' name (defaults to the instance's ID)
|
||||
* @param [cpuUtilization=true] Whether to create an alarm for the instance's CPUUtilization metric
|
||||
* @param [instanceStatusCheck=true] Whether to create an alarm for the instance's instance status check
|
||||
* @param [systemStatusCheck=true] Whether to create an alarm for the instance's system status check
|
||||
* @param [memUsedPercent=false] Whether to create an alarm for the instance's mem_used_percent metric
|
||||
* @param [diskUsedPercent=false] Whether to create an alarm for the instance's disk_used_percent metric
|
||||
* @param [extraTags={}] Extra tags to apply to the alarms
|
||||
* @param [protectFromDeletion=true] Whether to protect the alarms from deletion
|
||||
* @returns An object with the alarms that were created
|
||||
*/
|
||||
export function createCloudwatchAlarmsForEc2Instance(
|
||||
ec2Instance: aws.ec2.Instance | pulumi.Output<aws.ec2.Instance> | pulumi.Output<aws.ec2.GetInstanceResult>,
|
||||
snsTopic?: aws.sns.Topic | pulumi.Output<aws.sns.Topic> | pulumi.Output<aws.sns.GetTopicResult>,
|
||||
namePrefix: string | pulumi.Output<string> = pulumi.interpolate `${ ec2Instance.id }`,
|
||||
cpuUtilization: boolean = true,
|
||||
instanceStatusCheck: boolean = true,
|
||||
systemStatusCheck: boolean = true,
|
||||
memUsedPercent: boolean = false, // requires CWAgent configured
|
||||
diskUsedPercent: boolean = false, // requires CWAgent configured
|
||||
extraTags: { [key: string]: string } = {},
|
||||
protectFromDeletion: boolean = true,
|
||||
) {
|
||||
namePrefix = pulumi.interpolate `${ namePrefix }`;
|
||||
const idPrefix = pulumi.interpolate `${ namePrefix.apply((s: string) => s.replace('_','-').replace(/-?\w/g, match => match.toLowerCase()))}`;
|
||||
const ec2InstanceId = pulumi.interpolate `${ ec2Instance.id }`;
|
||||
const snsTopicArn = snsTopic !== undefined ? pulumi.interpolate `${ snsTopic.arn }` : undefined;
|
||||
|
||||
/**
|
||||
* Helps monitoring the CPU utilization of an EC2 instance.
|
||||
*
|
||||
* Depending on the application, consistently high utilization levels might be normal. But if performance is
|
||||
* degraded, and the application is not constrained by disk I/O, memory, or network resources, then a maxed-out CPU
|
||||
* might indicate a resource bottleneck or application performance problems.
|
||||
* High CPU utilization might indicate that an upgrade to a more CPU intensive instance is required.
|
||||
* If detailed monitoring is enabled, one can change the period to 60 seconds instead of 300 seconds.
|
||||
*
|
||||
* This alarm will trigger when the CPU utilization is > 80% for 3 consecutive times over 15 minutes.
|
||||
* It only sends a notification to the given SNS topics (if any was given).
|
||||
*/
|
||||
const cpuUtilization_cloudwatchMetricAlarm = cpuUtilization ? pulumi
|
||||
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
|
||||
.apply(
|
||||
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
|
||||
new aws.cloudwatch.MetricAlarm(
|
||||
`${ idPrefix }-cpuUtilization`,
|
||||
{
|
||||
name: `${ namePrefix }_CPUUtilization`,
|
||||
alarmDescription: "Notify the team when the CPU utilization is > 80% 3 consecutive times over 15 minutes.",
|
||||
tags: {
|
||||
Controls: "SOC2/CC7.2",
|
||||
...extraTags,
|
||||
},
|
||||
|
||||
namespace: "AWS/EC2",
|
||||
dimensions: {
|
||||
InstanceId: instanceId,
|
||||
},
|
||||
metricName: "CPUUtilization",
|
||||
statistic: "Average",
|
||||
comparisonOperator: "GreaterThanThreshold",
|
||||
threshold: 80,
|
||||
period: 300,
|
||||
evaluationPeriods: 3,
|
||||
datapointsToAlarm: 3,
|
||||
alarmActions: [
|
||||
topicArn,
|
||||
].filter(item => item !== undefined),
|
||||
},
|
||||
{
|
||||
protect: protectFromDeletion,
|
||||
},
|
||||
),
|
||||
) : undefined;
|
||||
|
||||
/**
|
||||
* Helps monitoring an instance's InstanceStatus check.
|
||||
*
|
||||
* This alarm is used to detect underlying problems with instances.
|
||||
* Should this status check fail, this alarm should be in ALARM state.
|
||||
*
|
||||
* This alarm will trigger when the Instance status check fails 2 consecutive times over 10 minutes.
|
||||
* It:
|
||||
* - sends a notification to the given SNS topics (if any was given);
|
||||
* - tries to automatically restart the instance.
|
||||
*/
|
||||
const instanceStatusCheck_cloudwatchMetricAlarm = instanceStatusCheck ? pulumi
|
||||
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
|
||||
.apply(
|
||||
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
|
||||
new aws.cloudwatch.MetricAlarm(
|
||||
`${ idPrefix }-instanceStatusCheck`,
|
||||
{
|
||||
name: `${ namePrefix ?? ec2InstanceId }_InstanceStatusCheck`,
|
||||
alarmDescription: "Notify the team and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.",
|
||||
tags: {
|
||||
Controls: "SOC2/CC7.2",
|
||||
...extraTags,
|
||||
},
|
||||
|
||||
namespace: "AWS/EC2",
|
||||
dimensions: {
|
||||
InstanceId: instanceId,
|
||||
},
|
||||
metricName: "StatusCheckFailed_Instance",
|
||||
statistic: "Maximum",
|
||||
unit: "Count",
|
||||
comparisonOperator: "GreaterThanOrEqualToThreshold",
|
||||
threshold: 1,
|
||||
period: 300,
|
||||
evaluationPeriods: 2,
|
||||
datapointsToAlarm: 2,
|
||||
alarmActions: [
|
||||
topicArn,
|
||||
"arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0",
|
||||
].filter(item => item !== undefined),
|
||||
},
|
||||
{
|
||||
protect: protectFromDeletion,
|
||||
},
|
||||
),
|
||||
) : undefined;
|
||||
|
||||
/**
|
||||
* Helps monitoring an instance's SystemStatus check.
|
||||
*
|
||||
* This alarm is used to detect underlying problems with instances.
|
||||
* Should this status check fail, this alarm should be in the ALARM state.
|
||||
*
|
||||
* This alarm will trigger when the System status check fails 2 consecutive times over 10 minutes.
|
||||
* It:
|
||||
* - sends a notification to the given SNS topics (if any was given);
|
||||
* - tries to automatically recover the instance.
|
||||
*/
|
||||
const systemStatusCheck_cloudwatchMetricAlarm = systemStatusCheck ? pulumi
|
||||
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
|
||||
.apply(
|
||||
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
|
||||
new aws.cloudwatch.MetricAlarm(
|
||||
`${ idPrefix }-systemStatusCheck`,
|
||||
{
|
||||
name: `${ namePrefix ?? ec2InstanceId }_SystemStatusCheck`,
|
||||
alarmDescription: "Notify the team and recover the instance when the System status check fails 2 consecutive times over 10 minutes.",
|
||||
tags: {
|
||||
Controls: "SOC2/CC7.2",
|
||||
...extraTags,
|
||||
},
|
||||
|
||||
namespace: "AWS/EC2",
|
||||
dimensions: {
|
||||
InstanceId: instanceId,
|
||||
},
|
||||
metricName: "StatusCheckFailed_System",
|
||||
statistic: "Maximum",
|
||||
unit: "Count",
|
||||
comparisonOperator: "GreaterThanOrEqualToThreshold",
|
||||
threshold: 1,
|
||||
period: 300,
|
||||
evaluationPeriods: 2,
|
||||
datapointsToAlarm: 2,
|
||||
alarmActions: [
|
||||
topicArn,
|
||||
"arn:aws:automate:eu-west-1:ec2:recover",
|
||||
].filter(item => item !== undefined),
|
||||
},
|
||||
{
|
||||
protect: protectFromDeletion,
|
||||
},
|
||||
),
|
||||
) : undefined;
|
||||
|
||||
/**
|
||||
* Helps monitoring the memory utilization of an EC2 instance.
|
||||
*
|
||||
* High memory utilization might indicate that an upgrade to a more memory oriented instance is required.
|
||||
*
|
||||
* This alarm requires the CloudWatch Agent to be configured to send mem_used_percent data.
|
||||
*
|
||||
* This alarm will trigger when the memory utilization is > 85% for 3 consecutive times over 15 minutes.
|
||||
* It only sends a notification to the given SNS topics (if any was given).
|
||||
*/
|
||||
const memUsedPercent_cloudwatchMetricAlarm = memUsedPercent ? pulumi
|
||||
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
|
||||
.apply(
|
||||
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
|
||||
new aws.cloudwatch.MetricAlarm(
|
||||
`${ idPrefix }-memUsedPercent`,
|
||||
{
|
||||
name: `${ namePrefix ?? ec2InstanceId }_MemUsedPercent`,
|
||||
alarmDescription: "Notify the team when the memory utilization is > 85% 3 consecutive times over 15 minutes.",
|
||||
tags: {
|
||||
Controls: "SOC2/CC7.2",
|
||||
...extraTags,
|
||||
},
|
||||
|
||||
namespace: "CWAgent",
|
||||
dimensions: {
|
||||
InstanceId: instanceId,
|
||||
},
|
||||
metricName: "mem_used_percent",
|
||||
statistic: "Average",
|
||||
comparisonOperator: "GreaterThanThreshold",
|
||||
threshold: 85,
|
||||
period: 300,
|
||||
evaluationPeriods: 3,
|
||||
datapointsToAlarm: 3,
|
||||
alarmActions: [
|
||||
topicArn,
|
||||
].filter(item => item !== undefined),
|
||||
},
|
||||
{
|
||||
protect: protectFromDeletion,
|
||||
},
|
||||
),
|
||||
) : undefined;
|
||||
|
||||
/**
|
||||
* Helps monitoring the disk utilization of an EC2 instance.
|
||||
*
|
||||
* This alarm requires the CloudWatch Agent to be configured to send disk_used_percent data.
|
||||
*
|
||||
* This alarm will trigger when the disk utilization is > 85% for 3 consecutive times over 15 minutes.
|
||||
* It only sends a notification to the given SNS topics (if any was given).
|
||||
*/
|
||||
const diskUsedPercent_cloudwatchMetricAlarm = diskUsedPercent ? pulumi
|
||||
.all([ idPrefix, namePrefix, ec2InstanceId, snsTopicArn ])
|
||||
.apply(
|
||||
([ idPrefix, namePrefix, instanceId, topicArn ]: [ string, string, string, aws.ARN|undefined ]) =>
|
||||
new aws.cloudwatch.MetricAlarm(
|
||||
`${ idPrefix }-diskUsedPercent`,
|
||||
{
|
||||
name: `${ namePrefix ?? ec2InstanceId }_DiskUsedPercent`,
|
||||
alarmDescription: "Notify the team when the disk utilization is > 85% 3 consecutive times over 15 minutes.",
|
||||
tags: {
|
||||
Controls: "SOC2/CC7.2",
|
||||
...extraTags,
|
||||
},
|
||||
|
||||
namespace: "CWAgent",
|
||||
dimensions: {
|
||||
InstanceId: instanceId,
|
||||
},
|
||||
metricName: "disk_used_percent",
|
||||
statistic: "Average",
|
||||
comparisonOperator: "GreaterThanThreshold",
|
||||
threshold: 85,
|
||||
period: 300,
|
||||
evaluationPeriods: 3,
|
||||
datapointsToAlarm: 3,
|
||||
alarmActions: [
|
||||
topicArn,
|
||||
].filter(item => item !== undefined),
|
||||
},
|
||||
{
|
||||
protect: protectFromDeletion,
|
||||
},
|
||||
),
|
||||
) : undefined;
|
||||
|
||||
return {
|
||||
cpuUtilization: cpuUtilization_cloudwatchMetricAlarm,
|
||||
instanceStatusCheck: instanceStatusCheck_cloudwatchMetricAlarm,
|
||||
systemStatusCheck: systemStatusCheck_cloudwatchMetricAlarm,
|
||||
memUsedPercent: memUsedPercent_cloudwatchMetricAlarm,
|
||||
diskUsedPercent: diskUsedPercent_cloudwatchMetricAlarm,
|
||||
} as const;
|
||||
};
|
||||
Reference in New Issue
Block a user