chore(aws,pulumi): improve alarms definition example

This commit is contained in:
Michele Cereda
2025-02-28 17:27:01 +03:00
parent 1d9dc5e367
commit 13e24473e9

View File

@@ -1,3 +1,4 @@
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const instance_output = aws.ec2.getInstanceOutput({
@@ -68,137 +69,187 @@ new aws.chatbot.SlackChannelConfiguration(
},
);
instance_output.id.apply( ( instanceId: string ) => {
// refer https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Best_Practice_Recommended_Alarms_AWS_Services.html#EC2
pulumi.
all([ instance_output.id, instance_output.ami, instance_output.instanceType ])
.apply( ([ instanceId, instanceAmi, instanceType ]: [ string, string, string ] ) => {
// refer https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Best_Practice_Recommended_Alarms_AWS_Services.html#EC2
new aws.cloudwatch.MetricAlarm(
`${instanceId}_systemStatus`,
{
name: `${instanceId}_SystemStatus`,
alarmDescription: "Notify on Slack and recover the instance when the System status check fails 2 consecutive times over 10 minutes.",
new aws.cloudwatch.MetricAlarm(
`${instanceId}_systemStatus`,
{
name: `${instanceId}_SystemStatus`,
alarmDescription: "Notify on Slack and recover the instance when the System status check fails 2 consecutive times over 10 minutes.",
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId,
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId,
},
metricName: "StatusCheckFailed_System",
statistic: "Maximum",
unit: "Count",
comparisonOperator: "GreaterThanOrEqualToThreshold",
threshold: 1,
period: 300,
evaluationPeriods: 2,
datapointsToAlarm: 2,
alarmActions: [
notifications_snsTopic.arn,
"arn:aws:automate:eu-west-1:ec2:recover",
],
okActions: [
notifications_snsTopic.arn,
],
},
metricName: "StatusCheckFailed_System",
statistic: "Maximum",
unit: "Count",
comparisonOperator: "GreaterThanOrEqualToThreshold",
threshold: 1,
period: 300,
evaluationPeriods: 2,
datapointsToAlarm: 2,
alarmActions: [
notifications_snsTopic.arn,
"arn:aws:automate:eu-west-1:ec2:recover",
],
},
);
);
new aws.cloudwatch.MetricAlarm(
`${instanceId}_instanceStatus`,
{
name: `${instanceId}_InstanceStatus`,
alarmDescription: "Notify on Slack and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.",
new aws.cloudwatch.MetricAlarm(
`${instanceId}_instanceStatus`,
{
name: `${instanceId}_InstanceStatus`,
alarmDescription: "Notify on Slack and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.",
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId,
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId,
},
metricName: "StatusCheckFailed_Instance",
statistic: "Maximum",
unit: "Count",
comparisonOperator: "GreaterThanOrEqualToThreshold",
threshold: 1,
period: 300,
evaluationPeriods: 2,
datapointsToAlarm: 2,
alarmActions: [
notifications_snsTopic.arn,
"arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0",
],
okActions: [
notifications_snsTopic.arn,
],
},
metricName: "StatusCheckFailed_Instance",
statistic: "Maximum",
unit: "Count",
comparisonOperator: "GreaterThanOrEqualToThreshold",
threshold: 1,
period: 300,
evaluationPeriods: 2,
datapointsToAlarm: 2,
alarmActions: [
notifications_snsTopic.arn,
"arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0",
],
},
);
);
new aws.cloudwatch.MetricAlarm(
`${instanceId}-cpuUtilization`,
{
name: `${instanceId}_CPUUtilization`,
alarmDescription: "Notify on Slack when the CPU utilization is above 80% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2",
new aws.cloudwatch.MetricAlarm(
`${instanceId}-cpuUtilization`,
{
name: `${instanceId}_CPUUtilization`,
alarmDescription: "Notify on Slack when the CPU utilization is above 80% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2",
},
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId
},
metricName: "CPUUtilization",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 80,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
notifications_snsTopic.arn,
],
okActions: [
notifications_snsTopic.arn,
],
},
);
namespace: "AWS/EC2",
dimensions: {
InstanceId: instanceId
// requires the host to have the cloudwatch agent installed and configured to send 'mem_used_percent' metrics
new aws.cloudwatch.MetricAlarm(
`${instanceId}-memUsedPercent`,
{
name: `${instanceId}_MemUsedPercent`,
alarmDescription: "Notify on Slack when the memory utilization is > 85% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2", // FIXME
},
namespace: "CWAgent",
dimensions: {
InstanceId: instanceId,
},
metricName: "mem_used_percent",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 85,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
notifications_snsTopic.arn,
],
okActions: [
notifications_snsTopic.arn,
],
},
metricName: "CPUUtilization",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 80,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
notifications_snsTopic.arn,
],
},
);
);
// requires the host to have the cloudwatch agent installed and configured to send 'mem_used_percent' metrics
new aws.cloudwatch.MetricAlarm(
`${instanceId}-memUsedPercent`,
{
name: `${instanceId}_MemUsedPercent`,
alarmDescription: "Notify on Slack when the memory utilization is > 85% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2", // FIXME
// requires the host to have the cloudwatch agent installed and configured to send 'disk_used_percent' metrics
new aws.cloudwatch.MetricAlarm(
`${instanceId}-diskUsedPercent`,
{
name: `${instanceId}_DiskUsedPercent`,
alarmDescription: "Notify on Slack when the disk utilization is > 85% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2", // FIXME
},
namespace: "CWAgent",
dimensions: {
InstanceId: instanceId,
},
metricName: "disk_used_percent",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 85,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
notifications_snsTopic.arn,
],
okActions: [
notifications_snsTopic.arn,
],
},
);
new aws.cloudwatch.MetricAlarm(
`${instanceId}-diskUsedPercent`,
{
name: `${instanceId}_DiskUsedPercent_rootDisk`,
alarmDescription: "Notify on Slack when the root disk utilization is > 85% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2", // FIXME
},
namespace: "CWAgent",
dimensions: {
InstanceId: instanceId,
namespace: "CWAgent",
dimensions: {
InstanceId: instanceId,
ImageId: instanceAmi,
InstanceType: instanceType,
device: "nvme0n1p1",
fstype: "xfs",
path: "/",
},
metricName: "disk_used_percent",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 85,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
notifications_snsTopic.arn,
],
okActions: [
notifications_snsTopic.arn,
],
},
metricName: "mem_used_percent",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 85,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
notifications_snsTopic.arn,
],
},
);
);
// requires the host to have the cloudwatch agent installed and configured to send 'disk_used_percent' metrics
new aws.cloudwatch.MetricAlarm(
`${instanceId}-diskUsedPercent`,
{
name: `${instanceId}_DiskUsedPercent`,
alarmDescription: "Notify on Slack when the disk utilization is > 85% 3 consecutive times over 15 minutes.",
tags: {
Controls: "SOC2/CC7.2", // FIXME
},
namespace: "CWAgent",
dimensions: {
InstanceId: instanceId,
},
metricName: "disk_used_percent",
statistic: "Average",
comparisonOperator: "GreaterThanThreshold",
threshold: 85,
period: 300,
evaluationPeriods: 3,
datapointsToAlarm: 3,
alarmActions: [
notifications_snsTopic.arn,
],
},
);
});
});