chore(aws,pulumi): improve alarms definition example

2026-02-09 05:44:23 +00:00 · 2025-02-28 17:27:01 +03:00
parent 1d9dc5e367
commit 13e24473e9
1 changed files with 170 additions and 119 deletions
--- a/snippets/pulumi/aws/notify
+++ b/snippets/pulumi/aws/notify
@@ -1,3 +1,4 @@
+import * as pulumi from "@pulumi/pulumi";
 import * as aws from "@pulumi/aws";

 const instance_output = aws.ec2.getInstanceOutput({
@@ -68,137 +69,187 @@ new aws.chatbot.SlackChannelConfiguration(
    },
 );

-instance_output.id.apply( ( instanceId: string ) => {
-    // refer https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Best_Practice_Recommended_Alarms_AWS_Services.html#EC2
+pulumi.
+    all([ instance_output.id, instance_output.ami, instance_output.instanceType ])
+    .apply( ([ instanceId, instanceAmi, instanceType ]: [ string, string, string ] ) => {
+        // refer https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Best_Practice_Recommended_Alarms_AWS_Services.html#EC2

-    new aws.cloudwatch.MetricAlarm(
-        `${instanceId}_systemStatus`,
-        {
-            name: `${instanceId}_SystemStatus`,
-            alarmDescription: "Notify on Slack and recover the instance when the System status check fails 2 consecutive times over 10 minutes.",
+        new aws.cloudwatch.MetricAlarm(
+            `${instanceId}_systemStatus`,
+            {
+                name: `${instanceId}_SystemStatus`,
+                alarmDescription: "Notify on Slack and recover the instance when the System status check fails 2 consecutive times over 10 minutes.",

-            namespace: "AWS/EC2",
-            dimensions: {
-                InstanceId: instanceId,
+                namespace: "AWS/EC2",
+                dimensions: {
+                    InstanceId: instanceId,
+                },
+                metricName: "StatusCheckFailed_System",
+                statistic: "Maximum",
+                unit: "Count",
+                comparisonOperator: "GreaterThanOrEqualToThreshold",
+                threshold: 1,
+                period: 300,
+                evaluationPeriods: 2,
+                datapointsToAlarm: 2,
+                alarmActions: [
+                    notifications_snsTopic.arn,
+                    "arn:aws:automate:eu-west-1:ec2:recover",
+                ],
+                okActions: [
+                    notifications_snsTopic.arn,
+                ],
            },
-            metricName: "StatusCheckFailed_System",
-            statistic: "Maximum",
-            unit: "Count",
-            comparisonOperator: "GreaterThanOrEqualToThreshold",
-            threshold: 1,
-            period: 300,
-            evaluationPeriods: 2,
-            datapointsToAlarm: 2,
-            alarmActions: [
-                notifications_snsTopic.arn,
-                "arn:aws:automate:eu-west-1:ec2:recover",
-            ],
-        },
-    );
+        );

-    new aws.cloudwatch.MetricAlarm(
-        `${instanceId}_instanceStatus`,
-        {
-            name: `${instanceId}_InstanceStatus`,
-            alarmDescription: "Notify on Slack and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.",
+        new aws.cloudwatch.MetricAlarm(
+            `${instanceId}_instanceStatus`,
+            {
+                name: `${instanceId}_InstanceStatus`,
+                alarmDescription: "Notify on Slack and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.",

-            namespace: "AWS/EC2",
-            dimensions: {
-                InstanceId: instanceId,
+                namespace: "AWS/EC2",
+                dimensions: {
+                    InstanceId: instanceId,
+                },
+                metricName: "StatusCheckFailed_Instance",
+                statistic: "Maximum",
+                unit: "Count",
+                comparisonOperator: "GreaterThanOrEqualToThreshold",
+                threshold: 1,
+                period: 300,
+                evaluationPeriods: 2,
+                datapointsToAlarm: 2,
+                alarmActions: [
+                    notifications_snsTopic.arn,
+                    "arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0",
+                ],
+                okActions: [
+                    notifications_snsTopic.arn,
+                ],
            },
-            metricName: "StatusCheckFailed_Instance",
-            statistic: "Maximum",
-            unit: "Count",
-            comparisonOperator: "GreaterThanOrEqualToThreshold",
-            threshold: 1,
-            period: 300,
-            evaluationPeriods: 2,
-            datapointsToAlarm: 2,
-            alarmActions: [
-                notifications_snsTopic.arn,
-                "arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0",
-            ],
-        },
-    );
+        );

-    new aws.cloudwatch.MetricAlarm(
-        `${instanceId}-cpuUtilization`,
-        {
-            name: `${instanceId}_CPUUtilization`,
-            alarmDescription: "Notify on Slack when the CPU utilization is above 80% 3 consecutive times over 15 minutes.",
-            tags: {
-                Controls: "SOC2/CC7.2",
+        new aws.cloudwatch.MetricAlarm(
+            `${instanceId}-cpuUtilization`,
+            {
+                name: `${instanceId}_CPUUtilization`,
+                alarmDescription: "Notify on Slack when the CPU utilization is above 80% 3 consecutive times over 15 minutes.",
+                tags: {
+                    Controls: "SOC2/CC7.2",
+                },
+
+                namespace: "AWS/EC2",
+                dimensions: {
+                    InstanceId: instanceId
+                },
+                metricName: "CPUUtilization",
+                statistic: "Average",
+                comparisonOperator: "GreaterThanThreshold",
+                threshold:  80,
+                period: 300,
+                evaluationPeriods: 3,
+                datapointsToAlarm: 3,
+                alarmActions: [
+                    notifications_snsTopic.arn,
+                ],
+                okActions: [
+                    notifications_snsTopic.arn,
+                ],
            },
+        );

-            namespace: "AWS/EC2",
-            dimensions: {
-                InstanceId: instanceId
+        // requires the host to have the cloudwatch agent installed and configured to send 'mem_used_percent' metrics
+        new aws.cloudwatch.MetricAlarm(
+            `${instanceId}-memUsedPercent`,
+            {
+                name: `${instanceId}_MemUsedPercent`,
+                alarmDescription: "Notify on Slack when the memory utilization is > 85% 3 consecutive times over 15 minutes.",
+                tags: {
+                    Controls: "SOC2/CC7.2",  // FIXME
+                },
+
+                namespace: "CWAgent",
+                dimensions: {
+                    InstanceId: instanceId,
+                },
+                metricName: "mem_used_percent",
+                statistic: "Average",
+                comparisonOperator: "GreaterThanThreshold",
+                threshold: 85,
+                period: 300,
+                evaluationPeriods: 3,
+                datapointsToAlarm: 3,
+                alarmActions: [
+                    notifications_snsTopic.arn,
+                ],
+                okActions: [
+                    notifications_snsTopic.arn,
+                ],
            },
-            metricName: "CPUUtilization",
-            statistic: "Average",
-            comparisonOperator: "GreaterThanThreshold",
-            threshold:  80,
-            period: 300,
-            evaluationPeriods: 3,
-            datapointsToAlarm: 3,
-            alarmActions: [
-                notifications_snsTopic.arn,
-            ],
-        },
-    );
+        );

-    // requires the host to have the cloudwatch agent installed and configured to send 'mem_used_percent' metrics
-    new aws.cloudwatch.MetricAlarm(
-        `${instanceId}-memUsedPercent`,
-        {
-            name: `${instanceId}_MemUsedPercent`,
-            alarmDescription: "Notify on Slack when the memory utilization is > 85% 3 consecutive times over 15 minutes.",
-            tags: {
-                Controls: "SOC2/CC7.2",  // FIXME
+        // requires the host to have the cloudwatch agent installed and configured to send 'disk_used_percent' metrics
+        new aws.cloudwatch.MetricAlarm(
+            `${instanceId}-diskUsedPercent`,
+            {
+                name: `${instanceId}_DiskUsedPercent`,
+                alarmDescription: "Notify on Slack when the disk utilization is > 85% 3 consecutive times over 15 minutes.",
+                tags: {
+                    Controls: "SOC2/CC7.2",  // FIXME
+                },
+
+                namespace: "CWAgent",
+                dimensions: {
+                    InstanceId: instanceId,
+                },
+                metricName: "disk_used_percent",
+                statistic: "Average",
+                comparisonOperator: "GreaterThanThreshold",
+                threshold: 85,
+                period: 300,
+                evaluationPeriods: 3,
+                datapointsToAlarm: 3,
+                alarmActions: [
+                    notifications_snsTopic.arn,
+                ],
+                okActions: [
+                    notifications_snsTopic.arn,
+                ],
            },
+        );
+        new aws.cloudwatch.MetricAlarm(
+            `${instanceId}-diskUsedPercent`,
+            {
+                name: `${instanceId}_DiskUsedPercent_rootDisk`,
+                alarmDescription: "Notify on Slack when the root disk utilization is > 85% 3 consecutive times over 15 minutes.",
+                tags: {
+                    Controls: "SOC2/CC7.2",  // FIXME
+                },

-            namespace: "CWAgent",
-            dimensions: {
-                InstanceId: instanceId,
+                namespace: "CWAgent",
+                dimensions: {
+                    InstanceId: instanceId,
+                    ImageId: instanceAmi,
+                    InstanceType: instanceType,
+                    device: "nvme0n1p1",
+                    fstype: "xfs",
+                    path: "/",
+                },
+                metricName: "disk_used_percent",
+                statistic: "Average",
+                comparisonOperator: "GreaterThanThreshold",
+                threshold: 85,
+                period: 300,
+                evaluationPeriods: 3,
+                datapointsToAlarm: 3,
+                alarmActions: [
+                    notifications_snsTopic.arn,
+                ],
+                okActions: [
+                    notifications_snsTopic.arn,
+                ],
            },
-            metricName: "mem_used_percent",
-            statistic: "Average",
-            comparisonOperator: "GreaterThanThreshold",
-            threshold: 85,
-            period: 300,
-            evaluationPeriods: 3,
-            datapointsToAlarm: 3,
-            alarmActions: [
-                notifications_snsTopic.arn,
-            ],
-        },
-    );
+        );

-    // requires the host to have the cloudwatch agent installed and configured to send 'disk_used_percent' metrics
-    new aws.cloudwatch.MetricAlarm(
-        `${instanceId}-diskUsedPercent`,
-        {
-            name: `${instanceId}_DiskUsedPercent`,
-            alarmDescription: "Notify on Slack when the disk utilization is > 85% 3 consecutive times over 15 minutes.",
-            tags: {
-                Controls: "SOC2/CC7.2",  // FIXME
-            },
-
-            namespace: "CWAgent",
-            dimensions: {
-                InstanceId: instanceId,
-            },
-            metricName: "disk_used_percent",
-            statistic: "Average",
-            comparisonOperator: "GreaterThanThreshold",
-            threshold: 85,
-            period: 300,
-            evaluationPeriods: 3,
-            datapointsToAlarm: 3,
-            alarmActions: [
-                notifications_snsTopic.arn,
-            ],
-        },
-    );
-
-});
+    });