diff --git a/snippets/pulumi/aws/notify slack channel and take action when instance is impaired.ts b/snippets/pulumi/aws/notify slack channel and take action when instance is impaired.ts index 460d268..f927c76 100644 --- a/snippets/pulumi/aws/notify slack channel and take action when instance is impaired.ts +++ b/snippets/pulumi/aws/notify slack channel and take action when instance is impaired.ts @@ -1,3 +1,4 @@ +import * as pulumi from "@pulumi/pulumi"; import * as aws from "@pulumi/aws"; const instance_output = aws.ec2.getInstanceOutput({ @@ -68,137 +69,187 @@ new aws.chatbot.SlackChannelConfiguration( }, ); -instance_output.id.apply( ( instanceId: string ) => { - // refer https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Best_Practice_Recommended_Alarms_AWS_Services.html#EC2 +pulumi. + all([ instance_output.id, instance_output.ami, instance_output.instanceType ]) + .apply( ([ instanceId, instanceAmi, instanceType ]: [ string, string, string ] ) => { + // refer https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Best_Practice_Recommended_Alarms_AWS_Services.html#EC2 - new aws.cloudwatch.MetricAlarm( - `${instanceId}_systemStatus`, - { - name: `${instanceId}_SystemStatus`, - alarmDescription: "Notify on Slack and recover the instance when the System status check fails 2 consecutive times over 10 minutes.", + new aws.cloudwatch.MetricAlarm( + `${instanceId}_systemStatus`, + { + name: `${instanceId}_SystemStatus`, + alarmDescription: "Notify on Slack and recover the instance when the System status check fails 2 consecutive times over 10 minutes.", - namespace: "AWS/EC2", - dimensions: { - InstanceId: instanceId, + namespace: "AWS/EC2", + dimensions: { + InstanceId: instanceId, + }, + metricName: "StatusCheckFailed_System", + statistic: "Maximum", + unit: "Count", + comparisonOperator: "GreaterThanOrEqualToThreshold", + threshold: 1, + period: 300, + evaluationPeriods: 2, + datapointsToAlarm: 2, + alarmActions: [ + notifications_snsTopic.arn, + "arn:aws:automate:eu-west-1:ec2:recover", + ], + okActions: [ + notifications_snsTopic.arn, + ], }, - metricName: "StatusCheckFailed_System", - statistic: "Maximum", - unit: "Count", - comparisonOperator: "GreaterThanOrEqualToThreshold", - threshold: 1, - period: 300, - evaluationPeriods: 2, - datapointsToAlarm: 2, - alarmActions: [ - notifications_snsTopic.arn, - "arn:aws:automate:eu-west-1:ec2:recover", - ], - }, - ); + ); - new aws.cloudwatch.MetricAlarm( - `${instanceId}_instanceStatus`, - { - name: `${instanceId}_InstanceStatus`, - alarmDescription: "Notify on Slack and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.", + new aws.cloudwatch.MetricAlarm( + `${instanceId}_instanceStatus`, + { + name: `${instanceId}_InstanceStatus`, + alarmDescription: "Notify on Slack and restart the instance when the Instance status check fails 2 consecutive times over 10 minutes.", - namespace: "AWS/EC2", - dimensions: { - InstanceId: instanceId, + namespace: "AWS/EC2", + dimensions: { + InstanceId: instanceId, + }, + metricName: "StatusCheckFailed_Instance", + statistic: "Maximum", + unit: "Count", + comparisonOperator: "GreaterThanOrEqualToThreshold", + threshold: 1, + period: 300, + evaluationPeriods: 2, + datapointsToAlarm: 2, + alarmActions: [ + notifications_snsTopic.arn, + "arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0", + ], + okActions: [ + notifications_snsTopic.arn, + ], }, - metricName: "StatusCheckFailed_Instance", - statistic: "Maximum", - unit: "Count", - comparisonOperator: "GreaterThanOrEqualToThreshold", - threshold: 1, - period: 300, - evaluationPeriods: 2, - datapointsToAlarm: 2, - alarmActions: [ - notifications_snsTopic.arn, - "arn:aws:swf:eu-west-1:012345678901:action/actions/AWS_EC2.InstanceId.Reboot/1.0", - ], - }, - ); + ); - new aws.cloudwatch.MetricAlarm( - `${instanceId}-cpuUtilization`, - { - name: `${instanceId}_CPUUtilization`, - alarmDescription: "Notify on Slack when the CPU utilization is above 80% 3 consecutive times over 15 minutes.", - tags: { - Controls: "SOC2/CC7.2", + new aws.cloudwatch.MetricAlarm( + `${instanceId}-cpuUtilization`, + { + name: `${instanceId}_CPUUtilization`, + alarmDescription: "Notify on Slack when the CPU utilization is above 80% 3 consecutive times over 15 minutes.", + tags: { + Controls: "SOC2/CC7.2", + }, + + namespace: "AWS/EC2", + dimensions: { + InstanceId: instanceId + }, + metricName: "CPUUtilization", + statistic: "Average", + comparisonOperator: "GreaterThanThreshold", + threshold: 80, + period: 300, + evaluationPeriods: 3, + datapointsToAlarm: 3, + alarmActions: [ + notifications_snsTopic.arn, + ], + okActions: [ + notifications_snsTopic.arn, + ], }, + ); - namespace: "AWS/EC2", - dimensions: { - InstanceId: instanceId + // requires the host to have the cloudwatch agent installed and configured to send 'mem_used_percent' metrics + new aws.cloudwatch.MetricAlarm( + `${instanceId}-memUsedPercent`, + { + name: `${instanceId}_MemUsedPercent`, + alarmDescription: "Notify on Slack when the memory utilization is > 85% 3 consecutive times over 15 minutes.", + tags: { + Controls: "SOC2/CC7.2", // FIXME + }, + + namespace: "CWAgent", + dimensions: { + InstanceId: instanceId, + }, + metricName: "mem_used_percent", + statistic: "Average", + comparisonOperator: "GreaterThanThreshold", + threshold: 85, + period: 300, + evaluationPeriods: 3, + datapointsToAlarm: 3, + alarmActions: [ + notifications_snsTopic.arn, + ], + okActions: [ + notifications_snsTopic.arn, + ], }, - metricName: "CPUUtilization", - statistic: "Average", - comparisonOperator: "GreaterThanThreshold", - threshold: 80, - period: 300, - evaluationPeriods: 3, - datapointsToAlarm: 3, - alarmActions: [ - notifications_snsTopic.arn, - ], - }, - ); + ); - // requires the host to have the cloudwatch agent installed and configured to send 'mem_used_percent' metrics - new aws.cloudwatch.MetricAlarm( - `${instanceId}-memUsedPercent`, - { - name: `${instanceId}_MemUsedPercent`, - alarmDescription: "Notify on Slack when the memory utilization is > 85% 3 consecutive times over 15 minutes.", - tags: { - Controls: "SOC2/CC7.2", // FIXME + // requires the host to have the cloudwatch agent installed and configured to send 'disk_used_percent' metrics + new aws.cloudwatch.MetricAlarm( + `${instanceId}-diskUsedPercent`, + { + name: `${instanceId}_DiskUsedPercent`, + alarmDescription: "Notify on Slack when the disk utilization is > 85% 3 consecutive times over 15 minutes.", + tags: { + Controls: "SOC2/CC7.2", // FIXME + }, + + namespace: "CWAgent", + dimensions: { + InstanceId: instanceId, + }, + metricName: "disk_used_percent", + statistic: "Average", + comparisonOperator: "GreaterThanThreshold", + threshold: 85, + period: 300, + evaluationPeriods: 3, + datapointsToAlarm: 3, + alarmActions: [ + notifications_snsTopic.arn, + ], + okActions: [ + notifications_snsTopic.arn, + ], }, + ); + new aws.cloudwatch.MetricAlarm( + `${instanceId}-diskUsedPercent`, + { + name: `${instanceId}_DiskUsedPercent_rootDisk`, + alarmDescription: "Notify on Slack when the root disk utilization is > 85% 3 consecutive times over 15 minutes.", + tags: { + Controls: "SOC2/CC7.2", // FIXME + }, - namespace: "CWAgent", - dimensions: { - InstanceId: instanceId, + namespace: "CWAgent", + dimensions: { + InstanceId: instanceId, + ImageId: instanceAmi, + InstanceType: instanceType, + device: "nvme0n1p1", + fstype: "xfs", + path: "/", + }, + metricName: "disk_used_percent", + statistic: "Average", + comparisonOperator: "GreaterThanThreshold", + threshold: 85, + period: 300, + evaluationPeriods: 3, + datapointsToAlarm: 3, + alarmActions: [ + notifications_snsTopic.arn, + ], + okActions: [ + notifications_snsTopic.arn, + ], }, - metricName: "mem_used_percent", - statistic: "Average", - comparisonOperator: "GreaterThanThreshold", - threshold: 85, - period: 300, - evaluationPeriods: 3, - datapointsToAlarm: 3, - alarmActions: [ - notifications_snsTopic.arn, - ], - }, - ); + ); - // requires the host to have the cloudwatch agent installed and configured to send 'disk_used_percent' metrics - new aws.cloudwatch.MetricAlarm( - `${instanceId}-diskUsedPercent`, - { - name: `${instanceId}_DiskUsedPercent`, - alarmDescription: "Notify on Slack when the disk utilization is > 85% 3 consecutive times over 15 minutes.", - tags: { - Controls: "SOC2/CC7.2", // FIXME - }, - - namespace: "CWAgent", - dimensions: { - InstanceId: instanceId, - }, - metricName: "disk_used_percent", - statistic: "Average", - comparisonOperator: "GreaterThanThreshold", - threshold: 85, - period: 300, - evaluationPeriods: 3, - datapointsToAlarm: 3, - alarmActions: [ - notifications_snsTopic.arn, - ], - }, - ); - -}); + });