From 203d3619984defbb2c0d7aa1ad1b4b283b8ffe70 Mon Sep 17 00:00:00 2001 From: Chris Larsen Date: Tue, 5 Dec 2023 12:41:21 -0800 Subject: [PATCH] atlas cloudwatch: enable polling for EC2 status check failures. (#511) Polls at 60s. Disabled the other EC2 system metrics to avoid polling. Also log the polling offset at completion and log the accounts resolved from the account config. --- atlas-cloudwatch/src/main/resources/ec2.conf | 196 ++++++++++-------- .../src/main/resources/reference.conf | 4 +- .../cloudwatch/AwsConfigAccountSupplier.scala | 1 + .../atlas/cloudwatch/CloudWatchPoller.scala | 15 +- .../cloudwatch/CloudWatchPollerSuite.scala | 3 +- 5 files changed, 119 insertions(+), 100 deletions(-) diff --git a/atlas-cloudwatch/src/main/resources/ec2.conf b/atlas-cloudwatch/src/main/resources/ec2.conf index 5c9319c3..25ff063c 100644 --- a/atlas-cloudwatch/src/main/resources/ec2.conf +++ b/atlas-cloudwatch/src/main/resources/ec2.conf @@ -5,130 +5,142 @@ atlas { // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/viewing_metrics_with_cloudwatch.html ec2 = { namespace = "AWS/EC2" - period = 5m - end-period-offset = 3 + period = 1m timeout = 20m + end-period-offset = 1 + poll-offset = 1m dimensions = [ "AutoScalingGroupName" ] metrics = [ +// { +// name = "CPUUtilization" +// alias = "aws.ec2.cpuUtilization" +// conversion = "max" +// }, +// { +// name = "NetworkIn" +// alias = "aws.ec2.networkThroughput" +// conversion = "sum,rate" +// tags = [ +// { +// key = "id" +// value = "in" +// } +// ] +// }, +// { +// name = "NetworkOut" +// alias = "aws.ec2.networkThroughput" +// conversion = "sum,rate" +// tags = [ +// { +// key = "id" +// value = "out" +// } +// ] +// }, +// { +// name = "NetworkPacketsIn" +// alias = "aws.ec2.networkPackets" +// conversion = "sum,rate" +// tags = [ +// { +// key = "id" +// value = "in" +// } +// ] +// }, +// { +// name = "NetworkPacketsOut" +// alias = "aws.ec2.networkPackets" +// conversion = "sum,rate" +// tags = [ +// { +// key = "id" +// value = "out" +// } +// ] +// }, +// { +// name = "DiskReadBytes" +// alias = "aws.ec2.ioThroughput" +// conversion = "sum,rate" +// tags = [ +// { +// key = "id" +// value = "read" +// } +// ] +// }, +// { +// name = "DiskWriteBytes" +// alias = "aws.ec2.ioThroughput" +// conversion = "sum,rate" +// tags = [ +// { +// key = "id" +// value = "write" +// } +// ] +// }, +// { +// name = "DiskReadOps" +// alias = "aws.ec2.iops" +// conversion = "sum,rate" +// tags = [ +// { +// key = "id" +// value = "read" +// } +// ] +// }, +// { +// name = "DiskWriteOps" +// alias = "aws.ec2.iops" +// conversion = "sum,rate" +// tags = [ +// { +// key = "id" +// value = "write" +// } +// ] +// }, { - name = "CPUUtilization" - alias = "aws.ec2.cpuUtilization" + name = "StatusCheckFailed_Instance" + alias = "aws.ec2.badInstances" conversion = "max" - }, - { - name = "NetworkIn" - alias = "aws.ec2.networkThroughput" - conversion = "sum,rate" - tags = [ - { - key = "id" - value = "in" - } - ] - }, - { - name = "NetworkOut" - alias = "aws.ec2.networkThroughput" - conversion = "sum,rate" tags = [ { key = "id" - value = "out" - } - ] - }, - { - name = "NetworkPacketsIn" - alias = "aws.ec2.networkPackets" - conversion = "sum,rate" - tags = [ - { - key = "id" - value = "in" - } - ] - }, - { - name = "NetworkPacketsOut" - alias = "aws.ec2.networkPackets" - conversion = "sum,rate" - tags = [ - { - key = "id" - value = "out" - } - ] - }, - { - name = "DiskReadBytes" - alias = "aws.ec2.ioThroughput" - conversion = "sum,rate" - tags = [ - { - key = "id" - value = "read" - } - ] - }, - { - name = "DiskWriteBytes" - alias = "aws.ec2.ioThroughput" - conversion = "sum,rate" - tags = [ - { - key = "id" - value = "write" - } - ] - }, - { - name = "DiskReadOps" - alias = "aws.ec2.iops" - conversion = "sum,rate" - tags = [ - { - key = "id" - value = "read" - } - ] - }, - { - name = "DiskWriteOps" - alias = "aws.ec2.iops" - conversion = "sum,rate" - tags = [ - { - key = "id" - value = "write" + value = "instance" } ] }, { - name = "StatusCheckFailed_Instance" + name = "StatusCheckFailed_System" alias = "aws.ec2.badInstances" conversion = "max" tags = [ { key = "id" - value = "instance" + value = "system" } ] }, { - name = "StatusCheckFailed_System" + name = "StatusCheckFailed_AttachedEBS" alias = "aws.ec2.badInstances" conversion = "max" tags = [ { key = "id" - value = "system" + value = "ebs" } ] - }, + } ] } diff --git a/atlas-cloudwatch/src/main/resources/reference.conf b/atlas-cloudwatch/src/main/resources/reference.conf index f9a2ac0d..909389c0 100644 --- a/atlas-cloudwatch/src/main/resources/reference.conf +++ b/atlas-cloudwatch/src/main/resources/reference.conf @@ -14,9 +14,9 @@ aws-poller-io-dispatcher { type = Dispatcher executor = "thread-pool-executor" thread-pool-executor { - fixed-pool-size = 8 + fixed-pool-size = 16 } - throughput = 16 + throughput = 32 } iep.leader { diff --git a/atlas-cloudwatch/src/main/scala/com/netflix/atlas/cloudwatch/AwsConfigAccountSupplier.scala b/atlas-cloudwatch/src/main/scala/com/netflix/atlas/cloudwatch/AwsConfigAccountSupplier.scala index a24051eb..1907b840 100644 --- a/atlas-cloudwatch/src/main/scala/com/netflix/atlas/cloudwatch/AwsConfigAccountSupplier.scala +++ b/atlas-cloudwatch/src/main/scala/com/netflix/atlas/cloudwatch/AwsConfigAccountSupplier.scala @@ -168,6 +168,7 @@ class AwsConfigAccountSupplier( logger.info( s"Finished loading ${rawAccountResources.size} (${filtered.size} filtered) AWS accounts and resources in ${(System.currentTimeMillis() - start) / 1000.0} seconds" ) + logger.info(s"Final AWS accounts: ${filtered}") registry .timer("atlas.cloudwatch.account.supplier.aws.loadTime") .record(System.currentTimeMillis() - start, TimeUnit.MILLISECONDS) diff --git a/atlas-cloudwatch/src/main/scala/com/netflix/atlas/cloudwatch/CloudWatchPoller.scala b/atlas-cloudwatch/src/main/scala/com/netflix/atlas/cloudwatch/CloudWatchPoller.scala index 8930d4e1..6d83dd48 100644 --- a/atlas-cloudwatch/src/main/scala/com/netflix/atlas/cloudwatch/CloudWatchPoller.scala +++ b/atlas-cloudwatch/src/main/scala/com/netflix/atlas/cloudwatch/CloudWatchPoller.scala @@ -167,7 +167,7 @@ class CloudWatchPoller( Optional.of(region) ) val catFutures = filtered.map { category => - val runner = Poller(now, category, client, account, region) + val runner = Poller(now, category, client, account, region, offset) this.synchronized { runners += runner runner.execute @@ -246,7 +246,8 @@ class CloudWatchPoller( category: MetricCategory, client: CloudWatchClient, account: String, - region: Region + region: Region, + offset: Int ) { private val nowMillis = now.toEpochMilli @@ -254,7 +255,9 @@ class CloudWatchPoller( private[cloudwatch] val got = new AtomicInteger() private[cloudwatch] def execute: Future[Done] = { - logger.info(s"Polling for account ${account} and category ${category.namespace} in ${region}") + logger.info( + s"Polling for account ${account} at ${offset}s and category ${category.namespace} in ${region}" + ) val futures = category.toListRequests.map { tuple => val (definition, request) = tuple val promise = Promise[Done]() @@ -265,7 +268,7 @@ class CloudWatchPoller( Future.reduceLeft(futures)((_, _) => Done).andThen { case Success(_) => logger.info( - s"Finished polling with ${got.get()} of ${expecting.get()} for ${account} and ${ + s"Finished polling with ${got.get()} of ${expecting.get()} for ${account} at ${offset} and ${ category.namespace } in region ${region} in ${(System.currentTimeMillis() - nowMillis) / 1000.0} s" ) @@ -399,7 +402,9 @@ class CloudWatchPoller( } catch { case ex: Exception => logger.error( - s"Error getting metric ${metric.metricName()} for ${account} and ${category.namespace} ${definition.name} in region ${region}", + s"Error getting metric ${metric.metricName()} for ${account} at ${offset} and ${ + category.namespace + } ${definition.name} in region ${region}", ex ) registry diff --git a/atlas-cloudwatch/src/test/scala/com/netflix/atlas/cloudwatch/CloudWatchPollerSuite.scala b/atlas-cloudwatch/src/test/scala/com/netflix/atlas/cloudwatch/CloudWatchPollerSuite.scala index 068c14aa..b9c4a6f8 100644 --- a/atlas-cloudwatch/src/test/scala/com/netflix/atlas/cloudwatch/CloudWatchPollerSuite.scala +++ b/atlas-cloudwatch/src/test/scala/com/netflix/atlas/cloudwatch/CloudWatchPollerSuite.scala @@ -453,7 +453,8 @@ class CloudWatchPollerSuite extends FunSuite with TestKitBase { category, client, account, - region + region, + 60 ) }