Skip to content

Commit

Permalink
atlas cloudwatch: enable polling for EC2 status check failures. (#511)
Browse files Browse the repository at this point in the history
Polls at 60s. Disabled the other EC2 system metrics to avoid polling.
Also log the polling offset at completion and log the accounts resolved from the account config.
  • Loading branch information
manolama authored Dec 5, 2023
1 parent 9824207 commit 203d361
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 100 deletions.
196 changes: 104 additions & 92 deletions atlas-cloudwatch/src/main/resources/ec2.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,130 +5,142 @@ atlas {
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/viewing_metrics_with_cloudwatch.html
ec2 = {
namespace = "AWS/EC2"
period = 5m
end-period-offset = 3
period = 1m
timeout = 20m
end-period-offset = 1
poll-offset = 1m

dimensions = [
"AutoScalingGroupName"
]

metrics = [
// {
// name = "CPUUtilization"
// alias = "aws.ec2.cpuUtilization"
// conversion = "max"
// },
// {
// name = "NetworkIn"
// alias = "aws.ec2.networkThroughput"
// conversion = "sum,rate"
// tags = [
// {
// key = "id"
// value = "in"
// }
// ]
// },
// {
// name = "NetworkOut"
// alias = "aws.ec2.networkThroughput"
// conversion = "sum,rate"
// tags = [
// {
// key = "id"
// value = "out"
// }
// ]
// },
// {
// name = "NetworkPacketsIn"
// alias = "aws.ec2.networkPackets"
// conversion = "sum,rate"
// tags = [
// {
// key = "id"
// value = "in"
// }
// ]
// },
// {
// name = "NetworkPacketsOut"
// alias = "aws.ec2.networkPackets"
// conversion = "sum,rate"
// tags = [
// {
// key = "id"
// value = "out"
// }
// ]
// },
// {
// name = "DiskReadBytes"
// alias = "aws.ec2.ioThroughput"
// conversion = "sum,rate"
// tags = [
// {
// key = "id"
// value = "read"
// }
// ]
// },
// {
// name = "DiskWriteBytes"
// alias = "aws.ec2.ioThroughput"
// conversion = "sum,rate"
// tags = [
// {
// key = "id"
// value = "write"
// }
// ]
// },
// {
// name = "DiskReadOps"
// alias = "aws.ec2.iops"
// conversion = "sum,rate"
// tags = [
// {
// key = "id"
// value = "read"
// }
// ]
// },
// {
// name = "DiskWriteOps"
// alias = "aws.ec2.iops"
// conversion = "sum,rate"
// tags = [
// {
// key = "id"
// value = "write"
// }
// ]
// },
{
name = "CPUUtilization"
alias = "aws.ec2.cpuUtilization"
name = "StatusCheckFailed_Instance"
alias = "aws.ec2.badInstances"
conversion = "max"
},
{
name = "NetworkIn"
alias = "aws.ec2.networkThroughput"
conversion = "sum,rate"
tags = [
{
key = "id"
value = "in"
}
]
},
{
name = "NetworkOut"
alias = "aws.ec2.networkThroughput"
conversion = "sum,rate"
tags = [
{
key = "id"
value = "out"
}
]
},
{
name = "NetworkPacketsIn"
alias = "aws.ec2.networkPackets"
conversion = "sum,rate"
tags = [
{
key = "id"
value = "in"
}
]
},
{
name = "NetworkPacketsOut"
alias = "aws.ec2.networkPackets"
conversion = "sum,rate"
tags = [
{
key = "id"
value = "out"
}
]
},
{
name = "DiskReadBytes"
alias = "aws.ec2.ioThroughput"
conversion = "sum,rate"
tags = [
{
key = "id"
value = "read"
}
]
},
{
name = "DiskWriteBytes"
alias = "aws.ec2.ioThroughput"
conversion = "sum,rate"
tags = [
{
key = "id"
value = "write"
}
]
},
{
name = "DiskReadOps"
alias = "aws.ec2.iops"
conversion = "sum,rate"
tags = [
{
key = "id"
value = "read"
}
]
},
{
name = "DiskWriteOps"
alias = "aws.ec2.iops"
conversion = "sum,rate"
tags = [
{
key = "id"
value = "write"
value = "instance"
}
]
},
{
name = "StatusCheckFailed_Instance"
name = "StatusCheckFailed_System"
alias = "aws.ec2.badInstances"
conversion = "max"
tags = [
{
key = "id"
value = "instance"
value = "system"
}
]
},
{
name = "StatusCheckFailed_System"
name = "StatusCheckFailed_AttachedEBS"
alias = "aws.ec2.badInstances"
conversion = "max"
tags = [
{
key = "id"
value = "system"
value = "ebs"
}
]
},
}
]
}

Expand Down
4 changes: 2 additions & 2 deletions atlas-cloudwatch/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ aws-poller-io-dispatcher {
type = Dispatcher
executor = "thread-pool-executor"
thread-pool-executor {
fixed-pool-size = 8
fixed-pool-size = 16
}
throughput = 16
throughput = 32
}

iep.leader {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ class AwsConfigAccountSupplier(
logger.info(
s"Finished loading ${rawAccountResources.size} (${filtered.size} filtered) AWS accounts and resources in ${(System.currentTimeMillis() - start) / 1000.0} seconds"
)
logger.info(s"Final AWS accounts: ${filtered}")
registry
.timer("atlas.cloudwatch.account.supplier.aws.loadTime")
.record(System.currentTimeMillis() - start, TimeUnit.MILLISECONDS)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ class CloudWatchPoller(
Optional.of(region)
)
val catFutures = filtered.map { category =>
val runner = Poller(now, category, client, account, region)
val runner = Poller(now, category, client, account, region, offset)
this.synchronized {
runners += runner
runner.execute
Expand Down Expand Up @@ -246,15 +246,18 @@ class CloudWatchPoller(
category: MetricCategory,
client: CloudWatchClient,
account: String,
region: Region
region: Region,
offset: Int
) {

private val nowMillis = now.toEpochMilli
private[cloudwatch] val expecting = new AtomicInteger()
private[cloudwatch] val got = new AtomicInteger()

private[cloudwatch] def execute: Future[Done] = {
logger.info(s"Polling for account ${account} and category ${category.namespace} in ${region}")
logger.info(
s"Polling for account ${account} at ${offset}s and category ${category.namespace} in ${region}"
)
val futures = category.toListRequests.map { tuple =>
val (definition, request) = tuple
val promise = Promise[Done]()
Expand All @@ -265,7 +268,7 @@ class CloudWatchPoller(
Future.reduceLeft(futures)((_, _) => Done).andThen {
case Success(_) =>
logger.info(
s"Finished polling with ${got.get()} of ${expecting.get()} for ${account} and ${
s"Finished polling with ${got.get()} of ${expecting.get()} for ${account} at ${offset} and ${
category.namespace
} in region ${region} in ${(System.currentTimeMillis() - nowMillis) / 1000.0} s"
)
Expand Down Expand Up @@ -399,7 +402,9 @@ class CloudWatchPoller(
} catch {
case ex: Exception =>
logger.error(
s"Error getting metric ${metric.metricName()} for ${account} and ${category.namespace} ${definition.name} in region ${region}",
s"Error getting metric ${metric.metricName()} for ${account} at ${offset} and ${
category.namespace
} ${definition.name} in region ${region}",
ex
)
registry
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,8 @@ class CloudWatchPollerSuite extends FunSuite with TestKitBase {
category,
client,
account,
region
region,
60
)
}

Expand Down

0 comments on commit 203d361

Please sign in to comment.