Skip to content

Commit

Permalink
Change handling of graceful case of LoadStatsReporting onRemoteClose (#…
Browse files Browse the repository at this point in the history
…37076)

Currently, we treat all remote grpc stream closes as errors, and log
warnings and increment failure metrics for every instance. A remote grpc
stream close with a 0 code is not a failure, but instead a graceful
termination, so it should be a lower log level and should not increment
failure metrics.

Risk Level: Low

---------

Signed-off-by: Brian Sonnenberg <[email protected]>
  • Loading branch information
briansonnenberg authored Nov 20, 2024
1 parent 005eb77 commit 4e8706d
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 6 deletions.
3 changes: 3 additions & 0 deletions changelogs/current.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ minor_behavior_changes:
bug_fixes:
# *Changes expected to improve the state of the world and are unlikely to have negative effects*
- area: lrs
change: |
Fixes errors stat being incremented and warning log spamming for LoadStatsReporting graceful stream close.
- area: tls
change: |
Support operations on IP SANs when the IP version is not supported by the host operating system, for example
Expand Down
19 changes: 14 additions & 5 deletions source/common/upstream/load_stats_reporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@ LoadStatsReporter::LoadStatsReporter(const LocalInfo::LocalInfo& local_info,
time_source_(dispatcher.timeSource()) {
request_.mutable_node()->MergeFrom(local_info.node());
request_.mutable_node()->add_client_features("envoy.lrs.supports_send_all_clusters");
retry_timer_ = dispatcher.createTimer([this]() -> void { establishNewStream(); });
retry_timer_ = dispatcher.createTimer([this]() -> void {
stats_.retries_.inc();
establishNewStream();
});
response_timer_ = dispatcher.createTimer([this]() -> void { sendLoadStatsRequest(); });
establishNewStream();
}

void LoadStatsReporter::setRetryTimer() {
ENVOY_LOG(info, "Load reporter stats stream/connection will retry in {} ms.", RETRY_DELAY_MS);
retry_timer_->enableTimer(std::chrono::milliseconds(RETRY_DELAY_MS));
}

Expand Down Expand Up @@ -150,8 +154,6 @@ void LoadStatsReporter::sendLoadStatsRequest() {
}

void LoadStatsReporter::handleFailure() {
ENVOY_LOG(warn, "Load reporter stats stream/connection failure, will retry in {} ms.",
RETRY_DELAY_MS);
stats_.errors_.inc();
setRetryTimer();
}
Expand Down Expand Up @@ -243,10 +245,17 @@ void LoadStatsReporter::onReceiveTrailingMetadata(Http::ResponseTrailerMapPtr&&
}

void LoadStatsReporter::onRemoteClose(Grpc::Status::GrpcStatus status, const std::string& message) {
ENVOY_LOG(warn, "{} gRPC config stream closed: {}, {}", service_method_.name(), status, message);
response_timer_->disableTimer();
stream_ = nullptr;
handleFailure();
if (status != Grpc::Status::WellKnownGrpcStatus::Ok) {
ENVOY_LOG(warn, "{} gRPC config stream closed: {}, {}", service_method_.name(), status,
message);
handleFailure();
} else {
ENVOY_LOG(debug, "{} gRPC config stream closed gracefully, {}", service_method_.name(),
message);
setRetryTimer();
}
}

} // namespace Upstream
Expand Down
4 changes: 3 additions & 1 deletion source/common/upstream/load_stats_reporter.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ namespace Upstream {
#define ALL_LOAD_REPORTER_STATS(COUNTER) \
COUNTER(requests) \
COUNTER(responses) \
COUNTER(errors)
COUNTER(errors) \
COUNTER(retries)

/**
* Struct definition for all load reporter stats. @see stats_macros.h
Expand All @@ -43,6 +44,7 @@ class LoadStatsReporter
std::unique_ptr<envoy::service::load_stats::v3::LoadStatsResponse>&& message) override;
void onReceiveTrailingMetadata(Http::ResponseTrailerMapPtr&& metadata) override;
void onRemoteClose(Grpc::Status::GrpcStatus status, const std::string& message) override;
const LoadReporterStats& getStats() { return stats_; };

// TODO(htuch): Make this configurable or some static.
const uint32_t RETRY_DELAY_MS = 5000;
Expand Down
16 changes: 16 additions & 0 deletions test/common/upstream/load_stats_reporter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,24 @@ TEST_F(LoadStatsReporterTest, RemoteStreamClose) {
EXPECT_CALL(*async_client_, startRaw(_, _, _, _)).WillOnce(Return(&async_stream_));
expectSendMessage({});
retry_timer_cb_();
EXPECT_EQ(load_stats_reporter_->getStats().errors_.value(), 1);
EXPECT_EQ(load_stats_reporter_->getStats().retries_.value(), 1);
}

// Validate that errors stat is not incremented for a graceful stream termination.
TEST_F(LoadStatsReporterTest, RemoteStreamGracefulClose) {
EXPECT_CALL(*async_client_, startRaw(_, _, _, _)).WillOnce(Return(&async_stream_));
expectSendMessage({});
createLoadStatsReporter();
EXPECT_CALL(*response_timer_, disableTimer());
EXPECT_CALL(*retry_timer_, enableTimer(_, _));
load_stats_reporter_->onRemoteClose(Grpc::Status::WellKnownGrpcStatus::Ok, "");
EXPECT_CALL(*async_client_, startRaw(_, _, _, _)).WillOnce(Return(&async_stream_));
expectSendMessage({});
retry_timer_cb_();
EXPECT_EQ(load_stats_reporter_->getStats().errors_.value(), 0);
EXPECT_EQ(load_stats_reporter_->getStats().retries_.value(), 1);
}
} // namespace
} // namespace Upstream
} // namespace Envoy

0 comments on commit 4e8706d

Please sign in to comment.