Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Aligning timeouts to reflect real-world scenarios #399

Merged
merged 4 commits into from
Jan 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ address = '0.0.0.0:28101'
max_message_size_bytes = 4_194_304
request_limit = 3
request_buffer_size = 16
message_timeout_secs = 30
client_access_timeout_secs = 2
message_timeout_secs = 10
zajko marked this conversation as resolved.
Show resolved Hide resolved
client_access_timeout_secs = 10

[rpc_server.speculative_exec_server]
enable_server = true
Expand Down
39 changes: 37 additions & 2 deletions metrics/src/rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ const RESPONSE_SIZE_BUCKETS: &[f64; 8] = &[
5e+2_f64, 1e+3_f64, 2e+3_f64, 5e+3_f64, 5e+4_f64, 5e+5_f64, 5e+6_f64, 5e+7_f64,
];

const RESPONSE_TIME_MS_BUCKETS: &[f64; 8] = &[
1_f64, 5_f64, 10_f64, 30_f64, 50_f64, 100_f64, 200_f64, 300_f64,
const RESPONSE_TIME_MS_BUCKETS: &[f64; 9] = &[
1_f64, 5_f64, 10_f64, 30_f64, 50_f64, 100_f64, 300_f64, 1000_f64, 3000_f64,
];

static ENDPOINT_CALLS: Lazy<IntCounterVec> = Lazy::new(|| {
Expand All @@ -24,6 +24,21 @@ static ENDPOINT_CALLS: Lazy<IntCounterVec> = Lazy::new(|| {
counter
});

static TIMEOUT_COUNTERS: Lazy<IntCounterVec> = Lazy::new(|| {
let counter = IntCounterVec::new(
Opts::new(
"rpc_server_timeout_counts",
"Counters for how many of the requests failed due to internal timeout",
),
&["timer"],
)
.unwrap();
REGISTRY
.register(Box::new(counter.clone()))
.expect("cannot register metric");
counter
});

static RESPONSE_TIMES_MS: Lazy<HistogramVec> = Lazy::new(|| {
let histogram = HistogramVec::new(
HistogramOpts {
Expand Down Expand Up @@ -56,6 +71,18 @@ static RECONNECT_TIMES_MS: Lazy<Histogram> = Lazy::new(|| {
histogram
});

static MISMATCHED_IDS: Lazy<IntGauge> = Lazy::new(|| {
let counter = IntGauge::new(
"rpc_server_mismatched_ids",
"Number of mismatched ID events observed in responses from binary port",
)
.expect("rpc_server_mismatched_ids metric can't be created");
REGISTRY
.register(Box::new(counter.clone()))
.expect("cannot register metric");
counter
});

static DISCONNECT_EVENTS: Lazy<IntGauge> = Lazy::new(|| {
let counter = IntGauge::new(
"rpc_server_disconnects",
Expand Down Expand Up @@ -108,3 +135,11 @@ pub fn register_request_size(method: &str, payload_size: f64) {
.with_label_values(&[method])
.observe(payload_size);
}

pub fn register_timeout(timer_name: &str) {
TIMEOUT_COUNTERS.with_label_values(&[timer_name]).inc();
}

pub fn register_mismatched_id() {
MISMATCHED_IDS.inc();
}
6 changes: 2 additions & 4 deletions resources/example_configs/EXAMPLE_NCTL_CONFIG.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ cors_origin = ""
ip_address = "0.0.0.0"
port = 28102
max_message_size_bytes = 4194304
request_limit = 3
request_buffer_size = 16
message_timeout_secs = 30
client_access_timeout_secs = 2
message_timeout_secs = 10
client_access_timeout_secs = 10
keepalive_timeout_ms = 10_000

[rpc_server.node_client.exponential_backoff]
Expand Down
6 changes: 2 additions & 4 deletions resources/example_configs/EXAMPLE_NCTL_POSTGRES_CONFIG.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ cors_origin = ""
ip_address = "0.0.0.0"
port = 28102
max_message_size_bytes = 4194304
request_limit = 3
request_buffer_size = 16
message_timeout_secs = 30
client_access_timeout_secs = 2
message_timeout_secs = 10
client_access_timeout_secs = 10
keepalive_timeout_ms = 10_000

[rpc_server.node_client.exponential_backoff]
Expand Down
6 changes: 2 additions & 4 deletions resources/example_configs/EXAMPLE_NODE_CONFIG.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ cors_origin = ""
ip_address = "3.20.57.210"
port = 7777
max_message_size_bytes = 4194304
request_limit = 10
request_buffer_size = 50
message_timeout_secs = 60
client_access_timeout_secs = 60
message_timeout_secs = 10
client_access_timeout_secs = 10
keepalive_timeout_ms = 10_000

[rpc_server.node_client.exponential_backoff]
Expand Down
8 changes: 2 additions & 6 deletions resources/example_configs/default_debian_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,10 @@ ip_address = '127.0.0.1'
port = 7779
# Maximum size of a message in bytes.
max_message_size_bytes = 4_194_304
# Maximum number of in-flight node requests.
request_limit = 3
# Number of node requests that can be buffered.
request_buffer_size = 16
# Timeout for a node request in seconds.
message_timeout_secs = 30
message_timeout_secs = 10
# Timeout specifying how long to wait for binary port client to be available.
client_access_timeout_secs = 2
client_access_timeout_secs = 10
# The amount of time in milliseconds to wait between sending keepalive requests.
keepalive_timeout_ms = 10_000

Expand Down
4 changes: 2 additions & 2 deletions resources/example_configs/default_rpc_only_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ request_limit = 3
# Number of node requests that can be buffered.
request_buffer_size = 16
# Timeout for a node request in seconds.
message_timeout_secs = 30
message_timeout_secs = 10
# Timeout specifying how long to wait for binary port client to be available.
client_access_timeout_secs = 2
client_access_timeout_secs = 10
# The amount of time in milliseconds to wait between sending keepalive requests.
keepalive_timeout_ms = 10_000

Expand Down
2 changes: 1 addition & 1 deletion resources/example_configs/default_sse_only_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ port = 18888
max_concurrent_requests = 50
max_requests_per_second = 50

[admin_server]
[admin_api_server]
enable_server = true
port = 18887
max_concurrent_requests = 1
Expand Down
Loading
Loading