Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve metrics push #19460

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/mysten-common/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use prometheus::Encoder;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::{debug, error, info};

const DEFAULT_METRICS_PUSH_TIMEOUT: Duration = Duration::from_secs(30);
const METRICS_PUSH_TIMEOUT: Duration = Duration::from_secs(45);

pub struct MetricsPushClient {
certificate: std::sync::Arc<sui_tls::SelfSignedCertificate>,
Expand Down Expand Up @@ -77,7 +77,7 @@ pub async fn push_metrics(
.header(reqwest::header::CONTENT_ENCODING, "snappy")
.header(reqwest::header::CONTENT_TYPE, prometheus::PROTOBUF_FORMAT)
.body(compressed)
.timeout(DEFAULT_METRICS_PUSH_TIMEOUT)
.timeout(METRICS_PUSH_TIMEOUT)
.send()
.await?;

Expand Down
16 changes: 11 additions & 5 deletions crates/sui-bridge/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ use prometheus::{
};
use std::time::Duration;
use sui_types::crypto::NetworkKeyPair;
use tokio::time::sleep;

const FINE_GRAINED_LATENCY_SEC_BUCKETS: &[f64] = &[
0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9,
Expand Down Expand Up @@ -54,15 +53,22 @@ pub fn start_metrics_push_task(
let mut interval = tokio::time::interval(interval);
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);

let mut errors = 0;
loop {
interval.tick().await;

// Retry pushing metrics if there is an error.
while let Err(error) = push_metrics(&client, &url, &registry).await {
tracing::warn!("unable to push metrics: {error}; new client will be created");
sleep(Duration::from_secs(1)).await;
if let Err(error) = push_metrics(&client, &url, &registry).await {
errors += 1;
if errors >= 10 {
// If we hit 10 failures in a row, start logging errors.
tracing::error!("unable to push metrics: {error}; new client will be created");
} else {
tracing::warn!("unable to push metrics: {error}; new client will be created");
}
// aggressively recreate our client connection if we hit an error
client = MetricsPushClient::new(metrics_key_pair.copy());
} else {
errors = 0;
}
}
});
Expand Down
15 changes: 11 additions & 4 deletions crates/sui-node/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,22 @@ pub fn start_metrics_push_task(config: &sui_config::NodeConfig, registry: Regist
let mut interval = tokio::time::interval(interval);
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);

let mut errors = 0;
loop {
interval.tick().await;

// Retry pushing metrics if there is an error.
while let Err(error) = push_metrics(&client, &url, &registry).await {
tracing::warn!("unable to push metrics: {error}; new client will be created");
sleep(Duration::from_secs(1)).await;
if let Err(error) = push_metrics(&client, &url, &registry).await {
errors += 1;
if errors >= 10 {
// If we hit 10 failures in a row, start logging errors.
tracing::error!("unable to push metrics: {error}; new client will be created");
} else {
tracing::warn!("unable to push metrics: {error}; new client will be created");
}
// aggressively recreate our client connection if we hit an error
client = MetricsPushClient::new(config_copy.network_key_pair().copy());
} else {
errors = 0;
}
}
});
Expand Down
Loading