Skip to content

Commit

Permalink
fix: Better monitoring experience (#33)
Browse files Browse the repository at this point in the history
* fix(better_monitoring): Renamed some errors

* fix(better_monitoring): Templates working... maybe? + Bump version

* fix(better_monitoring): long tail assets

* fix(better_monitoring): Added hashmap for config

* fix(better_monitoring): verify_indexer_sync util

* fix(better_monitoring): README

* fix(better_monitoring): bool instead of result for sync check

* fix: remove long tail asset for future data

---------

Co-authored-by: 0xevolve <[email protected]>
  • Loading branch information
akhercha and EvolveArt authored Jul 18, 2024
1 parent 7d9bb51 commit c7b78dd
Show file tree
Hide file tree
Showing 16 changed files with 257 additions and 80 deletions.
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ IGNORE_SOURCES=BITSTAMP,DEFILLAMA
IGNORE_PUBLISHERS=BINANCE

# Prometheus
TELEGRAM_BOT_TOKEN=
TELEGRAM_TOKEN=
OPSGENIE_API_KEY=

# Server PORT
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ It then processes the data and computes the following metrics:
- `time_since_last_update_pair_id{network, pair, type}`: Time since an update has been published for a given pair. (in seconds)
- `price_deviation{network, pair, source, type}`: Deviation of the price from a reference price (DefiLlama API) given source and pair. (in percents)
- `price_deviation_source{network, pair, source, type}`: Deviation of the price from the on-chain aggregated median price given source and pair. (in percents)
- `long_tail_asset_threshold{pair}`: Deviation threshold configuration for long tail assets.
- `long_tail_asset_deviation{network, pair, type, source1, source2}`: Deviation between two sources for long tail assets.
- `publisher_balance{network, publisher}`: Balance of a publisher. (in ETH)
- `vrf_balance{network}`: Balance of the VRF contract. (in ETH)
- `vrf_requests_count{network, status}`: Number of VRF requests handled for a given network.
Expand Down Expand Up @@ -60,7 +62,7 @@ IGNORE_SOURCES=BITSTAMP,DEFILLAMA
IGNORE_PUBLISHERS=BINANCE

# Prometheus
TELEGRAM_BOT_TOKEN=
TELEGRAM_TOKEN=
OPSGENIE_API_KEY=
```

Expand Down
2 changes: 1 addition & 1 deletion compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ services:
- grafana-data:/var/lib/grafana

alertmanager:
image: prom/alertmanager:v0.23.0
image: prom/alertmanager:v0.27.0
restart: unless-stopped
ports:
- "9093:9093"
Expand Down
3 changes: 3 additions & 0 deletions grafana/provisioning/datasources/prometheus_ds.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# config file version
apiVersion: 1

datasources:
- name: Prometheus
access: proxy
Expand Down
8 changes: 5 additions & 3 deletions prometheus/alertmanager.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,18 @@ receivers:
telegram_configs:
- bot_token: "${{TELEGRAM_TOKEN}}}}"
chat_id: -1001904637278
parse_mode: ""
parse_mode: "HTML"
message: '{{ template "telegram.default.message" . }}'
- name: "internal-critical"
opsgenie_configs:
- api_key: "${{OPS_GENIE_API_KEY}}"
- name: "public"
telegram_configs:
- bot_token: "${{TELEGRAM_TOKEN}}"
chat_id: -1002060420752
parse_mode: ""
parse_mode: "HTML"
message: '{{ template "telegram.default.message" . }}'

# The directory from which notification templates are read.
templates:
- "/etc/alertmanager/template/*.tmpl"
- "/config/templates/alert.tmpl"
30 changes: 25 additions & 5 deletions prometheus/alerts.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@ groups:
annotations:
summary: "Time since the last update is too high"
description: "The time since the last update from {{ $labels.publisher }} has exceeded 1800 seconds."

- alert: WrongPrice
expr: abs(price_deviation) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Price deviation is too high"
description: "The price deviation of {{ $labels.pair }} from {{ $labels.source }} has exceeded 5%."
description: "The price deviation of {{ $labels.pair }} from {{ $labels.source }} has deviated for more than 5% with the reference price from DefiLlama."

- alert: TooFewSources
expr: num_sources < 5
for: 5m
Expand All @@ -25,14 +27,16 @@ groups:
annotations:
summary: "Too few sources"
description: "The number of sources for {{ $labels.pair }} has fallen below 5."

- alert: SourceDeviation
expr: abs(price_deviation_source) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Source deviation is too high"
description: "The source deviation of {{ $labels.pair }} from {{ $labels.source }} has exceeded 5%."
description: "The source deviation of {{ $labels.pair }} from {{ $labels.source }} has deviated for more than 5% with the Oracle median price."

- alert: IndexerDown
expr: indexer_blocks_left > 10
for: 5m
Expand All @@ -41,6 +45,7 @@ groups:
annotations:
summary: "Indexer is down"
description: "The {{ $labels.network }} indexer for {{ $labels.type }} is lagging behind 10 blocks."

- alert: PublisherBalanceLow
expr: publisher_balance < 0.1
for: 5m
Expand All @@ -49,14 +54,25 @@ groups:
annotations:
summary: "Publisher balance is low"
description: "The {{ $labels.publisher }} balance is below 0.1 ETH."

- alert: PriceDeviationTooHigh
expr: abs(on_off_price_deviation) > 0.025
for: 5m
labels:
severity: critical
annotations:
summary: "Price deviation is too high"
description: "The on-chain price of {{ $labels.pair }} from the reference price has exceeded 2.5%."
description: "The median on-chain price of {{ $labels.pair }} has deviated for more than 2.5% with the reference price from DefiLlama."

- alert: LongTailAssetDeviation
expr: abs(long_tail_asset_deviation) > on(pair) group_left long_tail_asset_threshold
for: 5m
labels:
severity: warning
annotations:
summary: "Long tail asset deviation is too high"
description: 'The deviation between sources for {{ $labels.pair }} ({{ $labels.type }}) from {{ $labels.source1 }} vs {{ $labels.source2 }} has exceeded the configured threshold of {{ $value | printf "%.2f" }}.'

- name: API
rules:
- alert: TimeSinceLastUpdateTooHigh
Expand All @@ -68,6 +84,7 @@ groups:
annotations:
summary: "Time since the last update is too high"
description: "The time since the last update for {{ $labels.pair }} has exceeded 1800 seconds."

- alert: WrongPrice
expr: abs(api_price_deviation) > 0.025
for: 5m
Expand All @@ -76,7 +93,8 @@ groups:
group: API
annotations:
summary: "Price deviation is too high"
description: "The price deviation of {{ $labels.pair }} from DefiLlama has exceeded 2.5%."
description: "The median price of {{ $labels.pair }} from our API has deviated for more than 2.5% with the reference price from DefiLlama."

- alert: TooFewSources
expr: api_num_sources < 1
for: 5m
Expand All @@ -86,6 +104,7 @@ groups:
annotations:
summary: "Too few sources"
description: "The number of sources for {{ $labels.pair }} has fallen below 1."

- alert: SequencerDeviation
expr: abs(api_sequencer_deviation) > 0.02
for: 5m
Expand All @@ -94,7 +113,8 @@ groups:
group: API
annotations:
summary: "Sequencer deviation is too high"
description: "The ETH/STRK price has deviated from the sequencer price by more than 2%."
description: "The ETH/STRK price has deviated by more than 2% from the sequencer price."

- name: VRF
rules:
- alert: TimeInPendingStatusTooLong
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,20 @@
{{ end }}
{{ end }}

{{ define "__pragma_text_alert_list" }}{{ range . }}
{{ define "__pragma_text_alert_list" }}
{{ range . }}
---
🪪 <b>{{ .Labels.alertname }}</b>
{{- if .Annotations.summary }}
📝 {{ .Annotations.summary }}{{ end }}
📝 {{ .Annotations.summary }}
{{ end }}
{{- if .Annotations.description }}
📖 {{ .Annotations.description }}{{ end }}
📖 {{ .Annotations.description }}
{{ end }}

🌍 **Network:** {{ .Labels.network }}
🧩 **Publisher:** {{ .Labels.publisher }}
🔍 **Severity:** {{ .Labels.severity | toUpper }}
💿 **Data Type:** {{ .Labels.type | title }}
{{ end }}
{{ end }}
19 changes: 18 additions & 1 deletion src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ use strum::{Display, EnumString, IntoStaticStr};
use tokio::sync::OnceCell;
use url::Url;

use crate::{constants::CONFIG_UPDATE_INTERVAL, utils::try_felt_to_u32};
use crate::{
constants::{CONFIG_UPDATE_INTERVAL, LONG_TAIL_ASSETS, LONG_TAIL_ASSET_THRESHOLD},
utils::try_felt_to_u32,
};

#[derive(Debug, Clone, EnumString, IntoStaticStr)]
pub enum NetworkName {
Expand Down Expand Up @@ -197,6 +200,8 @@ pub async fn periodic_config_update() {
let mut next_update = Instant::now() + interval;

loop {
log::info!("[CONFIG] Updating config...");

let new_config = Config::create_from_env().await;
let updated_config = ArcSwap::from_pointee(new_config.clone());

Expand Down Expand Up @@ -441,6 +446,18 @@ async fn init_future_config(
}
}

#[allow(dead_code)]
/// Fill the LONG_TAIL_ASSET_THRESHOLD metrics with every long tail assets configuration
/// fetched from LONG_TAIL_ASSETS.
/// TODO: LONG_TAIL_ASSETS should be an independent (db, yaml...) configuration?
pub fn init_long_tail_asset_configuration() {
for (pair, threshold) in LONG_TAIL_ASSETS.iter() {
LONG_TAIL_ASSET_THRESHOLD
.with_label_values(&[pair])
.set(*threshold);
}
}

/// Parse pairs from a comma separated string.
/// e.g BTC/USD,ETH/USD
pub fn parse_pairs(pairs: &str) -> Vec<String> {
Expand Down
56 changes: 52 additions & 4 deletions src/constants.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::collections::HashMap;

use lazy_static::lazy_static;
use phf::phf_map;
use prometheus::{opts, register_gauge_vec, register_int_gauge_vec, GaugeVec, IntGaugeVec};
Expand All @@ -19,6 +21,52 @@ pub(crate) static COINGECKO_IDS: phf::Map<&'static str, &'static str> = phf_map!
};

lazy_static! {
/// TODO: Current storage of long tail assets here is not really good.
/// We should probably store them either in a yaml config file or a
/// database (cons of a database => update the threshold/pairs without restarting
/// the monitoring service).
pub static ref LONG_TAIL_ASSETS: HashMap<String, f64> = {
let mut map = HashMap::new();
map.insert("ZEND/USD".to_string(), 0.1);
map.insert("NSTR/USD".to_string(), 0.15);
map
};

/// We have a list of assets that are defined as long tail assets.
/// They have lower liquidity and higher volatilty - thus, it is trickier
/// to track their prices and have good alerting.
/// Our way of dealing with those assets is:
/// - we don't use the usual metrics "price_deviation" below
/// - instead, we compare all the sources one to one and if the deviation
/// between the two is greater than a certain threshold, we send an alert.
///
/// "LONG_TAIL_ASSET_THRESHOLD" will contain the long tail assets pairs
/// and the threshold.
/// "LONG_TAIL_ASSET_DEVIATION" will contain the deviation between two sources.
///
/// We define all the long tail assets in the config::init_long_tail_asset_configuration
/// function.
///
pub static ref LONG_TAIL_ASSET_THRESHOLD: GaugeVec = register_gauge_vec!(
opts!(
"long_tail_asset_threshold",
"Deviation threshold configuration for long tail assets"
),
&["pair"]
)
.unwrap();

pub static ref LONG_TAIL_ASSET_DEVIATION: GaugeVec = register_gauge_vec!(
opts!(
"long_tail_asset_deviation",
"Deviation between two sources for long tail assets"
),
&["network", "pair", "type", "source1", "source2"]
)
.unwrap();

// Regular metrics below

pub static ref TIME_SINCE_LAST_UPDATE_PUBLISHER: GaugeVec = register_gauge_vec!(
opts!(
"time_since_last_update_seconds",
Expand All @@ -43,15 +91,15 @@ lazy_static! {
pub static ref PRICE_DEVIATION: GaugeVec = register_gauge_vec!(
opts!(
"price_deviation",
"Price deviation from the reference price."
"Price deviation for a source compared to a reference price (DefiLlama)."
),
&["network", "pair", "source", "type"]
)
.unwrap();
pub static ref PRICE_DEVIATION_SOURCE: GaugeVec = register_gauge_vec!(
opts!(
"price_deviation_source",
"Price deviation from the reference price."
"Price deviation for a source compared to our oracle price."
),
&["network", "pair", "source", "type"]
)
Expand Down Expand Up @@ -80,15 +128,15 @@ lazy_static! {
pub static ref API_PRICE_DEVIATION: GaugeVec = register_gauge_vec!(
opts!(
"api_price_deviation",
"Price deviation from the reference price."
"Price deviation for our API compared to a reference price (DefiLlama)."
),
&["network", "pair"]
)
.unwrap();
pub static ref ON_OFF_PRICE_DEVIATION: GaugeVec = register_gauge_vec!(
opts!(
"on_off_price_deviation",
"On chain price deviation from the reference price"
"Median on chain price deviation compared to a reference price (Defillama)."
),
&["network", "pair", "type"]
)
Expand Down
Loading

0 comments on commit c7b78dd

Please sign in to comment.