From 2433cdbfb275bc65da8be96a2b4313365d5f00ee Mon Sep 17 00:00:00 2001 From: Junfan Zhang Date: Fri, 24 Jan 2025 15:44:17 +0800 Subject: [PATCH 1/2] [#2350] improvement(coordinator): Add metrics of active/lost server number --- .../apache/uniffle/coordinator/SimpleClusterManager.java | 9 +++++++++ .../uniffle/coordinator/metric/CoordinatorMetrics.java | 8 ++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java b/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java index d1daf565b1..ed842a31bb 100644 --- a/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java +++ b/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java @@ -25,6 +25,7 @@ import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.nio.charset.StandardCharsets; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -171,6 +172,14 @@ void nodesCheck() { CoordinatorMetrics.gaugeUnhealthyServerNum.set(unhealthyNodes.size()); CoordinatorMetrics.gaugeTotalServerNum.set(servers.size()); + CoordinatorMetrics.gaugeLostServerNum.set(lostNodes.size()); + + // get the active server num. + Set allServers = new HashSet<>(servers.keySet()); + allServers.removeAll(excludedNodes); + allServers.removeAll(unhealthyNodes); + CoordinatorMetrics.gaugeActiveServerNum.set(allServers.size()); + } catch (Exception e) { LOG.warn("Error happened in nodesCheck", e); } diff --git a/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java b/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java index a97892526e..0b61011f68 100644 --- a/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java +++ b/coordinator/src/main/java/org/apache/uniffle/coordinator/metric/CoordinatorMetrics.java @@ -33,7 +33,8 @@ import org.apache.uniffle.common.util.RssUtils; public class CoordinatorMetrics { - + private static final String ACTIVE_SERVER_NUM = "active_server_num"; + private static final String LOST_SERVER_NUM = "lost_server_num"; private static final String TOTAL_SERVER_NUM = "total_server_num"; private static final String RUNNING_APP_NUM = "running_app_num"; private static final String TOTAL_APP_NUM = "total_app_num"; @@ -46,7 +47,8 @@ public class CoordinatorMetrics { public static final String REMOTE_STORAGE_IN_USED_PREFIX = "remote_storage_in_used_"; public static final String APP_NUM_TO_USER = "app_num"; public static final String USER_LABEL = "user_name"; - + public static Gauge gaugeLostServerNum; + public static Gauge gaugeActiveServerNum; public static Gauge gaugeTotalServerNum; public static Gauge gaugeExcludeServerNum; public static Gauge gaugeUnhealthyServerNum; @@ -107,6 +109,8 @@ public static void updateDynamicGaugeForRemoteStorage(String storageHost, double } private static void setUpMetrics() { + gaugeLostServerNum = metricsManager.addGauge(LOST_SERVER_NUM); + gaugeActiveServerNum = metricsManager.addGauge(ACTIVE_SERVER_NUM); gaugeTotalServerNum = metricsManager.addGauge(TOTAL_SERVER_NUM); gaugeExcludeServerNum = metricsManager.addGauge(EXCLUDE_SERVER_NUM); gaugeUnhealthyServerNum = metricsManager.addGauge(UNHEALTHY_SERVER_NUM); From d8aa241a51874c3a82cefcda40825dc0fc58ddbd Mon Sep 17 00:00:00 2001 From: Junfan Zhang Date: Fri, 24 Jan 2025 17:38:10 +0800 Subject: [PATCH 2/2] fix --- .../org/apache/uniffle/coordinator/SimpleClusterManager.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java b/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java index ed842a31bb..5d9e88db37 100644 --- a/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java +++ b/coordinator/src/main/java/org/apache/uniffle/coordinator/SimpleClusterManager.java @@ -177,7 +177,9 @@ void nodesCheck() { // get the active server num. Set allServers = new HashSet<>(servers.keySet()); allServers.removeAll(excludedNodes); - allServers.removeAll(unhealthyNodes); + for (ServerNode unhealthyNode : unhealthyNodes) { + allServers.remove(unhealthyNode.getId()); + } CoordinatorMetrics.gaugeActiveServerNum.set(allServers.size()); } catch (Exception e) {