Skip to content

Commit

Permalink
[#2350] improvement(coordinator): Add metrics of active/lost server n…
Browse files Browse the repository at this point in the history
…umber (#2351)

### What changes were proposed in this pull request?

Add metrics of active/lost server number

for #2350 

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Needn't

---------

Co-authored-by: Junfan Zhang <[email protected]>
  • Loading branch information
zuston and Junfan Zhang authored Jan 26, 2025
1 parent deb5de3 commit 7e63bde
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -171,6 +172,16 @@ void nodesCheck() {

CoordinatorMetrics.gaugeUnhealthyServerNum.set(unhealthyNodes.size());
CoordinatorMetrics.gaugeTotalServerNum.set(servers.size());
CoordinatorMetrics.gaugeLostServerNum.set(lostNodes.size());

// get the active server num.
Set<String> allServers = new HashSet<>(servers.keySet());
allServers.removeAll(excludedNodes);
for (ServerNode unhealthyNode : unhealthyNodes) {
allServers.remove(unhealthyNode.getId());
}
CoordinatorMetrics.gaugeActiveServerNum.set(allServers.size());

} catch (Exception e) {
LOG.warn("Error happened in nodesCheck", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
import org.apache.uniffle.common.util.RssUtils;

public class CoordinatorMetrics {

private static final String ACTIVE_SERVER_NUM = "active_server_num";
private static final String LOST_SERVER_NUM = "lost_server_num";
private static final String TOTAL_SERVER_NUM = "total_server_num";
private static final String RUNNING_APP_NUM = "running_app_num";
private static final String TOTAL_APP_NUM = "total_app_num";
Expand All @@ -46,7 +47,8 @@ public class CoordinatorMetrics {
public static final String REMOTE_STORAGE_IN_USED_PREFIX = "remote_storage_in_used_";
public static final String APP_NUM_TO_USER = "app_num";
public static final String USER_LABEL = "user_name";

public static Gauge gaugeLostServerNum;
public static Gauge gaugeActiveServerNum;
public static Gauge gaugeTotalServerNum;
public static Gauge gaugeExcludeServerNum;
public static Gauge gaugeUnhealthyServerNum;
Expand Down Expand Up @@ -107,6 +109,8 @@ public static void updateDynamicGaugeForRemoteStorage(String storageHost, double
}

private static void setUpMetrics() {
gaugeLostServerNum = metricsManager.addGauge(LOST_SERVER_NUM);
gaugeActiveServerNum = metricsManager.addGauge(ACTIVE_SERVER_NUM);
gaugeTotalServerNum = metricsManager.addGauge(TOTAL_SERVER_NUM);
gaugeExcludeServerNum = metricsManager.addGauge(EXCLUDE_SERVER_NUM);
gaugeUnhealthyServerNum = metricsManager.addGauge(UNHEALTHY_SERVER_NUM);
Expand Down

0 comments on commit 7e63bde

Please sign in to comment.