From d4edc6f646a6613ee70011c258b97bce9b57563a Mon Sep 17 00:00:00 2001 From: Slava Tutrinov Date: Mon, 18 Nov 2024 15:11:12 +0300 Subject: [PATCH] HDDS-11742. don't reset SCM leadership metric to null if the latter already defined at raft server init time --- .../scm/server/StorageContainerManager.java | 19 ++++++++-- .../hadoop/ozone/MiniOzoneClusterImpl.java | 13 ++++++- .../hadoop/ozone/MiniOzoneHAClusterImpl.java | 36 ++++++++++++------- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index 9390318a29f..7fce4d41d2b 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -1595,8 +1595,22 @@ public void start() throws IOException { setStartTime(); - // At this point leader is not known - scmHAMetricsUpdate(null); + AtomicBoolean leaderIsDefined = new AtomicBoolean(false); + if (SCMHAUtils.isSCMHAEnabled(configuration)) { + getScmHAManager().getRatisServer().getRatisRoles().stream() + .filter(peerState -> peerState.contains("LEADER")) + .findFirst() + .ifPresent(peerState -> { + leaderIsDefined.set(true); + String[] peerInfo = peerState.split(":"); + scmHAMetricsUpdate(peerInfo[3]); + }); + } + + if (!leaderIsDefined.get()) { + // At this point leader is not known + scmHAMetricsUpdate(null); + } if (scmCertificateClient != null) { // In case root CA certificate is rotated during this SCM is offline @@ -2298,7 +2312,6 @@ public void scmHAMetricsUpdate(String leaderId) { // unregister, in case metrics already exist // so that the metric tags will get updated. SCMHAMetrics.unRegister(); - scmHAMetrics = SCMHAMetrics.create(getScmId(), leaderId); } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java index 3594996856a..feb313c6c98 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java @@ -145,7 +145,7 @@ public class MiniOzoneClusterImpl implements MiniOzoneCluster { private CertificateClient caClient; private final Set clients = ConcurrentHashMap.newKeySet(); private SecretKeyClient secretKeyClient; - private static MockedStatic mockDNStatic = Mockito.mockStatic(HddsDatanodeService.class); + private static volatile MockedStatic mockDNStatic; /** * Creates a new MiniOzoneCluster with Recon. @@ -571,6 +571,16 @@ private void setSecretKeyClient(SecretKeyClient client) { this.secretKeyClient = client; } + public static void mockDatanode() { + if (mockDNStatic == null) { + synchronized (MiniOzoneClusterImpl.class) { + if (mockDNStatic == null) { + mockDNStatic = Mockito.mockStatic(HddsDatanodeService.class); + } + } + } + } + private static void stopDatanodes( Collection hddsDatanodes) { if (!hddsDatanodes.isEmpty()) { @@ -867,6 +877,7 @@ protected Gateway createS3G() { */ protected List createHddsDatanodes() throws IOException { + mockDatanode(); List hddsDatanodes = new ArrayList<>(); // Override default datanode initial and current version if necessary diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java index 39c2250b73c..93bf0c73723 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java @@ -50,6 +50,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.function.Function; @@ -540,7 +541,7 @@ protected OMHAService createOMService() throws IOException, } /** - * Start OM service with multiple OMs. + * Start SCM service with multiple SCMs. */ protected SCMHAService createSCMService() throws IOException, AuthenticationException { @@ -553,6 +554,7 @@ protected SCMHAService createSCMService() int retryCount = 0; + CountDownLatch scmStartCounter = new CountDownLatch(numOfSCMs); while (true) { try { initSCMHAConfig(); @@ -584,19 +586,29 @@ protected SCMHAService createSCMService() } scmList.add(scm); - if (i <= numOfActiveSCMs) { - scm.start(); - activeSCMs.add(scm); - LOG.info("Started SCM RPC server at {}", - scm.getClientRpcAddress()); - } else { - inactiveSCMs.add(scm); - LOG.info("Intialized SCM at {}. This SCM is currently " - + "inactive (not running).", scm.getClientRpcAddress()); - } + int scmCounter = i; + new Thread(() -> { + if (scmCounter <= numOfActiveSCMs) { + try { + scm.start(); + } catch (IOException e) { + scmStartCounter.countDown(); + throw new RuntimeException(e); + } + activeSCMs.add(scm); + LOG.info("Started SCM RPC server at {}", + scm.getClientRpcAddress()); + } else { + inactiveSCMs.add(scm); + LOG.info("Initialized SCM at {}. This SCM is currently " + + "inactive (not running).", scm.getClientRpcAddress()); + } + scmStartCounter.countDown(); + }).start(); } + scmStartCounter.await(); break; - } catch (BindException e) { + } catch (RuntimeException | InterruptedException e) { for (StorageContainerManager scm : scmList) { scm.stop(); scm.join();