Skip to content

Commit

Permalink
HDDS-8876. Enable TestOMRatisSnapshots, increase timeouts, disable fl…
Browse files Browse the repository at this point in the history
…aky assertions (apache#5673)
  • Loading branch information
adoroszlai authored Nov 27, 2023
1 parent 60bb060 commit 36a23f9
Showing 1 changed file with 20 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@
import org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils;
import org.apache.hadoop.ozone.om.snapshot.OmSnapshotUtils;
import org.apache.ozone.test.GenericTestUtils;
import org.apache.ozone.test.tag.Unhealthy;
import org.apache.ratis.server.protocol.TermIndex;
import org.assertj.core.api.Fail;
import org.junit.jupiter.api.AfterEach;
Expand Down Expand Up @@ -105,7 +104,6 @@
* Tests the Ratis snapshots feature in OM.
*/
@Timeout(5000)
@Unhealthy("HDDS-8876")
public class TestOMRatisSnapshots {

private MiniOzoneHAClusterImpl cluster = null;
Expand Down Expand Up @@ -260,7 +258,7 @@ public void testInstallSnapshot(int numSnapshotsToCreate) throws Exception {
GenericTestUtils.waitFor(() -> {
return followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex()
>= leaderOMSnapshotIndex - 1;
}, 100, 10000);
}, 100, 30_000);

long followerOMLastAppliedIndex =
followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex();
Expand Down Expand Up @@ -296,7 +294,7 @@ public void testInstallSnapshot(int numSnapshotsToCreate) throws Exception {
// Verify RPC server is running
GenericTestUtils.waitFor(() -> {
return followerOM.isOmRpcServerRunning();
}, 100, 5000);
}, 100, 30_000);

assertLogCapture(logCapture,
"Install Checkpoint is finished");
Expand Down Expand Up @@ -420,7 +418,7 @@ public void testInstallIncrementalSnapshot(@TempDir Path tempDir)
// Wait the follower download the snapshot,but get stuck by injector
GenericTestUtils.waitFor(() -> {
return followerOM.getOmSnapshotProvider().getNumDownloaded() == 1;
}, 1000, 10000);
}, 1000, 30_000);

// Get two incremental tarballs, adding new keys/snapshot for each.
IncrementData firstIncrement = getNextIncrementalTarball(160, 2, leaderOM,
Expand All @@ -445,7 +443,7 @@ public void testInstallIncrementalSnapshot(@TempDir Path tempDir)
GenericTestUtils.waitFor(() -> {
return followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex()
>= leaderOMSnapshotIndex - 1;
}, 1000, 30000);
}, 1000, 30_000);

assertEquals(3, followerOM.getOmSnapshotProvider().getNumDownloaded());
// Verify that the follower OM's DB contains the transactions which were
Expand Down Expand Up @@ -480,7 +478,7 @@ public void testInstallIncrementalSnapshot(@TempDir Path tempDir)
// Verify RPC server is running
GenericTestUtils.waitFor(() -> {
return followerOM.isOmRpcServerRunning();
}, 100, 5000);
}, 100, 30_000);

// Read & Write after snapshot installed.
List<String> newKeys = writeKeys(1);
Expand All @@ -493,7 +491,7 @@ public void testInstallIncrementalSnapshot(@TempDir Path tempDir)
} catch (IOException e) {
throw new RuntimeException(e);
}
}, 100, 10000);
}, 100, 30_000);

// Verify follower candidate directory get cleaned
String[] filesInCandidate = followerOM.getOmSnapshotProvider().
Expand Down Expand Up @@ -552,7 +550,7 @@ private IncrementData getNextIncrementalTarball(
// by injector
GenericTestUtils.waitFor(() ->
followerOM.getOmSnapshotProvider().getNumDownloaded() ==
expectedNumDownloads, 1000, 10000);
expectedNumDownloads, 1000, 30_000);

assertTrue(followerOM.getOmRatisServer().
getLastAppliedTermIndex().getIndex()
Expand Down Expand Up @@ -627,7 +625,7 @@ public void testInstallIncrementalSnapshotWithFailure() throws Exception {
// Wait the follower download the snapshot,but get stuck by injector
GenericTestUtils.waitFor(() -> {
return followerOM.getOmSnapshotProvider().getNumDownloaded() == 1;
}, 1000, 10000);
}, 1000, 30_000);

// Do some transactions, let leader OM take a new snapshot and purge the
// old logs, so that follower must download the new snapshot again.
Expand All @@ -644,7 +642,7 @@ public void testInstallIncrementalSnapshotWithFailure() throws Exception {
// by injector
GenericTestUtils.waitFor(() -> {
return followerOM.getOmSnapshotProvider().getNumDownloaded() == 2;
}, 1000, 10000);
}, 1000, 30_000);

// Corrupt the mixed checkpoint in the candidate DB dir
File followerCandidateDir = followerOM.getOmSnapshotProvider().
Expand Down Expand Up @@ -675,7 +673,7 @@ public void testInstallIncrementalSnapshotWithFailure() throws Exception {
GenericTestUtils.waitFor(() -> {
return followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex()
>= leaderOMSnapshotIndex - 1;
}, 1000, 10000);
}, 1000, 30_000);

// Verify that the follower OM's DB contains the transactions which were
// made while it was inactive.
Expand All @@ -697,28 +695,30 @@ public void testInstallIncrementalSnapshotWithFailure() throws Exception {
}

// Verify the metrics
/* HDDS-8876
GenericTestUtils.waitFor(() -> {
DBCheckpointMetrics dbMetrics =
leaderOM.getMetrics().getDBCheckpointMetrics();
return dbMetrics.getLastCheckpointStreamingNumSSTExcluded() == 0;
}, 100, 10000);
}, 100, 30_000);
GenericTestUtils.waitFor(() -> {
DBCheckpointMetrics dbMetrics =
leaderOM.getMetrics().getDBCheckpointMetrics();
return dbMetrics.getNumIncrementalCheckpoints() >= 1;
}, 100, 10000);
}, 100, 30_000);
GenericTestUtils.waitFor(() -> {
DBCheckpointMetrics dbMetrics =
leaderOM.getMetrics().getDBCheckpointMetrics();
return dbMetrics.getNumCheckpoints() >= 3;
}, 100, 10000);
}, 100, 30_000);
*/

// Verify RPC server is running
GenericTestUtils.waitFor(() -> {
return followerOM.isOmRpcServerRunning();
}, 100, 5000);
}, 100, 30_000);

// Read & Write after snapshot installed.
List<String> newKeys = writeKeys(1);
Expand All @@ -731,7 +731,7 @@ public void testInstallIncrementalSnapshotWithFailure() throws Exception {
} catch (IOException e) {
throw new RuntimeException(e);
}
}, 100, 10000);
}, 100, 30_000);

// Verify follower candidate directory get cleaned
String[] filesInCandidate = followerOM.getOmSnapshotProvider().
Expand Down Expand Up @@ -790,7 +790,7 @@ public void testInstallSnapshotWithClientWrite() throws Exception {
GenericTestUtils.waitFor(() -> {
return followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex()
>= leaderOMSnapshotIndex - 1;
}, 100, 3000);
}, 100, 30_000);

// Verify checkpoint installation was happened.
String msg = "Reloaded OM state";
Expand Down Expand Up @@ -894,7 +894,7 @@ public void testInstallSnapshotWithClientRead() throws Exception {
GenericTestUtils.waitFor(() -> {
return followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex()
>= leaderOMSnapshotIndex - 1;
}, 100, 3000);
}, 100, 30_000);

long followerOMLastAppliedIndex =
followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex();
Expand Down Expand Up @@ -1112,7 +1112,7 @@ private void assertLogCapture(GenericTestUtils.LogCapturer logCapture,
throws InterruptedException, TimeoutException {
GenericTestUtils.waitFor(() -> {
return logCapture.getOutput().contains(msg);
}, 100, 5000);
}, 100, 30_000);
}

// Returns temp dir where tarball was untarred.
Expand Down

0 comments on commit 36a23f9

Please sign in to comment.