diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index cbf5b24129e..49337692b83 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -21,13 +21,14 @@ import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_COMMAND_STATUS_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL; import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.QUASI_CLOSED; import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -46,6 +47,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import java.util.Objects; import java.util.Set; import java.util.UUID; import java.util.concurrent.CountDownLatch; @@ -100,7 +102,6 @@ import org.apache.hadoop.ozone.protocol.commands.SCMCommand; import org.apache.ozone.test.GenericTestUtils; import org.apache.ozone.test.LambdaTestUtils; -import org.apache.ozone.test.tag.Flaky; import org.apache.ratis.protocol.RaftGroupId; import org.apache.ratis.protocol.exceptions.StateMachineException; import org.apache.ratis.server.storage.FileInfo; @@ -109,11 +110,15 @@ import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.Order; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestMethodOrder; /** * Tests the containerStateMachine failure handling. */ +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) public class TestContainerStateMachineFailures { private static MiniOzoneCluster cluster; @@ -138,7 +143,9 @@ public static void init() throws Exception { conf.setTimeDuration(HDDS_PIPELINE_REPORT_INTERVAL, 200, TimeUnit.MILLISECONDS); conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 200, TimeUnit.MILLISECONDS); - conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 30, TimeUnit.SECONDS); + conf.setTimeDuration(HDDS_NODE_REPORT_INTERVAL, 1, TimeUnit.SECONDS); + conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, TimeUnit.SECONDS); + conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 6, TimeUnit.SECONDS); conf.set(OzoneConfigKeys.OZONE_SCM_CLOSE_CONTAINER_WAIT_DURATION, "2s"); conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_SCRUB_INTERVAL, "2s"); conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, "5s"); @@ -254,7 +261,6 @@ public void testContainerStateMachineCloseOnMissingPipeline() } @Test - @Flaky("HDDS-12215") public void testContainerStateMachineRestartWithDNChangePipeline() throws Exception { try (OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -304,7 +310,12 @@ public void testContainerStateMachineRestartWithDNChangePipeline() } } + // This test case is placed at the end because it resets the Ratis storage location. + // This causes pipelines to break. Those pipelines are closed passively + // via client-side retries rather than by the ScrubbingService. + // Running this test earlier would leave a dirty pipeline pool for subsequent tests. @Test + @Order(Integer.MAX_VALUE) public void testContainerStateMachineFailures() throws Exception { OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -476,16 +487,16 @@ public void testApplyTransactionFailure() throws Exception { getHddsDatanodes().get(index), omKeyLocationInfo.getPipeline()); SimpleStateMachineStorage storage = (SimpleStateMachineStorage) stateMachine.getStateMachineStorage(); - stateMachine.takeSnapshot(); + long containerID = omKeyLocationInfo.getContainerID(); + // delete the container db file + FileUtil.fullyDelete(new File(keyValueContainerData.getContainerPath())); final FileInfo snapshot = getSnapshotFileInfo(storage); final Path parentPath = snapshot.getPath(); // Since the snapshot threshold is set to 1, since there are // applyTransactions, we should see snapshots - assertThat(parentPath.getParent().toFile().listFiles().length).isGreaterThan(0); + assertThat(Objects.requireNonNull(parentPath.getParent().toFile().listFiles()).length) + .isGreaterThan(0); assertNotNull(snapshot); - long containerID = omKeyLocationInfo.getContainerID(); - // delete the container db file - FileUtil.fullyDelete(new File(keyValueContainerData.getContainerPath())); Pipeline pipeline = cluster.getStorageContainerLocationClient() .getContainerWithPipeline(containerID).getPipeline(); XceiverClientSpi xceiverClient = @@ -506,12 +517,13 @@ public void testApplyTransactionFailure() throws Exception { xceiverClientManager.releaseClient(xceiverClient, false); } // Make sure the container is marked unhealthy - assertSame(dn.getDatanodeStateMachine() + GenericTestUtils.waitFor(() -> dn.getDatanodeStateMachine() .getContainer().getContainerSet().getContainer(containerID) - .getContainerState(), UNHEALTHY); + .getContainerState() == UNHEALTHY, 100, 5000); try { // try to take a new snapshot, ideally it should just fail stateMachine.takeSnapshot(); + fail("Should have thrown StateMachineException because it is UNHEALTHY"); } catch (IOException ioe) { assertInstanceOf(StateMachineException.class, ioe); } @@ -521,8 +533,7 @@ public void testApplyTransactionFailure() throws Exception { try { final FileInfo latestSnapshot = getSnapshotFileInfo(storage); assertEquals(snapshot.getPath(), latestSnapshot.getPath()); - } catch (Throwable e) { - assertFalse(snapshot.getPath().toFile().exists()); + } catch (IOException ioe) { } } @@ -532,7 +543,6 @@ public void testApplyTransactionFailure() throws Exception { } @Test - @Flaky("HDDS-6115") void testApplyTransactionIdempotencyWithClosedContainer() throws Exception { OzoneOutputStream key = @@ -590,6 +600,8 @@ void testApplyTransactionIdempotencyWithClosedContainer() .getContainerState(), ContainerProtos.ContainerDataProto.State.CLOSED); assertTrue(stateMachine.isStateMachineHealthy()); + GenericTestUtils.waitFor(() -> stateMachine.getLastAppliedTermIndex().getIndex() != markIndex1, + 1000, 30000); try { stateMachine.takeSnapshot(); } finally { @@ -618,7 +630,6 @@ void testApplyTransactionIdempotencyWithClosedContainer() // not be marked unhealthy and pipeline should not fail if container gets // closed here. @Test - @Flaky("HDDS-13482") void testWriteStateMachineDataIdempotencyWithClosedContainer() throws Exception { OzoneOutputStream key = @@ -683,7 +694,7 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() }; Runnable r2 = () -> { try { - ByteString data = ByteString.copyFromUtf8("hello"); + ByteString data = ByteString.copyFromUtf8("ratis"); ContainerProtos.ContainerCommandRequestProto.Builder writeChunkRequest = ContainerTestHelper.newWriteChunkRequestBuilder(pipeline, omKeyLocationInfo.getBlockID(), data.size()); @@ -698,7 +709,7 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() failCount.incrementAndGet(); } String message = e.getMessage(); - assertThat(message).doesNotContain("hello"); + assertThat(message).doesNotContain("ratis"); assertThat(message).contains(HddsUtils.REDACTED.toStringUtf8()); } }; @@ -745,7 +756,6 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() } @Test - @Flaky("HDDS-14101") void testContainerStateMachineSingleFailureRetry() throws Exception { try (OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -776,7 +786,6 @@ void testContainerStateMachineSingleFailureRetry() } @Test - @Flaky("HDDS-14101") void testContainerStateMachineDualFailureRetry() throws Exception { OzoneOutputStream key =