From 63b20e08a4f5b9f427baed3b742c72619a636113 Mon Sep 17 00:00:00 2001 From: chungen0126 Date: Sun, 31 May 2026 09:11:11 +0800 Subject: [PATCH 1/6] HDDS-13482. Intermittent failure in TestContainerStateMachineFailures --- .../TestContainerStateMachineFailures.java | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index cbf5b24129e..7254b306165 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -21,9 +21,11 @@ import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_COMMAND_STATUS_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL; import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.QUASI_CLOSED; import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -100,7 +102,6 @@ import org.apache.hadoop.ozone.protocol.commands.SCMCommand; import org.apache.ozone.test.GenericTestUtils; import org.apache.ozone.test.LambdaTestUtils; -import org.apache.ozone.test.tag.Flaky; import org.apache.ratis.protocol.RaftGroupId; import org.apache.ratis.protocol.exceptions.StateMachineException; import org.apache.ratis.server.storage.FileInfo; @@ -109,11 +110,15 @@ import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.Order; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestMethodOrder; /** * Tests the containerStateMachine failure handling. */ +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) public class TestContainerStateMachineFailures { private static MiniOzoneCluster cluster; @@ -138,7 +143,9 @@ public static void init() throws Exception { conf.setTimeDuration(HDDS_PIPELINE_REPORT_INTERVAL, 200, TimeUnit.MILLISECONDS); conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 200, TimeUnit.MILLISECONDS); - conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 30, TimeUnit.SECONDS); + conf.setTimeDuration(HDDS_NODE_REPORT_INTERVAL, 1, TimeUnit.SECONDS); + conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, TimeUnit.SECONDS); + conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 6, TimeUnit.SECONDS); conf.set(OzoneConfigKeys.OZONE_SCM_CLOSE_CONTAINER_WAIT_DURATION, "2s"); conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_SCRUB_INTERVAL, "2s"); conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, "5s"); @@ -254,7 +261,6 @@ public void testContainerStateMachineCloseOnMissingPipeline() } @Test - @Flaky("HDDS-12215") public void testContainerStateMachineRestartWithDNChangePipeline() throws Exception { try (OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -304,7 +310,12 @@ public void testContainerStateMachineRestartWithDNChangePipeline() } } + // This test case is placed at the end because it resets the Ratis storage location. + // This causes pipelines to break. Those pipelines are closed passively + // via client-side retries rather than by the ScrubbingService. + // Running this test earlier would leave a dirty pipeline pool for subsequent tests. @Test + @Order(Integer.MAX_VALUE) public void testContainerStateMachineFailures() throws Exception { OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -532,7 +543,6 @@ public void testApplyTransactionFailure() throws Exception { } @Test - @Flaky("HDDS-6115") void testApplyTransactionIdempotencyWithClosedContainer() throws Exception { OzoneOutputStream key = @@ -618,7 +628,6 @@ void testApplyTransactionIdempotencyWithClosedContainer() // not be marked unhealthy and pipeline should not fail if container gets // closed here. @Test - @Flaky("HDDS-13482") void testWriteStateMachineDataIdempotencyWithClosedContainer() throws Exception { OzoneOutputStream key = @@ -683,7 +692,7 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() }; Runnable r2 = () -> { try { - ByteString data = ByteString.copyFromUtf8("hello"); + ByteString data = ByteString.copyFromUtf8("ratis"); ContainerProtos.ContainerCommandRequestProto.Builder writeChunkRequest = ContainerTestHelper.newWriteChunkRequestBuilder(pipeline, omKeyLocationInfo.getBlockID(), data.size()); @@ -698,7 +707,7 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() failCount.incrementAndGet(); } String message = e.getMessage(); - assertThat(message).doesNotContain("hello"); + assertThat(message).doesNotContain("ratis"); assertThat(message).contains(HddsUtils.REDACTED.toStringUtf8()); } }; @@ -745,7 +754,6 @@ void testWriteStateMachineDataIdempotencyWithClosedContainer() } @Test - @Flaky("HDDS-14101") void testContainerStateMachineSingleFailureRetry() throws Exception { try (OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) @@ -776,7 +784,6 @@ void testContainerStateMachineSingleFailureRetry() } @Test - @Flaky("HDDS-14101") void testContainerStateMachineDualFailureRetry() throws Exception { OzoneOutputStream key = From 6cdd22bf1f6d7392d9eb58c3e20dee514771e697 Mon Sep 17 00:00:00 2001 From: chungen0126 Date: Sun, 31 May 2026 15:02:40 +0800 Subject: [PATCH 2/6] fix testApplyTransactionIdempotencyWithClosedContainer --- .../ozone/client/rpc/TestContainerStateMachineFailures.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index 7254b306165..9785b4c8fd6 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -600,6 +600,8 @@ void testApplyTransactionIdempotencyWithClosedContainer() .getContainerState(), ContainerProtos.ContainerDataProto.State.CLOSED); assertTrue(stateMachine.isStateMachineHealthy()); + GenericTestUtils.waitFor(() -> stateMachine.getLastAppliedTermIndex().getIndex() != markIndex1 , + 1000, 30000); try { stateMachine.takeSnapshot(); } finally { From 9550d0819aef76d41bbe52859615251594c9af4e Mon Sep 17 00:00:00 2001 From: chungen0126 Date: Sun, 31 May 2026 15:30:23 +0800 Subject: [PATCH 3/6] fix checkstyle --- .../ozone/client/rpc/TestContainerStateMachineFailures.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index 9785b4c8fd6..9432bae3293 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -600,7 +600,7 @@ void testApplyTransactionIdempotencyWithClosedContainer() .getContainerState(), ContainerProtos.ContainerDataProto.State.CLOSED); assertTrue(stateMachine.isStateMachineHealthy()); - GenericTestUtils.waitFor(() -> stateMachine.getLastAppliedTermIndex().getIndex() != markIndex1 , + GenericTestUtils.waitFor(() -> stateMachine.getLastAppliedTermIndex().getIndex() != markIndex1, 1000, 30000); try { stateMachine.takeSnapshot(); From dc41163df55f0f05c301032e63a528bf379e6f6a Mon Sep 17 00:00:00 2001 From: chungen0126 Date: Mon, 1 Jun 2026 16:32:26 +0800 Subject: [PATCH 4/6] rewrite testApplyTransactionFailure --- .../rpc/TestContainerStateMachineFailures.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index 9432bae3293..ba3809c39d9 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -48,6 +48,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; +import java.util.Objects; import java.util.Set; import java.util.UUID; import java.util.concurrent.CountDownLatch; @@ -487,16 +488,16 @@ public void testApplyTransactionFailure() throws Exception { getHddsDatanodes().get(index), omKeyLocationInfo.getPipeline()); SimpleStateMachineStorage storage = (SimpleStateMachineStorage) stateMachine.getStateMachineStorage(); - stateMachine.takeSnapshot(); + long containerID = omKeyLocationInfo.getContainerID(); + // delete the container db file + FileUtil.fullyDelete(new File(keyValueContainerData.getContainerPath())); final FileInfo snapshot = getSnapshotFileInfo(storage); final Path parentPath = snapshot.getPath(); // Since the snapshot threshold is set to 1, since there are // applyTransactions, we should see snapshots - assertThat(parentPath.getParent().toFile().listFiles().length).isGreaterThan(0); + assertThat(Objects.requireNonNull(parentPath.getParent().toFile().listFiles()).length) + .isGreaterThan(0); assertNotNull(snapshot); - long containerID = omKeyLocationInfo.getContainerID(); - // delete the container db file - FileUtil.fullyDelete(new File(keyValueContainerData.getContainerPath())); Pipeline pipeline = cluster.getStorageContainerLocationClient() .getContainerWithPipeline(containerID).getPipeline(); XceiverClientSpi xceiverClient = @@ -517,12 +518,13 @@ public void testApplyTransactionFailure() throws Exception { xceiverClientManager.releaseClient(xceiverClient, false); } // Make sure the container is marked unhealthy - assertSame(dn.getDatanodeStateMachine() + GenericTestUtils.waitFor(() -> dn.getDatanodeStateMachine() .getContainer().getContainerSet().getContainer(containerID) - .getContainerState(), UNHEALTHY); + .getContainerState() == UNHEALTHY, 100, 5000); try { // try to take a new snapshot, ideally it should just fail stateMachine.takeSnapshot(); + fail("Should have thrown StateMachineException because it is UNHEALTHY"); } catch (IOException ioe) { assertInstanceOf(StateMachineException.class, ioe); } From 19765cbe9189ef2041631fba1d34948584ac107e Mon Sep 17 00:00:00 2001 From: chungen0126 Date: Mon, 1 Jun 2026 17:26:22 +0800 Subject: [PATCH 5/6] fix testApplyTransactionFailure --- .../ozone/client/rpc/TestContainerStateMachineFailures.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index ba3809c39d9..2b94b5575fd 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -534,8 +534,7 @@ public void testApplyTransactionFailure() throws Exception { try { final FileInfo latestSnapshot = getSnapshotFileInfo(storage); assertEquals(snapshot.getPath(), latestSnapshot.getPath()); - } catch (Throwable e) { - assertFalse(snapshot.getPath().toFile().exists()); + } catch (IOException ioe) { } } From a1e6bd9c0b2a131e72f382bd3e2b961d55b0d759 Mon Sep 17 00:00:00 2001 From: chungen0126 Date: Mon, 1 Jun 2026 17:28:32 +0800 Subject: [PATCH 6/6] fix checkstyle --- .../ozone/client/rpc/TestContainerStateMachineFailures.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index 2b94b5575fd..49337692b83 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -29,7 +29,6 @@ import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull;