diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java index f1deedc8d330..b6ce09407f7a 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/HddsVolume.java @@ -25,6 +25,7 @@ import jakarta.annotation.Nullable; import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.util.Iterator; import java.util.List; import java.util.concurrent.ConcurrentSkipListSet; @@ -304,24 +305,74 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused) return checkDbHealth(dbFile); } + /** + * Verifies the per-volume RocksDB's global state files (CURRENT, MANIFEST, + * OPTIONS) by opening the DB in secondary mode. A successful open implies + * those files are readable and internally consistent and that the + * referenced SST file names match what RocksDB expects. + * + *
This check intentionally does not read or checksum SST file + * contents or any individual key/value. Per-block / per-key integrity is + * verified by the container data scanner, which scans containers (and + * their RocksDB rows) on its own schedule. + * + *
The volume is only marked {@link VolumeCheckResult#FAILED} once the + * configured threshold of failures is exceeded, matching the parent class's + * intermittent-error tolerance. Open failures whose underlying RocksDB + * status is {@code IOError(NoSpace)} are not counted: {@code openAsSecondary} + * writes its info LOG into the disk-check directory, so an out-of-space + * failure there is unrelated to DB integrity. Any other status — permission + * denied, missing path, corruption, generic IO error — is still counted as + * a real failure. + */ @VisibleForTesting public VolumeCheckResult checkDbHealth(File dbFile) throws InterruptedException { if (!(getDiskCheckEnabled() && getDatanodeConfig().isRocksDbDiskCheckEnabled())) { return VolumeCheckResult.HEALTHY; } + File secondaryDir = new File(getDiskCheckDir(), "rocksdb-secondary-" + Time.now()); + try { + Files.createDirectories(secondaryDir.toPath()); + } catch (IOException e) { + LOG.error("Failed to create secondary instance dir {} for volume {}", secondaryDir, getStorageDir(), e); + getIoTestSlidingWindow().add(); + return getIoTestSlidingWindow().isExceeded() + ? VolumeCheckResult.FAILED : VolumeCheckResult.HEALTHY; + } + try (ManagedOptions managedOptions = new ManagedOptions(); - ManagedRocksDB ignored = ManagedRocksDB.openReadOnly(managedOptions, dbFile.toString())) { + ManagedRocksDB ignored = + ManagedRocksDB.openAsSecondary(managedOptions, dbFile.toString(), secondaryDir.getPath())) { // Do nothing. Only check if rocksdb is accessible. LOG.debug("Successfully opened the database at \"{}\" for HDDS volume {}.", dbFile, getStorageDir()); } catch (Exception e) { if (Thread.currentThread().isInterrupted()) { throw new InterruptedException("Check of database for volume " + this + " interrupted."); } - LOG.warn("Could not open Volume DB located at {}", dbFile, e); - getIoTestSlidingWindow().add(); + + // openAsSecondary writes its info LOG into secondaryDir. If that write + // fails because the disk is full, RocksDB surfaces the failure as + // IOError(NoSpace) (mapped from ENOSPC). That is unrelated to DB + // integrity, so don't count it against the sliding window. Any other + // status (permission denied, missing path, corruption, generic IO + // error) is still treated as a real failure. + if (ManagedRocksDB.isNoSpaceFailure(e)) { + LOG.warn("Skipping RocksDB health-check failure accounting for volume {}: " + + "secondary open returned IOError(NoSpace) for {}.", this, secondaryDir, e); + } else { + LOG.error("Could not open Volume DB located at {}", dbFile, e); + getIoTestSlidingWindow().add(); + } + } finally { + try { + FileUtils.deleteDirectory(secondaryDir); + } catch (IOException e) { + LOG.warn("Failed to delete RocksDB secondary instance dir {}", secondaryDir, e); + } } + if (getIoTestSlidingWindow().isExceeded()) { LOG.error("Failed to open the database at \"{}\" for HDDS volume {}: " + "encountered more than the {} tolerated failures.", diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java index d9424b76a139..367ff8ed92b8 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java @@ -534,7 +534,6 @@ public File getTmpDir() { return this.tmpDir; } - @VisibleForTesting public File getDiskCheckDir() { return this.diskCheckDir; } diff --git a/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java b/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java index 3401469f6824..3e19e95bca29 100644 --- a/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java +++ b/hadoop-hdds/managed-rocksdb/src/main/java/org/apache/hadoop/hdds/utils/db/managed/ManagedRocksDB.java @@ -31,6 +31,7 @@ import org.rocksdb.OptionsUtil; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; +import org.rocksdb.Status; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -75,6 +76,83 @@ public static ManagedRocksDB openReadOnly( ); } + /** + * Opens a RocksDB at {@code dbPath} as a secondary instance. + * It is safe to use a secondary instance while a primary writer + * is active on the same DB. + * + *
Secondary mode is RocksDB's supported way to attach an extra reader + * to a DB that has a live primary writer. If a DB is simultaneously opened + * by with the primary writer and as a read-only instance, + * it has undefined behavior. It often succeeds if the read-only instance + * closes quickly, but the contract is unsafe. + * + *
Catch-up semantics. A secondary's view does not auto-refresh; it + * stays at the snapshot captured at open time. The only way to advance it + * is to call {@code tryCatchUpWithPrimary()}, a user-triggered operation + * that rebuilds the in-memory memtable from new MANIFEST / WAL entries and + * never writes anything to disk. + * + *
The secondary log directory. Secondary mode requires its own
+ * directory at {@code secondaryDbLogFilePath} for the RocksDB info
+ * {@code LOG} file. That directory is used only for log files. No
+ * important data lives there. The previous {@code LOG} file is rotated to
+ * {@code LOG.old. Callers wanting to consult the {@link Status} on a
+ * {@link RocksDBException} from outside this module would otherwise have
+ * to import {@code org.rocksdb.Status} directly, which is restricted by
+ * the project's {@code banned-rocksdb-imports} enforcer rule. Use this
+ * helper instead.
+ *
+ * @param t the throwable to inspect; the entire cause chain is walked.
+ * @return {@code true} iff a {@code RocksDBException} with status
+ * {@code IOError(NoSpace)} is found.
+ */
+ public static boolean isNoSpaceFailure(Throwable t) {
+ for (Throwable cur = t; cur != null; cur = cur.getCause()) {
+ if (cur instanceof RocksDBException) {
+ Status status = ((RocksDBException) cur).getStatus();
+ if (status != null
+ && status.getCode() == Status.Code.IOError
+ && status.getSubCode() == Status.SubCode.NoSpace) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
public static ManagedRocksDB open(
final DBOptions options, final String path,
final List