From 37a0bc31663422874db0a38bb756373c88bdc426 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 11 Jun 2026 11:42:49 -0400 Subject: [PATCH 1/3] fixmissingfilesizes --- doc/release-notes/fix-missing-filesizes.md | 7 ++ doc/sphinx-guides/source/api/native-api.rst | 17 ++++ .../iq/dataverse/DataFileServiceBean.java | 98 +++++++++++++++++++ .../edu/harvard/iq/dataverse/api/Admin.java | 34 +++++++ .../iq/dataverse/dataaccess/DataAccess.java | 19 ++++ .../dataverse/dataaccess/DataAccessTest.java | 15 +++ 6 files changed, 190 insertions(+) create mode 100644 doc/release-notes/fix-missing-filesizes.md diff --git a/doc/release-notes/fix-missing-filesizes.md b/doc/release-notes/fix-missing-filesizes.md new file mode 100644 index 00000000000..ead117c4fbe --- /dev/null +++ b/doc/release-notes/fix-missing-filesizes.md @@ -0,0 +1,7 @@ +### Admin API call to fix missing file sizes + +A new superuser-only Admin API endpoint has been added to allow administrators to scan for and fix missing file size entries in the database. This is useful for files that were uploaded but whose sizes were not correctly recorded. + +The endpoint will attempt to retrieve the file size from the underlying storage and update the database record. It only processes files in storage drivers that are configured as Dataverse-accessible (where Dataverse can read the files). + +`GET /api/admin/datafiles/integrity/fixmissingfilesizes?limit=N` diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index e6d5f4681b2..ffa873b472c 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -6194,6 +6194,23 @@ with limit parameter: Note the optional "limit" parameter. Without it, the API will attempt to populate the sizes for all the saved originals that don't have them in the database yet. Otherwise it will do so for the first N such datafiles. +Repair Missing File Sizes +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following superuser-only API will scan the database for datafiles with missing filesizes (i.e. where the size is not yet recorded in the database) and will attempt to retrieve the sizes from the underlying storage media and update the database records: + +.. code-block:: bash + + curl "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes" + +with limit parameter: + +.. code-block:: bash + + curl "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes?limit=100" + +Note the optional "limit" parameter. Without it, the API will attempt to populate the sizes for all the saved originals that don't have them in the database yet. Otherwise it will do so for the first N such datafiles. + By default, the admin API calls are blocked and can only be called from localhost. See more details in :ref:`:BlockedApiEndpoints <:BlockedApiEndpoints>` and :ref:`:BlockedApiPolicy <:BlockedApiPolicy>` settings in :doc:`/installation/config`. Get File External Tool URL diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 7af16372f40..7893bb8696f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -33,6 +33,7 @@ import java.util.logging.Logger; import java.util.stream.Collectors; +import jakarta.ejb.Asynchronous; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; import jakarta.ejb.TransactionAttribute; @@ -56,6 +57,8 @@ public class DataFileServiceBean implements java.io.Serializable { @EJB DvObjectServiceBean dvObjectService; @EJB + DataFileServiceBean self; + @EJB PermissionServiceBean permissionService; @EJB UserServiceBean userService; @@ -1336,6 +1339,101 @@ public List selectFilesWithMissingOriginalSizes() { } } + public List selectFilesWithMissingSizes(List accessibleDriverIds) { + if (accessibleDriverIds == null || accessibleDriverIds.isEmpty()) { + return new ArrayList<>(); + } + + StringBuilder queryStr = new StringBuilder(); + queryStr.append("SELECT f.id FROM datafile f WHERE (f.filesize IS NULL OR f.filesize <= 0) AND ("); + + for (int i = 0; i < accessibleDriverIds.size(); i++) { + String driverId = accessibleDriverIds.get(i); + if (i > 0) { + queryStr.append(" OR "); + } + queryStr.append("("); + if (driverId.equals(DataAccess.DEFAULT_STORAGE_DRIVER_IDENTIFIER)) { + queryStr.append("f.storageidentifier NOT LIKE '%").append(DataAccess.SEPARATOR).append("%' OR "); + } + queryStr.append("f.storageidentifier LIKE '").append(driverId).append(DataAccess.SEPARATOR).append("%')"); + } + queryStr.append(") ORDER BY f.id"); + + Query query = em.createNativeQuery(queryStr.toString()); + + try { + return (List) query.getResultList().stream().map(o -> ((Number) o).longValue()).collect(Collectors.toList()); + } catch (Exception ex) { + return new ArrayList<>(); + } + } + + + @Asynchronous + public void fixMissingFileSizes(List datafileIds) { + List failedIds = new ArrayList<>(); + int batchSize = 25; + for (int i = 0; i < datafileIds.size(); i += batchSize) { + List batch = datafileIds.subList(i, Math.min(i + batchSize, datafileIds.size())); + try { + failedIds.addAll(self.fixMissingFileSizesBatch(batch)); + } catch (Exception e) { + logger.severe("Batch processing failed unexpectedly: " + e.getMessage()); + failedIds.addAll(batch); + } + } + if (failedIds.isEmpty()) { + logger.info("Finished repairing data files that were missing file sizes."); + } else { + logger.info("Finished repairing data files that were missing file sizes. Failed IDs: " + failedIds); + } + } + + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + public List fixMissingFileSizesBatch(List datafileIds) { + List failedIds = new ArrayList<>(); + for (Long fileId : datafileIds) { + try { + fixMissingFileSizeInCurrentTransaction(fileId); + } catch (Exception e) { + logger.warning("Failed to fix missing file size for datafile id=" + fileId + ": " + e.getMessage()); + failedIds.add(fileId); + } + } + return failedIds; + } + + private void fixMissingFileSizeInCurrentTransaction(long fileId) throws IOException { + DataFile dataFile = find(fileId); + + if (dataFile != null) { + String driverId = DataAccess.getStorageDriverFromIdentifier(dataFile.getStorageIdentifier()); + if (StorageIO.isDataverseAccessible(driverId)) { + StorageIO storageIO = null; + try { + storageIO = dataFile.getStorageIO(); + storageIO.open(); + long size = storageIO.getSize(); + if (size >= 0) { + dataFile.setFilesize(size); + save(dataFile); + logger.fine("Fixed filesize for datafile id=" + fileId + ": " + size + " bytes."); + } else { + throw new IOException("storageIO.getSize() returned negative size: " + size); + } + } finally { + if (storageIO != null) { + storageIO.closeInputStream(); + } + } + } else { + logger.warning("Skipping datafile id=" + fileId + " because storage driver " + driverId + " is not Dataverse accessible."); + } + } else { + logger.warning("DataFile id=" + fileId + ": No such DataFile!"); + } + } public void finalizeFileDelete(Long dataFileId, String storageLocation) throws IOException { // Verify that the DataFile no longer exists: diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java index 919fa7f67f9..620a6ab5493 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java @@ -1539,6 +1539,40 @@ public Response fixMissingOriginalSizes(@QueryParam("limit") Integer limit) { return ok(info); } + @Path("datafiles/integrity/fixmissingfilesizes") + @AuthRequired + @GET + public Response fixMissingFileSizes(@Context final ContainerRequestContext crc, @QueryParam("limit") Integer limit) { + + User u = getRequestUser(crc); + if (!u.isSuperuser()) { + return error(Status.FORBIDDEN, BundleUtil.getStringFromBundle("admin.api.auth.mustBeSuperUser")); + } + JsonObjectBuilder info = Json.createObjectBuilder(); + + List accessibleDriverIds = DataAccess.getIdsForStorageDriversWithReadableFiles(); + List affectedFileIds = fileService.selectFilesWithMissingSizes(accessibleDriverIds); + + if (affectedFileIds.isEmpty()) { + info.add("message", + "No datafiles found with missing filesizes for accessible storage drivers; exiting."); + } else { + int howmany = affectedFileIds.size(); + String message = "Found " + howmany + " datafiles with missing filesizes. "; + + if (limit != null && howmany > limit) { + affectedFileIds = affectedFileIds.subList(0, limit); + message = message.concat(" Kicking off an async job that will repair the " + limit + " files in the background."); + } else { + message = message.concat(" Kicking off an async job that will repair the files in the background."); + } + info.add("message", message); + fileService.fixMissingFileSizes(affectedFileIds); + } + + return ok(info); + } + /** * This method is used in API tests, called from UtilIt.java. */ diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java index bc4c69390cf..efab7be8eac 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java @@ -25,7 +25,9 @@ import edu.harvard.iq.dataverse.util.FileUtil; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Properties; import java.util.logging.Logger; @@ -331,6 +333,23 @@ public static String getStorageDriverLabelFor(String storageDriverId) { } return label; } + + public static List getIdsForStorageDriversWithReadableFiles() { + List driverIds = new ArrayList<>(); + + // Other configured drivers + final String DATAVERSE_DRIVER_PREFIX = "dataverse.files."; + final String DATAVERSE_DRIVER_TYPE = ".type"; + for (String property : System.getProperties().stringPropertyNames()) { + if (property.startsWith(DATAVERSE_DRIVER_PREFIX) && property.endsWith(DATAVERSE_DRIVER_TYPE)) { + String driverId = property.substring(DATAVERSE_DRIVER_PREFIX.length(), property.length() - DATAVERSE_DRIVER_TYPE.length()); + if (StorageIO.isDataverseAccessible(driverId)) { + driverIds.add(driverId); + } + } + } + return driverIds; + } /** * This method checks to see if an overlay store is being used and, if so, diff --git a/src/test/java/edu/harvard/iq/dataverse/dataaccess/DataAccessTest.java b/src/test/java/edu/harvard/iq/dataverse/dataaccess/DataAccessTest.java index f7ce061fb24..9f78e3f5877 100644 --- a/src/test/java/edu/harvard/iq/dataverse/dataaccess/DataAccessTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/dataaccess/DataAccessTest.java @@ -79,4 +79,19 @@ void testGetStorageIdFromLocation() { assertEquals("s3://18b39722140-50eb7d3c5ece", DataAccess.getStorageIdFromLocation("s3://bucketname:10.5072/FK2/ABCDEF/18b39722140-50eb7d3c5ece")); } + + @Test + void testGetAccessibleStorageDriverIds() { + System.setProperty("dataverse.files.s3.type", "s3"); + System.setProperty("dataverse.files.s3.label", "S3 Storage"); + + System.setProperty("dataverse.files.remote.type", "remote"); + System.setProperty("dataverse.files.remote.files-not-accessible-by-dataverse", "true"); + + java.util.List accessibleDrivers = DataAccess.getIdsForStorageDriversWithReadableFiles(); + + assertTrue(accessibleDrivers.contains("file"), "Default 'file' driver should be accessible by default"); + assertTrue(accessibleDrivers.contains("s3"), "S3 driver should be accessible"); + assertFalse(accessibleDrivers.contains("remote"), "Remote driver with files-not-accessible-by-dataverse=true should not be accessible"); + } } From 8d1de452b8cd5705651a539455ee9b683a7ea07d Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 11 Jun 2026 11:52:15 -0400 Subject: [PATCH 2/3] use POST, i18n --- doc/release-notes/fix-missing-filesizes.md | 2 +- doc/sphinx-guides/source/api/native-api.rst | 4 ++-- src/main/java/edu/harvard/iq/dataverse/api/Admin.java | 10 +++++----- src/main/java/propertyFiles/Bundle.properties | 4 ++++ 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/doc/release-notes/fix-missing-filesizes.md b/doc/release-notes/fix-missing-filesizes.md index ead117c4fbe..b2be800a72e 100644 --- a/doc/release-notes/fix-missing-filesizes.md +++ b/doc/release-notes/fix-missing-filesizes.md @@ -4,4 +4,4 @@ A new superuser-only Admin API endpoint has been added to allow administrators t The endpoint will attempt to retrieve the file size from the underlying storage and update the database record. It only processes files in storage drivers that are configured as Dataverse-accessible (where Dataverse can read the files). -`GET /api/admin/datafiles/integrity/fixmissingfilesizes?limit=N` +`POST /api/admin/datafiles/integrity/fixmissingfilesizes?limit=N` diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index ffa873b472c..a00f955b46d 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -6201,13 +6201,13 @@ The following superuser-only API will scan the database for datafiles with missi .. code-block:: bash - curl "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes" + curl -X POST "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes" with limit parameter: .. code-block:: bash - curl "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes?limit=100" + curl -X POST "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes?limit=100" Note the optional "limit" parameter. Without it, the API will attempt to populate the sizes for all the saved originals that don't have them in the database yet. Otherwise it will do so for the first N such datafiles. diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java index 620a6ab5493..fdf152d36c3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java @@ -1541,7 +1541,7 @@ public Response fixMissingOriginalSizes(@QueryParam("limit") Integer limit) { @Path("datafiles/integrity/fixmissingfilesizes") @AuthRequired - @GET + @POST public Response fixMissingFileSizes(@Context final ContainerRequestContext crc, @QueryParam("limit") Integer limit) { User u = getRequestUser(crc); @@ -1555,16 +1555,16 @@ public Response fixMissingFileSizes(@Context final ContainerRequestContext crc, if (affectedFileIds.isEmpty()) { info.add("message", - "No datafiles found with missing filesizes for accessible storage drivers; exiting."); + BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.noFilesFound")); } else { int howmany = affectedFileIds.size(); - String message = "Found " + howmany + " datafiles with missing filesizes. "; + String message = BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.found", Arrays.asList(String.valueOf(howmany))); if (limit != null && howmany > limit) { affectedFileIds = affectedFileIds.subList(0, limit); - message = message.concat(" Kicking off an async job that will repair the " + limit + " files in the background."); + message = message.concat(BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.kickingOffWithLimit", Arrays.asList(String.valueOf(limit)))); } else { - message = message.concat(" Kicking off an async job that will repair the files in the background."); + message = message.concat(BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.kickingOff")); } info.add("message", message); fileService.fixMissingFileSizes(affectedFileIds); diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index 9a8c97fe429..b72be90eaf2 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -2836,6 +2836,10 @@ admin.api.deleteUser.failure.roleAssignments=the user is associated with role as admin.api.deleteUser.failure.versionUser=the user has contributed to dataset version(s) admin.api.deleteUser.failure.savedSearches=the user has created saved searches admin.api.deleteUser.success=Authenticated User {0} deleted. +admin.api.datafiles.integrity.fixMissingFileSizes.noFilesFound=No datafiles found with missing filesizes for accessible storage drivers; exiting. +admin.api.datafiles.integrity.fixMissingFileSizes.found=Found {0} datafiles with missing filesizes. +admin.api.datafiles.integrity.fixMissingFileSizes.kickingOff=Kicking off an async job that will repair the files in the background. +admin.api.datafiles.integrity.fixMissingFileSizes.kickingOffWithLimit=Kicking off an async job that will repair the {0} files in the background. #Files.java files.api.metadata.update.duplicateFile=Filename already exists at {0} From 0eb448a9142066aa465dd15423671914c6db6cf7 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 11 Jun 2026 13:20:44 -0400 Subject: [PATCH 3/3] fix query --- .../java/edu/harvard/iq/dataverse/DataFileServiceBean.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 7893bb8696f..3d3e3113774 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -1345,7 +1345,7 @@ public List selectFilesWithMissingSizes(List accessibleDriverIds) } StringBuilder queryStr = new StringBuilder(); - queryStr.append("SELECT f.id FROM datafile f WHERE (f.filesize IS NULL OR f.filesize <= 0) AND ("); + queryStr.append("SELECT f.id FROM datafile f, dvobject o WHERE f.id = o.id AND (f.filesize IS NULL OR f.filesize = -1) AND ("); for (int i = 0; i < accessibleDriverIds.size(); i++) { String driverId = accessibleDriverIds.get(i); @@ -1353,10 +1353,11 @@ public List selectFilesWithMissingSizes(List accessibleDriverIds) queryStr.append(" OR "); } queryStr.append("("); + //ToDo - are there any systems where entries for the default store don't have the store id/separator? If not, this can be dropped. If so, perhaps we update via flyway? if (driverId.equals(DataAccess.DEFAULT_STORAGE_DRIVER_IDENTIFIER)) { - queryStr.append("f.storageidentifier NOT LIKE '%").append(DataAccess.SEPARATOR).append("%' OR "); + queryStr.append("o.storageidentifier NOT LIKE '%").append(DataAccess.SEPARATOR).append("%' OR "); } - queryStr.append("f.storageidentifier LIKE '").append(driverId).append(DataAccess.SEPARATOR).append("%')"); + queryStr.append("o.storageidentifier LIKE '").append(driverId).append(DataAccess.SEPARATOR).append("%')"); } queryStr.append(") ORDER BY f.id");