diff --git a/doc/release-notes/fix-missing-filesizes.md b/doc/release-notes/fix-missing-filesizes.md new file mode 100644 index 00000000000..b2be800a72e --- /dev/null +++ b/doc/release-notes/fix-missing-filesizes.md @@ -0,0 +1,7 @@ +### Admin API call to fix missing file sizes + +A new superuser-only Admin API endpoint has been added to allow administrators to scan for and fix missing file size entries in the database. This is useful for files that were uploaded but whose sizes were not correctly recorded. + +The endpoint will attempt to retrieve the file size from the underlying storage and update the database record. It only processes files in storage drivers that are configured as Dataverse-accessible (where Dataverse can read the files). + +`POST /api/admin/datafiles/integrity/fixmissingfilesizes?limit=N` diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index e6d5f4681b2..a00f955b46d 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -6194,6 +6194,23 @@ with limit parameter: Note the optional "limit" parameter. Without it, the API will attempt to populate the sizes for all the saved originals that don't have them in the database yet. Otherwise it will do so for the first N such datafiles. +Repair Missing File Sizes +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following superuser-only API will scan the database for datafiles with missing filesizes (i.e. where the size is not yet recorded in the database) and will attempt to retrieve the sizes from the underlying storage media and update the database records: + +.. code-block:: bash + + curl -X POST "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes" + +with limit parameter: + +.. code-block:: bash + + curl -X POST "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes?limit=100" + +Note the optional "limit" parameter. Without it, the API will attempt to populate the sizes for all the saved originals that don't have them in the database yet. Otherwise it will do so for the first N such datafiles. + By default, the admin API calls are blocked and can only be called from localhost. See more details in :ref:`:BlockedApiEndpoints <:BlockedApiEndpoints>` and :ref:`:BlockedApiPolicy <:BlockedApiPolicy>` settings in :doc:`/installation/config`. Get File External Tool URL diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 7af16372f40..3d3e3113774 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -33,6 +33,7 @@ import java.util.logging.Logger; import java.util.stream.Collectors; +import jakarta.ejb.Asynchronous; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; import jakarta.ejb.TransactionAttribute; @@ -56,6 +57,8 @@ public class DataFileServiceBean implements java.io.Serializable { @EJB DvObjectServiceBean dvObjectService; @EJB + DataFileServiceBean self; + @EJB PermissionServiceBean permissionService; @EJB UserServiceBean userService; @@ -1336,6 +1339,102 @@ public List selectFilesWithMissingOriginalSizes() { } } + public List selectFilesWithMissingSizes(List accessibleDriverIds) { + if (accessibleDriverIds == null || accessibleDriverIds.isEmpty()) { + return new ArrayList<>(); + } + + StringBuilder queryStr = new StringBuilder(); + queryStr.append("SELECT f.id FROM datafile f, dvobject o WHERE f.id = o.id AND (f.filesize IS NULL OR f.filesize = -1) AND ("); + + for (int i = 0; i < accessibleDriverIds.size(); i++) { + String driverId = accessibleDriverIds.get(i); + if (i > 0) { + queryStr.append(" OR "); + } + queryStr.append("("); + //ToDo - are there any systems where entries for the default store don't have the store id/separator? If not, this can be dropped. If so, perhaps we update via flyway? + if (driverId.equals(DataAccess.DEFAULT_STORAGE_DRIVER_IDENTIFIER)) { + queryStr.append("o.storageidentifier NOT LIKE '%").append(DataAccess.SEPARATOR).append("%' OR "); + } + queryStr.append("o.storageidentifier LIKE '").append(driverId).append(DataAccess.SEPARATOR).append("%')"); + } + queryStr.append(") ORDER BY f.id"); + + Query query = em.createNativeQuery(queryStr.toString()); + + try { + return (List) query.getResultList().stream().map(o -> ((Number) o).longValue()).collect(Collectors.toList()); + } catch (Exception ex) { + return new ArrayList<>(); + } + } + + + @Asynchronous + public void fixMissingFileSizes(List datafileIds) { + List failedIds = new ArrayList<>(); + int batchSize = 25; + for (int i = 0; i < datafileIds.size(); i += batchSize) { + List batch = datafileIds.subList(i, Math.min(i + batchSize, datafileIds.size())); + try { + failedIds.addAll(self.fixMissingFileSizesBatch(batch)); + } catch (Exception e) { + logger.severe("Batch processing failed unexpectedly: " + e.getMessage()); + failedIds.addAll(batch); + } + } + if (failedIds.isEmpty()) { + logger.info("Finished repairing data files that were missing file sizes."); + } else { + logger.info("Finished repairing data files that were missing file sizes. Failed IDs: " + failedIds); + } + } + + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + public List fixMissingFileSizesBatch(List datafileIds) { + List failedIds = new ArrayList<>(); + for (Long fileId : datafileIds) { + try { + fixMissingFileSizeInCurrentTransaction(fileId); + } catch (Exception e) { + logger.warning("Failed to fix missing file size for datafile id=" + fileId + ": " + e.getMessage()); + failedIds.add(fileId); + } + } + return failedIds; + } + + private void fixMissingFileSizeInCurrentTransaction(long fileId) throws IOException { + DataFile dataFile = find(fileId); + + if (dataFile != null) { + String driverId = DataAccess.getStorageDriverFromIdentifier(dataFile.getStorageIdentifier()); + if (StorageIO.isDataverseAccessible(driverId)) { + StorageIO storageIO = null; + try { + storageIO = dataFile.getStorageIO(); + storageIO.open(); + long size = storageIO.getSize(); + if (size >= 0) { + dataFile.setFilesize(size); + save(dataFile); + logger.fine("Fixed filesize for datafile id=" + fileId + ": " + size + " bytes."); + } else { + throw new IOException("storageIO.getSize() returned negative size: " + size); + } + } finally { + if (storageIO != null) { + storageIO.closeInputStream(); + } + } + } else { + logger.warning("Skipping datafile id=" + fileId + " because storage driver " + driverId + " is not Dataverse accessible."); + } + } else { + logger.warning("DataFile id=" + fileId + ": No such DataFile!"); + } + } public void finalizeFileDelete(Long dataFileId, String storageLocation) throws IOException { // Verify that the DataFile no longer exists: diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java index 919fa7f67f9..fdf152d36c3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Admin.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Admin.java @@ -1539,6 +1539,40 @@ public Response fixMissingOriginalSizes(@QueryParam("limit") Integer limit) { return ok(info); } + @Path("datafiles/integrity/fixmissingfilesizes") + @AuthRequired + @POST + public Response fixMissingFileSizes(@Context final ContainerRequestContext crc, @QueryParam("limit") Integer limit) { + + User u = getRequestUser(crc); + if (!u.isSuperuser()) { + return error(Status.FORBIDDEN, BundleUtil.getStringFromBundle("admin.api.auth.mustBeSuperUser")); + } + JsonObjectBuilder info = Json.createObjectBuilder(); + + List accessibleDriverIds = DataAccess.getIdsForStorageDriversWithReadableFiles(); + List affectedFileIds = fileService.selectFilesWithMissingSizes(accessibleDriverIds); + + if (affectedFileIds.isEmpty()) { + info.add("message", + BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.noFilesFound")); + } else { + int howmany = affectedFileIds.size(); + String message = BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.found", Arrays.asList(String.valueOf(howmany))); + + if (limit != null && howmany > limit) { + affectedFileIds = affectedFileIds.subList(0, limit); + message = message.concat(BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.kickingOffWithLimit", Arrays.asList(String.valueOf(limit)))); + } else { + message = message.concat(BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.kickingOff")); + } + info.add("message", message); + fileService.fixMissingFileSizes(affectedFileIds); + } + + return ok(info); + } + /** * This method is used in API tests, called from UtilIt.java. */ diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java index bc4c69390cf..efab7be8eac 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java @@ -25,7 +25,9 @@ import edu.harvard.iq.dataverse.util.FileUtil; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Properties; import java.util.logging.Logger; @@ -331,6 +333,23 @@ public static String getStorageDriverLabelFor(String storageDriverId) { } return label; } + + public static List getIdsForStorageDriversWithReadableFiles() { + List driverIds = new ArrayList<>(); + + // Other configured drivers + final String DATAVERSE_DRIVER_PREFIX = "dataverse.files."; + final String DATAVERSE_DRIVER_TYPE = ".type"; + for (String property : System.getProperties().stringPropertyNames()) { + if (property.startsWith(DATAVERSE_DRIVER_PREFIX) && property.endsWith(DATAVERSE_DRIVER_TYPE)) { + String driverId = property.substring(DATAVERSE_DRIVER_PREFIX.length(), property.length() - DATAVERSE_DRIVER_TYPE.length()); + if (StorageIO.isDataverseAccessible(driverId)) { + driverIds.add(driverId); + } + } + } + return driverIds; + } /** * This method checks to see if an overlay store is being used and, if so, diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index 9a8c97fe429..b72be90eaf2 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -2836,6 +2836,10 @@ admin.api.deleteUser.failure.roleAssignments=the user is associated with role as admin.api.deleteUser.failure.versionUser=the user has contributed to dataset version(s) admin.api.deleteUser.failure.savedSearches=the user has created saved searches admin.api.deleteUser.success=Authenticated User {0} deleted. +admin.api.datafiles.integrity.fixMissingFileSizes.noFilesFound=No datafiles found with missing filesizes for accessible storage drivers; exiting. +admin.api.datafiles.integrity.fixMissingFileSizes.found=Found {0} datafiles with missing filesizes. +admin.api.datafiles.integrity.fixMissingFileSizes.kickingOff=Kicking off an async job that will repair the files in the background. +admin.api.datafiles.integrity.fixMissingFileSizes.kickingOffWithLimit=Kicking off an async job that will repair the {0} files in the background. #Files.java files.api.metadata.update.duplicateFile=Filename already exists at {0} diff --git a/src/test/java/edu/harvard/iq/dataverse/dataaccess/DataAccessTest.java b/src/test/java/edu/harvard/iq/dataverse/dataaccess/DataAccessTest.java index f7ce061fb24..9f78e3f5877 100644 --- a/src/test/java/edu/harvard/iq/dataverse/dataaccess/DataAccessTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/dataaccess/DataAccessTest.java @@ -79,4 +79,19 @@ void testGetStorageIdFromLocation() { assertEquals("s3://18b39722140-50eb7d3c5ece", DataAccess.getStorageIdFromLocation("s3://bucketname:10.5072/FK2/ABCDEF/18b39722140-50eb7d3c5ece")); } + + @Test + void testGetAccessibleStorageDriverIds() { + System.setProperty("dataverse.files.s3.type", "s3"); + System.setProperty("dataverse.files.s3.label", "S3 Storage"); + + System.setProperty("dataverse.files.remote.type", "remote"); + System.setProperty("dataverse.files.remote.files-not-accessible-by-dataverse", "true"); + + java.util.List accessibleDrivers = DataAccess.getIdsForStorageDriversWithReadableFiles(); + + assertTrue(accessibleDrivers.contains("file"), "Default 'file' driver should be accessible by default"); + assertTrue(accessibleDrivers.contains("s3"), "S3 driver should be accessible"); + assertFalse(accessibleDrivers.contains("remote"), "Remote driver with files-not-accessible-by-dataverse=true should not be accessible"); + } }