Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/release-notes/fix-missing-filesizes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
### Admin API call to fix missing file sizes

A new superuser-only Admin API endpoint has been added to allow administrators to scan for and fix missing file size entries in the database. This is useful for files that were uploaded but whose sizes were not correctly recorded.

The endpoint will attempt to retrieve the file size from the underlying storage and update the database record. It only processes files in storage drivers that are configured as Dataverse-accessible (where Dataverse can read the files).

`POST /api/admin/datafiles/integrity/fixmissingfilesizes?limit=N`
17 changes: 17 additions & 0 deletions doc/sphinx-guides/source/api/native-api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6194,6 +6194,23 @@ with limit parameter:

Note the optional "limit" parameter. Without it, the API will attempt to populate the sizes for all the saved originals that don't have them in the database yet. Otherwise it will do so for the first N such datafiles.

Repair Missing File Sizes
~~~~~~~~~~~~~~~~~~~~~~~~~

The following superuser-only API will scan the database for datafiles with missing filesizes (i.e. where the size is not yet recorded in the database) and will attempt to retrieve the sizes from the underlying storage media and update the database records:

.. code-block:: bash

curl -X POST "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes"

with limit parameter:

.. code-block:: bash

curl -X POST "$SERVER_URL/api/admin/datafiles/integrity/fixmissingfilesizes?limit=100"

Note the optional "limit" parameter. Without it, the API will attempt to populate the sizes for all the saved originals that don't have them in the database yet. Otherwise it will do so for the first N such datafiles.

By default, the admin API calls are blocked and can only be called from localhost. See more details in :ref:`:BlockedApiEndpoints <:BlockedApiEndpoints>` and :ref:`:BlockedApiPolicy <:BlockedApiPolicy>` settings in :doc:`/installation/config`.

Get File External Tool URL
Expand Down
99 changes: 99 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.util.logging.Logger;
import java.util.stream.Collectors;

import jakarta.ejb.Asynchronous;
import jakarta.ejb.EJB;
import jakarta.ejb.Stateless;
import jakarta.ejb.TransactionAttribute;
Expand All @@ -56,6 +57,8 @@ public class DataFileServiceBean implements java.io.Serializable {
@EJB
DvObjectServiceBean dvObjectService;
@EJB
DataFileServiceBean self;
@EJB
PermissionServiceBean permissionService;
@EJB
UserServiceBean userService;
Expand Down Expand Up @@ -1336,6 +1339,102 @@ public List<Long> selectFilesWithMissingOriginalSizes() {
}
}

public List<Long> selectFilesWithMissingSizes(List<String> accessibleDriverIds) {
if (accessibleDriverIds == null || accessibleDriverIds.isEmpty()) {
return new ArrayList<>();
}

StringBuilder queryStr = new StringBuilder();
queryStr.append("SELECT f.id FROM datafile f, dvobject o WHERE f.id = o.id AND (f.filesize IS NULL OR f.filesize = -1) AND (");

for (int i = 0; i < accessibleDriverIds.size(); i++) {
String driverId = accessibleDriverIds.get(i);
if (i > 0) {
queryStr.append(" OR ");
}
queryStr.append("(");
//ToDo - are there any systems where entries for the default store don't have the store id/separator? If not, this can be dropped. If so, perhaps we update via flyway?
if (driverId.equals(DataAccess.DEFAULT_STORAGE_DRIVER_IDENTIFIER)) {
queryStr.append("o.storageidentifier NOT LIKE '%").append(DataAccess.SEPARATOR).append("%' OR ");
}
queryStr.append("o.storageidentifier LIKE '").append(driverId).append(DataAccess.SEPARATOR).append("%')");
}
queryStr.append(") ORDER BY f.id");

Query query = em.createNativeQuery(queryStr.toString());

try {
return (List<Long>) query.getResultList().stream().map(o -> ((Number) o).longValue()).collect(Collectors.toList());
} catch (Exception ex) {
return new ArrayList<>();
}
}


@Asynchronous
public void fixMissingFileSizes(List<Long> datafileIds) {
List<Long> failedIds = new ArrayList<>();
int batchSize = 25;
for (int i = 0; i < datafileIds.size(); i += batchSize) {
List<Long> batch = datafileIds.subList(i, Math.min(i + batchSize, datafileIds.size()));
try {
failedIds.addAll(self.fixMissingFileSizesBatch(batch));
} catch (Exception e) {
logger.severe("Batch processing failed unexpectedly: " + e.getMessage());
failedIds.addAll(batch);
}
}
if (failedIds.isEmpty()) {
logger.info("Finished repairing data files that were missing file sizes.");
} else {
logger.info("Finished repairing data files that were missing file sizes. Failed IDs: " + failedIds);
}
}

@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public List<Long> fixMissingFileSizesBatch(List<Long> datafileIds) {
List<Long> failedIds = new ArrayList<>();
for (Long fileId : datafileIds) {
try {
fixMissingFileSizeInCurrentTransaction(fileId);
} catch (Exception e) {
logger.warning("Failed to fix missing file size for datafile id=" + fileId + ": " + e.getMessage());
failedIds.add(fileId);
}
}
return failedIds;
}

private void fixMissingFileSizeInCurrentTransaction(long fileId) throws IOException {
DataFile dataFile = find(fileId);

if (dataFile != null) {
String driverId = DataAccess.getStorageDriverFromIdentifier(dataFile.getStorageIdentifier());
if (StorageIO.isDataverseAccessible(driverId)) {
StorageIO<DataFile> storageIO = null;
try {
storageIO = dataFile.getStorageIO();
storageIO.open();
long size = storageIO.getSize();
if (size >= 0) {
dataFile.setFilesize(size);
save(dataFile);
logger.fine("Fixed filesize for datafile id=" + fileId + ": " + size + " bytes.");
} else {
throw new IOException("storageIO.getSize() returned negative size: " + size);
}
} finally {
if (storageIO != null) {
storageIO.closeInputStream();
}
}
} else {
logger.warning("Skipping datafile id=" + fileId + " because storage driver " + driverId + " is not Dataverse accessible.");
}
} else {
logger.warning("DataFile id=" + fileId + ": No such DataFile!");
}
}

public void finalizeFileDelete(Long dataFileId, String storageLocation) throws IOException {
// Verify that the DataFile no longer exists:
Expand Down
34 changes: 34 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/api/Admin.java
Original file line number Diff line number Diff line change
Expand Up @@ -1539,6 +1539,40 @@ public Response fixMissingOriginalSizes(@QueryParam("limit") Integer limit) {
return ok(info);
}

@Path("datafiles/integrity/fixmissingfilesizes")
@AuthRequired
@POST
public Response fixMissingFileSizes(@Context final ContainerRequestContext crc, @QueryParam("limit") Integer limit) {

User u = getRequestUser(crc);
if (!u.isSuperuser()) {
return error(Status.FORBIDDEN, BundleUtil.getStringFromBundle("admin.api.auth.mustBeSuperUser"));
}
JsonObjectBuilder info = Json.createObjectBuilder();

List<String> accessibleDriverIds = DataAccess.getIdsForStorageDriversWithReadableFiles();
List<Long> affectedFileIds = fileService.selectFilesWithMissingSizes(accessibleDriverIds);

if (affectedFileIds.isEmpty()) {
info.add("message",
BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.noFilesFound"));
} else {
int howmany = affectedFileIds.size();
String message = BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.found", Arrays.asList(String.valueOf(howmany)));

if (limit != null && howmany > limit) {
affectedFileIds = affectedFileIds.subList(0, limit);
message = message.concat(BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.kickingOffWithLimit", Arrays.asList(String.valueOf(limit))));
} else {
message = message.concat(BundleUtil.getStringFromBundle("admin.api.datafiles.integrity.fixMissingFileSizes.kickingOff"));
}
info.add("message", message);
fileService.fixMissingFileSizes(affectedFileIds);
}

return ok(info);
}

/**
* This method is used in API tests, called from UtilIt.java.
*/
Expand Down
19 changes: 19 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/dataaccess/DataAccess.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import edu.harvard.iq.dataverse.util.FileUtil;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
import java.util.logging.Logger;

Expand Down Expand Up @@ -331,6 +333,23 @@ public static String getStorageDriverLabelFor(String storageDriverId) {
}
return label;
}

public static List<String> getIdsForStorageDriversWithReadableFiles() {
List<String> driverIds = new ArrayList<>();

// Other configured drivers
final String DATAVERSE_DRIVER_PREFIX = "dataverse.files.";
final String DATAVERSE_DRIVER_TYPE = ".type";
for (String property : System.getProperties().stringPropertyNames()) {
if (property.startsWith(DATAVERSE_DRIVER_PREFIX) && property.endsWith(DATAVERSE_DRIVER_TYPE)) {
String driverId = property.substring(DATAVERSE_DRIVER_PREFIX.length(), property.length() - DATAVERSE_DRIVER_TYPE.length());
if (StorageIO.isDataverseAccessible(driverId)) {
driverIds.add(driverId);
}
}
}
return driverIds;
}

/**
* This method checks to see if an overlay store is being used and, if so,
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/propertyFiles/Bundle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2836,6 +2836,10 @@ admin.api.deleteUser.failure.roleAssignments=the user is associated with role as
admin.api.deleteUser.failure.versionUser=the user has contributed to dataset version(s)
admin.api.deleteUser.failure.savedSearches=the user has created saved searches
admin.api.deleteUser.success=Authenticated User {0} deleted.
admin.api.datafiles.integrity.fixMissingFileSizes.noFilesFound=No datafiles found with missing filesizes for accessible storage drivers; exiting.
admin.api.datafiles.integrity.fixMissingFileSizes.found=Found {0} datafiles with missing filesizes.
admin.api.datafiles.integrity.fixMissingFileSizes.kickingOff=Kicking off an async job that will repair the files in the background.
admin.api.datafiles.integrity.fixMissingFileSizes.kickingOffWithLimit=Kicking off an async job that will repair the {0} files in the background.

#Files.java
files.api.metadata.update.duplicateFile=Filename already exists at {0}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,19 @@ void testGetStorageIdFromLocation() {
assertEquals("s3://18b39722140-50eb7d3c5ece",
DataAccess.getStorageIdFromLocation("s3://bucketname:10.5072/FK2/ABCDEF/18b39722140-50eb7d3c5ece"));
}

@Test
void testGetAccessibleStorageDriverIds() {
System.setProperty("dataverse.files.s3.type", "s3");
System.setProperty("dataverse.files.s3.label", "S3 Storage");

System.setProperty("dataverse.files.remote.type", "remote");
System.setProperty("dataverse.files.remote.files-not-accessible-by-dataverse", "true");

java.util.List<String> accessibleDrivers = DataAccess.getIdsForStorageDriversWithReadableFiles();

assertTrue(accessibleDrivers.contains("file"), "Default 'file' driver should be accessible by default");
assertTrue(accessibleDrivers.contains("s3"), "S3 driver should be accessible");
assertFalse(accessibleDrivers.contains("remote"), "Remote driver with files-not-accessible-by-dataverse=true should not be accessible");
}
}
Loading