added consolidated metadata support

jreadey · jreadey · commit 77042d85a70e · 2026-01-06T09:39:00.000+08:00
diff --git a/hsds/async_lib.py b/hsds/async_lib.py
@@ -17,15 +17,15 @@
 from aiohttp.web_exceptions import HTTPForbidden
 from h5json.hdf5dtype import getItemSize
 from h5json.hdf5dtype import createDataType
-from h5json.array_util import getNumElements, bytesToArray
+from h5json.array_util import getNumElements, bytesToArray, bytesArrayToList
 from h5json.objid import isValidUuid, isSchema2Id, getS3Key, isS3ObjKey
 from h5json.objid import getObjId, isValidChunkId, getCollectionForId
 from h5json.filters import getFilters
-from h5json.shape_util import getShapeDims
+from h5json.shape_util import getShapeDims, getDataSize
 from h5json.dset_util import getDatasetLayoutClass, getDatasetLayout, getChunkDims
 from h5json.time_util import getNow
 
-from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator
+from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator, getChunkIndex, getChunkIds
 from .util.dsetUtil import getHyperslabSelection
 from .util.storUtil import getStorKeys, putStorJSONObj, getStorJSONObj
 from .util.storUtil import deleteStorObj, getStorBytes, isStorObj
@@ -77,6 +77,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None):
         msg += f"{dset_id}"
         log.warn(msg)
         return
+
     type_json = dset_json["type"]
     item_size = getItemSize(type_json)
     if not getDatasetLayout(dset_json):
@@ -112,7 +113,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None):
     if layout_class == "H5D_CONTIGUOUS_REF":
         # In H5D_CONTIGUOUS_REF a non-compressed part of the HDF5 is divided
         # into equal size chunks, so we can just compute link bytes and num
-        # chunks based on the size of the coniguous dataset
+        # chunks based on the size of the contiguous dataset
         layout_dims = getChunkDims(dset_json)
         num_chunks = getNumChunks(selection, layout_dims)
         chunk_size = item_size
@@ -268,20 +269,26 @@ def scanRootCallback(app, s3keys):
     results = app["scanRoot_results"]
     scanRoot_keyset = app["scanRoot_keyset"]
     checksums = results["checksums"]
+
     for s3key in s3keys.keys():
 
         if not isS3ObjKey(s3key):
-            log.info(f"not s3obj key, ignoring: {s3key}")
+            log.info(f"scanRoot -not s3obj key, ignoring: {s3key}")
             continue
         if s3key in scanRoot_keyset:
-            log.warn(f"scanRoot - dejavu for key: {s3key}")
+            log.warn(f"scanRoot -scanRoot - dejavu for key: {s3key}")
             continue
         scanRoot_keyset.add(s3key)
-        msg = f"scanRoot adding key: {s3key} to keyset, "
+        msg = f"scanRoot - adding key: {s3key} to keyset, "
         msg += f"{len(scanRoot_keyset)} keys"
         log.debug(msg)
 
         objid = getObjId(s3key)
+
+        if objid in app["deleted_ids"]:
+            log.debug(f"scanRoot - skipping deleted id: {objid}")
+            continue
+
         etag = None
         obj_size = None
         lastModified = None
@@ -306,8 +313,15 @@ def scanRootCallback(app, s3keys):
             is_chunk = True
             results["num_chunks"] += 1
             results["allocated_bytes"] += obj_size
+            chunk_index = getChunkIndex(objid)
+            if max(chunk_index) == 0:
+                # save the first chunk if present
+                # this will be used to save dataset values to
+                # the the obj_ids set for small datasets
+                results["obj_ids"].add(objid)
         else:
             results["metadata_bytes"] += obj_size
+            results["obj_ids"].add(objid)
 
         if is_chunk or getCollectionForId(objid) == "datasets":
             if is_chunk:
@@ -345,6 +359,144 @@ def scanRootCallback(app, s3keys):
             log.error(msg)
 
 
+async def _getDatsetValueJson(app, dset_id, dset_json, obj_ids, size_limit=None, bucket=None):
+    """ If the dataset size is less than size_limit, and the chunk_ids for the dataset are
+        available, return a JSON representation of the dataset values. Othewise, return None """
+
+    dims = getShapeDims(dset_json)
+    if dims is None:
+        return None  # null dataspace
+    if "type" not in dset_json:
+        msg = f"_getDatsetValueJson - expected to find type in dataset_json for {dset_id}"
+        log.warn(msg)
+        return None
+    type_json = dset_json["type"]
+    item_size = getItemSize(type_json)
+    if item_size == "H5T_VARIABLE":
+        item_size = 1024  # make a guess for variable length types
+    dataset_size = getDataSize(dims, item_size)
+    if dataset_size > size_limit:
+        log.debug(f"_getDatasetValueJson - dataset size {dataset_size} exceeds limit {size_limit}")
+        return None
+
+    chunk_dims = getChunkDims(dset_json)
+    if not chunk_dims:
+        log.warning(f"_getDatasetValueJson - no layout found for dataset: {dset_id}")
+        return None
+    if chunk_dims != dims:
+        msg = f"_getDatasetValueJson - dataset layout {chunk_dims} does not match dims {dims} "
+        msg += f"for dataset: {dset_id}, ignoring"
+        log.warning(msg)
+        return None
+    select_all = getHyperslabSelection(dims)  # select entire datashape
+    chunk_ids = getChunkIds(dset_id, select_all, dims)
+    if len(chunk_ids) == 0:
+        log.debug(f"_getDatasetValueJson - no chunk ids found for dataset: {dset_id}")
+        return None
+    if len(chunk_ids) > 1:
+        log.debug(f"_getDatasetValueJson - more than one chunk id found for dataset: {dset_id}")
+        return None
+    chunk_id = chunk_ids[0]
+    if chunk_id not in obj_ids:
+        log.debug(f"_getDatasetValueJson - chunk id {chunk_id} not in scanned obj_ids")
+        return None
+    log.debug(f"using chunk: {chunk_id} to get dataset value for {dset_id}")
+
+    # fetch the chunk - using getStoreBytes since this will not be used with
+    # chunk cache or chunk crawlers
+    # TBD: need parameters for s3path, s3offset, s3size for ref layouts
+    # regular store read
+
+    filters = getFilters(dset_json)
+    dt = createDataType(type_json)
+    filter_ops = getFilterOps(app, dset_id, filters, dtype=dt, chunk_shape=chunk_dims)
+
+    kwargs = {
+        "filter_ops": filter_ops,
+        "offset": None,
+        "length": None,
+        "bucket": bucket
+    }
+    s3key = getS3Key(chunk_id)
+
+    try:
+        chunk_bytes = await getStorBytes(app, s3key, **kwargs)
+    except HTTPNotFound:
+        log.warning(f"_getDatasetValueJson - HTTPNotFound for chunk {chunk_id} bucket:{bucket}")
+        return None
+    except HTTPForbidden:
+        log.warning(f"_getDatasetValueJson - HTTPForbidden for chunk {chunk_id} bucket:{bucket}")
+        return None
+    except HTTPInternalServerError:
+        msg = "_getDatasetValueJson - "
+        msg += f"HTTPInternalServerError for chunk {chunk_id} bucket:{bucket}"
+        log.warning(msg)
+        return None
+
+    if chunk_bytes is None:
+        msg = f"_getDatasetValueJson -read {chunk_id} bucket: {bucket} returned None"
+        log.warning(msg)
+        return None
+
+    arr = bytesToArray(chunk_bytes, dt, chunk_dims)
+
+    json_value = bytesArrayToList(arr)
+    log.debug(f"_getDatsetValueJson - returning {json_value}")
+
+    return json_value
+
+
+async def getConsolidatedMetaData(app, obj_ids, bucket=None):
+    # create a consolidated metadata summary for all objects in the domain
+    # return a dict of obj_ids to their metadata summaries
+    log.info("getConsolidatedMetaData - creating consolidated metadata summary")
+    consolidated_metadata = {}
+    for obj_id in obj_ids:
+        if isValidChunkId(obj_id):
+            # skip chunks - we may use the chunk later when processing it's dataset object
+            continue
+        s3_key = getS3Key(obj_id)
+        try:
+            obj_json = await getStorJSONObj(app, s3_key, bucket=bucket)
+        except HTTPNotFound:
+            log.warn(f"HTTPNotFound for {s3_key} bucket:{bucket}")
+            continue
+        except HTTPForbidden:
+            log.warn(f"HTTPForbidden error for {s3_key} bucket:{bucket}")
+            continue
+        except HTTPInternalServerError:
+            msg = f"HTTPInternalServerError error for {s3_key} bucket:{bucket}"
+            log.warn(msg)
+            continue
+        log.debug(f"getConsolidatedMetaData - got json for obj_id: {obj_id}: {obj_json}")
+        # extract relevant metadata
+        metadata_summary = {}
+        if "type" in obj_json:
+            metadata_summary["type"] = obj_json["type"]
+        if "shape" in obj_json:
+            metadata_summary["shape"] = obj_json["shape"]
+        if "attributes" in obj_json:
+            metadata_summary["attributes"] = obj_json["attributes"]
+        if "links" in obj_json:
+            metadata_summary["links"] = obj_json["links"]
+        if "creationProperties" in obj_json:
+            metadata_summary["creationProperties"] = obj_json["creationProperties"]
+        if getCollectionForId(obj_id) == "datasets":
+            log.debug("getConsolidatedMetaData - got dataset")
+            size_limit = 4096  # TBD - make this a config
+            kwargs = {"size_limit": size_limit, "bucket": bucket}
+            json_value = await _getDatsetValueJson(app, obj_id, obj_json, obj_ids, **kwargs)
+            if json_value is not None:
+                log.debug(f"adding dataset value to metadata summary for dataset: {obj_id}")
+                metadata_summary["value"] = json_value
+        else:
+            log.debug("getConsolidatedMetaData - not a dataset")
+
+        consolidated_metadata[obj_id] = metadata_summary
+    log.info("getConsolidatedMetaData - done creating consolidated metadata summary")
+    return consolidated_metadata
+
+
 async def scanRoot(app, rootid, update=False, bucket=None):
 
     # iterate through all s3 keys under the given root.
@@ -386,7 +538,8 @@ async def scanRoot(app, rootid, update=False, bucket=None):
     results["num_linked_chunks"] = 0
     results["linked_bytes"] = 0
     results["logical_bytes"] = 0
-    results["checksums"] = {}  # map of objid to checksums
+    results["obj_ids"] = set()  # map of object ids scanned (and first chunk id for datasets)
+    results["checksums"] = {}   # map of objid to checksums
     results["bucket"] = bucket
     results["scan_start"] = getNow(app=app)
 
@@ -405,6 +558,9 @@ async def scanRoot(app, rootid, update=False, bucket=None):
     num_objects += len(results["datasets"])
     num_objects += results["num_chunks"]
     log.info(f"scanRoot - got {num_objects} keys for rootid: {rootid}")
+    obj_ids = results["obj_ids"]
+    log.info(f"scanRoot - got {len(obj_ids)} unique object ids")
+    log.debug(f"scanRoot - obj_ids: {obj_ids}")
 
     dataset_results = results["datasets"]
     for dsetid in dataset_results:
@@ -445,13 +601,29 @@ async def scanRoot(app, rootid, update=False, bucket=None):
 
     results["scan_complete"] = getNow(app=app)
 
+    # extract the obj_ids set, that won't go into .info.json
+    obj_ids = results["obj_ids"]
+    del results["obj_ids"]
+    log.debug(f"obj_ids set: {obj_ids}")
+
     if update:
         # write .info object back to S3
         info_key = root_prefix + ".info.json"
         msg = f"scanRoot - updating info key: {info_key} with results: "
         msg += f"{results}"
         log.info(msg)
         await putStorJSONObj(app, info_key, results, bucket=bucket)
+
+        # create a json summary of objects in ths domain
+        log.debug(f"Creating consolidated metadata summary for root {rootid}")
+        summary_key = root_prefix + ".summary.json"
+        summary_data = await getConsolidatedMetaData(app, obj_ids, bucket=bucket)
+        if summary_data:
+            log.info(f"Got consolidated metadata summary for root {rootid}")
+            log.debug(f"Summary data: {summary_data}")
+            await putStorJSONObj(app, summary_key, summary_data, bucket=bucket)
+        else:
+            log.info(f"No consolidated metadata summary for root {rootid}")
     return results
 
 
diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py
@@ -1094,7 +1094,7 @@ async def get_chunk(
         log.debug(msg)
     else:
         s3key = getS3Key(chunk_id)
-        log.debug(f"getChunk chunkid: {chunk_id} bucket: {bucket}")
+        log.debug(f"getChunk chunkid: {chunk_id} bucket: {bucket} using key: {s3key}")
     if chunk_id in chunk_cache:
         log.debug(f"getChunk chunkid: {chunk_id} found in cache")
         chunk_arr = chunk_cache[chunk_id]
diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py
@@ -462,6 +462,11 @@ async def GET_Domain(request):
     if "verbose" in params and params["verbose"]:
         verbose = True
 
+    getobjs = False
+    # include domain objects if requested
+    if params.get("getobjs"):
+        getobjs = True
+
     if not domain:
         log.info("no domain passed in, returning all top-level domains")
         # no domain passed in, return top-level domains for this request
@@ -543,22 +548,9 @@ async def GET_Domain(request):
         return resp
 
     # return just the keys as per the REST API
-    kwargs = {"verbose": verbose, "bucket": bucket}
+    kwargs = {"verbose": verbose, "getobjs": getobjs, "bucket": bucket}
     rsp_json = await getDomainResponse(app, domain_json, **kwargs)
 
-    # include domain objects if requested
-    if params.get("getobjs") and "root" in domain_json:
-
-        log.debug("getting all domain objects")
-        root_id = domain_json["root"]
-        kwargs = {"include_attrs": include_attrs, "bucket": bucket}
-        domain_objs = await getDomainObjects(app, root_id, **kwargs)
-        if domain_objs:
-            rsp_json["domain_objs"] = domain_objs
-
-    # include domain class if present
-    # if "class" in domain_json:
-    #    rsp_json["class"] = domain_json["class"]
 
     # include dn_ids if requested
     if "getdnids" in params and params["getdnids"]:
diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py
@@ -116,7 +116,7 @@ async def getDomainJson(app, domain, reload=False):
     return domain_json
 
 
-async def getDomainResponse(app, domain_json, bucket=None, verbose=False):
+async def getDomainResponse(app, domain_json, bucket=None, verbose=False, getobjs=False):
     """ construct JSON response for domain request """
     rsp_json = {}
     if "root" in domain_json:
@@ -189,6 +189,13 @@ async def getDomainResponse(app, domain_json, bucket=None, verbose=False):
         rsp_json["num_linked_chunks"] = num_linked_chunks
         rsp_json["md5_sum"] = md5_sum
 
+    if getobjs and "root" in domain_json:
+        root_id = domain_json["root"]
+        domain_objs = await getDomainObjs(app, root_id, bucket=bucket)
+        if domain_objs:
+            log.debug(f"returning {len(domain_objs)} for root_id: {root_id}")
+            rsp_json["domain_objs"] = domain_objs
+
     # pass back config parameters the client may care about
 
     rsp_json["limits"] = getLimits()
@@ -849,8 +856,32 @@ async def getRootInfo(app, root_id, bucket=None):
     return info_json
 
 
+async def getDomainObjs(app, root_id, bucket=None):
+    """ Return domain objects if available for this root id """
+    log.debug(f"getDomainObjs {root_id}")
+
+    s3_key = getS3Key(root_id)
+
+    parts = s3_key.split("/")
+    # dset_key is in the format  db/<root>/d/<dset>/.dataset.json
+    # get the key for the root info object as: db/<root>/.summary.json
+    if len(parts) != 3:
+        log.error(f"Unexpected s3key format: {s3_key}")
+        return None
+
+    summary_key = f"db/{parts[1]}/.summary.json"
+
+    try:
+        summary_json = await getStorJSONObj(app, summary_key, bucket=bucket)
+    except HTTPNotFound:
+        log.warn(f".summary.json not found for key: {summary_key}")
+        return None
+
+    return summary_json
+
+
 async def doFlush(app, root_id, bucket=None):
-    """return wnen all DN nodes have wrote any pending changes to S3"""
+    """return wnen all DN nodes have wrote any pending changes to S3 """
     log.info(f"doFlush {root_id}")
     params = {"flush": 1}
     if bucket:
diff --git a/hsds/util/storUtil.py b/hsds/util/storUtil.py
@@ -493,7 +493,7 @@ async def getStorBytes(app,
         chunk_bytes = []
 
         for chunk_location in chunk_locations:
-            log.debug(f"getStoreBytes - processing chunk_location: {chunk_location}")
+            log.debug(f"getStorBytes - processing chunk_location: {chunk_location}")
             n = chunk_location.offset - offset
             if n < 0:
                 log.warn(f"getStorBytes - unexpected offset for chunk_location: {chunk_location}")
diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py
diff --git a/tests/integ/vlen_test.py b/tests/integ/vlen_test.py