TGSAI
diff --git a/‎poetry.lock‎
Lines changed: 1329 additions & 1193 deletions b/‎poetry.lock‎
Lines changed: 1329 additions & 1193 deletions
diff --git a/‎src/mdio/commands/segy.py‎
Lines changed: 54 additions & 8 deletions b/‎src/mdio/commands/segy.py‎
Lines changed: 54 additions & 8 deletions
diff --git a/‎src/mdio/converters/segy.py‎
Lines changed: 82 additions & 24 deletions b/‎src/mdio/converters/segy.py‎
Lines changed: 82 additions & 24 deletions
@@ -39,7 +39,7 @@
 \b
 --header-names inline,crossline
 --header-locations 189,193
---header-lengths 4,4
+--header-types int32,int32
 
 \b
 Our recommended chunk sizes are:
@@ -212,8 +212,8 @@ def segy_import(
     index byte locations (starts from 1) are the minimum amount
     of information needed to index the file. However, we suggest
     giving names to the index dimensions, and if needed providing
-    the header lengths if they are not standard. By default, all header
-    entries are assumed to be 4-byte long.
+    the header types if they are not standard. By default, all header
+    entries are assumed to be 4-byte long (int32).
 
     The chunk size depends on the data type, however, it can be
     chosen to accommodate any workflow's access patterns. See examples
@@ -267,7 +267,7 @@ def segy_import(
         Chunks: 8 shots x 2 cables x 256 channels x 512 samples
         --header-locations 9,213,13
         --header-names shot,cable,chan
-        --header-lengths 4,2,4
+        --header-types int32,int16,int32
         --chunk-size 8,2,256,512
 
     We can override the dataset grid by the `grid_overrides` parameter.
@@ -278,8 +278,8 @@ def segy_import(
     a cable number and channel numbers are sequential (i.e. each cable
     doesn't start with channel number 1; we can tell MDIO to ingest
     this with the correct geometry by calculating cable numbers and
-    wrapped channel numbers. Note the missing byte location and word
-    length for the "cable" index.
+    wrapped channel numbers. Note the missing byte location and type
+    for the "cable" index.
 
 
     Usage:
@@ -289,7 +289,7 @@ def segy_import(
         Chunks: 8 shots x 2 cables x 256 channels x 512 samples
         --header-locations 9,None,13
         --header-names shot,cable,chan
-        --header-lengths 4,None,4
+        --header-types int32,None,int32
         --chunk-size 8,2,256,512
         --grid-overrides '{"ChannelWrap": True, "ChannelsPerCable": 800,
                            "CalculateCable": True}'
@@ -299,7 +299,7 @@ def segy_import(
         sequential (aka. unwrapped), we can still ingest it like this.
         --header-locations 9,213,13
         --header-names shot,cable,chan
-        --header-lengths 4,2,4
+        --header-types int32,int16,int32
         --chunk-size 8,2,256,512
         --grid-overrides '{"ChannelWrap":True, "ChannelsPerCable": 800}'
         \b
@@ -309,6 +309,52 @@ def segy_import(
         In cases where the user does not know if the input has unwrapped
         channels but desires to store with wrapped channel index use:
         --grid-overrides '{"AutoChannelWrap": True}'
+
+        \b
+        For cases with no well-defined trace header for indexing a NonBinned
+        grid override is provided.This creates the index and attributes an
+        incrementing integer to the trace for the index based on first in first
+        out. For example a CDP and Offset keyed file might have a header for offset
+        as real world offset which would result in a very sparse populated index.
+        Instead, the following override will create a new index from 1 to N, where
+        N is the number of offsets within a CDP ensemble. The index to be auto
+        generated is called "trace". Note the required "chunksize" parameter in
+        the grid override. This is due to the non-binned ensemble chunksize is
+        irrelevant to the index dimension chunksizes and has to be specified
+        in the grid override itself. Note the lack of offset, only indexing CDP,
+        providing CDP header type, and chunksize for only CDP and Sample
+        dimension. The chunksize for non-binned dimension is in the grid overrides
+        as described above. The below configuration will yield 1MB chunks.
+        \b
+        --header-locations 21
+        --header-names cdp
+        --header-types int32
+        --chunk-size 4,1024
+        --grid-overrides '{"NonBinned": True, "chunksize": 64}'
+
+        \b
+        A more complicated case where you may have a 5D dataset that is not
+        binned in Offset and Azimuth directions can be ingested like below.
+        However, the Offset and Azimuth dimensions will be combined to "trace"
+        dimension. The below configuration will yield 1MB chunks.
+        \b
+        --header-locations 189,193
+        --header-names inline,crossline
+        --header-types int32,int32
+        --chunk-size 4,4,1024
+        --grid-overrides '{"NonBinned": True, "chunksize": 16}'
+
+        \b
+        For dataset with expected duplicate traces we have the following
+        parameterization. This will use the same logic as NonBinned with
+        a fixed chunksize of 1. The other keys are still important. The
+        below example allows multiple traces per receiver (i.e. reshoot).
+        \b
+        --header-locations 9,213,13
+        --header-names shot,cable,chan
+        --header-types int32,int16,int32
+        --chunk-size 8,2,256,512
+        --grid-overrides '{"HasDuplicates": True}'
     """
     mdio.segy_to_mdio(
         segy_path=input_segy_path,
 
@@ -258,22 +258,71 @@ def segy_to_mdio(
         channels but desires to store with wrapped channel index use:
         >>>    grid_overrides={"AutoChannelWrap": True,
                                "AutoChannelTraceQC":  1000000}
-    """
-    num_index = len(index_bytes)
 
-    if chunksize is None:
-        if num_index == 1:
-            chunksize = (512,) * 2
+        For cases with no well-defined trace header for indexing a NonBinned
+        grid override is provided.This creates the index and attributes an
+        incrementing integer to the trace for the index based on first in first
+        out. For example a CDP and Offset keyed file might have a header for offset
+        as real world offset which would result in a very sparse populated index.
+        Instead, the following override will create a new index from 1 to N, where
+        N is the number of offsets within a CDP ensemble. The index to be auto
+        generated is called "trace". Note the required "chunksize" parameter in
+        the grid override. This is due to the non-binned ensemble chunksize is
+        irrelevant to the index dimension chunksizes and has to be specified
+        in the grid override itself. Note the lack of offset, only indexing CDP,
+        providing CDP header type, and chunksize for only CDP and Sample
+        dimension. The chunksize for non-binned dimension is in the grid overrides
+        as described above. The below configuration will yield 1MB chunks:
 
-        elif num_index == 2:
-            chunksize = (64,) * 3
+        >>> segy_to_mdio(
+        ...     segy_path="prefix/cdp_offset_file.segy",
+        ...     mdio_path_or_buffer="s3://bucket/cdp_offset_file.mdio",
+        ...     index_bytes=(21,),
+        ...     index_types=("int32",),
+        ...     index_names=("cdp",),
+        ...     chunksize=(4, 1024),
+        ...     grid_overrides={"NonBinned": True, "chunksize": 64},
+        ... )
 
-        else:
-            msg = (
-                f"Default chunking for {num_index + 1}-D seismic data is "
-                "not implemented yet. Please explicity define chunk sizes."
+        A more complicated case where you may have a 5D dataset that is not
+        binned in Offset and Azimuth directions can be ingested like below.
+        However, the Offset and Azimuth dimensions will be combined to "trace"
+        dimension. The below configuration will yield 1MB chunks.
+
+        >>> segy_to_mdio(
+        ...     segy_path="prefix/cdp_offset_file.segy",
+        ...     mdio_path_or_buffer="s3://bucket/cdp_offset_file.mdio",
+        ...     index_bytes=(189, 193),
+        ...     index_types=("int32", "int32"),
+        ...     index_names=("inline", "crossline"),
+        ...     chunksize=(4, 4, 1024),
+        ...     grid_overrides={"NonBinned": True, "chunksize": 64},
+        ... )
+
+        For dataset with expected duplicate traces we have the following
+        parameterization. This will use the same logic as NonBinned with
+        a fixed chunksize of 1. The other keys are still important. The
+        below example allows multiple traces per receiver (i.e. reshoot).
+
+        >>> segy_to_mdio(
+        ...     segy_path="prefix/cdp_offset_file.segy",
+        ...     mdio_path_or_buffer="s3://bucket/cdp_offset_file.mdio",
+        ...     index_bytes=(9, 213, 13),
+        ...     index_types=("int32", "int16", "int32"),
+        ...     index_names=("shot", "cable", "chan"),
+        ...     chunksize=(8, 2, 256, 512),
+        ...     grid_overrides={"HasDuplicates": True},
+        ... )
+    """
+    num_index = len(index_bytes)
+
+    if chunksize is not None:
+        if len(chunksize) != len(index_bytes) + 1:
+            message = (
+                f"Length of chunks={len(chunksize)} must be ",
+                f"equal to array dimensions={len(index_bytes) + 1}",
             )
-            raise NotImplementedError(msg)
+            raise ValueError(message)
 
     if storage_options is None:
         storage_options = {}
@@ -296,14 +345,15 @@ def segy_to_mdio(
 
     index_types = parse_index_types(index_types, num_index)
 
-    dimensions, index_headers = get_grid_plan(
+    dimensions, chunksize, index_headers = get_grid_plan(
         segy_path=segy_path,
         segy_endian=endian,
         index_bytes=index_bytes,
         index_names=index_names,
         index_types=index_types,
         binary_header=binary_header,
         return_headers=True,
+        chunksize=chunksize,
         grid_overrides=grid_overrides,
     )
 
@@ -316,6 +366,10 @@ def segy_to_mdio(
 
     # Check grid validity by comparing trace numbers
     if np.sum(grid.live_mask) != num_traces:
+        for dim_name in grid.dim_names:
+            dim_min, dim_max = grid.get_min(dim_name), grid.get_max(dim_name)
+            logger.warning(f"{dim_name} min: {dim_min} max: {dim_max}")
+        logger.warning(f"Ingestion grid shape: {grid.shape}.")
         raise GridTraceCountError(np.sum(grid.live_mask), num_traces)
 
     zarr_root = create_zarr_hierarchy(
@@ -358,20 +412,24 @@ def segy_to_mdio(
     )
 
     if chunksize is None:
-        suffix = [str(x) for x in range(len(index_bytes) + 1)]
-        suffix = "".join(suffix)
+        dim_count = len(index_headers) + 1
+        if dim_count == 2:
+            chunksize = (512,) * 2
 
-    else:
-        if len(chunksize) != len(index_bytes) + 1:
-            message = (
-                f"Length of chunks={len(chunksize)} must be ",
-                f"equal to array dimensions={len(index_bytes) + 1}",
+        elif dim_count == 3:
+            chunksize = (64,) * 3
+
+        else:
+            msg = (
+                f"Default chunking for {dim_count}-D seismic data is "
+                "not implemented yet. Please explicity define chunk sizes."
             )
-            raise ValueError(message)
+            raise NotImplementedError(msg)
 
-        suffix = [
-            dim_chunksize if dim_chunksize > 0 else None for dim_chunksize in chunksize
-        ]
+        suffix = [str(x) for x in range(dim_count)]
+        suffix = "".join(suffix)
+    else:
+        suffix = [dim_chunks if dim_chunks > 0 else None for dim_chunks in chunksize]
         suffix = [str(idx) for idx, value in enumerate(suffix) if value is not None]
         suffix = "".join(suffix)