@@ -258,22 +258,71 @@ def segy_to_mdio(
258258 channels but desires to store with wrapped channel index use:
259259 >>> grid_overrides={"AutoChannelWrap": True,
260260 "AutoChannelTraceQC": 1000000}
261- """
262- num_index = len (index_bytes )
263261
264- if chunksize is None :
265- if num_index == 1 :
266- chunksize = (512 ,) * 2
262+ For cases with no well-defined trace header for indexing a NonBinned
263+ grid override is provided.This creates the index and attributes an
264+ incrementing integer to the trace for the index based on first in first
265+ out. For example a CDP and Offset keyed file might have a header for offset
266+ as real world offset which would result in a very sparse populated index.
267+ Instead, the following override will create a new index from 1 to N, where
268+ N is the number of offsets within a CDP ensemble. The index to be auto
269+ generated is called "trace". Note the required "chunksize" parameter in
270+ the grid override. This is due to the non-binned ensemble chunksize is
271+ irrelevant to the index dimension chunksizes and has to be specified
272+ in the grid override itself. Note the lack of offset, only indexing CDP,
273+ providing CDP header type, and chunksize for only CDP and Sample
274+ dimension. The chunksize for non-binned dimension is in the grid overrides
275+ as described above. The below configuration will yield 1MB chunks:
267276
268- elif num_index == 2 :
269- chunksize = (64 ,) * 3
277+ >>> segy_to_mdio(
278+ ... segy_path="prefix/cdp_offset_file.segy",
279+ ... mdio_path_or_buffer="s3://bucket/cdp_offset_file.mdio",
280+ ... index_bytes=(21,),
281+ ... index_types=("int32",),
282+ ... index_names=("cdp",),
283+ ... chunksize=(4, 1024),
284+ ... grid_overrides={"NonBinned": True, "chunksize": 64},
285+ ... )
270286
271- else :
272- msg = (
273- f"Default chunking for { num_index + 1 } -D seismic data is "
274- "not implemented yet. Please explicity define chunk sizes."
287+ A more complicated case where you may have a 5D dataset that is not
288+ binned in Offset and Azimuth directions can be ingested like below.
289+ However, the Offset and Azimuth dimensions will be combined to "trace"
290+ dimension. The below configuration will yield 1MB chunks.
291+
292+ >>> segy_to_mdio(
293+ ... segy_path="prefix/cdp_offset_file.segy",
294+ ... mdio_path_or_buffer="s3://bucket/cdp_offset_file.mdio",
295+ ... index_bytes=(189, 193),
296+ ... index_types=("int32", "int32"),
297+ ... index_names=("inline", "crossline"),
298+ ... chunksize=(4, 4, 1024),
299+ ... grid_overrides={"NonBinned": True, "chunksize": 64},
300+ ... )
301+
302+ For dataset with expected duplicate traces we have the following
303+ parameterization. This will use the same logic as NonBinned with
304+ a fixed chunksize of 1. The other keys are still important. The
305+ below example allows multiple traces per receiver (i.e. reshoot).
306+
307+ >>> segy_to_mdio(
308+ ... segy_path="prefix/cdp_offset_file.segy",
309+ ... mdio_path_or_buffer="s3://bucket/cdp_offset_file.mdio",
310+ ... index_bytes=(9, 213, 13),
311+ ... index_types=("int32", "int16", "int32"),
312+ ... index_names=("shot", "cable", "chan"),
313+ ... chunksize=(8, 2, 256, 512),
314+ ... grid_overrides={"HasDuplicates": True},
315+ ... )
316+ """
317+ num_index = len (index_bytes )
318+
319+ if chunksize is not None :
320+ if len (chunksize ) != len (index_bytes ) + 1 :
321+ message = (
322+ f"Length of chunks={ len (chunksize )} must be " ,
323+ f"equal to array dimensions={ len (index_bytes ) + 1 } " ,
275324 )
276- raise NotImplementedError ( msg )
325+ raise ValueError ( message )
277326
278327 if storage_options is None :
279328 storage_options = {}
@@ -296,14 +345,15 @@ def segy_to_mdio(
296345
297346 index_types = parse_index_types (index_types , num_index )
298347
299- dimensions , index_headers = get_grid_plan (
348+ dimensions , chunksize , index_headers = get_grid_plan (
300349 segy_path = segy_path ,
301350 segy_endian = endian ,
302351 index_bytes = index_bytes ,
303352 index_names = index_names ,
304353 index_types = index_types ,
305354 binary_header = binary_header ,
306355 return_headers = True ,
356+ chunksize = chunksize ,
307357 grid_overrides = grid_overrides ,
308358 )
309359
@@ -316,6 +366,10 @@ def segy_to_mdio(
316366
317367 # Check grid validity by comparing trace numbers
318368 if np .sum (grid .live_mask ) != num_traces :
369+ for dim_name in grid .dim_names :
370+ dim_min , dim_max = grid .get_min (dim_name ), grid .get_max (dim_name )
371+ logger .warning (f"{ dim_name } min: { dim_min } max: { dim_max } " )
372+ logger .warning (f"Ingestion grid shape: { grid .shape } ." )
319373 raise GridTraceCountError (np .sum (grid .live_mask ), num_traces )
320374
321375 zarr_root = create_zarr_hierarchy (
@@ -358,20 +412,24 @@ def segy_to_mdio(
358412 )
359413
360414 if chunksize is None :
361- suffix = [str (x ) for x in range (len (index_bytes ) + 1 )]
362- suffix = "" .join (suffix )
415+ dim_count = len (index_headers ) + 1
416+ if dim_count == 2 :
417+ chunksize = (512 ,) * 2
363418
364- else :
365- if len (chunksize ) != len (index_bytes ) + 1 :
366- message = (
367- f"Length of chunks={ len (chunksize )} must be " ,
368- f"equal to array dimensions={ len (index_bytes ) + 1 } " ,
419+ elif dim_count == 3 :
420+ chunksize = (64 ,) * 3
421+
422+ else :
423+ msg = (
424+ f"Default chunking for { dim_count } -D seismic data is "
425+ "not implemented yet. Please explicity define chunk sizes."
369426 )
370- raise ValueError ( message )
427+ raise NotImplementedError ( msg )
371428
372- suffix = [
373- dim_chunksize if dim_chunksize > 0 else None for dim_chunksize in chunksize
374- ]
429+ suffix = [str (x ) for x in range (dim_count )]
430+ suffix = "" .join (suffix )
431+ else :
432+ suffix = [dim_chunks if dim_chunks > 0 else None for dim_chunks in chunksize ]
375433 suffix = [str (idx ) for idx , value in enumerate (suffix ) if value is not None ]
376434 suffix = "" .join (suffix )
377435
0 commit comments