1212from modelarrayio .cli .parser_utils import (
1313 add_backend_arg ,
1414 add_cohort_arg ,
15- add_output_hdf5_arg ,
16- add_output_tiledb_arg ,
15+ add_output_arg ,
1716 add_s3_workers_arg ,
1817 add_scalar_columns_arg ,
1918 add_storage_args ,
20- add_tiledb_storage_args ,
2119)
2220from modelarrayio .storage import h5_storage , tiledb_storage
2321from modelarrayio .utils .cifti import (
3432def cifti_to_h5 (
3533 cohort_file ,
3634 backend = 'hdf5' ,
37- output_hdf5 = 'fixeldb.h5' ,
38- output_tiledb = 'arraydb.tdb' ,
35+ output = 'fixelarray.h5' ,
3936 storage_dtype = 'float32' ,
4037 compression = 'gzip' ,
4138 compression_level = 4 ,
4239 shuffle = True ,
4340 chunk_voxels = 0 ,
4441 target_chunk_mb = 2.0 ,
45- tdb_compression = 'zstd' ,
46- tdb_compression_level = 5 ,
47- tdb_shuffle = True ,
48- tdb_tile_voxels = 0 ,
49- tdb_target_tile_mb = 2.0 ,
50- tdb_workers = None ,
42+ workers = None ,
5143 scalar_columns = None ,
5244 s3_workers = 1 ,
5345):
54- """Load all CIFTI data and write to an HDF5 file with configurable storage .
46+ """Load all CIFTI data and write to an HDF5 or TileDB file .
5547
5648 Parameters
5749 ----------
5850 cohort_file : :obj:`str`
5951 Path to a csv with demographic info and paths to data
6052 backend : :obj:`str`
61- Backend to use for storage
62- output_hdf5 : :obj:`str`
63- Path to a new .h5 file to be written
64- output_tiledb : :obj:`str`
65- Path to a new .tdb file to be written
53+ Backend to use for storage (``'hdf5'`` or ``'tiledb'``)
54+ output : :obj:`str`
55+ Output path. For the hdf5 backend, path to an .h5 file;
56+ for the tiledb backend, path to a .tdb directory.
6657 storage_dtype : :obj:`str`
6758 Floating type to store values
6859 compression : :obj:`str`
69- HDF5 compression filter
60+ Compression filter. ``gzip`` works for both backends;
61+ ``lzf`` is HDF5-only; ``zstd`` is TileDB-only.
7062 compression_level : :obj:`int`
71- Gzip compression level (0-9 )
63+ Compression level (codec-dependent )
7264 shuffle : :obj:`bool`
73- Enable HDF5 shuffle filter
65+ Enable shuffle filter
7466 chunk_voxels : :obj:`int`
75- Chunk size along the voxel axis
67+ Chunk/tile size along the greyordinate axis (0 = auto)
7668 target_chunk_mb : :obj:`float`
77- Target chunk size in MiB when auto-computing chunk_voxels
78- tdb_compression : :obj:`str`
79- TileDB compression filter
80- tdb_compression_level : :obj:`int`
81- TileDB compression level
82- tdb_shuffle : :obj:`bool`
83- Enable TileDB shuffle filter
84- tdb_tile_voxels : :obj:`int`
85- Tile size along the voxel axis
86- tdb_target_tile_mb : :obj:`float`
87- Target tile size in MiB when auto-computing tdb_tile_voxels
88- tdb_workers : :obj:`int`
89- Number of workers to use for parallel loading
69+ Target chunk/tile size in MiB when auto-computing the spatial axis length
70+ workers : :obj:`int`
71+ Maximum number of parallel TileDB write workers (``None`` = auto).
72+ Has no effect when ``backend='hdf5'``.
9073 scalar_columns : :obj:`list`
9174 List of scalar columns to use
9275 s3_workers : :obj:`int`
93- Number of workers to use for parallel loading
76+ Number of workers for parallel S3 downloads
9477
9578 Returns
9679 -------
9780 status : :obj:`int`
98- Status of the operation. 0 if successful, 1 if failed.
81+ 0 if successful, 1 if failed.
9982 """
10083 cohort_df = pd .read_csv (cohort_file )
10184 cohort_long = _cohort_to_long_dataframe (cohort_df , scalar_columns = scalar_columns )
@@ -108,7 +91,7 @@ def cifti_to_h5(
10891 if backend == 'hdf5' :
10992 scalars , last_brain_names = _load_cohort_cifti (cohort_long , s3_workers )
11093
111- f = h5py .File (output_hdf5 , 'w' )
94+ f = h5py .File (output , 'w' )
11295
11396 greyordinate_table , structure_names = brain_names_to_dataframe (last_brain_names )
11497 greyordinatesh5 = f .create_dataset (
@@ -136,9 +119,9 @@ def cifti_to_h5(
136119
137120 h5_storage .write_rows_in_column_stripes (dset , scalars [scalar_name ])
138121 f .close ()
139- return int (not os .path .exists (output_hdf5 ))
122+ return int (not os .path .exists (output ))
140123 else :
141- os .makedirs (output_tiledb , exist_ok = True )
124+ os .makedirs (output , exist_ok = True )
142125 if not scalar_sources :
143126 return 0
144127
@@ -161,26 +144,26 @@ def _process_scalar_job(scalar_name, source_files):
161144 return scalar_name
162145 num_items = rows [0 ].shape [0 ]
163146 tiledb_storage .create_empty_scalar_matrix_array (
164- output_tiledb ,
147+ output ,
165148 dataset_path ,
166149 num_subjects ,
167150 num_items ,
168151 storage_dtype = storage_dtype ,
169- compression = tdb_compression ,
170- compression_level = tdb_compression_level ,
171- shuffle = tdb_shuffle ,
172- tile_voxels = tdb_tile_voxels ,
173- target_tile_mb = tdb_target_tile_mb ,
152+ compression = compression ,
153+ compression_level = compression_level ,
154+ shuffle = shuffle ,
155+ tile_voxels = chunk_voxels ,
156+ target_tile_mb = target_chunk_mb ,
174157 sources_list = source_files ,
175158 )
176159 # write column names array for ModelArray compatibility
177- tiledb_storage .write_column_names (output_tiledb , scalar_name , source_files )
178- uri = os .path .join (output_tiledb , dataset_path )
160+ tiledb_storage .write_column_names (output , scalar_name , source_files )
161+ uri = os .path .join (output , dataset_path )
179162 tiledb_storage .write_rows_in_column_stripes (uri , rows )
180163 return scalar_name
181164
182165 scalar_names = list (scalar_sources .keys ())
183- worker_count = tdb_workers if isinstance (tdb_workers , int ) and tdb_workers > 0 else None
166+ worker_count = workers if isinstance (workers , int ) and workers > 0 else None
184167 if worker_count is None :
185168 cpu_count = os .cpu_count () or 1
186169 worker_count = min (len (scalar_names ), max (1 , cpu_count ))
@@ -207,20 +190,14 @@ def _process_scalar_job(scalar_name, source_files):
207190def cifti_to_h5_main (
208191 cohort_file ,
209192 backend = 'hdf5' ,
210- output_hdf5 = 'fixelarray.h5' ,
211- output_tiledb = 'arraydb.tdb' ,
193+ output = 'fixelarray.h5' ,
212194 storage_dtype = 'float32' ,
213195 compression = 'gzip' ,
214196 compression_level = 4 ,
215197 shuffle = True ,
216198 chunk_voxels = 0 ,
217199 target_chunk_mb = 2.0 ,
218- tdb_compression = 'zstd' ,
219- tdb_compression_level = 5 ,
220- tdb_shuffle = True ,
221- tdb_tile_voxels = 0 ,
222- tdb_target_tile_mb = 2.0 ,
223- tdb_workers = None ,
200+ workers = None ,
224201 scalar_columns = None ,
225202 s3_workers = 1 ,
226203 log_level = 'INFO' ,
@@ -233,44 +210,37 @@ def cifti_to_h5_main(
233210 return cifti_to_h5 (
234211 cohort_file = cohort_file ,
235212 backend = backend ,
236- output_hdf5 = output_hdf5 ,
237- output_tiledb = output_tiledb ,
213+ output = output ,
238214 storage_dtype = storage_dtype ,
239215 compression = compression ,
240216 compression_level = compression_level ,
241217 shuffle = shuffle ,
242218 chunk_voxels = chunk_voxels ,
243219 target_chunk_mb = target_chunk_mb ,
244- tdb_compression = tdb_compression ,
245- tdb_compression_level = tdb_compression_level ,
246- tdb_shuffle = tdb_shuffle ,
247- tdb_tile_voxels = tdb_tile_voxels ,
248- tdb_target_tile_mb = tdb_target_tile_mb ,
249- tdb_workers = tdb_workers ,
220+ workers = workers ,
250221 scalar_columns = scalar_columns ,
251222 s3_workers = s3_workers ,
252223 )
253224
254225
255226def _parse_cifti_to_h5 ():
256227 parser = argparse .ArgumentParser (
257- description = 'Create a hdf5 file of CIDTI2 dscalar data' ,
228+ description = 'Create a hdf5 file of CIFTI2 dscalar data' ,
258229 formatter_class = argparse .ArgumentDefaultsHelpFormatter ,
259230 )
260231 add_cohort_arg (parser )
261232 add_scalar_columns_arg (parser )
262- add_output_hdf5_arg (parser , default_name = 'fixelarray.h5' )
263- add_output_tiledb_arg (parser , default_name = 'arraydb.tdb' )
233+ add_output_arg (parser , default_name = 'fixelarray.h5' )
264234 add_backend_arg (parser )
265235 add_storage_args (parser )
266- add_tiledb_storage_args (parser )
267236 parser .add_argument (
268- '--tdb-workers' ,
269- '--tdb_workers' ,
237+ '--workers' ,
270238 type = int ,
271239 help = (
272- 'Maximum number of TileDB write workers. Default 0 (auto, uses CPU count). '
273- 'Set to 1 to disable parallel writes.'
240+ 'Maximum number of parallel TileDB write workers. '
241+ 'Default 0 (auto, uses CPU count). '
242+ 'Set to 1 to disable parallel writes. '
243+ 'Has no effect when --backend=hdf5.'
274244 ),
275245 default = 0 ,
276246 )
0 commit comments