Skip to content

Commit 862a2d1

Browse files
authored
Consolidate TileDB- and HDF5-related parameters (#28)
1 parent d2cb8df commit 862a2d1

10 files changed

Lines changed: 145 additions & 289 deletions

File tree

docs/examples/plot_fixel_workflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@
102102
# --index-file /home/username/myProject/data/FD/index.mif \
103103
# --directions-file /home/username/myProject/data/FD/directions.mif \
104104
# --cohort-file /home/username/myProject/data/cohort_FD.csv \
105-
# --output-hdf5 /home/username/myProject/data/FD.h5
105+
# --output /home/username/myProject/data/FD.h5
106106
#
107107
# This produces ``FD.h5`` in ``/home/username/myProject/data``. You can then use
108108
# `ModelArray <https://pennlinc.github.io/ModelArray/>`_ to run statistical analyses on it.

docs/examples/plot_voxel_workflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@
107107
# modelarrayio nifti-to-h5 \
108108
# --group-mask-file /home/username/myProject/data/group_mask.nii.gz \
109109
# --cohort-file /home/username/myProject/data/cohort_FA.csv \
110-
# --output-hdf5 /home/username/myProject/data/FA.h5
110+
# --output /home/username/myProject/data/FA.h5
111111
#
112112
# This produces ``FA.h5`` in ``/home/username/myProject/data``. You can then use
113113
# `ModelArray <https://pennlinc.github.io/ModelArray/>`_ to run statistical analyses on it.

src/modelarrayio/cli/cifti_to_h5.py

Lines changed: 42 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,10 @@
1212
from modelarrayio.cli.parser_utils import (
1313
add_backend_arg,
1414
add_cohort_arg,
15-
add_output_hdf5_arg,
16-
add_output_tiledb_arg,
15+
add_output_arg,
1716
add_s3_workers_arg,
1817
add_scalar_columns_arg,
1918
add_storage_args,
20-
add_tiledb_storage_args,
2119
)
2220
from modelarrayio.storage import h5_storage, tiledb_storage
2321
from modelarrayio.utils.cifti import (
@@ -34,68 +32,53 @@
3432
def cifti_to_h5(
3533
cohort_file,
3634
backend='hdf5',
37-
output_hdf5='fixeldb.h5',
38-
output_tiledb='arraydb.tdb',
35+
output='fixelarray.h5',
3936
storage_dtype='float32',
4037
compression='gzip',
4138
compression_level=4,
4239
shuffle=True,
4340
chunk_voxels=0,
4441
target_chunk_mb=2.0,
45-
tdb_compression='zstd',
46-
tdb_compression_level=5,
47-
tdb_shuffle=True,
48-
tdb_tile_voxels=0,
49-
tdb_target_tile_mb=2.0,
50-
tdb_workers=None,
42+
workers=None,
5143
scalar_columns=None,
5244
s3_workers=1,
5345
):
54-
"""Load all CIFTI data and write to an HDF5 file with configurable storage.
46+
"""Load all CIFTI data and write to an HDF5 or TileDB file.
5547
5648
Parameters
5749
----------
5850
cohort_file : :obj:`str`
5951
Path to a csv with demographic info and paths to data
6052
backend : :obj:`str`
61-
Backend to use for storage
62-
output_hdf5 : :obj:`str`
63-
Path to a new .h5 file to be written
64-
output_tiledb : :obj:`str`
65-
Path to a new .tdb file to be written
53+
Backend to use for storage (``'hdf5'`` or ``'tiledb'``)
54+
output : :obj:`str`
55+
Output path. For the hdf5 backend, path to an .h5 file;
56+
for the tiledb backend, path to a .tdb directory.
6657
storage_dtype : :obj:`str`
6758
Floating type to store values
6859
compression : :obj:`str`
69-
HDF5 compression filter
60+
Compression filter. ``gzip`` works for both backends;
61+
``lzf`` is HDF5-only; ``zstd`` is TileDB-only.
7062
compression_level : :obj:`int`
71-
Gzip compression level (0-9)
63+
Compression level (codec-dependent)
7264
shuffle : :obj:`bool`
73-
Enable HDF5 shuffle filter
65+
Enable shuffle filter
7466
chunk_voxels : :obj:`int`
75-
Chunk size along the voxel axis
67+
Chunk/tile size along the greyordinate axis (0 = auto)
7668
target_chunk_mb : :obj:`float`
77-
Target chunk size in MiB when auto-computing chunk_voxels
78-
tdb_compression : :obj:`str`
79-
TileDB compression filter
80-
tdb_compression_level : :obj:`int`
81-
TileDB compression level
82-
tdb_shuffle : :obj:`bool`
83-
Enable TileDB shuffle filter
84-
tdb_tile_voxels : :obj:`int`
85-
Tile size along the voxel axis
86-
tdb_target_tile_mb : :obj:`float`
87-
Target tile size in MiB when auto-computing tdb_tile_voxels
88-
tdb_workers : :obj:`int`
89-
Number of workers to use for parallel loading
69+
Target chunk/tile size in MiB when auto-computing the spatial axis length
70+
workers : :obj:`int`
71+
Maximum number of parallel TileDB write workers (``None`` = auto).
72+
Has no effect when ``backend='hdf5'``.
9073
scalar_columns : :obj:`list`
9174
List of scalar columns to use
9275
s3_workers : :obj:`int`
93-
Number of workers to use for parallel loading
76+
Number of workers for parallel S3 downloads
9477
9578
Returns
9679
-------
9780
status : :obj:`int`
98-
Status of the operation. 0 if successful, 1 if failed.
81+
0 if successful, 1 if failed.
9982
"""
10083
cohort_df = pd.read_csv(cohort_file)
10184
cohort_long = _cohort_to_long_dataframe(cohort_df, scalar_columns=scalar_columns)
@@ -108,7 +91,7 @@ def cifti_to_h5(
10891
if backend == 'hdf5':
10992
scalars, last_brain_names = _load_cohort_cifti(cohort_long, s3_workers)
11093

111-
f = h5py.File(output_hdf5, 'w')
94+
f = h5py.File(output, 'w')
11295

11396
greyordinate_table, structure_names = brain_names_to_dataframe(last_brain_names)
11497
greyordinatesh5 = f.create_dataset(
@@ -136,9 +119,9 @@ def cifti_to_h5(
136119

137120
h5_storage.write_rows_in_column_stripes(dset, scalars[scalar_name])
138121
f.close()
139-
return int(not os.path.exists(output_hdf5))
122+
return int(not os.path.exists(output))
140123
else:
141-
os.makedirs(output_tiledb, exist_ok=True)
124+
os.makedirs(output, exist_ok=True)
142125
if not scalar_sources:
143126
return 0
144127

@@ -161,26 +144,26 @@ def _process_scalar_job(scalar_name, source_files):
161144
return scalar_name
162145
num_items = rows[0].shape[0]
163146
tiledb_storage.create_empty_scalar_matrix_array(
164-
output_tiledb,
147+
output,
165148
dataset_path,
166149
num_subjects,
167150
num_items,
168151
storage_dtype=storage_dtype,
169-
compression=tdb_compression,
170-
compression_level=tdb_compression_level,
171-
shuffle=tdb_shuffle,
172-
tile_voxels=tdb_tile_voxels,
173-
target_tile_mb=tdb_target_tile_mb,
152+
compression=compression,
153+
compression_level=compression_level,
154+
shuffle=shuffle,
155+
tile_voxels=chunk_voxels,
156+
target_tile_mb=target_chunk_mb,
174157
sources_list=source_files,
175158
)
176159
# write column names array for ModelArray compatibility
177-
tiledb_storage.write_column_names(output_tiledb, scalar_name, source_files)
178-
uri = os.path.join(output_tiledb, dataset_path)
160+
tiledb_storage.write_column_names(output, scalar_name, source_files)
161+
uri = os.path.join(output, dataset_path)
179162
tiledb_storage.write_rows_in_column_stripes(uri, rows)
180163
return scalar_name
181164

182165
scalar_names = list(scalar_sources.keys())
183-
worker_count = tdb_workers if isinstance(tdb_workers, int) and tdb_workers > 0 else None
166+
worker_count = workers if isinstance(workers, int) and workers > 0 else None
184167
if worker_count is None:
185168
cpu_count = os.cpu_count() or 1
186169
worker_count = min(len(scalar_names), max(1, cpu_count))
@@ -207,20 +190,14 @@ def _process_scalar_job(scalar_name, source_files):
207190
def cifti_to_h5_main(
208191
cohort_file,
209192
backend='hdf5',
210-
output_hdf5='fixelarray.h5',
211-
output_tiledb='arraydb.tdb',
193+
output='fixelarray.h5',
212194
storage_dtype='float32',
213195
compression='gzip',
214196
compression_level=4,
215197
shuffle=True,
216198
chunk_voxels=0,
217199
target_chunk_mb=2.0,
218-
tdb_compression='zstd',
219-
tdb_compression_level=5,
220-
tdb_shuffle=True,
221-
tdb_tile_voxels=0,
222-
tdb_target_tile_mb=2.0,
223-
tdb_workers=None,
200+
workers=None,
224201
scalar_columns=None,
225202
s3_workers=1,
226203
log_level='INFO',
@@ -233,44 +210,37 @@ def cifti_to_h5_main(
233210
return cifti_to_h5(
234211
cohort_file=cohort_file,
235212
backend=backend,
236-
output_hdf5=output_hdf5,
237-
output_tiledb=output_tiledb,
213+
output=output,
238214
storage_dtype=storage_dtype,
239215
compression=compression,
240216
compression_level=compression_level,
241217
shuffle=shuffle,
242218
chunk_voxels=chunk_voxels,
243219
target_chunk_mb=target_chunk_mb,
244-
tdb_compression=tdb_compression,
245-
tdb_compression_level=tdb_compression_level,
246-
tdb_shuffle=tdb_shuffle,
247-
tdb_tile_voxels=tdb_tile_voxels,
248-
tdb_target_tile_mb=tdb_target_tile_mb,
249-
tdb_workers=tdb_workers,
220+
workers=workers,
250221
scalar_columns=scalar_columns,
251222
s3_workers=s3_workers,
252223
)
253224

254225

255226
def _parse_cifti_to_h5():
256227
parser = argparse.ArgumentParser(
257-
description='Create a hdf5 file of CIDTI2 dscalar data',
228+
description='Create a hdf5 file of CIFTI2 dscalar data',
258229
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
259230
)
260231
add_cohort_arg(parser)
261232
add_scalar_columns_arg(parser)
262-
add_output_hdf5_arg(parser, default_name='fixelarray.h5')
263-
add_output_tiledb_arg(parser, default_name='arraydb.tdb')
233+
add_output_arg(parser, default_name='fixelarray.h5')
264234
add_backend_arg(parser)
265235
add_storage_args(parser)
266-
add_tiledb_storage_args(parser)
267236
parser.add_argument(
268-
'--tdb-workers',
269-
'--tdb_workers',
237+
'--workers',
270238
type=int,
271239
help=(
272-
'Maximum number of TileDB write workers. Default 0 (auto, uses CPU count). '
273-
'Set to 1 to disable parallel writes.'
240+
'Maximum number of parallel TileDB write workers. '
241+
'Default 0 (auto, uses CPU count). '
242+
'Set to 1 to disable parallel writes. '
243+
'Has no effect when --backend=hdf5.'
274244
),
275245
default=0,
276246
)

0 commit comments

Comments
 (0)