Update tiledb parallel

mattcieslak · mattcieslak · commit 375e994aaa62 · 2025-12-04T00:03:30.000-05:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,32 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11", "3.12"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+          pip install pytest
+
+      - name: Run tests
+        run: pytest -q
+
+
diff --git a/modelarrayio/cifti.py b/modelarrayio/cifti.py
@@ -2,6 +2,7 @@
 import os
 from collections import defaultdict
 import os.path as op
+from concurrent.futures import ThreadPoolExecutor, as_completed
 import numpy as np
 import nibabel as nb
 import pandas as pd
@@ -101,7 +102,8 @@ def write_storage(cohort_file, backend='hdf5', output_h5='fixeldb.h5', output_td
                tdb_compression_level=5,
                tdb_shuffle=True,
                tdb_tile_voxels=0,
-               tdb_target_tile_mb=2.0):
+               tdb_target_tile_mb=2.0,
+               tdb_workers=None):
     """
     Load all fixeldb data.
     Parameters
@@ -165,7 +167,8 @@ def write_storage(cohort_file, backend='hdf5', output_h5='fixeldb.h5', output_td
     else:
         base_uri = op.join(relative_root, output_tdb)
         os.makedirs(base_uri, exist_ok=True)
-        for scalar_name in scalars.keys():
+        scalar_names = list(scalars.keys())
+        for scalar_name in scalar_names:
             num_subjects = len(scalars[scalar_name])
             num_items = scalars[scalar_name][0].shape[0] if num_subjects > 0 else 0
             dataset_path = f'scalars/{scalar_name}/values'
@@ -182,8 +185,35 @@ def write_storage(cohort_file, backend='hdf5', output_h5='fixeldb.h5', output_td
                 target_tile_mb=tdb_target_tile_mb,
                 sources_list=sources_lists[scalar_name],
             )
+
+        def _write_scalar_to_tdb(scalar_name):
+            dataset_path = f'scalars/{scalar_name}/values'
             uri = op.join(base_uri, dataset_path)
             tdb_write_stripes(uri, scalars[scalar_name])
+
+        if not scalar_names:
+            return 0
+
+        # Determine worker count: explicit value takes precedence; fallback to CPU count.
+        worker_count = tdb_workers if isinstance(tdb_workers, int) and tdb_workers > 0 else None
+        if worker_count is None:
+            cpu_count = os.cpu_count() or 1
+            worker_count = min(len(scalar_names), max(1, cpu_count))
+        else:
+            worker_count = min(len(scalar_names), worker_count)
+
+        if worker_count <= 1:
+            for scalar_name in scalar_names:
+                _write_scalar_to_tdb(scalar_name)
+        else:
+            desc = "TileDB scalars"
+            with ThreadPoolExecutor(max_workers=worker_count) as executor:
+                futures = {
+                    executor.submit(_write_scalar_to_tdb, scalar_name): scalar_name
+                    for scalar_name in scalar_names
+                }
+                for future in tqdm(as_completed(futures), total=len(futures), desc=desc):
+                    future.result()
         return 0
 
 
@@ -221,7 +251,8 @@ def main():
                            tdb_compression_level=args.tdb_compression_level,
                            tdb_shuffle=args.tdb_shuffle,
                            tdb_tile_voxels=args.tdb_tile_voxels,
-                           tdb_target_tile_mb=args.tdb_target_tile_mb)
+                           tdb_target_tile_mb=args.tdb_target_tile_mb,
+                           tdb_workers=args.tdb_workers)
     return status
 
 
diff --git a/modelarrayio/parser.py b/modelarrayio/parser.py
@@ -123,6 +123,14 @@ def add_tiledb_storage_args(parser):
         type=float,
         help="Target tile size in MiB when auto-computing item tile length. Default 2.0",
         default=2.0)
+    parser.add_argument(
+        "--tdb-workers", "--tdb_workers",
+        type=int,
+        help=(
+            "Maximum number of TileDB write workers. Default 0 (auto, uses CPU count). "
+            "Set to 1 to disable parallel writes."
+        ),
+        default=0)
     return parser
 
 
diff --git a/tests/test_cifti_cli.py b/tests/test_cifti_cli.py
@@ -0,0 +1,103 @@
+import os
+import os.path as op
+import csv
+import subprocess
+import sys
+
+import numpy as np
+import nibabel as nb
+from nibabel.cifti2.cifti2_axes import ScalarAxis, BrainModelAxis
+import h5py
+
+
+def _make_synthetic_cifti_dscalar(mask_bool: np.ndarray, values: np.ndarray) -> nb.Cifti2Image:
+    # Build axes: single scalar and a brain model from a volumetric mask
+    scalar_axis = ScalarAxis(["synthetic"])  # one scalar map
+    brain_axis = BrainModelAxis.from_mask(mask_bool)
+    header = nb.cifti2.Cifti2Header.from_axes((scalar_axis, brain_axis))
+    # Data must be 2D: (nmaps, ngrayordinates)
+    data_2d = values.reshape(1, -1).astype(np.float32)
+    return nb.Cifti2Image(data_2d, header=header)
+
+
+def test_concifti_cli_creates_expected_hdf5(tmp_path):
+    # Create a small volumetric mask for brain model axis
+    vol_shape = (3, 3, 3)
+    mask = np.zeros(vol_shape, dtype=bool)
+    true_vox = [(0, 0, 0), (0, 1, 2), (1, 1, 1), (2, 2, 0), (2, 1, 2)]
+    for ijk in true_vox:
+        mask[ijk] = True
+    n_grayordinates = int(mask.sum())
+
+    # Create two subjects with simple sequences
+    subjects = []
+    for sidx in range(2):
+        vals = np.arange(n_grayordinates, dtype=np.float32) + sidx
+        img = _make_synthetic_cifti_dscalar(mask, vals)
+        path = tmp_path / f"sub-{sidx+1}.dscalar.nii"
+        img.to_filename(path)
+        subjects.append(str(path.name))
+
+    # Build cohort CSV
+    cohort_csv = tmp_path / "cohort_cifti.csv"
+    with cohort_csv.open("w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=["scalar_name", "source_file"])
+        writer.writeheader()
+        for sname in subjects:
+            writer.writerow({
+                "scalar_name": "THICK",
+                "source_file": sname,
+            })
+
+    out_h5 = tmp_path / "out_cifti.h5"
+    cmd = [
+        sys.executable,
+        "-m",
+        "modelarrayio.cifti",
+        "--cohort-file", str(cohort_csv.name),
+        "--relative-root", str(tmp_path),
+        "--output-hdf5", str(out_h5.name),
+        "--backend", "hdf5",
+        "--dtype", "float32",
+        "--compression", "gzip",
+        "--compression-level", "1",
+        "--shuffle", "True",
+        "--chunk-voxels", "0",
+        "--target-chunk-mb", "1.0",
+    ]
+    env = os.environ.copy()
+    proc = subprocess.run(cmd, cwd=str(tmp_path), env=env, capture_output=True, text=True)
+    assert proc.returncode == 0, f"concifti failed: {proc.stdout}\n{proc.stderr}"
+    assert op.exists(out_h5)
+
+    # Validate HDF5 contents
+    with h5py.File(out_h5, "r") as h5:
+        assert "greyordinates" in h5
+        grey = np.array(h5["greyordinates"])  # stored as transposed table (2, N)
+        assert grey.shape[0] == 2  # vertex_id, structure_id
+        n = grey.shape[1]
+        assert n == n_grayordinates
+
+        # structure_names present
+        g = h5["greyordinates"]
+        assert "structure_names" in g.attrs
+        struct_names = g.attrs["structure_names"]
+        assert len(struct_names) >= 1
+
+        # Scalars dataset
+        dset = h5["scalars/THICK/values"]
+        num_subjects, num_items = dset.shape
+        assert num_subjects == 2
+        assert num_items == n_grayordinates
+
+        # Column names exist and match subjects count
+        grp = h5["scalars/THICK"]
+        assert "column_names" in grp
+        colnames = list(map(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), grp["column_names"][...]))
+        assert len(colnames) == 2
+
+        # Spot-check a couple values
+        assert np.isclose(float(dset[0, 0]), 0.0)
+        assert np.isclose(float(dset[1, 0]), 1.0)
+
+
diff --git a/tests/test_voxels_cli.py b/tests/test_voxels_cli.py
@@ -0,0 +1,136 @@
+import os
+import os.path as op
+import csv
+import subprocess
+import sys
+
+import numpy as np
+import nibabel as nb
+import h5py
+
+
+def _make_nifti(data, affine=None):
+    if affine is None:
+        affine = np.eye(4)
+    return nb.Nifti1Image(data.astype(np.float32), affine)
+
+
+def _ijk_value(i, j, k):
+    return i * 100.0 + j * 10.0 + k * 1.0
+
+
+def test_convoxel_cli_creates_expected_hdf5(tmp_path):
+    # Small synthetic volume
+    shape = (5, 6, 7)
+    group_mask = np.zeros(shape, dtype=bool)
+    # Create a sparse pattern of true voxels
+    true_coords = [(0, 1, 1), (1, 2, 3), (2, 4, 5), (3, 0, 0), (4, 5, 6), (1, 1, 4), (2, 2, 2)]
+    for (i, j, k) in true_coords:
+        group_mask[i, j, k] = True
+
+    # Save group mask
+    group_mask_img = _make_nifti(group_mask.astype(np.uint8))
+    group_mask_file = tmp_path / "group_mask.nii.gz"
+    group_mask_img.to_filename(group_mask_file)
+
+    # Create two subjects with individual masks (one drops a voxel)
+    subjects = []
+    for sidx in range(2):
+        # Scalar volume encodes f(i,j,k)
+        scalar = np.zeros(shape, dtype=np.float32)
+        for (i, j, k) in true_coords:
+            scalar[i, j, k] = _ijk_value(i, j, k) + sidx  # slight per-subject shift
+
+        # Individual mask: subject 1 omits one voxel
+        indiv_mask = group_mask.copy()
+        if sidx == 1:
+            omit = true_coords[1]
+            indiv_mask[omit] = False
+
+        scalar_img = _make_nifti(scalar)
+        mask_img = _make_nifti(indiv_mask.astype(np.uint8))
+
+        scalar_path = tmp_path / f"sub-{sidx+1}_scalar.nii.gz"
+        mask_path = tmp_path / f"sub-{sidx+1}_mask.nii.gz"
+        scalar_img.to_filename(scalar_path)
+        mask_img.to_filename(mask_path)
+        subjects.append((str(scalar_path.name), str(mask_path.name)))
+
+    # Build cohort CSV (relative paths)
+    cohort_csv = tmp_path / "cohort.csv"
+    with cohort_csv.open("w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=["scalar_name", "source_file", "source_mask_file"])
+        writer.writeheader()
+        for sidx, (scalar_name, mask_name) in enumerate(subjects):
+            writer.writerow({
+                "scalar_name": "FA",
+                "source_file": scalar_name,
+                "source_mask_file": mask_name,
+            })
+
+    # Run CLI using module to avoid PATH issues
+    out_h5 = tmp_path / "out.h5"
+    cmd = [
+        sys.executable,
+        "-m",
+        "modelarrayio.voxels",
+        "--group-mask-file", str(group_mask_file.name),
+        "--cohort-file", str(cohort_csv.name),
+        "--relative-root", str(tmp_path),
+        "--output-hdf5", str(out_h5.name),
+        "--backend", "hdf5",
+        "--dtype", "float32",
+        "--compression", "gzip",
+        "--compression-level", "1",
+        "--shuffle", "True",
+        "--chunk-voxels", "0",
+        "--target-chunk-mb", "1.0",
+    ]
+    env = os.environ.copy()
+    proc = subprocess.run(cmd, cwd=str(tmp_path), env=env, capture_output=True, text=True)
+    assert proc.returncode == 0, f"convoxel failed: {proc.stdout}\n{proc.stderr}"
+    assert op.exists(out_h5)
+
+    # Validate HDF5 contents
+    with h5py.File(out_h5, "r") as h5:
+        assert "voxels" in h5
+        vox = np.array(h5["voxels"])  # stored as transposed table (3, N)
+        assert vox.shape[0] == 3
+        ijk = np.vstack(np.nonzero(group_mask))  # (3, N) ordered by i, then j, then k
+        assert vox.shape[1] == ijk.shape[1]
+
+        # Check ordering matches nonzero order (allow exact match)
+        assert np.array_equal(vox, ijk)
+
+        # Scalars dataset
+        dset = h5["scalars/FA/values"]
+        num_subjects, num_voxels = dset.shape
+        assert num_subjects == 2
+        assert num_voxels == ijk.shape[1]
+
+        # Column names exist and match subjects count
+        grp = h5["scalars/FA"]
+        assert "column_names" in grp
+        colnames = list(map(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), grp["column_names"][...]))
+        assert len(colnames) == 2
+
+        # Spot-check a voxel mapping (pick the third voxel)
+        vidx = 2
+        i, j, k = int(ijk[0, vidx]), int(ijk[1, vidx]), int(ijk[2, vidx])
+        expected_s0 = _ijk_value(i, j, k) + 0
+        expected_s1 = _ijk_value(i, j, k) + 1
+        # If subject 1 omitted that voxel, it should be NaN (masked out becomes NaN on flatten)
+        v0 = float(dset[0, vidx])
+        v1 = float(dset[1, vidx])
+        assert np.isclose(v0, expected_s0, equal_nan=True)
+        # Determine whether subject 1 omitted this voxel
+        omitted = False
+        omit = true_coords[1]
+        if (i, j, k) == omit:
+            omitted = True
+        if omitted:
+            assert np.isnan(v1)
+        else:
+            assert np.isclose(v1, expected_s1, equal_nan=True)
+
+