diff --git a/src/thunder/config/dataset/starc9.yaml b/src/thunder/config/dataset/starc9.yaml new file mode 100644 index 0000000..762fc09 --- /dev/null +++ b/src/thunder/config/dataset/starc9.yaml @@ -0,0 +1,42 @@ +dataset_name: starc9 +nb_classes: 9 +base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/ +compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"] +nb_train_samples: 630000 +nb_val_samples: 18000 +nb_test_samples: 54000 +md5sum: "3010519777b46827fdb16e656ed74975" +image_sizes: [[256, 256]] +mpp: 0.5 +cancer_type: colorectal +classes: ["ADI", "LYM", "MUC", "MUS", "NCS", "NOR", "BLD", "FCT", "TUM"] +class_to_id: + ADI: 0 + LYM: 1 + MUC: 2 + MUS: 3 + NCS: 4 + NOR: 5 + BLD: 6 + FCT: 7 + TUM: 8 +id_to_class: + 0: ADI + 1: LYM + 2: MUC + 3: MUS + 4: NCS + 5: NOR + 6: BLD + 7: FCT + 8: TUM +id_to_classname: + 0: adipose tissue + 1: lymphoid tissue + 2: mucin + 3: muscle + 4: necrosis + 5: normal mucosa + 6: blood + 7: fibroconnective tissue + 8: tumor diff --git a/src/thunder/datasets/__init__.py b/src/thunder/datasets/__init__.py index b8a1839..04bc8a6 100644 --- a/src/thunder/datasets/__init__.py +++ b/src/thunder/datasets/__init__.py @@ -16,6 +16,7 @@ spider_colorectal, spider_skin, spider_thorax, + starc9, tcga_crc_msi, tcga_tils, tcga_uniform, diff --git a/src/thunder/datasets/data_splits.py b/src/thunder/datasets/data_splits.py index e770205..2009bfa 100644 --- a/src/thunder/datasets/data_splits.py +++ b/src/thunder/datasets/data_splits.py @@ -39,6 +39,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None: "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", ] elif datasets[0] == "classification": datasets = [ @@ -58,6 +59,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None: "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", ] elif datasets[0] == "segmentation": datasets = [ @@ -104,6 +106,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None: create_splits_spider_colorectal, create_splits_spider_skin, create_splits_spider_thorax, + create_splits_starc9, create_splits_tcga_crc_msi, create_splits_tcga_tils, create_splits_tcga_uniform, @@ -128,6 +131,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None: "spider_colorectal": create_splits_spider_colorectal, "spider_skin": create_splits_spider_skin, "spider_thorax": create_splits_spider_thorax, + "starc9": create_splits_starc9, # Segmentation "ocelot": create_splits_ocelot, "pannuke": create_splits_pannuke, diff --git a/src/thunder/datasets/dataset/__init__.py b/src/thunder/datasets/dataset/__init__.py index f4288b5..2308237 100644 --- a/src/thunder/datasets/dataset/__init__.py +++ b/src/thunder/datasets/dataset/__init__.py @@ -25,6 +25,7 @@ ) from .spider_skin import create_splits_spider_skin, download_spider_skin from .spider_thorax import create_splits_spider_thorax, download_spider_thorax +from .starc9 import create_splits_starc9, download_starc9 from .tcga_crc_msi import create_splits_tcga_crc_msi, download_tcga_crc_msi from .tcga_tils import create_splits_tcga_tils, download_tcga_tils from .tcga_uniform import create_splits_tcga_uniform, download_tcga_uniform diff --git a/src/thunder/datasets/dataset/starc9.py b/src/thunder/datasets/dataset/starc9.py new file mode 100644 index 0000000..c1e5d9b --- /dev/null +++ b/src/thunder/datasets/dataset/starc9.py @@ -0,0 +1,167 @@ +from typing import Dict, List, Tuple + +CLASS_TO_ID = { + "ADI": 0, + "LYM": 1, + "MUC": 2, + "MUS": 3, + "NCS": 4, + "NOR": 5, + "BLD": 6, + "FCT": 7, + "TUM": 8, +} + +VALID_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"} + + +def download_starc9(root_folder: str) -> None: + """ + Download the STARC-9 dataset from Hugging Face and extract all zip files. + + Final split mapping: + - train: Training_data_normalized + - val: Validation_data/STANFORD-CRC-HE-VAL-SMALL + - test: Validation_data/STANFORD-CRC-HE-VAL-LARGE + + CURATED-TCGA is intentionally ignored here. + """ + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id="Path2AI/STARC-9", + repo_type="dataset", + local_dir=root_folder, + local_dir_use_symlinks=False, + ) + + extract_all_zips(root_folder) + + +def extract_all_zips(root_dir: str) -> None: + """ + Recursively extract every .zip under root_dir into a folder with the same stem. + """ + import os + from pathlib import Path + + from ..utils import unzip_file + + for current_root, _, files in os.walk(root_dir): + for file_name in files: + if not file_name.lower().endswith(".zip"): + continue + + unzip_file( + os.path.join(current_root, file_name), + current_root, + ) + + # Renaming folder extracted from STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip + if file_name == "STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip": + os.rename( + os.path.join(current_root, "NORMALIZED"), + os.path.join(current_root, "STANFORD-CRC-HE-VAL-LARGE"), + ) + + +def collect_images_from_class_root( + class_root: str, +) -> Tuple[List[str], List[int], Dict[str, int]]: + """ + Read all images from a directory structured like: + class_root/ + ADI/ + LYM/ + ... + """ + from pathlib import Path + + images: List[str] = [] + labels: List[int] = [] + + class_root_path = Path(class_root) + if not class_root_path.exists(): + raise FileNotFoundError(f"Class root does not exist: {class_root}") + + missing_classes = [c for c in CLASS_TO_ID if not (class_root_path / c).exists()] + if missing_classes: + raise FileNotFoundError( + f"Missing expected class folders under {class_root}: {missing_classes}" + ) + + for class_name, class_id in CLASS_TO_ID.items(): + class_dir = class_root_path / class_name + for img_path in sorted(class_dir.rglob("*")): + if img_path.is_file() and img_path.suffix.lower() in VALID_EXTS: + images.append(str(img_path.resolve())) + labels.append(class_id) + + return images, labels + + +def create_splits_starc9(base_folder: str, dataset_cfg: dict) -> None: + """ + Generating data splits for the STARC-9 dataset. + + :param base_folder: path to the main folder storing datasets. + :param dataset_cfg: dataset-specific config. + """ + import os + + from ...utils.constants import UtilsConstants + from ...utils.utils import set_seed + from ..data_splits import ( + check_dataset, + create_few_shot_training_data, + init_dict, + save_dict, + ) + + # Setting the random seed + set_seed(UtilsConstants.DEFAULT_SEED.value) + + # Initializing dict + starc9_data_splits = init_dict() + + # Getting folder paths + dataset_root = os.path.join(base_folder, "starc9") + train_root = os.path.join(dataset_root, "Training_data_normalized") + val_root = os.path.join( + dataset_root, + "Validation_data", + "STANFORD-CRC-HE-VAL-SMALL", + ) + test_root = os.path.join( + dataset_root, + "Validation_data", + "STANFORD-CRC-HE-VAL-LARGE", + ) + + # Collecting data + train_images, train_labels = collect_images_from_class_root(train_root) + val_images, val_labels = collect_images_from_class_root(val_root) + test_images, test_labels = collect_images_from_class_root(test_root) + + # Updating dict + starc9_data_splits["train"]["images"] = train_images + starc9_data_splits["train"]["labels"] = train_labels + starc9_data_splits["val"]["images"] = val_images + starc9_data_splits["val"]["labels"] = val_labels + starc9_data_splits["test"]["images"] = test_images + starc9_data_splits["test"]["labels"] = test_labels + + # Few-shot training data + starc9_data_splits = create_few_shot_training_data(starc9_data_splits) + + # Checking dataset characteristics + check_dataset( + starc9_data_splits, + dataset_cfg, + base_folder, + ) + + # Saving dict + save_dict( + starc9_data_splits, os.path.join(base_folder, "data_splits", "starc9.json") + ) diff --git a/src/thunder/datasets/download.py b/src/thunder/datasets/download.py index b8127f0..d5efa46 100644 --- a/src/thunder/datasets/download.py +++ b/src/thunder/datasets/download.py @@ -27,6 +27,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False * spider_colorectal * spider_skin * spider_thorax + * starc9 * tcga_crc_msi * tcga_tils * tcga_uniform @@ -65,6 +66,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", "tcga_crc_msi", "tcga_tils", "tcga_uniform", @@ -84,6 +86,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", "tcga_crc_msi", "tcga_tils", "tcga_uniform", @@ -160,5 +163,7 @@ def download_dataset(dataset: str): download_spider_skin(root_folder) elif dataset == "spider_thorax": download_spider_thorax(root_folder) + elif dataset == "starc9": + download_starc9(root_folder) else: raise ValueError(f"Dataset {dataset} is not supported.") diff --git a/src/thunder/utils/constants.py b/src/thunder/utils/constants.py index 76a40bc..31633c1 100644 --- a/src/thunder/utils/constants.py +++ b/src/thunder/utils/constants.py @@ -61,6 +61,7 @@ class DatasetConstants(Enum): "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", "tcga_crc_msi", "tcga_tils", "tcga_uniform",