From c1d2e8225c24e73f1e83d6ae8257d1cb87c36162 Mon Sep 17 00:00:00 2001 From: Hippolyte Verninas Date: Wed, 15 Oct 2025 10:56:59 +0200 Subject: [PATCH 1/9] enh add dataset download from kaggle --- datasets/bsd500_bsd20.py | 20 ++++++++++++++------ datasets/bsd500_cbsd68.py | 17 +++++++++++++---- datasets/bsd500_imnet100.py | 15 +++++++++++---- datasets/cbsd68_set3c.py | 9 ++++++--- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/datasets/bsd500_bsd20.py b/datasets/bsd500_bsd20.py index 26150bf..e6102ae 100644 --- a/datasets/bsd500_bsd20.py +++ b/datasets/bsd500_bsd20.py @@ -1,4 +1,6 @@ from benchopt import BaseDataset, safe_import_context, config +from pathlib import Path +import os with safe_import_context() as import_ctx: import deepinv as dinv @@ -7,6 +9,7 @@ from benchmark_utils.image_dataset import ImageDataset from deepinv.physics import Downsampling, Denoising, GaussianNoise from deepinv.physics.generator import MotionBlurGenerator + import kagglehub class Dataset(BaseDataset): @@ -21,7 +24,7 @@ class Dataset(BaseDataset): 'img_size': [256], } - requirements = ["datasets"] + requirements = ["kagglehub"] def get_data(self): # TODO: Remove @@ -74,24 +77,29 @@ def get_data(self): transforms.ToTensor() ]) + path = Path(kagglehub.dataset_download( + "balraj98/berkeley-segmentation-dataset-500-bsds500")) + path = path / "images" + train_dataset = ImageDataset( - config.get_data_path("BSD500") / "train", + path / "train", transform=transform ) test_dataset = ImageDataset( - config.get_data_path("BSD500") / "val", + path / "val", transform=transform, num_images=20 ) + data_path = Path(os.path.dirname(os.path.abspath(__file__))) + data_path = data_path.parent / "data" + dinv_dataset_path = dinv.datasets.generate_dataset( train_dataset=train_dataset, test_dataset=test_dataset, physics=physics, - save_dir=config.get_data_path( - key="generated_datasets" - ) / "bsd500_bsd20", + save_dir=data_path / "bsd500_bsd20", dataset_filename=self.task, device=device ) diff --git a/datasets/bsd500_cbsd68.py b/datasets/bsd500_cbsd68.py index ccfab17..485859b 100644 --- a/datasets/bsd500_cbsd68.py +++ b/datasets/bsd500_cbsd68.py @@ -1,4 +1,6 @@ from benchopt import BaseDataset, safe_import_context, config +from pathlib import Path +import os with safe_import_context() as import_ctx: import deepinv as dinv @@ -11,6 +13,7 @@ ) from deepinv.physics import Denoising, GaussianNoise, Downsampling from deepinv.physics.generator import MotionBlurGenerator + import kagglehub class Dataset(BaseDataset): @@ -77,8 +80,13 @@ def get_data(self): transforms.ToTensor() ]) + path = Path(kagglehub.dataset_download( + "balraj98/berkeley-segmentation-dataset-500-bsds500")) + path = path / "images" + train_dataset = ImageDataset( - config.get_data_path("BSD500") / "train", transform=transform + path / "train", + transform=transform ) dataset_cbsd68 = load_dataset("deepinv/CBSD68") @@ -86,13 +94,14 @@ def get_data(self): dataset_cbsd68["train"], key="png", transform=transform ) + data_path = Path(os.path.dirname(os.path.abspath(__file__))) + data_path = data_path.parent / "data" + dinv_dataset_path = dinv.datasets.generate_dataset( train_dataset=train_dataset, test_dataset=test_dataset, physics=physics, - save_dir=config.get_data_path( - key="generated_datasets" - ) / "bsd500_cbsd68", + save_dir=data_path / "bsd500_cbsd68", dataset_filename=self.task, device=device ) diff --git a/datasets/bsd500_imnet100.py b/datasets/bsd500_imnet100.py index 3b277fc..da6ff5a 100644 --- a/datasets/bsd500_imnet100.py +++ b/datasets/bsd500_imnet100.py @@ -1,4 +1,6 @@ from benchopt import BaseDataset, safe_import_context, config +from pathlib import Path +import os with safe_import_context() as import_ctx: import deepinv as dinv @@ -11,6 +13,7 @@ from deepinv.physics import Downsampling, Denoising, GaussianNoise from deepinv.physics.generator import MotionBlurGenerator from datasets import load_dataset + import kagglehub class Dataset(BaseDataset): @@ -77,8 +80,11 @@ def get_data(self): transforms.ToTensor() ]) + path = Path(kagglehub.dataset_download( + "balraj98/berkeley-segmentation-dataset-500-bsds500")) + train_dataset = ImageDataset( - config.get_data_path("BSD500") / "train", + path / "train", transform=transform ) @@ -89,13 +95,14 @@ def get_data(self): transform=transform ) + data_path = Path(os.path.dirname(os.path.abspath(__file__))) + data_path = data_path.parent / "data" + dinv_dataset_path = dinv.datasets.generate_dataset( train_dataset=train_dataset, test_dataset=test_dataset, physics=physics, - save_dir=config.get_data_path( - key="generated_datasets" - ) / "bsd500_imnet100", + save_dir=data_path / "bsd500_imnet100", dataset_filename=self.task, device=device ) diff --git a/datasets/cbsd68_set3c.py b/datasets/cbsd68_set3c.py index 325c160..ca426ba 100644 --- a/datasets/cbsd68_set3c.py +++ b/datasets/cbsd68_set3c.py @@ -1,4 +1,6 @@ from benchopt import BaseDataset, safe_import_context, config +import os +from pathlib import Path with safe_import_context() as import_ctx: import deepinv as dinv @@ -87,13 +89,14 @@ def get_data(self): dataset_Set3c["train"], key="image", transform=transform ) + data_path = Path(os.path.dirname(os.path.abspath(__file__))) + data_path = data_path.parent / "data" + dinv_dataset_path = dinv.datasets.generate_dataset( train_dataset=train_dataset, test_dataset=test_dataset, physics=physics, - save_dir=config.get_data_path( - key="generated_datasets" - ) / "sbsd68_set3c", + save_dir=data_path / "sbsd68_set3c", dataset_filename=self.task, device=device ) From ba4f5737a0dc135c3969915b010d32420fb03b41 Mon Sep 17 00:00:00 2001 From: Hippolyte Verninas Date: Wed, 15 Oct 2025 11:02:43 +0200 Subject: [PATCH 2/9] CLN rm config --- config.yml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 config.yml diff --git a/config.yml b/config.yml deleted file mode 100644 index e16d5c7..0000000 --- a/config.yml +++ /dev/null @@ -1,5 +0,0 @@ -data_home: /Users/melvinenargeot/Data/benchmark_inverse_problems -data_paths: - generated_datasets: generated_datasets - generated_trainings: generated_training - BSD500: BSD500/BSR/BSDS500/data/images \ No newline at end of file From dfa56f909e39c48c67c18abe22d6232e177d76da Mon Sep 17 00:00:00 2001 From: Hippolyte Verninas Date: Wed, 15 Oct 2025 11:11:23 +0200 Subject: [PATCH 3/9] FIX add requirements --- datasets/bsd500_cbsd68.py | 2 +- datasets/bsd500_imnet100.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/datasets/bsd500_cbsd68.py b/datasets/bsd500_cbsd68.py index 485859b..19e3325 100644 --- a/datasets/bsd500_cbsd68.py +++ b/datasets/bsd500_cbsd68.py @@ -28,7 +28,7 @@ class Dataset(BaseDataset): 'img_size': [256], } - requirements = ["datasets"] + requirements = ["datasets", "kagglehub"] def get_data(self): # TODO: Remove diff --git a/datasets/bsd500_imnet100.py b/datasets/bsd500_imnet100.py index da6ff5a..ad4320d 100644 --- a/datasets/bsd500_imnet100.py +++ b/datasets/bsd500_imnet100.py @@ -28,7 +28,7 @@ class Dataset(BaseDataset): 'img_size': [256], } - requirements = ["datasets"] + requirements = ["datasets", "kagglehub"] def get_data(self): # TODO: Remove @@ -82,6 +82,7 @@ def get_data(self): path = Path(kagglehub.dataset_download( "balraj98/berkeley-segmentation-dataset-500-bsds500")) + path = path / "images" train_dataset = ImageDataset( path / "train", From 15738d05543a053b811ef3caf7f4a6a636054989 Mon Sep 17 00:00:00 2001 From: Hippolyte Verninas Date: Wed, 15 Oct 2025 11:16:25 +0200 Subject: [PATCH 4/9] FIX linting --- datasets/bsd500_bsd20.py | 2 +- datasets/bsd500_cbsd68.py | 2 +- datasets/bsd500_imnet100.py | 2 +- datasets/cbsd68_set3c.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datasets/bsd500_bsd20.py b/datasets/bsd500_bsd20.py index e6102ae..3beb009 100644 --- a/datasets/bsd500_bsd20.py +++ b/datasets/bsd500_bsd20.py @@ -1,4 +1,4 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, safe_import_context from pathlib import Path import os diff --git a/datasets/bsd500_cbsd68.py b/datasets/bsd500_cbsd68.py index 19e3325..7bcc8e5 100644 --- a/datasets/bsd500_cbsd68.py +++ b/datasets/bsd500_cbsd68.py @@ -1,4 +1,4 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, safe_import_context from pathlib import Path import os diff --git a/datasets/bsd500_imnet100.py b/datasets/bsd500_imnet100.py index ad4320d..7db3724 100644 --- a/datasets/bsd500_imnet100.py +++ b/datasets/bsd500_imnet100.py @@ -1,4 +1,4 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, safe_import_context from pathlib import Path import os diff --git a/datasets/cbsd68_set3c.py b/datasets/cbsd68_set3c.py index ca426ba..05b4da0 100644 --- a/datasets/cbsd68_set3c.py +++ b/datasets/cbsd68_set3c.py @@ -1,4 +1,4 @@ -from benchopt import BaseDataset, safe_import_context, config +from benchopt import BaseDataset, safe_import_context import os from pathlib import Path From be6c568c6410b696aa0976bdd83b7eb4899ffd9c Mon Sep 17 00:00:00 2001 From: Hippolyte Verninas Date: Thu, 16 Oct 2025 17:23:58 +0200 Subject: [PATCH 5/9] FIX: download bsd500 from deepinv --- benchmark_utils/image_dataset.py | 33 -------- datasets/bsd500_bsd20.py | 126 ------------------------------- datasets/bsd500_cbsd68.py | 22 ++---- datasets/bsd500_imnet100.py | 22 ++---- 4 files changed, 12 insertions(+), 191 deletions(-) delete mode 100644 benchmark_utils/image_dataset.py delete mode 100644 datasets/bsd500_bsd20.py diff --git a/benchmark_utils/image_dataset.py b/benchmark_utils/image_dataset.py deleted file mode 100644 index 524ac74..0000000 --- a/benchmark_utils/image_dataset.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import random - -from torch.utils.data import Dataset -from typing import Callable -from PIL import Image - - -class ImageDataset(Dataset): - def __init__(self, - folder: str, - transform: Callable = None, - num_images=None): - self.folder = folder - self.transform = transform - self.files = [f for f in os.listdir(folder) if f.endswith(( - '.png', '.jpg', '.jpeg'))] - - if num_images is not None: - self.files.sort() - self.files = random.sample(self.files, num_images) - - def __len__(self): - return len(self.files) - - def __getitem__(self, idx): - img_name = os.path.join(self.folder, self.files[idx]) - image = Image.open(img_name) - - if self.transform: - image = self.transform(image) - - return image diff --git a/datasets/bsd500_bsd20.py b/datasets/bsd500_bsd20.py deleted file mode 100644 index 3beb009..0000000 --- a/datasets/bsd500_bsd20.py +++ /dev/null @@ -1,126 +0,0 @@ -from benchopt import BaseDataset, safe_import_context -from pathlib import Path -import os - -with safe_import_context() as import_ctx: - import deepinv as dinv - import torch - from torchvision import transforms - from benchmark_utils.image_dataset import ImageDataset - from deepinv.physics import Downsampling, Denoising, GaussianNoise - from deepinv.physics.generator import MotionBlurGenerator - import kagglehub - - -class Dataset(BaseDataset): - - name = "BSD500_BSD20" - - parameters = { - 'task': ['denoising', - 'gaussian-debluring', - 'motion-debluring', - 'SRx4'], - 'img_size': [256], - } - - requirements = ["kagglehub"] - - def get_data(self): - # TODO: Remove - device = ( - dinv.utils.get_freer_gpu()) if torch.cuda.is_available() else "cpu" - - n_channels = 3 - - if self.task == "denoising": - noise_level_img = 0.03 - physics = Denoising(GaussianNoise(sigma=noise_level_img)) - elif self.task == "gaussian-debluring": - filter_torch = dinv.physics.blur.gaussian_blur(sigma=(3, 3)) - noise_level_img = 0.03 - n_channels = 3 - - physics = dinv.physics.BlurFFT( - img_size=(n_channels, self.img_size, self.img_size), - filter=filter_torch, - noise_model=dinv.physics.GaussianNoise(sigma=noise_level_img), - device=device - ) - elif self.task == "motion-debluring": - psf_size = 31 - n_channels = 3 - motion_generator = MotionBlurGenerator( - (psf_size, psf_size), - device=device - ) - - filters = motion_generator.step(batch_size=1) - - physics = dinv.physics.BlurFFT( - img_size=(n_channels, self.img_size, self.img_size), - filter=filters["filter"], - device=device - ) - elif self.task == "SRx4": - physics = Downsampling(img_size=(n_channels, - self.img_size, - self.img_size), - filter="bicubic", - factor=4, - device=device) - else: - raise Exception("Unknown task") - - transform = transforms.Compose([ - transforms.Resize((self.img_size, self.img_size)), - transforms.ToTensor() - ]) - - path = Path(kagglehub.dataset_download( - "balraj98/berkeley-segmentation-dataset-500-bsds500")) - path = path / "images" - - train_dataset = ImageDataset( - path / "train", - transform=transform - ) - - test_dataset = ImageDataset( - path / "val", - transform=transform, - num_images=20 - ) - - data_path = Path(os.path.dirname(os.path.abspath(__file__))) - data_path = data_path.parent / "data" - - dinv_dataset_path = dinv.datasets.generate_dataset( - train_dataset=train_dataset, - test_dataset=test_dataset, - physics=physics, - save_dir=data_path / "bsd500_bsd20", - dataset_filename=self.task, - device=device - ) - - train_dataset = dinv.datasets.HDF5Dataset( - path=dinv_dataset_path, train=True - ) - test_dataset = dinv.datasets.HDF5Dataset( - path=dinv_dataset_path, train=False - ) - - x, y = train_dataset[0] - dinv.utils.plot([x.unsqueeze(0), y.unsqueeze(0)]) - - x, y = test_dataset[0] - dinv.utils.plot([x.unsqueeze(0), y.unsqueeze(0)]) - - return dict( - train_dataset=train_dataset, - test_dataset=test_dataset, - physics=physics, - dataset_name="BSD68", - task_name=self.task - ) diff --git a/datasets/bsd500_cbsd68.py b/datasets/bsd500_cbsd68.py index 7bcc8e5..c77fcc2 100644 --- a/datasets/bsd500_cbsd68.py +++ b/datasets/bsd500_cbsd68.py @@ -1,19 +1,16 @@ from benchopt import BaseDataset, safe_import_context -from pathlib import Path -import os +from benchopt.config import get_data_path with safe_import_context() as import_ctx: import deepinv as dinv import torch from torchvision import transforms from datasets import load_dataset - from benchmark_utils.image_dataset import ImageDataset from benchmark_utils.hugging_face_torch_dataset import ( HuggingFaceTorchDataset ) from deepinv.physics import Denoising, GaussianNoise, Downsampling from deepinv.physics.generator import MotionBlurGenerator - import kagglehub class Dataset(BaseDataset): @@ -28,7 +25,7 @@ class Dataset(BaseDataset): 'img_size': [256], } - requirements = ["datasets", "kagglehub"] + requirements = ["datasets"] def get_data(self): # TODO: Remove @@ -80,13 +77,9 @@ def get_data(self): transforms.ToTensor() ]) - path = Path(kagglehub.dataset_download( - "balraj98/berkeley-segmentation-dataset-500-bsds500")) - path = path / "images" - - train_dataset = ImageDataset( - path / "train", - transform=transform + path = get_data_path("BSD500") + train_dataset = dinv.datasets.BSDS500( + path, download=True, splits='train', transform=transform ) dataset_cbsd68 = load_dataset("deepinv/CBSD68") @@ -94,14 +87,11 @@ def get_data(self): dataset_cbsd68["train"], key="png", transform=transform ) - data_path = Path(os.path.dirname(os.path.abspath(__file__))) - data_path = data_path.parent / "data" - dinv_dataset_path = dinv.datasets.generate_dataset( train_dataset=train_dataset, test_dataset=test_dataset, physics=physics, - save_dir=data_path / "bsd500_cbsd68", + save_dir=get_data_path("bsd500_cbsd68"), dataset_filename=self.task, device=device ) diff --git a/datasets/bsd500_imnet100.py b/datasets/bsd500_imnet100.py index 7db3724..2b797e1 100644 --- a/datasets/bsd500_imnet100.py +++ b/datasets/bsd500_imnet100.py @@ -1,19 +1,16 @@ from benchopt import BaseDataset, safe_import_context -from pathlib import Path -import os +from benchopt.config import get_data_path with safe_import_context() as import_ctx: import deepinv as dinv import torch from torchvision import transforms - from benchmark_utils.image_dataset import ImageDataset from benchmark_utils.hugging_face_torch_dataset import ( HuggingFaceTorchDataset ) from deepinv.physics import Downsampling, Denoising, GaussianNoise from deepinv.physics.generator import MotionBlurGenerator from datasets import load_dataset - import kagglehub class Dataset(BaseDataset): @@ -28,7 +25,7 @@ class Dataset(BaseDataset): 'img_size': [256], } - requirements = ["datasets", "kagglehub"] + requirements = ["datasets"] def get_data(self): # TODO: Remove @@ -80,13 +77,9 @@ def get_data(self): transforms.ToTensor() ]) - path = Path(kagglehub.dataset_download( - "balraj98/berkeley-segmentation-dataset-500-bsds500")) - path = path / "images" - - train_dataset = ImageDataset( - path / "train", - transform=transform + path = get_data_path("BSD500") + train_dataset = dinv.datasets.BSDS500( + path, download=True, splits='train', transform=transform ) dataset_miniImnet100 = load_dataset("mterris/miniImnet100") @@ -96,14 +89,11 @@ def get_data(self): transform=transform ) - data_path = Path(os.path.dirname(os.path.abspath(__file__))) - data_path = data_path.parent / "data" - dinv_dataset_path = dinv.datasets.generate_dataset( train_dataset=train_dataset, test_dataset=test_dataset, physics=physics, - save_dir=data_path / "bsd500_imnet100", + save_dir=get_data_path("bsd500_imnet100"), dataset_filename=self.task, device=device ) From fb7608835dd21edd3d5a3c6145c34776e97229ee Mon Sep 17 00:00:00 2001 From: Hippolyte Verninas Date: Thu, 16 Oct 2025 17:26:21 +0200 Subject: [PATCH 6/9] FIX use get_data_path for all datasets --- datasets/cbsd68_set3c.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/datasets/cbsd68_set3c.py b/datasets/cbsd68_set3c.py index 05b4da0..fadd716 100644 --- a/datasets/cbsd68_set3c.py +++ b/datasets/cbsd68_set3c.py @@ -1,6 +1,5 @@ from benchopt import BaseDataset, safe_import_context -import os -from pathlib import Path +from benchopt.config import get_data_path with safe_import_context() as import_ctx: import deepinv as dinv @@ -89,14 +88,11 @@ def get_data(self): dataset_Set3c["train"], key="image", transform=transform ) - data_path = Path(os.path.dirname(os.path.abspath(__file__))) - data_path = data_path.parent / "data" - dinv_dataset_path = dinv.datasets.generate_dataset( train_dataset=train_dataset, test_dataset=test_dataset, physics=physics, - save_dir=data_path / "sbsd68_set3c", + save_dir=get_data_path("cbsd68_set3c"), dataset_filename=self.task, device=device ) From 9e23a5acb5156ade4dfd733bd06742f5ce55379d Mon Sep 17 00:00:00 2001 From: Hippolyte Verninas Date: Thu, 16 Oct 2025 17:41:57 +0200 Subject: [PATCH 7/9] TST skip get_data on mac --- test_config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test_config.py b/test_config.py index ba1b960..faf841d 100644 --- a/test_config.py +++ b/test_config.py @@ -11,3 +11,10 @@ def check_test_solver_install(solver_class): detecting the situation. """ pass + + +def check_test_dataset_get_data(dataset_class): + if sys.platform == "darwin": + pytest.skip( + "Skipping test_dataset_get_data on MacOS." + ) From 5974a8542045d062c46face1c13a9ee3a0203c23 Mon Sep 17 00:00:00 2001 From: Hippolyte Verninas Date: Fri, 17 Oct 2025 09:50:38 +0200 Subject: [PATCH 8/9] FIX use full BSD500 dataset --- datasets/bsd500_cbsd68.py | 2 +- datasets/bsd500_imnet100.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/bsd500_cbsd68.py b/datasets/bsd500_cbsd68.py index c77fcc2..bdc5e8a 100644 --- a/datasets/bsd500_cbsd68.py +++ b/datasets/bsd500_cbsd68.py @@ -79,7 +79,7 @@ def get_data(self): path = get_data_path("BSD500") train_dataset = dinv.datasets.BSDS500( - path, download=True, splits='train', transform=transform + path, download=True, transform=transform ) dataset_cbsd68 = load_dataset("deepinv/CBSD68") diff --git a/datasets/bsd500_imnet100.py b/datasets/bsd500_imnet100.py index 2b797e1..d0f6959 100644 --- a/datasets/bsd500_imnet100.py +++ b/datasets/bsd500_imnet100.py @@ -79,7 +79,7 @@ def get_data(self): path = get_data_path("BSD500") train_dataset = dinv.datasets.BSDS500( - path, download=True, splits='train', transform=transform + path, download=True, transform=transform ) dataset_miniImnet100 = load_dataset("mterris/miniImnet100") From a82f66cbdda6b11b898c62f9abae072a36975c4d Mon Sep 17 00:00:00 2001 From: Thomas Moreau Date: Fri, 17 Oct 2025 10:19:40 +0200 Subject: [PATCH 9/9] Update test_config.py --- test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_config.py b/test_config.py index faf841d..89e5a0f 100644 --- a/test_config.py +++ b/test_config.py @@ -13,7 +13,7 @@ def check_test_solver_install(solver_class): pass -def check_test_dataset_get_data(dataset_class): +def check_test_dataset_get_data(benchmark, dataset_class): if sys.platform == "darwin": pytest.skip( "Skipping test_dataset_get_data on MacOS."