Merge pull request #22 from ClimateBenchPress/fix_data

treigerm · web-flow · commit 071796d8e150 · 2025-06-02T15:17:20.000+01:00
Prepare data sources for paper
diff --git a/src/climatebenchpress/data_loader/datasets/cams.py b/src/climatebenchpress/data_loader/datasets/cams.py
@@ -34,6 +34,10 @@ def download(download_path: Path, progress: bool = True):
     @staticmethod
     def open(download_path: Path) -> xr.Dataset:
         ds = xr.open_dataset(download_path / Path(NO2_FILE).name)
+
+        # Restrict data to a single day.
+        # The specific day is arbitrary.
+        ds = ds.sel(valid_time=slice("2023-06-15", "2023-06-15")).chunk(-1)
         # Needed to make the dataset CF-compliant.
         ds.longitude.attrs["axis"] = "X"
         ds.latitude.attrs["axis"] = "Y"
diff --git a/src/climatebenchpress/data_loader/datasets/cmip6/abc.py b/src/climatebenchpress/data_loader/datasets/cmip6/abc.py
@@ -48,8 +48,13 @@ def download_with(
         zstore = zstore.replace("gs://", "https://storage.googleapis.com/")
 
         ds = xr.open_zarr(fsspec.get_mapper(zstore), consolidated=True)
+        # Only select the year 2020 for the dataset. The exact choice of this
+        # year is arbitrary,
+        # .chunk(-1) ensures that we only use a single chunk for the entire dataset.
+        ds = ds.sel(time=slice("2020", "2020")).chunk(-1)
         if variable_selector is not None:
             ds = ds[variable_selector]
+
         with monitor.progress_bar(progress):
             ds.to_zarr(downloadfile, mode="w", encoding=dict(), compute=False).compute()
 
diff --git a/src/climatebenchpress/data_loader/datasets/era5.py b/src/climatebenchpress/data_loader/datasets/era5.py
@@ -26,13 +26,15 @@ def download(download_path: Path, progress: bool = True):
 
         era5 = xr.open_zarr(ERA5_GCP_PATH, chunks={"time": 48}, consolidated=True)
 
-        ds = era5.sel(time=slice("2020-03-01", "2020-03-07"))[
+        # Restrict data to a single day.
+        # The specific day is arbitrary.
+        ds = era5.sel(time=slice("2020-03-01", "2020-03-01"))[
             [
                 "mean_sea_level_pressure",
                 "10m_u_component_of_wind",
                 "10m_v_component_of_wind",
             ]
-        ]
+        ].chunk(-1)
         # Needed to make the dataset CF-compliant.
         ds.time.attrs["standard_name"] = "time"
         ds.longitude.attrs["axis"] = "X"
diff --git a/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py b/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py
@@ -13,29 +13,28 @@
 from .abc import Dataset
 
 NUM_RETRIES = 3
-# Bounding box for an area in mainland France
-FRANCE_BBOX = {"T": slice(0, 1), "X": slice(202531, 207531), "Y": slice(35469, 40469)}
+
+# Define rough bounding box coordinates for mainland France.
+# Format: [min_longitude, min_latitude, max_longitude, max_latitude].
+FRANCE_BBOX = [-5.5, 42.3, 9.6, 51.1]
+
+# Biomass estimate for the year 2020.
+BIOMASS_URL = "https://dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.01/netcdf/ESACCI-BIOMASS-L4-AGB-MERGED-100m-2020-fv5.01.nc"
 
 
 class EsaBiomassCciDataset(Dataset):
     name = "esa-biomass-cci"
 
     @staticmethod
     def download(download_path: Path, progress: bool = True):
-        urls = [
-            f"https://dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.01/netcdf/ESACCI-BIOMASS-L4-AGB-MERGED-100m-{year}-fv5.01.nc"
-            # Restrict to 2 years for now for smaller download.
-            for year in [2010, 2015]
-        ]
-        for url in urls:
-            output_path = download_path / Path(url).name
-            for _ in range(NUM_RETRIES):
-                success = _download_netcdf(url, output_path, progress)
-                if success:
-                    break
-            if not success:
-                logging.info(f"Failed to download {url}")
-                return
+        output_path = download_path / Path(BIOMASS_URL).name
+        for _ in range(NUM_RETRIES):
+            success = _download_netcdf(BIOMASS_URL, output_path, progress)
+            if success:
+                break
+        if not success:
+            logging.info(f"Failed to download {BIOMASS_URL}")
+            return
 
     @staticmethod
     def open(download_path: Path) -> xr.Dataset:
@@ -44,12 +43,28 @@ def open(download_path: Path) -> xr.Dataset:
         # Needed to make the dataset CF-compliant.
         ds.lon.attrs["axis"] = "X"
         ds.lat.attrs["axis"] = "Y"
+        # We are constraining the dataset to mainland France to reduce its overall size.
+        # The global snapshot would be around 20 GB, which is too large for our use case.
+        # We chose France because it should have a fairly diverse set of biomass estimates
+        # but the choice is overall somewhat arbitrary.
+        ds = ds.sel(
+            lon=slice(FRANCE_BBOX[0], FRANCE_BBOX[2]),
+            lat=slice(FRANCE_BBOX[3], FRANCE_BBOX[1]),
+        ).chunk(-1)
         return ds[["agb"]]
 
 
 if __name__ == "__main__":
     ds = open_downloaded_canonicalized_dataset(EsaBiomassCciDataset)
-    open_downloaded_tiny_canonicalized_dataset(EsaBiomassCciDataset, slices=FRANCE_BBOX)
+    num_lon, num_lat = ds.lon.size, ds.lat.size
+    open_downloaded_tiny_canonicalized_dataset(
+        EsaBiomassCciDataset,
+        # Use a smaller spatial subset for the tiny dataset.
+        slices={
+            "X": slice(num_lon // 2, (num_lon // 2) + 500),
+            "Y": slice(num_lat // 2, (num_lat // 2) + 500),
+        },
+    )
 
     for v, da in ds.items():
         print(f"- {v}: {da.dims}")
diff --git a/src/climatebenchpress/data_loader/datasets/nextgems.py b/src/climatebenchpress/data_loader/datasets/nextgems.py
@@ -44,16 +44,19 @@ def download(download_path: Path, progress: bool = True):
             zoom=ZOOM, time=TIME_RESOLUTION, chunks=dict()
         ).to_dask()
 
-        ds = icon[[PRECIP_KEY, OLR_KEY]].sel(time=slice("2020-03-01", "2020-03-07"))
+        # Restrict data to a single day.
+        # The specific day is arbitrary.
+        ds = icon[[PRECIP_KEY, OLR_KEY]].sel(time=slice("2020-03-01", "2020-03-01"))
         # Regrid the data to 0.125 degree resolution.
-        # NOTE: This is using nearest neighbour interpolation. We need to do some
-        #       quality checks to ensure we don't get any significant aliasing
-        #       artifacts as the result of interpolation. For more details:
-        #       https://easy.gems.dkrz.de/Processing/healpix/lonlat_remap.html.
+        # NOTE:
+        # This is using nearest neighbour interpolation. Different interpolation methods
+        # should not have a drastic effect on the intercomparison of different compressors.
+        # However, this should be studied in more detail because re-gridding can often
+        # have unforeseen consequences.
         idx = _get_nn_lon_lat_index(
             2**ZOOM, np.linspace(-180, 180, NUM_LON), np.linspace(-90, 90, NUM_LAT)
         )
-        ds = ds.isel(cell=idx).chunk({"time": 1, "lat": NUM_LAT, "lon": NUM_LON})
+        ds = ds.isel(cell=idx).chunk(-1)
         ds.lon.attrs["axis"] = "X"
         ds.lat.attrs["axis"] = "Y"