Skip to content

Commit 071796d

Browse files
authored
Merge pull request #22 from ClimateBenchPress/fix_data
Prepare data sources for paper
2 parents 18a0b66 + 4757a53 commit 071796d

5 files changed

Lines changed: 54 additions & 25 deletions

File tree

src/climatebenchpress/data_loader/datasets/cams.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ def download(download_path: Path, progress: bool = True):
3434
@staticmethod
3535
def open(download_path: Path) -> xr.Dataset:
3636
ds = xr.open_dataset(download_path / Path(NO2_FILE).name)
37+
38+
# Restrict data to a single day.
39+
# The specific day is arbitrary.
40+
ds = ds.sel(valid_time=slice("2023-06-15", "2023-06-15")).chunk(-1)
3741
# Needed to make the dataset CF-compliant.
3842
ds.longitude.attrs["axis"] = "X"
3943
ds.latitude.attrs["axis"] = "Y"

src/climatebenchpress/data_loader/datasets/cmip6/abc.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,13 @@ def download_with(
4848
zstore = zstore.replace("gs://", "https://storage.googleapis.com/")
4949

5050
ds = xr.open_zarr(fsspec.get_mapper(zstore), consolidated=True)
51+
# Only select the year 2020 for the dataset. The exact choice of this
52+
# year is arbitrary,
53+
# .chunk(-1) ensures that we only use a single chunk for the entire dataset.
54+
ds = ds.sel(time=slice("2020", "2020")).chunk(-1)
5155
if variable_selector is not None:
5256
ds = ds[variable_selector]
57+
5358
with monitor.progress_bar(progress):
5459
ds.to_zarr(downloadfile, mode="w", encoding=dict(), compute=False).compute()
5560

src/climatebenchpress/data_loader/datasets/era5.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,15 @@ def download(download_path: Path, progress: bool = True):
2626

2727
era5 = xr.open_zarr(ERA5_GCP_PATH, chunks={"time": 48}, consolidated=True)
2828

29-
ds = era5.sel(time=slice("2020-03-01", "2020-03-07"))[
29+
# Restrict data to a single day.
30+
# The specific day is arbitrary.
31+
ds = era5.sel(time=slice("2020-03-01", "2020-03-01"))[
3032
[
3133
"mean_sea_level_pressure",
3234
"10m_u_component_of_wind",
3335
"10m_v_component_of_wind",
3436
]
35-
]
37+
].chunk(-1)
3638
# Needed to make the dataset CF-compliant.
3739
ds.time.attrs["standard_name"] = "time"
3840
ds.longitude.attrs["axis"] = "X"

src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,28 @@
1313
from .abc import Dataset
1414

1515
NUM_RETRIES = 3
16-
# Bounding box for an area in mainland France
17-
FRANCE_BBOX = {"T": slice(0, 1), "X": slice(202531, 207531), "Y": slice(35469, 40469)}
16+
17+
# Define rough bounding box coordinates for mainland France.
18+
# Format: [min_longitude, min_latitude, max_longitude, max_latitude].
19+
FRANCE_BBOX = [-5.5, 42.3, 9.6, 51.1]
20+
21+
# Biomass estimate for the year 2020.
22+
BIOMASS_URL = "https://dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.01/netcdf/ESACCI-BIOMASS-L4-AGB-MERGED-100m-2020-fv5.01.nc"
1823

1924

2025
class EsaBiomassCciDataset(Dataset):
2126
name = "esa-biomass-cci"
2227

2328
@staticmethod
2429
def download(download_path: Path, progress: bool = True):
25-
urls = [
26-
f"https://dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.01/netcdf/ESACCI-BIOMASS-L4-AGB-MERGED-100m-{year}-fv5.01.nc"
27-
# Restrict to 2 years for now for smaller download.
28-
for year in [2010, 2015]
29-
]
30-
for url in urls:
31-
output_path = download_path / Path(url).name
32-
for _ in range(NUM_RETRIES):
33-
success = _download_netcdf(url, output_path, progress)
34-
if success:
35-
break
36-
if not success:
37-
logging.info(f"Failed to download {url}")
38-
return
30+
output_path = download_path / Path(BIOMASS_URL).name
31+
for _ in range(NUM_RETRIES):
32+
success = _download_netcdf(BIOMASS_URL, output_path, progress)
33+
if success:
34+
break
35+
if not success:
36+
logging.info(f"Failed to download {BIOMASS_URL}")
37+
return
3938

4039
@staticmethod
4140
def open(download_path: Path) -> xr.Dataset:
@@ -44,12 +43,28 @@ def open(download_path: Path) -> xr.Dataset:
4443
# Needed to make the dataset CF-compliant.
4544
ds.lon.attrs["axis"] = "X"
4645
ds.lat.attrs["axis"] = "Y"
46+
# We are constraining the dataset to mainland France to reduce its overall size.
47+
# The global snapshot would be around 20 GB, which is too large for our use case.
48+
# We chose France because it should have a fairly diverse set of biomass estimates
49+
# but the choice is overall somewhat arbitrary.
50+
ds = ds.sel(
51+
lon=slice(FRANCE_BBOX[0], FRANCE_BBOX[2]),
52+
lat=slice(FRANCE_BBOX[3], FRANCE_BBOX[1]),
53+
).chunk(-1)
4754
return ds[["agb"]]
4855

4956

5057
if __name__ == "__main__":
5158
ds = open_downloaded_canonicalized_dataset(EsaBiomassCciDataset)
52-
open_downloaded_tiny_canonicalized_dataset(EsaBiomassCciDataset, slices=FRANCE_BBOX)
59+
num_lon, num_lat = ds.lon.size, ds.lat.size
60+
open_downloaded_tiny_canonicalized_dataset(
61+
EsaBiomassCciDataset,
62+
# Use a smaller spatial subset for the tiny dataset.
63+
slices={
64+
"X": slice(num_lon // 2, (num_lon // 2) + 500),
65+
"Y": slice(num_lat // 2, (num_lat // 2) + 500),
66+
},
67+
)
5368

5469
for v, da in ds.items():
5570
print(f"- {v}: {da.dims}")

src/climatebenchpress/data_loader/datasets/nextgems.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,19 @@ def download(download_path: Path, progress: bool = True):
4444
zoom=ZOOM, time=TIME_RESOLUTION, chunks=dict()
4545
).to_dask()
4646

47-
ds = icon[[PRECIP_KEY, OLR_KEY]].sel(time=slice("2020-03-01", "2020-03-07"))
47+
# Restrict data to a single day.
48+
# The specific day is arbitrary.
49+
ds = icon[[PRECIP_KEY, OLR_KEY]].sel(time=slice("2020-03-01", "2020-03-01"))
4850
# Regrid the data to 0.125 degree resolution.
49-
# NOTE: This is using nearest neighbour interpolation. We need to do some
50-
# quality checks to ensure we don't get any significant aliasing
51-
# artifacts as the result of interpolation. For more details:
52-
# https://easy.gems.dkrz.de/Processing/healpix/lonlat_remap.html.
51+
# NOTE:
52+
# This is using nearest neighbour interpolation. Different interpolation methods
53+
# should not have a drastic effect on the intercomparison of different compressors.
54+
# However, this should be studied in more detail because re-gridding can often
55+
# have unforeseen consequences.
5356
idx = _get_nn_lon_lat_index(
5457
2**ZOOM, np.linspace(-180, 180, NUM_LON), np.linspace(-90, 90, NUM_LAT)
5558
)
56-
ds = ds.isel(cell=idx).chunk({"time": 1, "lat": NUM_LAT, "lon": NUM_LON})
59+
ds = ds.isel(cell=idx).chunk(-1)
5760
ds.lon.attrs["axis"] = "X"
5861
ds.lat.attrs["axis"] = "Y"
5962

0 commit comments

Comments
 (0)