Skip to content

Commit 960d5dd

Browse files
authored
Add IFS uncompressed data (#26)
* Add IFS uncompressed data * Pin earthkit-regrid version * Refresh uv cache before installation * Use permalink
1 parent 6f18bfd commit 960d5dd

5 files changed

Lines changed: 178 additions & 3 deletions

File tree

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ jobs:
4747
run: uv python install
4848

4949
- name: Install the package
50-
run: uv sync --all-extras --all-groups && uv pip install .
50+
run: uv sync --refresh --all-extras --all-groups && uv pip install .
5151

5252
- name: Run tests
5353
run: uv run pytest

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ To download all the data used for the benchmark run the following commands:
1818
```bash
1919
uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci
2020
uv run python -m climatebenchpress.data_loader.datasets.cams
21-
uv run python -m climatebenchpress.data_loader.datasets.era5
21+
uv run python -m climatebenchpress.data_loader.datasets.ifs_uncompressed
2222
uv run python -m climatebenchpress.data_loader.datasets.nextgems
2323
uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta
2424
uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ dependencies = [
88
"cf-xarray~=0.10.0",
99
"cftime~=1.6.0",
1010
"dask>=2024.12.0,<2025.4",
11+
"earthkit-regrid~=0.5.0",
1112
"fsspec>=2024.10.0,<2025.4",
13+
"gribscan~=0.0.14",
1214
"healpy~=1.18.0",
1315
# These versions need to be pinned to be compatible with the NextGEMS
1416
# catalog at https://data.nextgems-h2020.eu/online.yaml.
@@ -52,5 +54,5 @@ where = ["src"]
5254
addopts = ["--import-mode=importlib"]
5355

5456
[[tool.mypy.overrides]]
55-
module = ["fsspec.*", "intake.*", "healpy.*"]
57+
module = ["fsspec.*", "intake.*", "healpy.*", "earthkit.*"]
5658
follow_untyped_imports = true

src/climatebenchpress/data_loader/datasets/all.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
from .cmip6.all import *
55
from .era5 import *
66
from .esa_biomass_cci import *
7+
from .ifs_uncompressed import *
78
from .nextgems import *
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
__all__ = ["IFSUncompressedDataset"]
2+
3+
import argparse
4+
from pathlib import Path
5+
6+
import earthkit.regrid
7+
import numpy as np
8+
import requests
9+
import xarray as xr
10+
11+
from .. import (
12+
monitor,
13+
open_downloaded_canonicalized_dataset,
14+
open_downloaded_tiny_canonicalized_dataset,
15+
)
16+
from .abc import Dataset
17+
18+
BASE_URL = "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket"
19+
20+
21+
class IFSUncompressedDataset(Dataset):
22+
"""Dataset for IFS uncompressed data.
23+
24+
Contains data from the [hplp](https://apps.ecmwf.int/ifs-experiments/rd/hplp/)
25+
experiment from the Integrated Forecasting System (IFS) model. Crucially,
26+
this dataset contains uncompressed 64-bit floating point data.
27+
"""
28+
29+
name = "ifs-uncompressed"
30+
31+
@staticmethod
32+
def download(download_path: Path, progress: bool = True):
33+
ds = load_hplp_data(leveltype="sfc", gridtype="reduced_gg")
34+
ds = ds[["msl", "10u", "10v"]]
35+
ds_regridded = regrid_to_regular(
36+
ds,
37+
in_grid={"grid": "O400"},
38+
out_grid={"grid": [0.25, 0.25]},
39+
)
40+
downloadfile = download_path / "ifs_uncompressed.zarr"
41+
with monitor.progress_bar(progress):
42+
ds_regridded.to_zarr(
43+
downloadfile, mode="w", encoding=dict(), compute=False
44+
).compute()
45+
46+
@staticmethod
47+
def open(download_path: Path) -> xr.Dataset:
48+
ds = xr.open_dataset(download_path / "ifs_uncompressed.zarr")
49+
50+
# Needed to make the dataset CF-compliant.
51+
ds.longitude.attrs["axis"] = "X"
52+
ds.latitude.attrs["axis"] = "Y"
53+
ds.time.attrs["standard_name"] = "time"
54+
return ds
55+
56+
57+
def load_hplp_data(leveltype=None, gridtype=None, step=None, remap=False):
58+
"""Function taken from: https://github.com/climet-eu/compression-lab-notebooks/blob/d297ee98be916359fde16ab36f0f9e0681662df8/04-example-datasets/01-hplp.ipynb."""
59+
if leveltype not in {"pl", "ml", "sfc", "wave"}:
60+
raise ValueError(
61+
f"Invalid leveltype: '{leveltype}'. Available leveltypes: pl, ml, sfc, wave"
62+
)
63+
64+
if leveltype in {"ml", "pl"} and not gridtype:
65+
raise ValueError(
66+
f"Gridtype is required for leveltype '{leveltype}'. Available: reduced_gg, sh"
67+
)
68+
69+
if remap and gridtype != "sh":
70+
raise ValueError("Only 'sh' fields can be remapped.")
71+
72+
if leveltype == "wave" and gridtype != "reduced_ll":
73+
print("Warning: Wave model data are stored on a reduced_ll grid.")
74+
75+
if leveltype == "sfc" and gridtype != "reduced_gg":
76+
print("Warning: Surface level data are stored on a reduced_gg grid.")
77+
78+
if step and not (leveltype == "ml" and gridtype == "reduced_gg"):
79+
print(
80+
"Warning: Specifying 'step' is unnecessary for this configuration and will be ignored."
81+
)
82+
83+
if leveltype in {"sfc", "wave"}:
84+
url = f"{BASE_URL}/hplp/hplp_{leveltype}.grib"
85+
elif leveltype == "ml" and gridtype == "reduced_gg":
86+
if step is None:
87+
raise ValueError(
88+
"The ml reduced_gg data are split into two parts:\n"
89+
" - Steps: 0, 12, 24, 36, 48, 60, 72, 84, 96, 108, 120 (2020-07-21T00:00:00 to 2020-07-26T00:00:00)\n"
90+
" - Steps: 132, 144, 156, 168, 180, 192, 204, 216, 228, 240 (2020-07-26T12:00:00 to 2020-07-31T00:00:00)\n"
91+
"Specify a step smaller than 120 for accessing the first part, \n"
92+
"and a step greater or equal to 132 for accessing the second part."
93+
)
94+
if step <= 120:
95+
url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}_levels_0_120.grib"
96+
else:
97+
url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}_levels_132_240.grib"
98+
else:
99+
url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}" + (
100+
"_O400.grib" if remap else ".grib"
101+
)
102+
ref = requests.get(f"{url}.ref").json()
103+
104+
print(f"Loading dataset {url}")
105+
106+
return xr.open_dataset(
107+
"reference://",
108+
engine="zarr",
109+
backend_kwargs=dict(storage_options=dict(fo=ref, asynchronous=False)),
110+
consolidated=False,
111+
)
112+
113+
114+
def regrid_to_regular(ds, in_grid, out_grid):
115+
"""Regrid dataset to a regular lat-lon grid.
116+
117+
Parameters
118+
----------
119+
ds : xr.Dataset
120+
The input dataset to regrid
121+
in_grid : dict
122+
The input grid specification for earthkit.regrid.interpolate
123+
out_grid : dict
124+
The output grid specification for earthkit.regrid.interpolate. Is assumed to be
125+
a regular lat-lon grid with equal spacing in latitude and longitude, e.g. {"grid": [0.25, 0.25]}.
126+
"""
127+
out_data = {var: [] for var in ds.data_vars}
128+
for var in ds.data_vars:
129+
for time in ds.time:
130+
r = earthkit.regrid.interpolate(
131+
ds[var].sel(time=time).values,
132+
in_grid=in_grid,
133+
out_grid=out_grid,
134+
method="linear",
135+
)
136+
out_data[var].append(r)
137+
138+
dx = out_grid["grid"][0]
139+
assert (
140+
out_grid["grid"][0] == out_grid["grid"][1]
141+
), "Only grids with equal latitude and longitude spacing are supported."
142+
lats = np.linspace(90, -90, int(180 / dx) + 1)
143+
lons = np.linspace(0, 360 - dx, int(360 / dx))
144+
coords = {
145+
"time": ds.time,
146+
"latitude": lats,
147+
"longitude": lons,
148+
}
149+
out_ds = xr.Dataset(
150+
{
151+
var: (("time", "latitude", "longitude"), out_data[var])
152+
for var in ds.data_vars
153+
},
154+
coords=coords,
155+
)
156+
return out_ds
157+
158+
159+
if __name__ == "__main__":
160+
parser = argparse.ArgumentParser()
161+
parser.add_argument("--basepath", type=Path, default=Path())
162+
args = parser.parse_args()
163+
164+
ds = open_downloaded_canonicalized_dataset(
165+
IFSUncompressedDataset, basepath=args.basepath
166+
)
167+
open_downloaded_tiny_canonicalized_dataset(
168+
IFSUncompressedDataset, basepath=args.basepath
169+
)
170+
171+
for v, da in ds.items():
172+
print(f"- {v}: {da.dims}")

0 commit comments

Comments
 (0)