Skip to content

Commit dd50a9b

Browse files
authored
Merge pull request #938 from davidhassell/kerchunk-read
Read Kerchunk and file-like object datasets
2 parents 79a3b0f + 1174168 commit dd50a9b

10 files changed

Lines changed: 171 additions & 73 deletions

File tree

Changelog.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ Version NEXTVERSION
33

44
**2026-??-??**
55

6+
* Read Kerchunk datasets with `cf.read`
7+
(https://github.com/NCAS-CMS/cf-python/issues/936)
8+
* Read open file handle datasets with `cf.read`
9+
(https://github.com/NCAS-CMS/cf-python/issues/937)
610
* Write UGRID datasets with `cf.write`
711
(https://github.com/NCAS-CMS/cf-python/issues/697)
812
* Support for HEALPix grids

cf/read_write/read.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
class read(cfdm.read):
2121
"""Read field or domain constructs from files.
2222
23-
The following file formats are supported: netCDF, CDL, Zarr, PP,
24-
and UM fields file.
23+
The following file formats are supported: netCDF, CDL, Zarr,
24+
Kerchunk, PP, and UM fields file.
2525
2626
NetCDF and Zarr datasets may be on local disk, on an OPeNDAP
2727
server, or in an S3 object store.
@@ -144,7 +144,7 @@ class read(cfdm.read):
144144
145145
:Parameters:
146146
147-
{{read datasets: (arbitrarily nested sequence of) `str`}}
147+
{{read datasets:}}
148148
149149
{{read recursive: `bool`, optional}}
150150
@@ -162,6 +162,7 @@ class read(cfdm.read):
162162
``'netCDF'`` A netCDF-3 or netCDF-4 dataset
163163
``'CDL'`` A text CDL file of a netCDF dataset
164164
``'Zarr'`` A Zarr v2 (xarray) or Zarr v3 dataset
165+
``'Kerchunk'`` A Kerchunked dataset
165166
``'UM'`` A UM fields file or PP dataset
166167
============== ==========================================
167168

cf/read_write/um/umread.py

Lines changed: 47 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -3549,6 +3549,13 @@ def read(
35493549
"(only Field constructs)"
35503550
)
35513551

3552+
representation = self.dataset_representation(dataset)
3553+
if representation != "path":
3554+
raise NotImplementedError(
3555+
"Can't yet read Field constructs from a UM or PP "
3556+
f"{representation!r} dataset: {dataset!r}"
3557+
)
3558+
35523559
if not _stash2standard_name:
35533560
# --------------------------------------------------------
35543561
# Create the STASH code to standard_name conversion
@@ -3605,75 +3612,17 @@ def read(
36053612
# Return now if there are valid file types
36063613
return []
36073614

3608-
# Parse the 'storage_options' keyword parameter
3609-
if storage_options is None:
3610-
storage_options = {}
3611-
elif filesystem is not None:
3612-
raise ValueError(
3613-
"Can't set both storage_options and filesystem keywords"
3615+
if storage_options is not None:
3616+
raise NotImplementedError(
3617+
"Can't yet open PP/UM files with file system storage options"
36143618
)
36153619

3616-
storage_protocol = None
3617-
36183620
if filesystem is not None:
3619-
# --------------------------------------------------------
3620-
# A pre-authenticated filesystem was provided: open the
3621-
# dataset as a file-like object and pass it to the backend.
3622-
# --------------------------------------------------------
36233621
raise NotImplementedError(
3624-
"Can't yet open PP/UM files from a remote file system"
3622+
"Can't yet open PP/UM files from a pre-defined file system"
36253623
)
36263624

3627-
try:
3628-
dataset = filesystem.open(dataset, "rb")
3629-
except AttributeError:
3630-
raise AttributeError(
3631-
f"The 'filesystem' object {filesystem!r} does not have "
3632-
"an 'open' method. Please provide a valid filesystem "
3633-
"object (e.g. an fsspec filesystem instance)."
3634-
)
3635-
except Exception as exc:
3636-
raise OSError(
3637-
f"Failed to open {dataset!r} using the provided "
3638-
f"'filesystem' object {filesystem!r}: {exc}"
3639-
) from exc
3640-
3641-
else:
3642-
from uritools import urisplit
3643-
3644-
u = urisplit(dataset)
3645-
if u.scheme == "s3":
3646-
# ----------------------------------------------------
3647-
# Dataset is an s3://... string.
3648-
# ----------------------------------------------------
3649-
raise NotImplementedError(
3650-
"Can't yet open PP/UM files from an s3 object store"
3651-
)
3652-
3653-
import fsspec
3654-
3655-
client_kwargs = storage_options.get("client_kwargs", {})
3656-
if (
3657-
"endpoint_url" not in storage_options
3658-
and "endpoint_url" not in client_kwargs
3659-
):
3660-
authority = u.authority
3661-
if not authority:
3662-
authority = ""
3663-
3664-
storage_options["endpoint_url"] = f"https://{authority}"
3665-
3666-
filesystem = fsspec.filesystem(
3667-
protocol=u.scheme, **storage_options
3668-
)
3669-
dataset = filesystem.open(u.path[1:], "rb")
3670-
3671-
if not storage_options:
3672-
storage_options = None
3673-
3674-
if filesystem is not None:
3675-
storage_protocol = filesystem.protocol
3676-
storage_options = filesystem.storage_options
3625+
storage_protocol = None
36773626

36783627
f = self.dataset_open(dataset, parse=True)
36793628

@@ -3835,6 +3784,41 @@ def dataset_open(self, filename, parse=True):
38353784
parse=parse,
38363785
)
38373786

3787+
@classmethod
3788+
def dataset_representation(cls, dataset):
3789+
"""Return the logical representation type of the input dataset.
3790+
3791+
.. versionadded:: NEXTVERSION
3792+
3793+
:Parameters:
3794+
3795+
dataset:
3796+
The dataset. May be a string-valued path or a
3797+
file-like object.
3798+
3799+
:Returns:
3800+
3801+
`str`
3802+
The dataset representation:
3803+
3804+
* ``'path'``: A string-valued path.
3805+
3806+
* ``'file_handle'``: An open file handle (such as
3807+
returned by `fsspec.filesystem.open`)
3808+
3809+
* ``'unknown'``: Anything else.
3810+
3811+
"""
3812+
# Strings (Paths)
3813+
if isinstance(dataset, str):
3814+
return "path"
3815+
3816+
# Check for a "binary stream" (file handle)
3817+
if hasattr(dataset, "read") and hasattr(dataset, "seek"):
3818+
return "file_handle"
3819+
3820+
return "unknown"
3821+
38383822

38393823
"""
38403824
Problems:

cf/test/example_field_0.kerchunk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"version":1,"refs":{"lat\/0":"base64:eF5jYMABDjQwNBwIcmNwCzpwgMHBAQAxqAWx","lat_bnds\/0.0":"base64:eF5jYCAVNMAZYX5+dkDg5xd2AAQcQAAAZ+II3Q==","lon\/0":"base64:eF5jYCAONDxQ2BDwwUPCLCAmOTOvqNQBCgB9ngjU","lon_bnds\/0.0":"base64:eF5jYKAyaACCBw8aGhQUHjzYsKGBwc0tLCwhIS0tJ6egoLi4jMEBFQAA\/noSOQ==","q\/0.0":["example_field_0.nc",17755,100],"q\/0.1":"base64:eF6z7v4lGfX4l4\/QxJO77ufUsaQsrTNvflvm6f88K8x6dlbngZp9PL2\/9\/DdP7LH8ZT9fKbpK0t0\/RVKEhaaSmnnfZy8qXfD5KeucfrtkxbNmLt8U8WGbbvWttkjAQAjCS4Q","q\/1.0":"base64:eF6TzNB+88v69hsGIGCZ2B5Yd\/9nIIgd9lZsR5Z\/3Q4Qm6\/mptye3iw5EFvX\/lxryfQ9rSD2JtPvryfnlbwGseeuW7yxYtHkhfOAwB4K3IEAAKyvI6g=","q\/1.1":"base64:eF6TtLaeY909x5IBCFju3z9yP+fIFhA7zN9fwf+5wgkQm6+3d0Pv7w1lILbu9OlF01cW7QOxN+XlZeR9zJgPYq\/d1DRx0qKFXfOAwB4K3IEAAGAXIr4=","time\/0":"\u0000\u0000\u0000\u0000\u0000\u0000?@",".zgroup":"{\"zarr_format\":2}",".zattrs":"{\"Conventions\":\"CF-1.12\"}","lat\/.zarray":"{\"shape\":[5],\"chunks\":[5],\"dtype\":\"<f8\",\"fill_value\":null,\"order\":\"C\",\"filters\":[{\"id\":\"shuffle\",\"elementsize\":8},{\"id\":\"zlib\",\"level\":4}],\"dimension_separator\":\".\",\"compressor\":null,\"zarr_format\":2}","lat\/.zattrs":"{\"_ARRAY_DIMENSIONS\":[\"lat\"],\"units\":\"degrees_north\",\"standard_name\":\"latitude\",\"bounds\":\"lat_bnds\"}","lat_bnds\/.zarray":"{\"shape\":[5,2],\"chunks\":[5,2],\"dtype\":\"<f8\",\"fill_value\":null,\"order\":\"C\",\"filters\":[{\"id\":\"shuffle\",\"elementsize\":8},{\"id\":\"zlib\",\"level\":4}],\"dimension_separator\":\".\",\"compressor\":null,\"zarr_format\":2}","lat_bnds\/.zattrs":"{\"_ARRAY_DIMENSIONS\":[\"lat\",\"bounds2\"]}","lon\/.zarray":"{\"shape\":[8],\"chunks\":[8],\"dtype\":\"<f8\",\"fill_value\":null,\"order\":\"C\",\"filters\":[{\"id\":\"shuffle\",\"elementsize\":8},{\"id\":\"zlib\",\"level\":4}],\"dimension_separator\":\".\",\"compressor\":null,\"zarr_format\":2}","lon\/.zattrs":"{\"_ARRAY_DIMENSIONS\":[\"lon\"],\"units\":\"degrees_east\",\"standard_name\":\"longitude\",\"bounds\":\"lon_bnds\"}","lon_bnds\/.zarray":"{\"shape\":[8,2],\"chunks\":[8,2],\"dtype\":\"<f8\",\"fill_value\":null,\"order\":\"C\",\"filters\":[{\"id\":\"shuffle\",\"elementsize\":8},{\"id\":\"zlib\",\"level\":4}],\"dimension_separator\":\".\",\"compressor\":null,\"zarr_format\":2}","lon_bnds\/.zattrs":"{\"_ARRAY_DIMENSIONS\":[\"lon\",\"bounds2\"]}","q\/.zarray":"{\"shape\":[5,8],\"chunks\":[3,4],\"dtype\":\"<f8\",\"fill_value\":null,\"order\":\"C\",\"filters\":[{\"id\":\"shuffle\",\"elementsize\":8},{\"id\":\"zlib\",\"level\":4}],\"dimension_separator\":\".\",\"compressor\":null,\"zarr_format\":2}","q\/.zattrs":"{\"_ARRAY_DIMENSIONS\":[\"lat\",\"lon\"],\"project\":\"research\",\"standard_name\":\"specific_humidity\",\"units\":\"1\",\"coordinates\":\"time\",\"cell_methods\":\"area: mean\"}","time\/.zarray":"{\"shape\":[],\"chunks\":[],\"dtype\":\"<f8\",\"fill_value\":null,\"order\":\"C\",\"filters\":null,\"dimension_separator\":\".\",\"compressor\":null,\"zarr_format\":2}","time\/.zattrs":"{\"_ARRAY_DIMENSIONS\":[],\"units\":\"days since 2018-12-01\",\"standard_name\":\"time\"}"}}

cf/test/test_kerchunk.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import datetime
2+
import faulthandler
3+
import json
4+
import os
5+
import unittest
6+
7+
import fsspec
8+
9+
faulthandler.enable() # to debug seg faults and timeouts
10+
11+
12+
import cf
13+
14+
warnings = False
15+
16+
17+
kerchunk_file = os.path.join(
18+
os.path.dirname(os.path.abspath(__file__)), "example_field_0.kerchunk"
19+
)
20+
21+
fs = fsspec.filesystem("reference", fo=kerchunk_file)
22+
kerchunk_mapper = fs.get_mapper()
23+
24+
25+
class read_writeTest(unittest.TestCase):
26+
"""Test the reading and writing of field constructs from/to disk."""
27+
28+
netcdf = os.path.join(
29+
os.path.dirname(os.path.abspath(__file__)), "example_field_0.nc"
30+
)
31+
kerchunk = kerchunk_mapper
32+
33+
def setUp(self):
34+
"""Preparations called immediately before each test method."""
35+
# Disable log messages to silence expected warnings
36+
cf.LOG_LEVEL("DISABLE")
37+
# Note: to enable all messages for given methods, lines or
38+
# calls (those without a 'verbose' option to do the same)
39+
# e.g. to debug them, wrap them (for methods, start-to-end
40+
# internally) as follows: cf.LOG_LEVEL('DEBUG')
41+
#
42+
# < ... test code ... >
43+
# cf.log_level('DISABLE')
44+
45+
def test_kerchunk_read(self):
46+
"""Test cf.read with Kerchunk."""
47+
f = cf.read(self.netcdf)[0]
48+
49+
k = cf.read(self.kerchunk, dask_chunks=3)
50+
self.assertEqual(len(k), 1)
51+
self.assertTrue(k[0].equals(f))
52+
self.assertGreater(k[0].data.npartitions, 1)
53+
54+
k = cf.read([self.kerchunk, self.kerchunk], dask_chunks=3)
55+
self.assertEqual(len(k), 2)
56+
self.assertTrue(k[0].equals(k[-1]))
57+
58+
k = cf.read([self.kerchunk, self.kerchunk, self.netcdf], dask_chunks=3)
59+
self.assertEqual(len(k), 3)
60+
self.assertTrue(k[0].equals(k[-1]))
61+
self.assertTrue(k[1].equals(k[-1]))
62+
63+
def test_kerchunk_original_filenames(self):
64+
"""Test original_filenames with Kerchunk."""
65+
k = cf.read(self.kerchunk)[0]
66+
self.assertEqual(k.get_original_filenames(), set())
67+
68+
def test_read_dict(self):
69+
"""Test cf.read with an Kerchunk dictionary."""
70+
with open(kerchunk_file, "r") as fh:
71+
d = json.load(fh)
72+
73+
with self.assertRaises(ValueError):
74+
cf.read(d)
75+
76+
fs = fsspec.filesystem("reference", fo=d)
77+
kerchunk = fs.get_mapper()
78+
self.assertEqual(len(cf.read(kerchunk)), 1)
79+
80+
def test_read_bytes(self):
81+
"""Test cf.read with a Kerchunk raw bytes representation."""
82+
with open(kerchunk_file, "r") as fh:
83+
d = json.load(fh)
84+
85+
b = json.dumps(d).encode("utf-8")
86+
with self.assertRaises(ValueError):
87+
cf.read(b)
88+
89+
d = json.loads(b)
90+
fs = fsspec.filesystem("reference", fo=d)
91+
kerchunk = fs.get_mapper()
92+
self.assertEqual(len(cf.read(kerchunk)), 1)
93+
94+
95+
if __name__ == "__main__":
96+
print("Run date:", datetime.datetime.now())
97+
cf.environment()
98+
print("")
99+
unittest.main(verbosity=2)

cf/test/test_pp.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,15 @@ def test_PP_um_version(self):
145145
f = cf.read(self.ppfile, um={"version": "6.6.3"})[0]
146146
self.assertEqual(f.get_property("um_version"), "6.6.3")
147147

148+
def test_PP_file_object(self):
149+
# Can't yet read PP/UM from file-like objects
150+
with open(self.ppfile, "rb") as fh:
151+
with self.assertRaises(NotImplementedError):
152+
cf.read(fh)
153+
154+
# Check that the file has been rewound
155+
self.assertEqual(fh.tell(), 0)
156+
148157

149158
if __name__ == "__main__":
150159
print("Run date:", datetime.datetime.now())

docs/source/introduction.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ may nonetheless be modified in memory.
6767
The `cf` package can:
6868

6969
* read :term:`field constructs <field construct>` and :term:`domain
70-
constructs <domain construct>` from netCDF, CDL, Zarr, PP and UM
71-
datasets with a choice of netCDF backends,
70+
constructs <domain construct>` from netCDF, CDL, Zarr, Kerchunk, PP
71+
and UM datasets with a choice of netCDF backends,
7272

7373
* read files from OPeNDAP servers and S3 object stores,
7474

docs/source/tutorial.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,15 +139,15 @@ The following file types can be read:
139139

140140
..
141141
142-
* `CFA-netCDF
143-
<https://github.com/NCAS-CMS/cfa-conventions/blob/master/source/cfa.md>`_
144-
files at version 0.6 or later.
142+
* Datasets in `Kerchunk <https://fsspec.github.io/kerchunk>`_ format.
145143

146144
..
147145
148146
* :ref:`PP and UM fields files <PP-and-UM-fields-files>`, whose
149147
contents are mapped into field constructs.
150148

149+
..
150+
151151
Note that when reading netCDF4 files that contain :ref:`hierachical
152152
groups <Hierarchical-groups>`, the group structure is saved via the
153153
:ref:`netCDF interface <NetCDF-interface>` so that it may be re-used,

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
netCDF4==1.7.2
1+
netCDF4>=1.7.2
22
cftime>=1.6.4
33
numpy>=2.0.0
44
cfdm>=1.13.0.0, <1.13.1.0

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def compile():
178178
179179
The ``cf`` package can:
180180
181-
* read field and domain constructs from netCDF, CDL, Zarr, PP and UM datasets,
181+
* read field and domain constructs from netCDF, CDL, Zarr, Kerchunk, PP and UM datasets,
182182
183183
* be fully flexible with respect to dataset storage chunking,
184184

0 commit comments

Comments
 (0)