Skip to content

Commit 5fadaf0

Browse files
committed
zarr3: Fix DecodeChunk and EncodeChunk for void access
The codec chain is prepared for the original dtype and chunk shape (without the extra bytes dimension). For void access: DecodeChunk: - Strip the bytes dimension from grid's chunk_shape to get original shape - Decode using the original codec shape - Reinterpret the decoded bytes as [chunk_shape..., bytes_per_elem] EncodeChunk: - Input has shape [chunk_shape..., bytes_per_elem] of byte_t - Create a view with the original chunk shape and element_size - Encode using the original codec This follows the pattern from zarr v2 (PR google#272) where the void metadata has the chunk_layout computed to match encoded/decoded layouts.
1 parent 0db22e4 commit 5fadaf0

1 file changed

Lines changed: 53 additions & 3 deletions

File tree

tensorstore/driver/zarr3/chunk_cache.cc

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,38 @@ ZarrLeafChunkCache::DecodeChunk(span<const Index> chunk_indices,
158158
const size_t num_fields = dtype_.fields.size();
159159
absl::InlinedVector<SharedArray<const void>, 1> field_arrays(num_fields);
160160

161-
// Special case: void access - return raw bytes directly
161+
// Special case: void access - decode using original codec shape, then
162+
// reinterpret as bytes with extra dimension.
163+
//
164+
// The codec was prepared for the original dtype and chunk_shape (without
165+
// bytes dimension). We decode to that shape, then view the raw bytes with
166+
// an extra dimension representing the bytes per element.
162167
if (open_as_void_) {
168+
// The grid's chunk_shape for void has extra bytes dimension - strip it
169+
// to get the original codec shape.
170+
const auto& void_chunk_shape = grid().chunk_shape;
171+
std::vector<Index> original_chunk_shape(
172+
void_chunk_shape.begin(),
173+
void_chunk_shape.end() - 1); // Strip bytes dimension
174+
175+
// Decode using original codec shape
163176
TENSORSTORE_ASSIGN_OR_RETURN(
164-
field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(),
165-
std::move(data)));
177+
auto decoded_array,
178+
codec_state_->DecodeArray(original_chunk_shape, std::move(data)));
179+
180+
// Reinterpret the decoded array's bytes as [chunk_shape..., bytes_per_elem]
181+
// This creates a view over the same memory but with byte dtype and extra dim
182+
const auto& void_component_shape = grid().components[0].shape();
183+
auto byte_array = AllocateArray(
184+
void_component_shape, c_order, default_init,
185+
dtype_v<tensorstore::dtypes::byte_t>);
186+
187+
// Copy decoded data to byte array (handles potential layout differences)
188+
std::memcpy(byte_array.data(), decoded_array.data(),
189+
decoded_array.num_elements() *
190+
decoded_array.dtype().size());
191+
192+
field_arrays[0] = std::move(byte_array);
166193
return field_arrays;
167194
}
168195

@@ -214,6 +241,29 @@ Result<absl::Cord> ZarrLeafChunkCache::EncodeChunk(
214241
span<const Index> chunk_indices,
215242
span<const SharedArray<const void>> component_arrays) {
216243
assert(component_arrays.size() == 1);
244+
245+
// Special case: void access - reinterpret byte array back to original
246+
// dtype shape before encoding.
247+
//
248+
// The input has shape [chunk_shape..., bytes_per_elem] of byte_t.
249+
// The codec expects [chunk_shape] of the original dtype.
250+
if (open_as_void_) {
251+
const auto& byte_array = component_arrays[0];
252+
const Index bytes_per_element = dtype_.bytes_per_outer_element;
253+
254+
// Build original chunk shape by stripping the bytes dimension
255+
const auto& void_shape = byte_array.shape();
256+
std::vector<Index> original_shape(void_shape.begin(), void_shape.end() - 1);
257+
258+
// Create a view over the byte data with original layout
259+
// The codec expects the original dtype's element size for stride calculation
260+
auto encoded_array = SharedArray<const void>(
261+
byte_array.element_pointer(),
262+
StridedLayout<>(c_order, bytes_per_element, original_shape));
263+
264+
return codec_state_->EncodeArray(encoded_array);
265+
}
266+
217267
return codec_state_->EncodeArray(component_arrays[0]);
218268
}
219269

0 commit comments

Comments
 (0)