From cdea6c6fda508f4523aa4d3ed87d2668f179ced5 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 28 May 2026 10:43:59 -0700 Subject: [PATCH 01/13] initial port of old indirection support Signed-off-by: Rob Johnson --- Makefile | 5 + include/splinterdb/data.h | 8 + src/allocator.h | 4 + src/blob.c | 308 +++++++++++++++++++++++++++++ src/blob.h | 92 +++++++++ src/btree.c | 225 ++++++++++++++------- src/btree.h | 9 +- src/btree_private.h | 16 +- src/data_blob_build.c | 65 ++++++ src/data_blob_build.h | 33 ++++ src/data_internal.c | 7 + src/data_internal.h | 203 +++++++++++++++++-- src/mini_allocator.c | 216 +++++++++++++++++--- src/mini_allocator.h | 44 ++++- src/shard_log.c | 79 +++++++- src/shard_log.h | 9 +- src/splinterdb.c | 24 ++- tests/functional/log_test.c | 67 +++++++ tests/unit/btree_test.c | 13 +- tests/unit/splinterdb_quick_test.c | 42 +++- 20 files changed, 1314 insertions(+), 155 deletions(-) create mode 100644 src/blob.c create mode 100644 src/blob.h create mode 100644 src/data_blob_build.c create mode 100644 src/data_blob_build.h diff --git a/Makefile b/Makefile index cb6e7c5f..b8dde00f 100644 --- a/Makefile +++ b/Makefile @@ -72,6 +72,8 @@ INCLUDE = -I $(INCDIR) -I $(SRCDIR) -I $(SRCDIR)/platform_$(PLATFORM) -I $(TESTS # use += here, so that extra flags can be provided via the environment +LD = $(CC) + CFLAGS += -D_GNU_SOURCE -ggdb -Wall -pthread -Wfatal-errors -Werror -Wvla CFLAGS += -DXXH_STATIC_LINKING_ONLY -fPIC CFLAGS += -DSPLINTERDB_PLATFORM_DIR=$(PLATFORM_DIR) @@ -408,6 +410,9 @@ CLOCKCACHE_SYS = $(OBJDIR)/$(SRCDIR)/clockcache.o \ $(PLATFORM_IO_SYS) BTREE_SYS = $(OBJDIR)/$(SRCDIR)/btree.o \ + $(OBJDIR)/$(SRCDIR)/blob.o \ + $(OBJDIR)/$(SRCDIR)/blob_build.o \ + $(OBJDIR)/$(SRCDIR)/data_blob_build.o \ $(OBJDIR)/$(SRCDIR)/data_internal.o \ $(OBJDIR)/$(SRCDIR)/mini_allocator.o \ $(CLOCKCACHE_SYS) diff --git a/include/splinterdb/data.h b/include/splinterdb/data.h index 920759bd..37ba8b8a 100644 --- a/include/splinterdb/data.h +++ b/include/splinterdb/data.h @@ -49,6 +49,8 @@ typedef enum message_type { */ typedef struct message { message_type type; + // Internal: non-NULL means data is a blob descriptor. + void *cc; slice data; } message; @@ -70,6 +72,12 @@ message_length(message msg) return slice_length(msg.data); } +static inline _Bool +message_isblob(message msg) +{ + return msg.cc != NULL; +} + static inline const void * message_data(message msg) { diff --git a/src/allocator.h b/src/allocator.h index 97a71e47..40ea970a 100644 --- a/src/allocator.h +++ b/src/allocator.h @@ -40,6 +40,8 @@ typedef uint64 allocator_root_id; * * - PAGE_TYPE_LOG : struct shard_log_hdr{} + computed offsets * + * - PAGE_TYPE_BLOB : raw user data (see blob.c) + * * - PAGE_TYPE_SUPERBLOCK : struct trunk_super_block{} * ---------------------------------------------------------------------------- */ @@ -51,6 +53,7 @@ typedef enum page_type { PAGE_TYPE_MEMTABLE, PAGE_TYPE_FILTER, PAGE_TYPE_LOG, + PAGE_TYPE_BLOB, PAGE_TYPE_SUPERBLOCK, PAGE_TYPE_MISC, // Used mainly as a testing hook, for cache access testing. NUM_PAGE_TYPES, @@ -62,6 +65,7 @@ static const char *const page_type_str[] = {"invalid", "memtable", "filter", "log", + "blob", "superblock", "misc"}; diff --git a/src/blob.c b/src/blob.c new file mode 100644 index 00000000..44b04772 --- /dev/null +++ b/src/blob.c @@ -0,0 +1,308 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +#include "blob.h" +#include "platform_sleep.h" +#include "poison.h" + +#define MIN_LIVE_PERCENTAGE (90ULL) + +bool +can_round_up(uint64 rounded_size, uint64 length) +{ + if (length == 0) { + return TRUE; + } + uint64 covering_rounded_count = (length + rounded_size - 1) / rounded_size; + uint64 covering_space_efficiency = + 100ULL * length / (covering_rounded_count * rounded_size); + return MIN_LIVE_PERCENTAGE <= covering_space_efficiency; +} + +void +parse_blob(uint64 extent_size, uint64 page_size, const blob *blobby, parsed_blob *pblobby) +{ + pblobby->base = blobby; + uint64 remainder = blobby->length; + + if (can_round_up(extent_size, remainder)) { + pblobby->num_extents = (remainder + extent_size - 1) / extent_size; + pblobby->leftovers[0].length = 0; + return; + } + + pblobby->num_extents = remainder / extent_size; + remainder -= pblobby->num_extents * extent_size; + + memset(pblobby->leftovers, 0, sizeof(pblobby->leftovers)); + uint64 entry = pblobby->num_extents; + uint64 leftovers_entry = 0; + while (page_size <= remainder) { + pblobby->leftovers[leftovers_entry].addr = blobby->addrs[entry]; + uint64 max_length = extent_size - (blobby->addrs[entry] % extent_size); + if (max_length <= remainder) { + pblobby->leftovers[leftovers_entry].length = max_length; + } else if (can_round_up(page_size, blobby->length)) { + pblobby->leftovers[leftovers_entry].length = remainder; + } else { + pblobby->leftovers[leftovers_entry].length = + remainder - (remainder % page_size); + } + + remainder -= pblobby->leftovers[leftovers_entry].length; + entry++; + leftovers_entry++; + } + + if (remainder) { + pblobby->leftovers[leftovers_entry].addr = blobby->addrs[entry]; + pblobby->leftovers[leftovers_entry].length = remainder; + } +} + +uint64 +blob_length(slice sblobby) +{ + const blob *blobby = slice_data(sblobby); + debug_assert(sizeof(*blobby) <= slice_length(sblobby)); + return blobby->length; +} + +static void +fragment_for_offset(uint64 extent_size, + uint64 page_size, + const parsed_blob *pblobby, + uint64 offset, + page_fragment *fragment) +{ + uint64 byte_addr = 0; + uint64 entry_remainder = 0; + debug_assert(offset < pblobby->base->length); + + if (offset / extent_size < pblobby->num_extents) { + byte_addr = + pblobby->base->addrs[offset / extent_size] + (offset % extent_size); + entry_remainder = MIN(pblobby->base->length - offset, + extent_size - (offset % extent_size)); + } else { + offset -= pblobby->num_extents * extent_size; + int i; + for (i = 0; i < ARRAY_SIZE(pblobby->leftovers) + && 0 < pblobby->leftovers[i].length; + i++) + { + if (offset < pblobby->leftovers[i].length) { + byte_addr = pblobby->leftovers[i].addr + offset; + entry_remainder = pblobby->leftovers[i].length - offset; + break; + } + offset -= pblobby->leftovers[i].length; + } + platform_assert(i < ARRAY_SIZE(pblobby->leftovers)); + } + + fragment->offset = byte_addr % page_size; + fragment->addr = byte_addr - fragment->offset; + fragment->length = MIN(entry_remainder, page_size - fragment->offset); +} + +static void +maybe_do_prefetch(blob_page_iterator *iter) +{ + uint64 curr_extent_num = iter->offset / iter->extent_size; + if (iter->mode == BLOB_PAGE_ITERATOR_MODE_PREFETCH + && curr_extent_num + 1 < iter->pblob.num_extents) + { + cache_prefetch(iter->cc, + iter->pblob.base->addrs[curr_extent_num + 1], + PAGE_TYPE_BLOB); + } +} + +platform_status +blob_page_iterator_init(cache *cc, + blob_page_iterator *iter, + slice sblobby, + uint64 offset, + blob_page_iterator_mode mode) +{ + debug_assert(mode == BLOB_PAGE_ITERATOR_MODE_PREFETCH + || mode == BLOB_PAGE_ITERATOR_MODE_NO_PREFETCH + || mode == BLOB_PAGE_ITERATOR_MODE_ALLOC); + + iter->cc = cc; + iter->mode = mode; + iter->extent_size = cache_extent_size(cc); + iter->page_size = cache_page_size(cc); + iter->offset = offset; + iter->page = NULL; + + parse_blob( + iter->extent_size, iter->page_size, slice_data(sblobby), &iter->pblob); + + debug_assert(offset <= iter->pblob.base->length); + + if (offset < iter->pblob.base->length) { + fragment_for_offset(iter->extent_size, + iter->page_size, + &iter->pblob, + iter->offset, + &iter->fragment); + maybe_do_prefetch(iter); + } + + return STATUS_OK; +} + +static bool +should_alloc(blob_page_iterator *iter) +{ + return iter->mode == BLOB_PAGE_ITERATOR_MODE_ALLOC + && iter->fragment.offset == 0 + && (iter->page_size <= iter->fragment.length + || can_round_up(iter->page_size, iter->pblob.base->length)); +} + +static void +blob_page_iterator_release_page(blob_page_iterator *iter) +{ + if (iter->page) { + if (iter->mode == BLOB_PAGE_ITERATOR_MODE_ALLOC) { + cache_unlock(iter->cc, iter->page); + cache_unclaim(iter->cc, iter->page); + } + cache_unget(iter->cc, iter->page); + iter->page = NULL; + } +} + +void +blob_page_iterator_deinit(blob_page_iterator *iter) +{ + blob_page_iterator_release_page(iter); +} + +platform_status +blob_page_iterator_get_curr(blob_page_iterator *iter, + uint64 *offset, + slice *result) +{ + if (iter->page == NULL) { + if (should_alloc(iter)) { + iter->page = cache_alloc(iter->cc, iter->fragment.addr, PAGE_TYPE_BLOB); + } else { + iter->page = cache_get(iter->cc, iter->fragment.addr, TRUE, PAGE_TYPE_BLOB); + if (iter->mode == BLOB_PAGE_ITERATOR_MODE_ALLOC) { + uint64 wait = 1; + while (!cache_try_claim(iter->cc, iter->page)) { + cache_unget(iter->cc, iter->page); + platform_sleep_ns(wait); + wait = MIN(2 * wait, 2048); + iter->page = + cache_get(iter->cc, iter->fragment.addr, TRUE, PAGE_TYPE_BLOB); + } + cache_lock(iter->cc, iter->page); + cache_mark_dirty(iter->cc, iter->page); + } + } + } + + *offset = iter->offset; + *result = slice_create(iter->fragment.length, + iter->page->data + iter->fragment.offset); + return STATUS_OK; +} + +bool +blob_page_iterator_at_end(blob_page_iterator *iter) +{ + return iter->pblob.base->length <= iter->offset; +} + +void +blob_page_iterator_advance_bytes(blob_page_iterator *iter, uint64 num_bytes) +{ + blob_page_iterator_release_page(iter); + + iter->offset += num_bytes; + if (iter->offset < iter->pblob.base->length) { + fragment_for_offset(iter->extent_size, + iter->page_size, + &iter->pblob, + iter->offset, + &iter->fragment); + maybe_do_prefetch(iter); + } +} + +void +blob_page_iterator_advance_page(blob_page_iterator *iter) +{ + blob_page_iterator_advance_bytes(iter, iter->fragment.length); +} + +platform_status +blob_materialize(cache *cc, slice sblobby, uint64 start, uint64 end, writable_buffer *result) +{ + const blob *blobby = slice_data(sblobby); + + if (end < start || blobby->length < end) { + return STATUS_BAD_PARAM; + } + + platform_status rc = writable_buffer_resize(result, end - start); + if (!SUCCESS(rc)) { + return rc; + } + + blob_page_iterator iter; + rc = blob_page_iterator_init( + cc, &iter, sblobby, start, BLOB_PAGE_ITERATOR_MODE_PREFETCH); + if (!SUCCESS(rc)) { + return rc; + } + + while (!blob_page_iterator_at_end(&iter) && iter.offset < end) { + uint64 offset; + slice data; + rc = blob_page_iterator_get_curr(&iter, &offset, &data); + if (!SUCCESS(rc)) { + goto out; + } + + uint64 length = MIN(end - offset, slice_length(data)); + memcpy((char *)writable_buffer_data(result) + (offset - start), + slice_data(data), + length); + blob_page_iterator_advance_page(&iter); + } + +out: + blob_page_iterator_deinit(&iter); + return rc; +} + +platform_status +blob_sync(cache *cc, slice sblob) +{ + blob_page_iterator itor; + platform_status rc = blob_page_iterator_init( + cc, &itor, sblob, 0, BLOB_PAGE_ITERATOR_MODE_NO_PREFETCH); + if (!SUCCESS(rc)) { + return rc; + } + + while (!blob_page_iterator_at_end(&itor)) { + uint64 offset; + slice result; + rc = blob_page_iterator_get_curr(&itor, &offset, &result); + if (!SUCCESS(rc)) { + break; + } + cache_page_sync(cc, itor.page, FALSE, PAGE_TYPE_BLOB); + blob_page_iterator_advance_page(&itor); + } + + blob_page_iterator_deinit(&itor); + return rc; +} diff --git a/src/blob.h b/src/blob.h new file mode 100644 index 00000000..088e7f69 --- /dev/null +++ b/src/blob.h @@ -0,0 +1,92 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "cache.h" +#include "util.h" + +typedef struct ONDISK blob { + uint64 length; + uint64 addrs[]; +} blob; + +typedef struct parsed_blob_entry { + uint64 addr; + uint64 length; +} parsed_blob_entry; + +typedef struct parsed_blob { + const blob *base; + uint64 num_extents; + parsed_blob_entry leftovers[3]; +} parsed_blob; + +typedef struct page_fragment { + uint64 addr; + uint64 offset; + uint64 length; +} page_fragment; + +typedef enum blob_page_iterator_mode { + BLOB_PAGE_ITERATOR_MODE_PREFETCH, + BLOB_PAGE_ITERATOR_MODE_NO_PREFETCH, + BLOB_PAGE_ITERATOR_MODE_ALLOC, +} blob_page_iterator_mode; + +typedef struct blob_page_iterator { + cache *cc; + blob_page_iterator_mode mode; + uint64 extent_size; + uint64 page_size; + parsed_blob pblob; + + uint64 offset; + page_fragment fragment; + page_handle *page; +} blob_page_iterator; + +bool +can_round_up(uint64 rounded_size, uint64 length); + +void +parse_blob(uint64 extent_size, uint64 page_size, const blob *blobby, parsed_blob *pblobby); + +uint64 +blob_length(slice sblob); + +platform_status +blob_page_iterator_init(cache *cc, + blob_page_iterator *iter, + slice sblobby, + uint64 offset, + blob_page_iterator_mode mode); + +void +blob_page_iterator_deinit(blob_page_iterator *iter); + +platform_status +blob_page_iterator_get_curr(blob_page_iterator *iter, + uint64 *offset, + slice *result); + +bool +blob_page_iterator_at_end(blob_page_iterator *iter); + +void +blob_page_iterator_advance_bytes(blob_page_iterator *iter, uint64 num_bytes); + +void +blob_page_iterator_advance_page(blob_page_iterator *iter); + +platform_status +blob_materialize(cache *cc, slice sblob, uint64 start, uint64 end, writable_buffer *result); + +static inline platform_status +blob_materialize_full(cache *cc, slice sblob, writable_buffer *result) +{ + return blob_materialize(cc, sblob, 0, blob_length(sblob), result); +} + +platform_status +blob_sync(cache *cc, slice sblob); diff --git a/src/btree.c b/src/btree.c index 31c6366d..3169e4c4 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "btree_private.h" +#include "data_blob_build.h" #include "platform_sleep.h" #include "poison.h" /* @@ -105,6 +106,13 @@ static const btree_pivot_stats BTREE_PIVOT_STATS_UNKNOWN = { BTREE_UNKNOWN_COUNTER, BTREE_UNKNOWN_COUNTER}; +static const blob_build_config btree_blob_cfg = { + .extent_batch = 0, + .page_batch = 1, + .subpage_batch = 2, + .alignment = 0, +}; + static inline uint8 btree_height(const btree_hdr *hdr) @@ -161,7 +169,13 @@ leaf_entry_key_size(const leaf_entry *entry) static inline uint64 leaf_entry_message_size(const leaf_entry *entry) { - return entry->message_length; + if (!ondisk_tuple_message_isblob(entry)) { + return entry->message_length; + } + slice sblob = + slice_create(entry->message_length, + entry->key_and_message + entry->key_length); + return blob_length(sblob); } /********************************************************* @@ -392,6 +406,19 @@ btree_set_leaf_entry(const btree_config *cfg, return TRUE; } +static bool32 +btree_copy_leaf_entry(const btree_config *cfg, + btree_hdr *hdr, + table_index k, + const leaf_entry *entry) +{ + key entry_key = ondisk_tuple_key(entry); + message entry_msg = + ondisk_tuple_message(ondisk_tuple_message_isblob(entry) ? (cache *)1 : NULL, + entry); + return btree_set_leaf_entry(cfg, hdr, k, entry_key, entry_msg); +} + static inline bool32 btree_insert_leaf_entry(const btree_config *cfg, btree_hdr *hdr, @@ -565,15 +592,16 @@ btree_merge_tuples(const btree_config *cfg, static message spec_message(const leaf_incorporate_spec *spec) { - if (spec->old_entry_state == ENTRY_DID_NOT_EXIST) { - return spec->msg.new_message; + if (spec->use_msg) { + return spec->msg.msg; } else { - return merge_accumulator_to_message(&spec->msg.merged_message); + return merge_accumulator_to_message(&spec->msg.ma); } } static inline platform_status btree_record_old_result(const btree_config *cfg, + cache *cc, const btree_hdr *hdr, const leaf_incorporate_spec *spec, lookup_result *old_result) @@ -584,50 +612,73 @@ btree_record_old_result(const btree_config *cfg, leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); return lookup_result_update( - old_result, leaf_entry_key(entry), leaf_entry_message(entry)); + old_result, leaf_entry_key(entry), leaf_entry_message(cc, entry)); } platform_status btree_create_leaf_incorporate_spec(const btree_config *cfg, + cache *cc, + mini_allocator *mini, platform_heap_id heap_id, btree_hdr *hdr, key tuple_key, message msg, leaf_incorporate_spec *spec) { - spec->tuple_key = tuple_key; + platform_status rc = STATUS_OK; + spec->tuple_key = tuple_key; bool32 found; spec->idx = btree_find_tuple(cfg, hdr, tuple_key, &found); spec->old_entry_state = found ? ENTRY_STILL_EXISTS : ENTRY_DID_NOT_EXIST; + if (!found) { - spec->msg.new_message = msg; spec->idx++; - return STATUS_OK; + if (message_isblob(msg)) { + spec->use_msg = FALSE; + merge_accumulator_init(&spec->msg.ma, heap_id); + rc = message_clone(&btree_blob_cfg, cc, mini, msg, &spec->msg.ma); + } else if (MAX_INLINE_MESSAGE_SIZE(btree_page_size(cfg)) + < message_length(msg)) + { + spec->use_msg = FALSE; + merge_accumulator_init(&spec->msg.ma, heap_id); + rc = message_to_blob(&btree_blob_cfg, cc, mini, msg, &spec->msg.ma); + } else { + spec->use_msg = TRUE; + spec->msg.msg = msg; + } } else { leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); - message oldmessage = leaf_entry_message(entry); + message oldmessage = leaf_entry_message(cc, entry); bool32 success; + spec->use_msg = FALSE; success = merge_accumulator_init_from_message( - &spec->msg.merged_message, heap_id, msg); + &spec->msg.ma, heap_id, msg); if (!success) { return STATUS_NO_MEMORY; } - if (btree_merge_tuples( - cfg, tuple_key, oldmessage, &spec->msg.merged_message)) - { - merge_accumulator_deinit(&spec->msg.merged_message); + if (btree_merge_tuples(cfg, tuple_key, oldmessage, &spec->msg.ma)) { + merge_accumulator_deinit(&spec->msg.ma); return STATUS_NO_MEMORY; - } else { - return STATUS_OK; + } else if (MAX_INLINE_MESSAGE_SIZE(btree_page_size(cfg)) + < merge_accumulator_length(&spec->msg.ma)) + { + rc = merge_accumulator_convert_to_blob( + &btree_blob_cfg, cc, mini, &spec->msg.ma); } } + + if (!SUCCESS(rc) && !spec->use_msg) { + merge_accumulator_deinit(&spec->msg.ma); + } + return rc; } void destroy_leaf_incorporate_spec(leaf_incorporate_spec *spec) { - if (spec->old_entry_state != ENTRY_DID_NOT_EXIST) { - merge_accumulator_deinit(&spec->msg.merged_message); + if (!spec->use_msg) { + merge_accumulator_deinit(&spec->msg.ma); } } @@ -641,16 +692,14 @@ btree_can_perform_leaf_incorporate_spec(const btree_config *cfg, hdr, btree_num_entries(hdr), spec->tuple_key, - spec->msg.new_message); + spec_message(spec)); } else if (spec->old_entry_state == ENTRY_STILL_EXISTS) { - message merged = merge_accumulator_to_message(&spec->msg.merged_message); return btree_can_set_leaf_entry( - cfg, hdr, spec->idx, spec->tuple_key, merged); + cfg, hdr, spec->idx, spec->tuple_key, spec_message(spec)); } else { debug_assert(spec->old_entry_state == ENTRY_HAS_BEEN_REMOVED); - message merged = merge_accumulator_to_message(&spec->msg.merged_message); return btree_can_set_leaf_entry( - cfg, hdr, btree_num_entries(hdr), spec->tuple_key, merged); + cfg, hdr, btree_num_entries(hdr), spec->tuple_key, spec_message(spec)); } } @@ -661,25 +710,22 @@ btree_try_perform_leaf_incorporate_spec(const btree_config *cfg, uint64 *generation) { bool32 success; + message msg = spec_message(spec); switch (spec->old_entry_state) { case ENTRY_DID_NOT_EXIST: - success = btree_insert_leaf_entry( - cfg, hdr, spec->idx, spec->tuple_key, spec->msg.new_message); + success = + btree_insert_leaf_entry(cfg, hdr, spec->idx, spec->tuple_key, msg); break; case ENTRY_STILL_EXISTS: { - message merged = - merge_accumulator_to_message(&spec->msg.merged_message); success = - btree_set_leaf_entry(cfg, hdr, spec->idx, spec->tuple_key, merged); + btree_set_leaf_entry(cfg, hdr, spec->idx, spec->tuple_key, msg); break; } case ENTRY_HAS_BEEN_REMOVED: { - message merged = - merge_accumulator_to_message(&spec->msg.merged_message); - success = btree_insert_leaf_entry( - cfg, hdr, spec->idx, spec->tuple_key, merged); + success = + btree_insert_leaf_entry(cfg, hdr, spec->idx, spec->tuple_key, msg); break; } default: @@ -722,11 +768,7 @@ btree_defragment_leaf(const btree_config *cfg, // IN } else { leaf_entry *entry = btree_get_leaf_entry(cfg, scratch_hdr, i); debug_only bool32 success = - btree_set_leaf_entry(cfg, - hdr, - dst_idx++, - leaf_entry_key(entry), - leaf_entry_message(entry)); + btree_copy_leaf_entry(cfg, hdr, dst_idx++, entry); debug_assert(success); } } @@ -913,11 +955,7 @@ btree_split_leaf_build_right_node(const btree_config *cfg, // IN spec->old_entry_state = ENTRY_HAS_BEEN_REMOVED; } else { leaf_entry *entry = btree_get_leaf_entry(cfg, left_hdr, i); - btree_set_leaf_entry(cfg, - right_hdr, - dst_idx, - leaf_entry_key(entry), - leaf_entry_message(entry)); + btree_copy_leaf_entry(cfg, right_hdr, dst_idx, entry); dst_idx++; } } @@ -1084,7 +1122,8 @@ btree_alloc(cache *cc, page_type type, btree_node *node) { - node->addr = mini_alloc(mini, height, next_extent); + (void)alloc_key; + node->addr = mini_alloc_page(mini, NUM_BLOB_BATCHES + height, next_extent); debug_assert(node->addr != 0); node->page = cache_alloc(cc, node->addr, type); @@ -1189,6 +1228,19 @@ btree_root_to_meta_addr(const btree_config *cfg, * Creating and destroying B-trees. *---------------------------------------------------------- */ + +#define BTREE_NUM_MINI_BATCHES (NUM_BLOB_BATCHES + BTREE_MAX_HEIGHT) + +static const page_type memtable_page_type_table[BTREE_NUM_MINI_BATCHES] = { + [0 ... NUM_BLOB_BATCHES - 1] = PAGE_TYPE_BLOB, + [NUM_BLOB_BATCHES ... BTREE_NUM_MINI_BATCHES - 1] = PAGE_TYPE_MEMTABLE, +}; + +static const page_type branch_page_type_table[BTREE_NUM_MINI_BATCHES] = { + [0 ... NUM_BLOB_BATCHES - 1] = PAGE_TYPE_BLOB, + [NUM_BLOB_BATCHES ... BTREE_NUM_MINI_BATCHES - 1] = PAGE_TYPE_BRANCH, +}; + uint64 btree_create(cache *cc, const btree_config *cfg, @@ -1226,8 +1278,14 @@ btree_create(cache *cc, cache_unget(cc, root_page); // set up the mini allocator - mini_init( - mini, cc, root.addr + btree_page_size(cfg), 0, BTREE_MAX_HEIGHT, type); + mini_init_with_types(mini, + cc, + root.addr + btree_page_size(cfg), + 0, + BTREE_NUM_MINI_BATCHES, + type, + type == PAGE_TYPE_BRANCH ? branch_page_type_table + : memtable_page_type_table); return root.addr; } @@ -1360,7 +1418,7 @@ btree_split_child_leaf(cache *cc, /* p: write, c: write, rc: write, cn: write if exists */ platform_status rc = - btree_record_old_result(cfg, child->hdr, spec, old_result); + btree_record_old_result(cfg, cc, child->hdr, spec, old_result); if (!SUCCESS(rc)) { if (child_next.addr != 0) { btree_node_full_unlock(cc, cfg, &child_next); @@ -1455,7 +1513,7 @@ btree_defragment_or_split_child_leaf(cache *cc, btree_node_unget(cc, cfg, parent); btree_node_lock(cc, cfg, child); platform_status rc = - btree_record_old_result(cfg, child->hdr, spec, old_result); + btree_record_old_result(cfg, cc, child->hdr, spec, old_result); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, child); return rc; @@ -1744,10 +1802,6 @@ btree_insert(cache *cc, // IN return STATUS_BAD_PARAM; } - if (MAX_INLINE_MESSAGE_SIZE(btree_page_size(cfg)) < message_length(msg)) { - return STATUS_BAD_PARAM; - } - if (message_is_invalid_user_type(msg)) { return STATUS_BAD_PARAM; } @@ -1767,7 +1821,7 @@ btree_insert(cache *cc, // IN if (btree_height(root_node.hdr) == 0) { rc = btree_create_leaf_incorporate_spec( - cfg, heap_id, root_node.hdr, tuple_key, msg, &spec); + cfg, cc, mini, heap_id, root_node.hdr, tuple_key, msg, &spec); if (!SUCCESS(rc)) { btree_node_unget(cc, cfg, &root_node); return rc; @@ -1779,7 +1833,7 @@ btree_insert(cache *cc, // IN } btree_node_lock(cc, cfg, &root_node); if (btree_can_perform_leaf_incorporate_spec(cfg, root_node.hdr, &spec)) { - rc = btree_record_old_result(cfg, root_node.hdr, &spec, old_result); + rc = btree_record_old_result(cfg, cc, root_node.hdr, &spec, old_result); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &root_node); destroy_leaf_incorporate_spec(&spec); @@ -1973,7 +2027,7 @@ btree_insert(cache *cc, // IN * - height of parent == 1 */ rc = btree_create_leaf_incorporate_spec( - cfg, heap_id, child_node.hdr, tuple_key, msg, &spec); + cfg, cc, mini, heap_id, child_node.hdr, tuple_key, msg, &spec); if (!SUCCESS(rc)) { btree_node_unget(cc, cfg, &parent_node); btree_node_unget(cc, cfg, &child_node); @@ -1992,7 +2046,7 @@ btree_insert(cache *cc, // IN goto start_over; } btree_node_lock(cc, cfg, &child_node); - rc = btree_record_old_result(cfg, child_node.hdr, &spec, old_result); + rc = btree_record_old_result(cfg, cc, child_node.hdr, &spec, old_result); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &child_node); destroy_leaf_incorporate_spec(&spec); @@ -2027,7 +2081,7 @@ btree_insert(cache *cc, // IN */ destroy_leaf_incorporate_spec(&spec); rc = btree_create_leaf_incorporate_spec( - cfg, heap_id, child_node.hdr, tuple_key, msg, &spec); + cfg, cc, mini, heap_id, child_node.hdr, tuple_key, msg, &spec); if (!SUCCESS(rc)) { btree_node_unget(cc, cfg, &parent_node); btree_node_unclaim(cc, cfg, &child_node); @@ -2245,7 +2299,7 @@ btree_lookup_with_ref_async(btree_lookup_async_state *state, uint64 depth) leaf_entry *entry = btree_get_leaf_entry(state->cfg, state->node.hdr, idx); state->found_key = leaf_entry_key(entry); - state->msg = leaf_entry_message(entry); + state->msg = leaf_entry_message(state->cc, entry); } else { btree_node_unget(state->cc, state->cfg, &state->node); } @@ -2272,7 +2326,7 @@ btree_lookup_with_ref(cache *cc, // IN if (found) { leaf_entry *entry = btree_get_leaf_entry(cfg, node->hdr, idx); *found_key = leaf_entry_key(entry); - *msg = leaf_entry_message(entry); + *msg = leaf_entry_message(cc, entry); } else { btree_node_unget(cc, cfg, node); } @@ -2438,7 +2492,8 @@ btree_iterator_curr(iterator *base_itor, key *curr_key, message *data) cache_validate_page(itor->cc, itor->curr.page, itor->curr.addr); if (itor->curr.hdr->height == 0) { *curr_key = btree_get_tuple_key(itor->cfg, itor->curr.hdr, itor->idx); - *data = btree_get_tuple_message(itor->cfg, itor->curr.hdr, itor->idx); + *data = + btree_get_tuple_message(itor->cfg, itor->cc, itor->curr.hdr, itor->idx); log_trace_key(*curr_key, "btree_iterator_get_curr"); } else { index_entry *entry = @@ -3508,12 +3563,29 @@ btree_pack_loop(btree_pack_req *req, // IN/OUT key tuple_key, // IN message msg) // IN { + platform_status rc; log_trace_key(tuple_key, "btree_pack_loop"); if (message_is_invalid_user_type(msg)) { return STATUS_INVALID_STATE; } + if (message_isblob(msg)) { + rc = message_clone(&btree_blob_cfg, req->cc, &req->mini, msg, &req->ma); + if (!SUCCESS(rc)) { + return rc; + } + msg = merge_accumulator_to_message(&req->ma); + } else if (MAX_INLINE_MESSAGE_SIZE(btree_page_size(req->cfg)) + < message_length(msg)) + { + rc = message_to_blob(&btree_blob_cfg, req->cc, &req->mini, msg, &req->ma); + if (!SUCCESS(rc)) { + return rc; + } + msg = merge_accumulator_to_message(&req->ma); + } + btree_node *leaf = btree_pack_get_current_node(req, 0); if (!leaf @@ -3529,7 +3601,7 @@ btree_pack_loop(btree_pack_req *req, // IN/OUT btree_pivot_stats *leaf_stats = btree_pack_get_current_node_stats(req, 0); leaf_stats->num_kvs++; leaf_stats->key_bytes += key_length(tuple_key); - leaf_stats->message_bytes += message_length(msg); + leaf_stats->message_bytes += message_materialized_length(msg); log_trace_key(tuple_key, "btree_pack_loop (bottom)"); @@ -3541,7 +3613,7 @@ btree_pack_loop(btree_pack_req *req, // IN/OUT req->num_tuples++; req->key_bytes += key_length(tuple_key); - req->message_bytes += message_length(msg); + req->message_bytes += message_materialized_length(msg); return STATUS_OK; } @@ -3856,6 +3928,7 @@ btree_print_index_node(platform_log_handle *log_handle, static void btree_print_leaf_entry(platform_log_handle *log_handle, const btree_config *cfg, + cache *cc, leaf_entry *entry, uint64 entry_num) { @@ -3864,12 +3937,13 @@ btree_print_leaf_entry(platform_log_handle *log_handle, "[%2lu]: %s -- %s\n", entry_num, key_string(dcfg, leaf_entry_key(entry)), - message_string(dcfg, leaf_entry_message(entry))); + message_string(dcfg, leaf_entry_message(cc, entry))); } static void btree_print_leaf_node(platform_log_handle *log_handle, const btree_config *cfg, + cache *cc, uint64 addr, btree_hdr *hdr, page_type type) @@ -3893,7 +3967,7 @@ btree_print_leaf_node(platform_log_handle *log_handle, log_handle, "Array of %d index leaf entries:\n", btree_num_entries(hdr)); for (uint64 i = 0; i < btree_num_entries(hdr); i++) { leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, i); - btree_print_leaf_entry(log_handle, cfg, entry, i); + btree_print_leaf_entry(log_handle, cfg, cc, entry, i); } platform_log(log_handle, "-------------------\n"); platform_log(log_handle, "\n"); @@ -3910,6 +3984,7 @@ btree_print_leaf_node(platform_log_handle *log_handle, void btree_print_locked_node(platform_log_handle *log_handle, const btree_config *cfg, + cache *cc, uint64 addr, btree_hdr *hdr, page_type type) @@ -3919,7 +3994,7 @@ btree_print_locked_node(platform_log_handle *log_handle, if (btree_height(hdr) > 0) { btree_print_index_node(log_handle, cfg, addr, hdr, type); } else { - btree_print_leaf_node(log_handle, cfg, addr, hdr, type); + btree_print_leaf_node(log_handle, cfg, cc, addr, hdr, type); } platform_log(log_handle, "} -- End BTree node at addr=%lu\n", addr); } @@ -3940,7 +4015,7 @@ btree_print_node(platform_log_handle *log_handle, return; } btree_node_get(cc, cfg, node, type); - btree_print_locked_node(log_handle, cfg, node->addr, node->hdr, type); + btree_print_locked_node(log_handle, cfg, cc, node->addr, node->hdr, type); btree_node_unget(cc, cfg, node); } @@ -4139,9 +4214,13 @@ btree_verify_node(cache *cc, platform_error_log("addr: %lu idx %u\n", node.addr, idx); platform_error_log("child addr: %lu idx %u\n", child.addr, idx); btree_print_locked_node( - Platform_error_log_handle, cfg, node.addr, node.hdr, type); - btree_print_locked_node( - Platform_error_log_handle, cfg, child.addr, child.hdr, type); + Platform_error_log_handle, cfg, cc, node.addr, node.hdr, type); + btree_print_locked_node(Platform_error_log_handle, + cfg, + cc, + child.addr, + child.hdr, + type); platform_assert(0); btree_node_unget(cc, cfg, &child); btree_node_unget(cc, cfg, &node); @@ -4161,9 +4240,13 @@ btree_verify_node(cache *cc, platform_error_log("addr: %lu idx %u\n", node.addr, idx); platform_error_log("child addr: %lu idx %u\n", child.addr, idx); btree_print_locked_node( - Platform_error_log_handle, cfg, node.addr, node.hdr, type); - btree_print_locked_node( - Platform_error_log_handle, cfg, child.addr, child.hdr, type); + Platform_error_log_handle, cfg, cc, node.addr, node.hdr, type); + btree_print_locked_node(Platform_error_log_handle, + cfg, + cc, + child.addr, + child.hdr, + type); platform_assert(0); btree_node_unget(cc, cfg, &child); btree_node_unget(cc, cfg, &node); diff --git a/src/btree.h b/src/btree.h index dbfc56fc..3018ebb5 100644 --- a/src/btree.h +++ b/src/btree.h @@ -12,6 +12,7 @@ #include "platform_hash.h" #include "platform_typed_alloc.h" #include "async.h" +#include "blob_build.h" #include "mini_allocator.h" #include "iterator.h" #include "lookup_result.h" @@ -36,8 +37,8 @@ * Therefore, the max # of mini-batches that the mini-allocator can track * is limited by the max height of the BTree. */ -_Static_assert(BTREE_MAX_HEIGHT == MINI_MAX_BATCHES, - "BTREE_MAX_HEIGHT has to be == MINI_MAX_BATCHES"); +_Static_assert(BTREE_MAX_HEIGHT + NUM_BLOB_BATCHES <= MINI_MAX_BATCHES, + "mini allocator needs room for blob and btree batches"); /* * Acceptable upper-bound on amount of space to waste when deciding whether @@ -166,6 +167,7 @@ typedef struct btree_pack_req { btree_node edge[BTREE_MAX_HEIGHT][MAX_PAGES_PER_EXTENT]; btree_pivot_stats edge_stats[BTREE_MAX_HEIGHT][MAX_PAGES_PER_EXTENT]; uint32 num_edges[BTREE_MAX_HEIGHT]; + merge_accumulator ma; mini_allocator mini; @@ -342,6 +344,7 @@ btree_pack_req_init(btree_pack_req *req, req->itor = itor; req->max_tuples = max_tuples; req->seed = seed; + merge_accumulator_init(&req->ma, hid); if (cfg->data_cfg->key_hash != NULL && max_tuples > 0) { req->fingerprint_arr = TYPED_ARRAY_ZALLOC(hid, req->fingerprint_arr, max_tuples); @@ -361,6 +364,7 @@ btree_pack_req_init(btree_pack_req *req, static inline void btree_pack_req_deinit(btree_pack_req *req, platform_heap_id hid) { + merge_accumulator_deinit(&req->ma); if (req->fingerprint_arr) { platform_free(hid, req->fingerprint_arr); } @@ -401,6 +405,7 @@ btree_print_tree(platform_log_handle *log_handle, void btree_print_locked_node(platform_log_handle *log_handle, const btree_config *cfg, + cache *cc, uint64 addr, btree_hdr *hdr, page_type type); diff --git a/src/btree_private.h b/src/btree_private.h index 99f6fb90..c3ff81ac 100644 --- a/src/btree_private.h +++ b/src/btree_private.h @@ -67,15 +67,18 @@ typedef struct leaf_incorporate_spec { ENTRY_STILL_EXISTS, ENTRY_HAS_BEEN_REMOVED } old_entry_state; + bool32 use_msg; union { - /* "old_entry_state" is the tag on this union. */ - message new_message; // old_entry_state == ENTRY_DID_NOT_EXIST - merge_accumulator merged_message; // otherwise + /* use_msg is the tag on this union. */ + message msg; + merge_accumulator ma; } msg; } leaf_incorporate_spec; platform_status btree_create_leaf_incorporate_spec(const btree_config *cfg, + cache *cc, + mini_allocator *mini, platform_heap_id heap_id, btree_hdr *hdr, key tuple_key, @@ -201,9 +204,9 @@ leaf_entry_key(leaf_entry *entry) } static inline message -leaf_entry_message(leaf_entry *entry) +leaf_entry_message(cache *cc, leaf_entry *entry) { - return ondisk_tuple_message(entry); + return ondisk_tuple_message(cc, entry); } static inline message_type @@ -240,10 +243,11 @@ btree_get_tuple_key(const btree_config *cfg, static inline message btree_get_tuple_message(const btree_config *cfg, + cache *cc, const btree_hdr *hdr, table_index k) { - return leaf_entry_message(btree_get_leaf_entry(cfg, hdr, k)); + return leaf_entry_message(cc, btree_get_leaf_entry(cfg, hdr, k)); } static inline message_type diff --git a/src/data_blob_build.c b/src/data_blob_build.c new file mode 100644 index 00000000..9a8d2f38 --- /dev/null +++ b/src/data_blob_build.c @@ -0,0 +1,65 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +/* + * data_blob_build.c -- + * + * Functions for building blob-backed messages. + */ + +#include "data_blob_build.h" +#include "poison.h" + +platform_status +message_to_blob(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + message msg, + merge_accumulator *ma) +{ + platform_assert(!message_isblob(msg)); + ma->type = message_class(msg); + ma->cc = cc; + return blob_build(cfg, cc, mini, message_slice(msg), &ma->data); +} + +platform_status +message_clone(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + message msg, + merge_accumulator *result) +{ + platform_assert(message_isblob(msg)); + result->type = message_class(msg); + result->cc = cc; + return blob_clone(cfg, cc, mini, message_slice(msg), &result->data); +} + +platform_status +merge_accumulator_convert_to_blob(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + merge_accumulator *ma) +{ + if (merge_accumulator_isblob(ma)) { + return STATUS_OK; + } + + writable_buffer blob; + writable_buffer_init(&blob, ma->data.heap_id); + platform_status rc = + blob_build(cfg, cc, mini, writable_buffer_to_slice(&ma->data), &blob); + if (!SUCCESS(rc)) { + writable_buffer_deinit(&blob); + return rc; + } + + rc = writable_buffer_copy_slice(&ma->data, writable_buffer_to_slice(&blob)); + writable_buffer_deinit(&blob); + if (!SUCCESS(rc)) { + return rc; + } + ma->cc = cc; + return rc; +} diff --git a/src/data_blob_build.h b/src/data_blob_build.h new file mode 100644 index 00000000..7bd9fddf --- /dev/null +++ b/src/data_blob_build.h @@ -0,0 +1,33 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +/* + * data_blob_build.h -- + * + * Functions for building blob-backed messages. + */ + +#pragma once + +#include "blob_build.h" +#include "data_internal.h" + +platform_status +message_to_blob(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + message msg, + merge_accumulator *ma); + +platform_status +message_clone(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + message msg, + merge_accumulator *result); + +platform_status +merge_accumulator_convert_to_blob(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + merge_accumulator *ma); diff --git a/src/data_internal.c b/src/data_internal.c index d64c1fc5..dbba619d 100644 --- a/src/data_internal.c +++ b/src/data_internal.c @@ -38,6 +38,7 @@ merge_accumulator_copy_message(merge_accumulator *ma, message msg) return FALSE; } ma->type = message_class(msg); + ma->cc = msg.cc; return TRUE; } @@ -45,6 +46,9 @@ _Bool merge_accumulator_resize(merge_accumulator *ma, uint64 newsize) { platform_status rc = writable_buffer_resize(&ma->data, newsize); + if (SUCCESS(rc) && newsize == 0) { + ma->cc = NULL; + } return SUCCESS(rc); } @@ -52,4 +56,7 @@ void merge_accumulator_set_class(merge_accumulator *ma, message_type type) { ma->type = type; + if (type == MESSAGE_TYPE_INVALID || type == MESSAGE_TYPE_DELETE) { + ma->cc = NULL; + } } diff --git a/src/data_internal.h b/src/data_internal.h index 9ffd1adc..4c37a89e 100644 --- a/src/data_internal.h +++ b/src/data_internal.h @@ -10,6 +10,8 @@ #pragma once #include "splinterdb/data.h" +#include "blob.h" +#include "cache.h" #include "util.h" /* @@ -349,7 +351,13 @@ message_type_string(message_type type) static inline message message_create(message_type type, slice data) { - return (message){.type = type, .data = data}; + return (message){.type = type, .cc = NULL, .data = data}; +} + +static inline message +message_create_blob(message_type type, cache *cc, slice data) +{ + return (message){.type = type, .cc = cc, .data = data}; } static inline bool32 @@ -373,18 +381,46 @@ message_is_invalid_user_type(message msg) || msg.type > MESSAGE_TYPE_MAX_VALID_USER_TYPE; } -/* Define an arbitrary ordering on messages. In practice, all we care - * about is equality, but this is written to follow the same - * comparison interface as for ordered types. */ -static inline int -message_lex_cmp(message a, message b) +static inline uint64 +message_materialized_length(message msg) { - if (a.type < b.type) { - return -1; - } else if (b.type < a.type) { - return 1; + if (message_isblob(msg)) { + return blob_length(message_slice(msg)); + } else { + return message_length(msg); + } +} + +static inline platform_status +message_materialize(message msg, merge_accumulator *tmp); + +static inline platform_status +message_materialize_if_needed(platform_heap_id heap_id, + message msg, + writable_buffer *tmp, + message *result) +{ + if (message_isblob(msg)) { + writable_buffer_init(tmp, heap_id); + platform_status rc = + blob_materialize_full(msg.cc, message_slice(msg), tmp); + if (!SUCCESS(rc)) { + writable_buffer_deinit(tmp); + return rc; + } + *result = + message_create(message_class(msg), writable_buffer_to_slice(tmp)); } else { - return slice_lex_cmp(a.data, b.data); + *result = msg; + } + return STATUS_OK; +} + +static inline void +message_dematerialize_if_needed(message msg, writable_buffer *tmp) +{ + if (message_isblob(msg)) { + writable_buffer_deinit(tmp); } } @@ -420,6 +456,7 @@ _Static_assert(MESSAGE_TYPE_MAX_VALID_USER_TYPE < (1ULL << ONDISK_MESSAGE_TYPE_BITS), "ONDISK_MESSAGE_TYPE_BITS is too small"); #define ONDISK_MESSAGE_TYPE_MASK ((0x1 << ONDISK_MESSAGE_TYPE_BITS) - 1) +#define ONDISK_MESSAGE_ISBLOB (0x4) /* Size of the data part of an existing ondisk_tuple */ static inline uint64 @@ -451,12 +488,20 @@ ondisk_tuple_message_class(const ondisk_tuple *odt) return odt->flags & ONDISK_MESSAGE_TYPE_MASK; } +static inline bool32 +ondisk_tuple_message_isblob(const ondisk_tuple *odt) +{ + return (odt->flags & ONDISK_MESSAGE_ISBLOB) != 0; +} + static inline message -ondisk_tuple_message(const ondisk_tuple *odt) +ondisk_tuple_message(cache *cc, const ondisk_tuple *odt) { - return message_create(ondisk_tuple_message_class(odt), - slice_create(odt->message_length, - odt->key_and_message + odt->key_length)); + slice data = + slice_create(odt->message_length, odt->key_and_message + odt->key_length); + return message_create_blob(ondisk_tuple_message_class(odt), + ondisk_tuple_message_isblob(odt) ? cc : NULL, + data); } static inline void @@ -464,6 +509,7 @@ copy_message_to_ondisk_tuple(ondisk_tuple *odt, message msg) { odt->message_length = message_length(msg); odt->flags = message_class(msg); + odt->flags |= message_isblob(msg) ? ONDISK_MESSAGE_ISBLOB : 0; memcpy(odt->key_and_message + odt->key_length, message_data(msg), message_length(msg)); @@ -474,7 +520,7 @@ copy_tuple_to_ondisk_tuple(ondisk_tuple *odt, key k, message msg) { debug_assert(key_is_user_key(k)); odt->key_length = key_length(k); - odt->flags = ONDISK_KEY_DEFAULT_FLAGS; + odt->key_flags = ONDISK_KEY_DEFAULT_FLAGS; memcpy(odt->key_and_message, key_data(k), key_length(k)); copy_message_to_ondisk_tuple(odt, msg); } @@ -487,6 +533,7 @@ copy_tuple_to_ondisk_tuple(ondisk_tuple *odt, key k, message msg) struct merge_accumulator { message_type type; + cache *cc; writable_buffer data; }; @@ -495,6 +542,7 @@ merge_accumulator_init(merge_accumulator *ma, platform_heap_id heap_id) { writable_buffer_init(&ma->data, heap_id); ma->type = MESSAGE_TYPE_INVALID; + ma->cc = NULL; } static inline void @@ -509,6 +557,7 @@ merge_accumulator_init_with_buffer(merge_accumulator *ma, writable_buffer_init_with_buffer( &ma->data, heap_id, allocation_size, data, logical_size); ma->type = type; + ma->cc = NULL; } static inline void @@ -516,6 +565,7 @@ merge_accumulator_deinit(merge_accumulator *ma) { writable_buffer_deinit(&ma->data); ma->type = MESSAGE_TYPE_INVALID; + ma->cc = NULL; } static inline bool32 @@ -524,16 +574,25 @@ merge_accumulator_is_definitive(const merge_accumulator *ma) return ma->type == MESSAGE_TYPE_INSERT || ma->type == MESSAGE_TYPE_DELETE; } +static inline bool32 +merge_accumulator_isblob(const merge_accumulator *ma) +{ + return ma->cc != NULL; +} + static inline message merge_accumulator_to_message(const merge_accumulator *ma) { - return message_create(ma->type, writable_buffer_to_slice(&ma->data)); + return message_create_blob(ma->type, + ma->cc, + writable_buffer_to_slice(&ma->data)); } static inline slice merge_accumulator_to_value(const merge_accumulator *ma) { debug_assert(ma->type == MESSAGE_TYPE_INSERT); + debug_assert(ma->cc == NULL); return writable_buffer_to_slice(&ma->data); } @@ -551,6 +610,7 @@ static inline void merge_accumulator_set_to_null(merge_accumulator *ma) { ma->type = MESSAGE_TYPE_INVALID; + ma->cc = NULL; writable_buffer_set_to_null(&ma->data); } @@ -559,9 +619,43 @@ merge_accumulator_is_null(const merge_accumulator *ma) { bool32 r = writable_buffer_is_null(&ma->data); debug_assert(IMPLIES(r, ma->type == MESSAGE_TYPE_INVALID)); + debug_assert(IMPLIES(r, ma->cc == NULL)); return r; } +static inline platform_status +message_materialize(message msg, merge_accumulator *tmp) +{ + debug_assert(message_isblob(msg)); + tmp->type = message_class(msg); + tmp->cc = NULL; + return blob_materialize_full(msg.cc, message_slice(msg), &tmp->data); +} + +static inline platform_status +merge_accumulator_ensure_materialized(merge_accumulator *ma) +{ + if (!merge_accumulator_isblob(ma)) { + return STATUS_OK; + } + + writable_buffer materialized; + writable_buffer_init(&materialized, ma->data.heap_id); + platform_status rc = blob_materialize_full(ma->cc, + writable_buffer_to_slice(&ma->data), + &materialized); + if (!SUCCESS(rc)) { + writable_buffer_deinit(&materialized); + return rc; + } + + writable_buffer old_data = ma->data; + ma->data = materialized; + ma->cc = NULL; + writable_buffer_deinit(&old_data); + return STATUS_OK; +} + /* * USER CALLBACK WRAPPERS * @@ -596,6 +690,11 @@ data_key_hash(const data_config *cfg, key k, uint32 seed) } } +static inline int +data_merge_tuples_final(const data_config *cfg, + key tuple_key, + merge_accumulator *oldest_message); + static inline int data_merge_tuples(const data_config *cfg, key tuple_key, @@ -611,12 +710,29 @@ data_merge_tuples(const data_config *cfg, message_type oldclass = message_class(old_raw_message); if (oldclass == MESSAGE_TYPE_DELETE) { - return cfg->merge_tuples_final(cfg, tuple_key.user_slice, new_message); + return data_merge_tuples_final(cfg, tuple_key, new_message); } // new class is UPDATE and old class is INSERT or UPDATE + platform_status rc = merge_accumulator_ensure_materialized(new_message); + if (!SUCCESS(rc)) { + return -1; + } + + writable_buffer old_tmp; + message old_materialized_message; + rc = message_materialize_if_needed(new_message->data.heap_id, + old_raw_message, + &old_tmp, + &old_materialized_message); + if (!SUCCESS(rc)) { + return -1; + } + int result = cfg->merge_tuples( - cfg, tuple_key.user_slice, old_raw_message, new_message); + cfg, tuple_key.user_slice, old_materialized_message, new_message); + message_dematerialize_if_needed(old_raw_message, &old_tmp); + if (result && merge_accumulator_message_class(new_message) == MESSAGE_TYPE_DELETE) { @@ -636,6 +752,12 @@ data_merge_tuples_final(const data_config *cfg, if (merge_accumulator_is_definitive(oldest_message)) { return 0; } + + platform_status rc = merge_accumulator_ensure_materialized(oldest_message); + if (!SUCCESS(rc)) { + return -1; + } + int result = cfg->merge_tuples_final(cfg, tuple_key.user_slice, oldest_message); if (result @@ -666,7 +788,48 @@ data_message_to_string(const data_config *cfg, char *str, size_t size) { - cfg->message_to_string(cfg, msg, str, size); + writable_buffer tmp; + message materialized; + platform_status rc = message_materialize_if_needed( + platform_get_heap_id(), msg, &tmp, &materialized); + if (!SUCCESS(rc)) { + if (size != 0) { + str[0] = '\0'; + } + return; + } + cfg->message_to_string(cfg, materialized, str, size); + message_dematerialize_if_needed(msg, &tmp); +} + +/* Define an arbitrary ordering on messages. In practice, all we care + * about is equality, but this follows the same interface as ordered types. */ +static inline int +message_lex_cmp(message a, message b) +{ + if (a.type < b.type) { + return -1; + } else if (b.type < a.type) { + return 1; + } else { + writable_buffer a_tmp; + message a_materialized; + writable_buffer b_tmp; + message b_materialized; + platform_heap_id hid = platform_get_heap_id(); + + platform_status rc = + message_materialize_if_needed(hid, a, &a_tmp, &a_materialized); + platform_assert_status_ok(rc); + rc = message_materialize_if_needed(hid, b, &b_tmp, &b_materialized); + platform_assert_status_ok(rc); + + int result = slice_lex_cmp(message_slice(a_materialized), + message_slice(b_materialized)); + message_dematerialize_if_needed(a, &a_tmp); + message_dematerialize_if_needed(b, &b_tmp); + return result; + } } #define key_string(cfg, key_to_print) \ diff --git a/src/mini_allocator.c b/src/mini_allocator.c index 9220519d..df4f46b3 100644 --- a/src/mini_allocator.c +++ b/src/mini_allocator.c @@ -50,6 +50,7 @@ typedef struct ONDISK mini_meta_hdr { */ typedef struct ONDISK meta_entry { uint64 extent_addr; + uint8 type; } meta_entry; static meta_entry * @@ -113,7 +114,7 @@ mini_full_lock_meta_tail(mini_allocator *mini) uint64 wait = 1; while (1) { uint64 meta_tail = mini->meta_tail; - meta_page = cache_get(mini->cc, meta_tail, TRUE, mini->type); + meta_page = cache_get(mini->cc, meta_tail, TRUE, mini->meta_type); if (meta_tail == mini->meta_tail && cache_try_claim(mini->cc, meta_page)) { break; @@ -155,9 +156,9 @@ mini_full_unlock_meta_page(mini_allocator *mini, page_handle *meta_page) * update our bookkeeping. */ static platform_status -mini_allocator_get_new_extent(mini_allocator *mini, uint64 *addr) +mini_allocator_get_new_extent(mini_allocator *mini, page_type type, uint64 *addr) { - platform_status rc = allocator_alloc(mini->al, addr, mini->type); + platform_status rc = allocator_alloc(mini->al, addr, type); if (SUCCESS(rc)) { __sync_fetch_and_add(&mini->num_extents, 1); } @@ -191,11 +192,29 @@ mini_init(mini_allocator *mini, uint64 meta_tail, uint64 num_batches, page_type type) +{ + page_type types[MINI_MAX_BATCHES]; + for (uint64 batch = 0; batch < num_batches; batch++) { + types[batch] = type; + } + return mini_init_with_types( + mini, cc, meta_head, meta_tail, num_batches, type, types); +} + +uint64 +mini_init_with_types(mini_allocator *mini, + cache *cc, + uint64 meta_head, + uint64 meta_tail, + uint64 num_batches, + page_type meta_type, + const page_type *types) { platform_assert(num_batches <= MINI_MAX_BATCHES); platform_assert(num_batches != 0); platform_assert(mini != NULL); platform_assert(cc != NULL); + platform_assert(types != NULL); ZERO_CONTENTS(mini); mini->cc = cc; @@ -203,14 +222,15 @@ mini_init(mini_allocator *mini, mini->meta_head = meta_head; mini->num_extents = 1; // for the meta page mini->num_batches = num_batches; - mini->type = type; - mini->pinned = (type == PAGE_TYPE_MEMTABLE); + mini->meta_type = meta_type; + mini->pinned = (meta_type == PAGE_TYPE_MEMTABLE); + memcpy(mini->types, types, num_batches * sizeof(*types)); page_handle *meta_page; if (meta_tail == 0) { // new mini allocator mini->meta_tail = meta_head; - meta_page = cache_alloc(cc, mini->meta_head, type); + meta_page = cache_alloc(cc, mini->meta_head, meta_type); mini_init_meta_page(mini, meta_page); // meta_page gets an extra ref @@ -230,12 +250,12 @@ mini_init(mini_allocator *mini, for (uint64 batch = 0; batch < num_batches; batch++) { // because we recover ref counts from the mini allocators on recovery, we // don't need to store these in the mini allocator until we consume them. - platform_status rc = - mini_allocator_get_new_extent(mini, &mini->next_extent[batch]); + platform_status rc = mini_allocator_get_new_extent( + mini, mini->types[batch], &mini->next_extent[batch]); platform_assert_status_ok(rc); } - return mini->next_extent[0]; + return mini_next_addr(mini, 0); } /* @@ -261,7 +281,8 @@ entry_fits_in_page(uint64 page_size, uint64 start, uint64 entry_size) static bool32 mini_append_entry_to_page(mini_allocator *mini, page_handle *meta_page, - uint64 extent_addr) + uint64 extent_addr, + page_type type) { uint64 page_size = cache_page_size(mini->cc); debug_assert(extent_addr != 0); @@ -275,6 +296,7 @@ mini_append_entry_to_page(mini_allocator *mini, meta_entry *new_entry = pointer_byte_offset(hdr, hdr->pos); new_entry->extent_addr = extent_addr; + new_entry->type = type; hdr->pos += sizeof(meta_entry); hdr->num_entries++; @@ -362,26 +384,28 @@ mini_append_entry(mini_allocator *mini, uint64 batch, uint64 next_addr) { page_handle *meta_page = mini_full_lock_meta_tail(mini); bool32 success; - success = mini_append_entry_to_page(mini, meta_page, next_addr); + success = + mini_append_entry_to_page(mini, meta_page, next_addr, mini->types[batch]); if (!success) { // need to allocate a new meta page uint64 new_meta_tail = mini->meta_tail + cache_page_size(mini->cc); if (new_meta_tail % cache_extent_size(mini->cc) == 0) { // need to allocate the next meta extent - platform_status rc = - mini_allocator_get_new_extent(mini, &new_meta_tail); + platform_status rc = mini_allocator_get_new_extent( + mini, mini->meta_type, &new_meta_tail); platform_assert_status_ok(rc); } mini_set_next_meta_addr(mini, meta_page, new_meta_tail); page_handle *last_meta_page = meta_page; - meta_page = cache_alloc(mini->cc, new_meta_tail, mini->type); + meta_page = cache_alloc(mini->cc, new_meta_tail, mini->meta_type); mini->meta_tail = new_meta_tail; mini_full_unlock_meta_page(mini, last_meta_page); mini_init_meta_page(mini, meta_page); - success = mini_append_entry_to_page(mini, meta_page, next_addr); + success = + mini_append_entry_to_page(mini, meta_page, next_addr, mini->types[batch]); if (mini->pinned) { cache_pin(mini->cc, meta_page); @@ -419,8 +443,8 @@ mini_alloc(mini_allocator *mini, uint64 batch, uint64 *next_extent) // need to allocate the next extent uint64 extent_addr = mini->next_extent[batch]; - platform_status rc = - mini_allocator_get_new_extent(mini, &mini->next_extent[batch]); + platform_status rc = mini_allocator_get_new_extent( + mini, mini->types[batch], &mini->next_extent[batch]); platform_assert_status_ok(rc); next_addr = extent_addr; @@ -437,6 +461,138 @@ mini_alloc(mini_allocator *mini, uint64 batch, uint64 *next_extent) return next_addr; } +platform_status +mini_alloc_bytes(mini_allocator *mini, + uint64 batch, + uint64 num_bytes, + uint64 alignment, + uint64 boundary, + uint64 addrs[2], + uint64 *next_extent) +{ + uint64 extent_size = cache_extent_size(mini->cc); + if (alignment == 0) { + alignment = 1; + } + debug_assert(batch < mini->num_batches); + debug_assert(num_bytes <= extent_size); + debug_assert(boundary % alignment == 0); + debug_assert(boundary < num_bytes || extent_size % boundary == 0); + + addrs[0] = 0; + addrs[1] = 0; + + uint64 next_addr = mini_lock_batch_get_next_addr(mini, batch); + + if (next_addr % alignment) { + next_addr += alignment - (next_addr % alignment); + } + + if (num_bytes <= boundary + && next_addr / boundary != (next_addr + num_bytes - 1) / boundary) + { + next_addr += boundary - (next_addr % boundary); + } + + uint64 num_allocs = 0; + uint64 remainder = num_bytes; + while (remainder) { + if (next_addr % extent_size == 0) { + uint64 extent_addr = mini->next_extent[batch]; + platform_status rc = mini_allocator_get_new_extent( + mini, mini->types[batch], &mini->next_extent[batch]); + if (!SUCCESS(rc)) { + mini_unlock_batch_set_next_addr(mini, batch, next_addr); + return rc; + } + next_addr = extent_addr; + + bool32 success = mini_append_entry(mini, batch, next_addr); + platform_assert(success); + } + + uint64 this_alloc_size = + MIN(remainder, extent_size - (next_addr % extent_size)); + + debug_assert(num_allocs < 2); + addrs[num_allocs] = next_addr; + remainder -= this_alloc_size; + num_allocs++; + next_addr += this_alloc_size; + } + + if (next_extent) { + *next_extent = mini->next_extent[batch]; + } + + debug_assert(mini->saved_next_addr[batch] == 0); + mini->saved_next_addr[batch] = next_addr; + return STATUS_OK; +} + +void +mini_alloc_bytes_finish(mini_allocator *mini, uint64 batch) +{ + uint64 saved_next_addr = mini->saved_next_addr[batch]; + mini->saved_next_addr[batch] = 0; + mini_unlock_batch_set_next_addr(mini, batch, saved_next_addr); +} + +uint64 +mini_alloc_page(mini_allocator *mini, uint64 batch, uint64 *next_extent) +{ + uint64 addrs[2] = {0, 0}; + uint64 page_size = cache_page_size(mini->cc); + platform_status rc = mini_alloc_bytes( + mini, batch, page_size, page_size, 0, addrs, next_extent); + if (!SUCCESS(rc)) { + return 0; + } + debug_assert(addrs[1] == 0); + mini_alloc_bytes_finish(mini, batch); + return addrs[0]; +} + +uint64 +mini_alloc_extent(mini_allocator *mini, uint64 batch, uint64 *next_extent) +{ + uint64 addrs[2] = {0, 0}; + uint64 extent_size = cache_extent_size(mini->cc); + platform_status rc = mini_alloc_bytes( + mini, batch, extent_size, extent_size, 0, addrs, next_extent); + if (!SUCCESS(rc)) { + return 0; + } + debug_assert(addrs[1] == 0); + mini_alloc_bytes_finish(mini, batch); + return addrs[0]; +} + +platform_status +mini_attach_extent(mini_allocator *mini, uint64 batch, uint64 addr) +{ + debug_assert(batch < mini->num_batches); + debug_assert(addr % cache_extent_size(mini->cc) == 0); + + uint64 old_next_addr = mini_lock_batch_get_next_addr(mini, batch); + allocator_inc_ref(mini->al, addr); + bool32 success = mini_append_entry(mini, batch, addr); + platform_assert(success); + mini_unlock_batch_set_next_addr(mini, batch, old_next_addr); + return STATUS_OK; +} + +uint64 +mini_next_addr(mini_allocator *mini, uint64 batch) +{ + debug_assert(batch < mini->num_batches); + if ((mini->next_addr[batch] % cache_extent_size(mini->cc)) != 0) { + return mini->next_addr[batch]; + } else { + return mini->next_extent[batch]; + } +} + /* *----------------------------------------------------------------------------- * mini_release -- @@ -459,9 +615,13 @@ mini_release(mini_allocator *mini) for (uint64 batch = 0; batch < mini->num_batches; batch++) { // Dealloc the next extent refcount ref = - allocator_dec_ref(mini->al, mini->next_extent[batch], mini->type); + allocator_dec_ref(mini->al, + mini->next_extent[batch], + mini->types[batch]); platform_assert(ref == AL_NO_REFS); - ref = allocator_dec_ref(mini->al, mini->next_extent[batch], mini->type); + ref = allocator_dec_ref(mini->al, + mini->next_extent[batch], + mini->types[batch]); platform_assert(ref == AL_FREE); } } @@ -567,7 +727,7 @@ mini_for_each_meta_page_func(cache *cc, uint64 num_meta_entries = mini_num_entries(meta_page); meta_entry *entry = first_entry(meta_page); for (uint64 i = 0; i < num_meta_entries; i++) { - fef->func(cc, type, entry->extent_addr, fef->arg); + fef->func(cc, entry->type, entry->extent_addr, fef->arg); entry = next_entry(entry); } } @@ -614,10 +774,13 @@ mini_dealloc_extent(cache *cc, page_type type, uint64 base_addr, void *out) { allocator *al = cache_get_allocator(cc); refcount ref = allocator_dec_ref(al, base_addr, type); - platform_assert(ref == AL_NO_REFS); - cache_extent_discard(cc, base_addr, type); - ref = allocator_dec_ref(al, base_addr, type); - platform_assert(ref == AL_FREE); + if (ref == AL_NO_REFS) { + cache_extent_discard(cc, base_addr, type); + ref = allocator_dec_ref(al, base_addr, type); + platform_assert(ref == AL_FREE); + } else { + platform_assert(ref != AL_FREE); + } } refcount @@ -724,7 +887,10 @@ mini_print(cache *cc, uint64 meta_head, page_type type) uint64 num_entries = mini_num_entries(meta_page); meta_entry *entry = first_entry(meta_page); for (uint64 i = 0; i < num_entries; i++) { - platform_default_log("| %3lu | %35lu |\n", i, entry->extent_addr); + platform_default_log("| %3lu | %35lu | %s\n", + i, + entry->extent_addr, + page_type_str[entry->type]); entry = next_entry(entry); } platform_default_log("|-------------------------------------------|\n"); diff --git a/src/mini_allocator.h b/src/mini_allocator.h index 9a2ac6bd..f130ca78 100644 --- a/src/mini_allocator.h +++ b/src/mini_allocator.h @@ -9,9 +9,9 @@ * * The purpose of the mini allocator is to allocate pages from extents * and to maintain a list of allocated extents for future bulk operations, - * such as reference counting and deallocation. Keyed mini allocators - * further associate a key range to each extent, so that these bulk - * operations can be restricted to given key ranges. + * such as reference counting and deallocation. A single mini allocator can + * manage batches with different page types; the metadata records each + * extent's page type so bulk operations can deallocate mixed trees/blobs. */ #pragma once @@ -26,7 +26,7 @@ * extents. This batch-size is somewhat of an artificial limit to manage this * contiguity. */ -#define MINI_MAX_BATCHES 8 +#define MINI_MAX_BATCHES 16 /* * mini_allocator: Mini-allocator context. @@ -37,11 +37,13 @@ typedef struct mini_allocator { bool32 pinned; uint64 meta_head; volatile uint64 meta_tail; - page_type type; + page_type meta_type; + page_type types[MINI_MAX_BATCHES]; uint64 num_extents; uint64 num_batches; volatile uint64 next_addr[MINI_MAX_BATCHES]; + uint64 saved_next_addr[MINI_MAX_BATCHES]; uint64 next_extent[MINI_MAX_BATCHES]; } mini_allocator; @@ -52,12 +54,44 @@ mini_init(mini_allocator *mini, uint64 meta_tail, uint64 num_batches, page_type type); + +uint64 +mini_init_with_types(mini_allocator *mini, + cache *cc, + uint64 meta_head, + uint64 meta_tail, + uint64 num_batches, + page_type meta_type, + const page_type *types); void mini_release(mini_allocator *mini); uint64 mini_alloc(mini_allocator *mini, uint64 batch, uint64 *next_extent); +platform_status +mini_alloc_bytes(mini_allocator *mini, + uint64 batch, + uint64 num_bytes, + uint64 alignment, + uint64 boundary, + uint64 addrs[2], + uint64 *next_extent); + +void +mini_alloc_bytes_finish(mini_allocator *mini, uint64 batch); + +uint64 +mini_alloc_page(mini_allocator *mini, uint64 batch, uint64 *next_extent); + +uint64 +mini_alloc_extent(mini_allocator *mini, uint64 batch, uint64 *next_extent); + +platform_status +mini_attach_extent(mini_allocator *mini, uint64 batch, uint64 addr); + +uint64 +mini_next_addr(mini_allocator *mini, uint64 batch); refcount mini_inc_ref(cache *cc, uint64 meta_head); diff --git a/src/shard_log.c b/src/shard_log.c index 2193dec0..3fa9abc1 100644 --- a/src/shard_log.c +++ b/src/shard_log.c @@ -10,6 +10,7 @@ */ #include "shard_log.h" +#include "data_blob_build.h" #include "data_internal.h" #include "platform_sleep.h" #include "platform_hash.h" @@ -58,6 +59,11 @@ const static iterator_ops shard_log_iterator_ops = { .print = NULL, }; +static const page_type shard_log_page_type_table[NUM_BLOB_BATCHES + 1] = { + PAGE_TYPE_LOG, + [1 ... NUM_BLOB_BATCHES] = PAGE_TYPE_BLOB, +}; + static inline uint64 shard_log_page_size(shard_log_config *cfg) { @@ -86,7 +92,7 @@ shard_log_get_thread_data(shard_log *log, threadid thr_id) page_handle * shard_log_alloc(shard_log *log, uint64 *next_extent) { - uint64 addr = mini_alloc(&log->mini, 0, next_extent); + uint64 addr = mini_alloc_page(&log->mini, 0, next_extent); return cache_alloc(log->cc, addr, PAGE_TYPE_LOG); } @@ -112,7 +118,13 @@ shard_log_init(shard_log *log, cache *cc, shard_log_config *cfg) thread_data->offset = 0; } - log->addr = mini_init(&log->mini, cc, log->meta_head, 0, 1, PAGE_TYPE_LOG); + log->addr = mini_init_with_types(&log->mini, + cc, + log->meta_head, + 0, + NUM_BLOB_BATCHES + 1, + PAGE_TYPE_LOG, + shard_log_page_type_table); // platform_default_log("addr: %lu meta_head: %lu\n", log->addr, // log->meta_head); @@ -152,10 +164,16 @@ log_entry_key(log_entry *le) return ondisk_tuple_key(&le->tuple); } +static bool32 +log_entry_message_isblob(log_entry *le) +{ + return ondisk_tuple_message_isblob(&le->tuple); +} + static message -log_entry_message(log_entry *le) +log_entry_message(cache *cc, log_entry *le) { - return ondisk_tuple_message(&le->tuple); + return ondisk_tuple_message(cc, &le->tuple); } static uint64 @@ -215,12 +233,40 @@ shard_log_write(log_handle *logh, key tuple_key, message msg, uint64 generation) shard_log *log = (shard_log *)logh; cache *cc = log->cc; + merge_accumulator log_blob; + bool32 log_blob_inited = FALSE; + + uint64 max_entry_size = + shard_log_page_size(log->cfg) - sizeof(shard_log_hdr); + if (message_isblob(msg) + || max_entry_size < log_entry_required_capacity(tuple_key, msg)) + { + merge_accumulator_init(&log_blob, platform_get_heap_id()); + platform_status rc; + if (message_isblob(msg)) { + rc = message_clone( + &log->cfg->blob_cfg, cc, &log->mini, msg, &log_blob); + } else { + rc = message_to_blob( + &log->cfg->blob_cfg, cc, &log->mini, msg, &log_blob); + } + if (!SUCCESS(rc)) { + merge_accumulator_deinit(&log_blob); + return rc.r; + } + msg = merge_accumulator_to_message(&log_blob); + log_blob_inited = TRUE; + } + shard_log_thread_data *thread_data = shard_log_get_thread_data(log, platform_get_tid()); page_handle *page; if (thread_data->addr == SHARD_UNMAPPED) { if (get_new_page_for_thread(log, thread_data, &page)) { + if (log_blob_inited) { + merge_accumulator_deinit(&log_blob); + } return -1; } } else { @@ -254,6 +300,9 @@ shard_log_write(log_handle *logh, key tuple_key, message msg, uint64 generation) cache_unget(cc, page); if (get_new_page_for_thread(log, thread_data, &page)) { + if (log_blob_inited) { + merge_accumulator_deinit(&log_blob); + } return -1; } cursor = (log_entry *)(page->data + thread_data->offset); @@ -272,6 +321,14 @@ shard_log_write(log_handle *logh, key tuple_key, message msg, uint64 generation) cache_unclaim(cc, page); cache_unget(cc, page); + if (log_blob_inited) { + platform_status rc = blob_sync(cc, message_slice(msg)); + merge_accumulator_deinit(&log_blob); + if (!SUCCESS(rc)) { + return rc.r; + } + } + return 0; } @@ -348,6 +405,7 @@ shard_log_iterator_init(cache *cc, memset(itor, 0, sizeof(shard_log_iterator)); itor->super.ops = &shard_log_iterator_ops; + itor->cc = cc; itor->cfg = cfg; allocator *al = cache_get_allocator(cc); @@ -433,7 +491,7 @@ shard_log_iterator_curr(iterator *itorh, key *curr_key, message *msg) { shard_log_iterator *itor = (shard_log_iterator *)itorh; *curr_key = log_entry_key(itor->entries[itor->pos]); - *msg = log_entry_message(itor->entries[itor->pos]); + *msg = log_entry_message(itor->cc, itor->entries[itor->pos]); } bool32 @@ -474,6 +532,12 @@ shard_log_config_init(shard_log_config *log_cfg, log_cfg->cache_cfg = cache_cfg; log_cfg->data_cfg = data_cfg; log_cfg->seed = HASH_SEED; + log_cfg->blob_cfg = (blob_build_config){ + .extent_batch = 1, + .page_batch = 2, + .subpage_batch = 3, + .alignment = cache_config_page_size(cache_cfg), + }; } void @@ -499,9 +563,10 @@ shard_log_print(shard_log *log) !terminal_log_entry(cfg, page->data, le); le = log_entry_next(le)) { - platform_default_log("%s -- %s : %lu\n", + platform_default_log("%s -- %s%s : %lu\n", key_string(dcfg, log_entry_key(le)), - message_string(dcfg, log_entry_message(le)), + log_entry_message_isblob(le) ? "(blob) " : "", + message_string(dcfg, log_entry_message(cc, le)), le->generation); } } diff --git a/src/shard_log.h b/src/shard_log.h index 0129e9d2..6459d78d 100644 --- a/src/shard_log.h +++ b/src/shard_log.h @@ -15,15 +15,17 @@ #include "cache.h" #include "iterator.h" #include "splinterdb/data.h" +#include "blob_build.h" #include "mini_allocator.h" /* * Configuration structure to set up the sharded log sub-system. */ typedef struct shard_log_config { - cache_config *cache_cfg; - data_config *data_cfg; - uint64 seed; + cache_config *cache_cfg; + data_config *data_cfg; + uint64 seed; + blob_build_config blob_cfg; // data config of point message tree } shard_log_config; @@ -50,6 +52,7 @@ typedef struct log_entry log_entry; typedef struct shard_log_iterator { iterator super; + cache *cc; shard_log_config *cfg; char *contents; log_entry **entries; diff --git a/src/splinterdb.c b/src/splinterdb.c index 9df80d85..9d88020e 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -535,8 +535,15 @@ splinterdb_lookup_result_value(const splinterdb_lookup_result *result, // IN return EINVAL; } - *value = - merge_accumulator_to_value(lookup_result_const_accumulator(_result)); + lookup_result *mutable_result = + lookup_result_from_splinterdb((splinterdb_lookup_result *)result); + platform_status status = + merge_accumulator_ensure_materialized(&mutable_result->value); + if (!SUCCESS(status)) { + return platform_status_to_int(status); + } + + *value = merge_accumulator_to_value(lookup_result_accumulator(mutable_result)); return 0; } @@ -651,6 +658,7 @@ struct splinterdb_iterator { core_range_iterator sri; platform_status last_rc; const splinterdb *parent; + merge_accumulator materialized_message; }; static inline bool32 @@ -710,6 +718,7 @@ splinterdb_iterator_init_with_bounds(splinterdb *kvs, // IN return platform_status_to_int(STATUS_NO_MEMORY); } it->last_rc = STATUS_OK; + merge_accumulator_init(&it->materialized_message, kvs->spl.heap_id); core_range_iterator *range_itor = &(it->sri); key min_key; @@ -743,6 +752,7 @@ splinterdb_iterator_init_with_bounds(splinterdb *kvs, // IN start_key, UINT64_MAX); if (!SUCCESS(rc)) { + merge_accumulator_deinit(&it->materialized_message); platform_free(kvs->spl.heap_id, it); return platform_status_to_int(rc); } @@ -759,6 +769,7 @@ splinterdb_iterator_deinit(splinterdb_iterator *iter) core_range_iterator *range_itor = &(iter->sri); core_range_iterator_deinit(range_itor); + merge_accumulator_deinit(&iter->materialized_message); core_handle *spl = range_itor->spl; platform_free(spl->heap_id, range_itor); @@ -837,8 +848,15 @@ splinterdb_iterator_get_current(splinterdb_iterator *iter, // IN iterator *itor = &(iter->sri.super); iterator_curr(itor, &result_key, &msg); - *value = message_slice(msg); *outkey = key_slice(result_key); + if (message_isblob(msg)) { + iter->last_rc = message_materialize(msg, &iter->materialized_message); + if (SUCCESS(iter->last_rc)) { + *value = merge_accumulator_to_value(&iter->materialized_message); + } + } else { + *value = message_slice(msg); + } } void diff --git a/tests/functional/log_test.c b/tests/functional/log_test.c index ee8ad0ad..912963f9 100644 --- a/tests/functional/log_test.c +++ b/tests/functional/log_test.c @@ -117,6 +117,70 @@ test_log_crash(clockcache *cc, return 0; } +static int +test_log_large_message(cache *cc, + shard_log_config *cfg, + shard_log *log, + platform_heap_id hid) +{ + platform_status rc; + shard_log_iterator itor; + iterator *itorh = (iterator *)&itor; + merge_accumulator msg; + key returned_key; + message returned_message; + char key_data[] = "large-log-key"; + key skey = + key_create(FALSE, sizeof(key_data) - 1, key_data); + uint64 value_len = 3 * cache_page_size(cc) + 123; + + rc = shard_log_init(log, cc, cfg); + platform_assert_status_ok(rc); + + merge_accumulator_init(&msg, hid); + bool32 success = merge_accumulator_resize(&msg, value_len); + platform_assert(success); + merge_accumulator_set_class(&msg, MESSAGE_TYPE_INSERT); + memset(merge_accumulator_data(&msg), 'L', value_len); + + int log_rc = log_write((log_handle *)log, + skey, + merge_accumulator_to_message(&msg), + 0); + platform_assert(log_rc == 0); + + merge_accumulator filler; + merge_accumulator_init(&filler, hid); + success = merge_accumulator_resize(&filler, cache_page_size(cc) / 4); + platform_assert(success); + merge_accumulator_set_class(&filler, MESSAGE_TYPE_INSERT); + memset(merge_accumulator_data(&filler), 'f', merge_accumulator_length(&filler)); + for (uint64 i = 1; i < 16; i++) { + log_rc = log_write((log_handle *)log, + skey, + merge_accumulator_to_message(&filler), + i); + platform_assert(log_rc == 0); + } + merge_accumulator_deinit(&filler); + + rc = shard_log_iterator_init( + cc, cfg, hid, log_addr((log_handle *)log), log_magic((log_handle *)log), &itor); + platform_assert_status_ok(rc); + platform_assert(iterator_can_curr(itorh)); + + iterator_curr(itorh, &returned_key, &returned_message); + platform_assert(data_key_compare(cfg->data_cfg, skey, returned_key) == 0); + platform_assert(message_lex_cmp(merge_accumulator_to_message(&msg), + returned_message) + == 0); + + shard_log_iterator_deinit(hid, &itor); + merge_accumulator_deinit(&msg); + shard_log_zap(log); + return 0; +} + typedef struct test_log_thread_params { shard_log *log; platform_thread thread; @@ -317,6 +381,9 @@ log_test(int argc, char *argv[]) shard_log *log = TYPED_MALLOC(hid, log); platform_assert(log != NULL); + rc = test_log_large_message((cache *)cc, &system_cfg.log_cfg, log, hid); + platform_assert(rc == 0); + if (run_perf_test) { ret = test_log_perf( (cache *)cc, &system_cfg.log_cfg, log, 200000000, &gen, 16, &ts, hid); diff --git a/tests/unit/btree_test.c b/tests/unit/btree_test.c index 14db5c5c..3c082f94 100644 --- a/tests/unit/btree_test.c +++ b/tests/unit/btree_test.c @@ -51,7 +51,8 @@ btree_leaf_incorporate_tuple(const btree_config *cfg, uint64 *generation) { platform_status rc = - btree_create_leaf_incorporate_spec(cfg, hid, hdr, tuple_key, msg, spec); + btree_create_leaf_incorporate_spec( + cfg, NULL, NULL, hid, hdr, tuple_key, msg, spec); ASSERT_TRUE(SUCCESS(rc)); return btree_try_perform_leaf_incorporate_spec(cfg, hdr, spec, generation); } @@ -206,7 +207,7 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) int cmp_rv = 0; for (uint32 i = 0; i < nkvs; i++) { key tuple_key = btree_get_tuple_key(cfg, hdr, i); - message msg = btree_get_tuple_message(cfg, hdr, i); + message msg = btree_get_tuple_message(cfg, NULL, hdr, i); cmp_rv = data_key_compare( cfg->data_cfg, key_create(FALSE, i % sizeof(i), &i), tuple_key); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte key %d\n", i); @@ -231,7 +232,7 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) cmp_rv = 0; for (uint64 i = 0; i < nkvs; i++) { key tuple_key = btree_get_tuple_key(cfg, hdr, i); - message msg = btree_get_tuple_message(cfg, hdr, i); + message msg = btree_get_tuple_message(cfg, NULL, hdr, i); cmp_rv = data_key_compare( cfg->data_cfg, key_create(FALSE, i % sizeof(i), &i), tuple_key); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte key %d\n", i); @@ -247,7 +248,7 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) for (uint64 i = 0; i < nkvs; i++) { key tuple_key = btree_get_tuple_key(cfg, hdr, i); - message msg = btree_get_tuple_message(cfg, hdr, i); + message msg = btree_get_tuple_message(cfg, NULL, hdr, i); cmp_rv = data_key_compare( cfg->data_cfg, key_create(FALSE, i % sizeof(i), &i), tuple_key); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte key %d\n", i); @@ -447,7 +448,7 @@ leaf_split_tests(btree_config *cfg, cfg, hid, hdr, tuple_key, bigger_msg, &spec, &generation); if (success) { btree_print_locked_node( - Platform_error_log_handle, cfg, 0, hdr, PAGE_TYPE_MEMTABLE); + Platform_error_log_handle, cfg, NULL, 0, hdr, PAGE_TYPE_MEMTABLE); ASSERT_FALSE(success, "Weird. An incorporate that was supposed to fail " "actually succeeded (nkvs=%d, realnkvs=%d, i=%d).\n", @@ -455,7 +456,7 @@ leaf_split_tests(btree_config *cfg, realnkvs, i); btree_print_locked_node( - Platform_error_log_handle, cfg, 0, hdr, PAGE_TYPE_MEMTABLE); + Platform_error_log_handle, cfg, NULL, 0, hdr, PAGE_TYPE_MEMTABLE); ASSERT_FALSE(success); } leaf_splitting_plan plan = diff --git a/tests/unit/splinterdb_quick_test.c b/tests/unit/splinterdb_quick_test.c index 3711a149..22c5cdea 100644 --- a/tests/unit/splinterdb_quick_test.c +++ b/tests/unit/splinterdb_quick_test.c @@ -337,15 +337,11 @@ CTEST2(splinterdb_quick, test_key_size_gt_max_key_size) } /* - * Test case to verify core interfaces when value-size is > max value-size. - * Here, we basically exercise the insert interface, which will trip up - * if very large values are supplied. (Once insert fails, there is - * no further need to verify the other interfaces for very-large-values.) + * Test case to verify core interfaces when value-size is larger than a page. */ CTEST2(splinterdb_quick, test_value_size_gt_max_value_size) { - size_t too_large_value_len = - MAX_INLINE_MESSAGE_SIZE(IO_DEFAULT_PAGE_SIZE) + 1; + size_t too_large_value_len = 3 * IO_DEFAULT_PAGE_SIZE + 123; char *too_large_value_data; too_large_value_data = TYPED_ARRAY_MALLOC( data->cfg.heap_id, too_large_value_data, too_large_value_len); @@ -355,8 +351,40 @@ CTEST2(splinterdb_quick, test_value_size_gt_max_value_size) int rc = splinterdb_insert( data->kvsb, slice_create(sizeof("foo"), "foo"), too_large_value, NULL); + ASSERT_EQUAL(0, rc); - ASSERT_EQUAL(EINVAL, rc); + splinterdb_lookup_result result; + splinterdb_lookup_result_init( + data->kvsb, &result, SPLINTERDB_LOOKUP_VALUE, 0, NULL); + + rc = splinterdb_lookup(data->kvsb, slice_create(sizeof("foo"), "foo"), &result); + ASSERT_EQUAL(0, rc); + ASSERT_TRUE(splinterdb_lookup_found(&result)); + + slice value; + rc = splinterdb_lookup_result_value(&result, &value); + ASSERT_EQUAL(0, rc); + ASSERT_EQUAL(too_large_value_len, slice_length(value)); + ASSERT_EQUAL(0, slice_lex_cmp(too_large_value, value)); + + splinterdb_lookup_result_deinit(&result); + + splinterdb_close(&data->kvsb); + rc = splinterdb_open(&data->cfg, &data->kvsb); + ASSERT_EQUAL(0, rc); + + splinterdb_lookup_result_init( + data->kvsb, &result, SPLINTERDB_LOOKUP_VALUE, 0, NULL); + rc = splinterdb_lookup(data->kvsb, slice_create(sizeof("foo"), "foo"), &result); + ASSERT_EQUAL(0, rc); + ASSERT_TRUE(splinterdb_lookup_found(&result)); + + rc = splinterdb_lookup_result_value(&result, &value); + ASSERT_EQUAL(0, rc); + ASSERT_EQUAL(too_large_value_len, slice_length(value)); + ASSERT_EQUAL(0, slice_lex_cmp(too_large_value, value)); + + splinterdb_lookup_result_deinit(&result); platform_free(data->cfg.heap_id, too_large_value_data); } From ca7c59a5d62407892bbb2bd9d85a1c53d6d590d3 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 28 May 2026 10:44:47 -0700 Subject: [PATCH 02/13] initial port of old indirection support Signed-off-by: Rob Johnson --- src/blob_build.c | 290 +++++++++++++++++++++++++++++++++++++++++++++++ src/blob_build.h | 30 +++++ 2 files changed, 320 insertions(+) create mode 100644 src/blob_build.c create mode 100644 src/blob_build.h diff --git a/src/blob_build.c b/src/blob_build.c new file mode 100644 index 00000000..f1469839 --- /dev/null +++ b/src/blob_build.c @@ -0,0 +1,290 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +#include "blob_build.h" +#include "poison.h" + +static platform_status +allocate_leftover_entries(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + uint64 data_len, + uint64 remainder, + writable_buffer *result) +{ + uint64 page_size = cache_page_size(cc); + uint64 extent_size = cache_extent_size(cc); + + uint64 num_pages; + if (can_round_up(page_size, data_len)) { + num_pages = (remainder + page_size - 1) / page_size; + } else { + num_pages = remainder / page_size; + } + + uint64 alloced_pages[2]; + platform_status rc; + if (num_pages != 0) { + rc = mini_alloc_bytes(mini, + cfg->page_batch, + num_pages * page_size, + MAX(page_size, cfg->alignment), + 0, + alloced_pages, + NULL); + if (!SUCCESS(rc)) { + return rc; + } + mini_alloc_bytes_finish(mini, cfg->page_batch); + + if (alloced_pages[1] != 0) { + rc = writable_buffer_append( + result, sizeof(alloced_pages), &alloced_pages); + } else { + rc = writable_buffer_append( + result, sizeof(alloced_pages[0]), &alloced_pages[0]); + } + if (!SUCCESS(rc)) { + return rc; + } + + if (remainder < num_pages * page_size) { + remainder = 0; + } else { + remainder -= num_pages * page_size; + } + } + + if (remainder != 0) { + rc = mini_alloc_bytes(mini, + cfg->subpage_batch, + remainder, + cfg->alignment, + extent_size, + alloced_pages, + NULL); + if (!SUCCESS(rc)) { + return rc; + } + platform_assert(alloced_pages[1] == 0); + + if (alloced_pages[0] % page_size == 0) { + page_handle *page = cache_alloc(cc, alloced_pages[0], PAGE_TYPE_BLOB); + cache_unlock(cc, page); + cache_unclaim(cc, page); + cache_unget(cc, page); + } else if (alloced_pages[0] / page_size + != (alloced_pages[0] + remainder - 1) / page_size) + { + uint64 next_page_addr = + (alloced_pages[0] + remainder - 1) / page_size * page_size; + page_handle *page = cache_alloc(cc, next_page_addr, PAGE_TYPE_BLOB); + cache_unlock(cc, page); + cache_unclaim(cc, page); + cache_unget(cc, page); + } + + mini_alloc_bytes_finish(mini, cfg->subpage_batch); + rc = writable_buffer_append( + result, sizeof(alloced_pages[0]), &alloced_pages[0]); + if (!SUCCESS(rc)) { + return rc; + } + } + + return STATUS_OK; +} + +static platform_status +build_blob_table(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + uint64 data_len, + writable_buffer *result) +{ + uint64 extent_size = cache_extent_size(cc); + uint64 num_extents; + uint64 remainder; + + if (can_round_up(extent_size, data_len)) { + num_extents = (data_len + extent_size - 1) / extent_size; + remainder = 0; + } else { + num_extents = data_len / extent_size; + remainder = data_len - num_extents * extent_size; + } + + platform_status rc = + writable_buffer_resize(result, sizeof(blob) + num_extents * sizeof(uint64)); + if (!SUCCESS(rc)) { + return rc; + } + + blob *blobby = writable_buffer_data(result); + blobby->length = data_len; + + for (uint64 i = 0; i < num_extents; i++) { + uint64 alloced_page = mini_alloc_extent(mini, cfg->extent_batch, NULL); + debug_assert(alloced_page != 0); + debug_assert(alloced_page != 1); + blobby->addrs[i] = alloced_page; + } + + return allocate_leftover_entries(cfg, cc, mini, data_len, remainder, result); +} + +platform_status +blob_build(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + slice data, + writable_buffer *result) +{ + platform_status rc = + build_blob_table(cfg, cc, mini, slice_length(data), result); + if (!SUCCESS(rc)) { + return rc; + } + + blob_page_iterator iter; + rc = blob_page_iterator_init(cc, + &iter, + writable_buffer_to_slice(result), + 0, + BLOB_PAGE_ITERATOR_MODE_ALLOC); + if (!SUCCESS(rc)) { + return rc; + } + + const char *raw_data = slice_data(data); + while (!blob_page_iterator_at_end(&iter)) { + uint64 offset; + slice dst; + rc = blob_page_iterator_get_curr(&iter, &offset, &dst); + if (!SUCCESS(rc)) { + goto out; + } + + debug_assert(offset + slice_length(dst) <= slice_length(data)); + memcpy(iter.page->data + iter.fragment.offset, + raw_data + offset, + slice_length(dst)); + + blob_page_iterator_advance_page(&iter); + } + +out: + blob_page_iterator_deinit(&iter); + return rc; +} + +static platform_status +clone_blob_table(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + parsed_blob *pblobby, + writable_buffer *result) +{ + platform_status rc = writable_buffer_resize( + result, sizeof(blob) + pblobby->num_extents * sizeof(uint64)); + if (!SUCCESS(rc)) { + return rc; + } + + blob *blobby = writable_buffer_data(result); + blobby->length = pblobby->base->length; + + for (uint64 i = 0; i < pblobby->num_extents; i++) { + blobby->addrs[i] = pblobby->base->addrs[i]; + rc = mini_attach_extent(mini, cfg->extent_batch, blobby->addrs[i]); + if (!SUCCESS(rc)) { + return rc; + } + } + + uint64 extent_size = cache_extent_size(cc); + if (pblobby->num_extents * extent_size < pblobby->base->length) { + uint64 remainder = + pblobby->base->length - pblobby->num_extents * extent_size; + rc = allocate_leftover_entries( + cfg, cc, mini, pblobby->base->length, remainder, result); + } + + return rc; +} + +platform_status +blob_clone(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + slice sblob, + writable_buffer *result) +{ + uint64 extent_size = cache_extent_size(cc); + uint64 page_size = cache_page_size(cc); + const blob *blobby = slice_data(sblob); + parsed_blob pblobby; + + parse_blob(extent_size, page_size, blobby, &pblobby); + + platform_status rc = clone_blob_table(cfg, cc, mini, &pblobby, result); + if (!SUCCESS(rc)) { + return rc; + } + + if (pblobby.num_extents * extent_size < pblobby.base->length) { + uint64 start = pblobby.num_extents * extent_size; + + blob_page_iterator src_iter; + blob_page_iterator dst_iter; + rc = blob_page_iterator_init( + cc, &src_iter, sblob, start, BLOB_PAGE_ITERATOR_MODE_PREFETCH); + if (!SUCCESS(rc)) { + return rc; + } + rc = blob_page_iterator_init(cc, + &dst_iter, + writable_buffer_to_slice(result), + start, + BLOB_PAGE_ITERATOR_MODE_ALLOC); + if (!SUCCESS(rc)) { + blob_page_iterator_deinit(&src_iter); + return rc; + } + + while (!blob_page_iterator_at_end(&src_iter) + && !blob_page_iterator_at_end(&dst_iter)) + { + uint64 src_offset; + slice src_data; + uint64 dst_offset; + slice dst_data; + rc = blob_page_iterator_get_curr(&src_iter, &src_offset, &src_data); + if (!SUCCESS(rc)) { + goto out; + } + rc = blob_page_iterator_get_curr(&dst_iter, &dst_offset, &dst_data); + if (!SUCCESS(rc)) { + goto out; + } + + platform_assert(src_offset == dst_offset); + uint64 amount_to_copy = MIN(slice_length(src_data), slice_length(dst_data)); + memcpy(dst_iter.page->data + dst_iter.fragment.offset, + slice_data(src_data), + amount_to_copy); + + blob_page_iterator_advance_bytes(&src_iter, amount_to_copy); + blob_page_iterator_advance_bytes(&dst_iter, amount_to_copy); + } + + debug_assert(blob_page_iterator_at_end(&src_iter)); + debug_assert(blob_page_iterator_at_end(&dst_iter)); + + out: + blob_page_iterator_deinit(&src_iter); + blob_page_iterator_deinit(&dst_iter); + } + + return rc; +} diff --git a/src/blob_build.h b/src/blob_build.h new file mode 100644 index 00000000..ad21457a --- /dev/null +++ b/src/blob_build.h @@ -0,0 +1,30 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "blob.h" +#include "mini_allocator.h" + +#define NUM_BLOB_BATCHES (3) + +typedef struct blob_build_config { + uint64 extent_batch; + uint64 page_batch; + uint64 subpage_batch; + uint64 alignment; +} blob_build_config; + +platform_status +blob_build(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + slice data, + writable_buffer *result); + +platform_status +blob_clone(const blob_build_config *cfg, + cache *cc, + mini_allocator *mini, + slice sblob, + writable_buffer *result); From b32988ae4496b8954c01755d1f4450a37e6d3255 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 28 May 2026 10:53:02 -0700 Subject: [PATCH 03/13] formatting Signed-off-by: Rob Johnson --- include/splinterdb/data.h | 4 +-- src/blob.c | 25 ++++++++----- src/blob.h | 15 +++++--- src/blob_build.c | 7 ++-- src/btree.c | 56 ++++++++++++++++-------------- src/data_internal.h | 10 +++--- src/mini_allocator.c | 25 +++++++------ src/shard_log.c | 35 ++++++++++--------- src/splinterdb.c | 3 +- tests/functional/log_test.c | 36 +++++++++---------- tests/unit/btree_test.c | 5 ++- tests/unit/splinterdb_quick_test.c | 8 +++-- 12 files changed, 125 insertions(+), 104 deletions(-) diff --git a/include/splinterdb/data.h b/include/splinterdb/data.h index 37ba8b8a..058b8d87 100644 --- a/include/splinterdb/data.h +++ b/include/splinterdb/data.h @@ -50,8 +50,8 @@ typedef enum message_type { typedef struct message { message_type type; // Internal: non-NULL means data is a blob descriptor. - void *cc; - slice data; + void *cc; + slice data; } message; static inline message_type diff --git a/src/blob.c b/src/blob.c index 44b04772..835a47ad 100644 --- a/src/blob.c +++ b/src/blob.c @@ -20,13 +20,16 @@ can_round_up(uint64 rounded_size, uint64 length) } void -parse_blob(uint64 extent_size, uint64 page_size, const blob *blobby, parsed_blob *pblobby) +parse_blob(uint64 extent_size, + uint64 page_size, + const blob *blobby, + parsed_blob *pblobby) { pblobby->base = blobby; uint64 remainder = blobby->length; if (can_round_up(extent_size, remainder)) { - pblobby->num_extents = (remainder + extent_size - 1) / extent_size; + pblobby->num_extents = (remainder + extent_size - 1) / extent_size; pblobby->leftovers[0].length = 0; return; } @@ -75,7 +78,7 @@ fragment_for_offset(uint64 extent_size, uint64 offset, page_fragment *fragment) { - uint64 byte_addr = 0; + uint64 byte_addr = 0; uint64 entry_remainder = 0; debug_assert(offset < pblobby->base->length); @@ -189,17 +192,19 @@ blob_page_iterator_get_curr(blob_page_iterator *iter, { if (iter->page == NULL) { if (should_alloc(iter)) { - iter->page = cache_alloc(iter->cc, iter->fragment.addr, PAGE_TYPE_BLOB); + iter->page = + cache_alloc(iter->cc, iter->fragment.addr, PAGE_TYPE_BLOB); } else { - iter->page = cache_get(iter->cc, iter->fragment.addr, TRUE, PAGE_TYPE_BLOB); + iter->page = + cache_get(iter->cc, iter->fragment.addr, TRUE, PAGE_TYPE_BLOB); if (iter->mode == BLOB_PAGE_ITERATOR_MODE_ALLOC) { uint64 wait = 1; while (!cache_try_claim(iter->cc, iter->page)) { cache_unget(iter->cc, iter->page); platform_sleep_ns(wait); wait = MIN(2 * wait, 2048); - iter->page = - cache_get(iter->cc, iter->fragment.addr, TRUE, PAGE_TYPE_BLOB); + iter->page = cache_get( + iter->cc, iter->fragment.addr, TRUE, PAGE_TYPE_BLOB); } cache_lock(iter->cc, iter->page); cache_mark_dirty(iter->cc, iter->page); @@ -242,7 +247,11 @@ blob_page_iterator_advance_page(blob_page_iterator *iter) } platform_status -blob_materialize(cache *cc, slice sblobby, uint64 start, uint64 end, writable_buffer *result) +blob_materialize(cache *cc, + slice sblobby, + uint64 start, + uint64 end, + writable_buffer *result) { const blob *blobby = slice_data(sblobby); diff --git a/src/blob.h b/src/blob.h index 088e7f69..7c737a74 100644 --- a/src/blob.h +++ b/src/blob.h @@ -17,8 +17,8 @@ typedef struct parsed_blob_entry { } parsed_blob_entry; typedef struct parsed_blob { - const blob *base; - uint64 num_extents; + const blob *base; + uint64 num_extents; parsed_blob_entry leftovers[3]; } parsed_blob; @@ -50,7 +50,10 @@ bool can_round_up(uint64 rounded_size, uint64 length); void -parse_blob(uint64 extent_size, uint64 page_size, const blob *blobby, parsed_blob *pblobby); +parse_blob(uint64 extent_size, + uint64 page_size, + const blob *blobby, + parsed_blob *pblobby); uint64 blob_length(slice sblob); @@ -80,7 +83,11 @@ void blob_page_iterator_advance_page(blob_page_iterator *iter); platform_status -blob_materialize(cache *cc, slice sblob, uint64 start, uint64 end, writable_buffer *result); +blob_materialize(cache *cc, + slice sblob, + uint64 start, + uint64 end, + writable_buffer *result); static inline platform_status blob_materialize_full(cache *cc, slice sblob, writable_buffer *result) diff --git a/src/blob_build.c b/src/blob_build.c index f1469839..c97aff82 100644 --- a/src/blob_build.c +++ b/src/blob_build.c @@ -114,8 +114,8 @@ build_blob_table(const blob_build_config *cfg, remainder = data_len - num_extents * extent_size; } - platform_status rc = - writable_buffer_resize(result, sizeof(blob) + num_extents * sizeof(uint64)); + platform_status rc = writable_buffer_resize( + result, sizeof(blob) + num_extents * sizeof(uint64)); if (!SUCCESS(rc)) { return rc; } @@ -269,7 +269,8 @@ blob_clone(const blob_build_config *cfg, } platform_assert(src_offset == dst_offset); - uint64 amount_to_copy = MIN(slice_length(src_data), slice_length(dst_data)); + uint64 amount_to_copy = + MIN(slice_length(src_data), slice_length(dst_data)); memcpy(dst_iter.page->data + dst_iter.fragment.offset, slice_data(src_data), amount_to_copy); diff --git a/src/btree.c b/src/btree.c index 3169e4c4..fbe66c02 100644 --- a/src/btree.c +++ b/src/btree.c @@ -172,9 +172,8 @@ leaf_entry_message_size(const leaf_entry *entry) if (!ondisk_tuple_message_isblob(entry)) { return entry->message_length; } - slice sblob = - slice_create(entry->message_length, - entry->key_and_message + entry->key_length); + slice sblob = slice_create(entry->message_length, + entry->key_and_message + entry->key_length); return blob_length(sblob); } @@ -412,10 +411,9 @@ btree_copy_leaf_entry(const btree_config *cfg, table_index k, const leaf_entry *entry) { - key entry_key = ondisk_tuple_key(entry); - message entry_msg = - ondisk_tuple_message(ondisk_tuple_message_isblob(entry) ? (cache *)1 : NULL, - entry); + key entry_key = ondisk_tuple_key(entry); + message entry_msg = ondisk_tuple_message( + ondisk_tuple_message_isblob(entry) ? (cache *)1 : NULL, entry); return btree_set_leaf_entry(cfg, hdr, k, entry_key, entry_msg); } @@ -652,8 +650,8 @@ btree_create_leaf_incorporate_spec(const btree_config *cfg, message oldmessage = leaf_entry_message(cc, entry); bool32 success; spec->use_msg = FALSE; - success = merge_accumulator_init_from_message( - &spec->msg.ma, heap_id, msg); + success = + merge_accumulator_init_from_message(&spec->msg.ma, heap_id, msg); if (!success) { return STATUS_NO_MEMORY; } @@ -688,11 +686,8 @@ btree_can_perform_leaf_incorporate_spec(const btree_config *cfg, const leaf_incorporate_spec *spec) { if (spec->old_entry_state == ENTRY_DID_NOT_EXIST) { - return btree_can_set_leaf_entry(cfg, - hdr, - btree_num_entries(hdr), - spec->tuple_key, - spec_message(spec)); + return btree_can_set_leaf_entry( + cfg, hdr, btree_num_entries(hdr), spec->tuple_key, spec_message(spec)); } else if (spec->old_entry_state == ENTRY_STILL_EXISTS) { return btree_can_set_leaf_entry( cfg, hdr, spec->idx, spec->tuple_key, spec_message(spec)); @@ -709,7 +704,7 @@ btree_try_perform_leaf_incorporate_spec(const btree_config *cfg, const leaf_incorporate_spec *spec, uint64 *generation) { - bool32 success; + bool32 success; message msg = spec_message(spec); switch (spec->old_entry_state) { case ENTRY_DID_NOT_EXIST: @@ -1232,13 +1227,13 @@ btree_root_to_meta_addr(const btree_config *cfg, #define BTREE_NUM_MINI_BATCHES (NUM_BLOB_BATCHES + BTREE_MAX_HEIGHT) static const page_type memtable_page_type_table[BTREE_NUM_MINI_BATCHES] = { - [0 ... NUM_BLOB_BATCHES - 1] = PAGE_TYPE_BLOB, - [NUM_BLOB_BATCHES ... BTREE_NUM_MINI_BATCHES - 1] = PAGE_TYPE_MEMTABLE, + [0 ... NUM_BLOB_BATCHES - 1] = PAGE_TYPE_BLOB, + [NUM_BLOB_BATCHES... BTREE_NUM_MINI_BATCHES - 1] = PAGE_TYPE_MEMTABLE, }; static const page_type branch_page_type_table[BTREE_NUM_MINI_BATCHES] = { - [0 ... NUM_BLOB_BATCHES - 1] = PAGE_TYPE_BLOB, - [NUM_BLOB_BATCHES ... BTREE_NUM_MINI_BATCHES - 1] = PAGE_TYPE_BRANCH, + [0 ... NUM_BLOB_BATCHES - 1] = PAGE_TYPE_BLOB, + [NUM_BLOB_BATCHES... BTREE_NUM_MINI_BATCHES - 1] = PAGE_TYPE_BRANCH, }; uint64 @@ -1833,7 +1828,8 @@ btree_insert(cache *cc, // IN } btree_node_lock(cc, cfg, &root_node); if (btree_can_perform_leaf_incorporate_spec(cfg, root_node.hdr, &spec)) { - rc = btree_record_old_result(cfg, cc, root_node.hdr, &spec, old_result); + rc = + btree_record_old_result(cfg, cc, root_node.hdr, &spec, old_result); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &root_node); destroy_leaf_incorporate_spec(&spec); @@ -2492,8 +2488,8 @@ btree_iterator_curr(iterator *base_itor, key *curr_key, message *data) cache_validate_page(itor->cc, itor->curr.page, itor->curr.addr); if (itor->curr.hdr->height == 0) { *curr_key = btree_get_tuple_key(itor->cfg, itor->curr.hdr, itor->idx); - *data = - btree_get_tuple_message(itor->cfg, itor->cc, itor->curr.hdr, itor->idx); + *data = btree_get_tuple_message( + itor->cfg, itor->cc, itor->curr.hdr, itor->idx); log_trace_key(*curr_key, "btree_iterator_get_curr"); } else { index_entry *entry = @@ -4213,8 +4209,12 @@ btree_verify_node(cache *cc, platform_error_log("child tuple larger than parent bound\n"); platform_error_log("addr: %lu idx %u\n", node.addr, idx); platform_error_log("child addr: %lu idx %u\n", child.addr, idx); - btree_print_locked_node( - Platform_error_log_handle, cfg, cc, node.addr, node.hdr, type); + btree_print_locked_node(Platform_error_log_handle, + cfg, + cc, + node.addr, + node.hdr, + type); btree_print_locked_node(Platform_error_log_handle, cfg, cc, @@ -4239,8 +4239,12 @@ btree_verify_node(cache *cc, platform_error_log("child pivot larger than parent bound\n"); platform_error_log("addr: %lu idx %u\n", node.addr, idx); platform_error_log("child addr: %lu idx %u\n", child.addr, idx); - btree_print_locked_node( - Platform_error_log_handle, cfg, cc, node.addr, node.hdr, type); + btree_print_locked_node(Platform_error_log_handle, + cfg, + cc, + node.addr, + node.hdr, + type); btree_print_locked_node(Platform_error_log_handle, cfg, cc, diff --git a/src/data_internal.h b/src/data_internal.h index 4c37a89e..1db32d57 100644 --- a/src/data_internal.h +++ b/src/data_internal.h @@ -583,9 +583,8 @@ merge_accumulator_isblob(const merge_accumulator *ma) static inline message merge_accumulator_to_message(const merge_accumulator *ma) { - return message_create_blob(ma->type, - ma->cc, - writable_buffer_to_slice(&ma->data)); + return message_create_blob( + ma->type, ma->cc, writable_buffer_to_slice(&ma->data)); } static inline slice @@ -641,9 +640,8 @@ merge_accumulator_ensure_materialized(merge_accumulator *ma) writable_buffer materialized; writable_buffer_init(&materialized, ma->data.heap_id); - platform_status rc = blob_materialize_full(ma->cc, - writable_buffer_to_slice(&ma->data), - &materialized); + platform_status rc = blob_materialize_full( + ma->cc, writable_buffer_to_slice(&ma->data), &materialized); if (!SUCCESS(rc)) { writable_buffer_deinit(&materialized); return rc; diff --git a/src/mini_allocator.c b/src/mini_allocator.c index df4f46b3..4a4a6170 100644 --- a/src/mini_allocator.c +++ b/src/mini_allocator.c @@ -156,7 +156,9 @@ mini_full_unlock_meta_page(mini_allocator *mini, page_handle *meta_page) * update our bookkeeping. */ static platform_status -mini_allocator_get_new_extent(mini_allocator *mini, page_type type, uint64 *addr) +mini_allocator_get_new_extent(mini_allocator *mini, + page_type type, + uint64 *addr) { platform_status rc = allocator_alloc(mini->al, addr, type); if (SUCCESS(rc)) { @@ -404,8 +406,8 @@ mini_append_entry(mini_allocator *mini, uint64 batch, uint64 next_addr) mini_full_unlock_meta_page(mini, last_meta_page); mini_init_meta_page(mini, meta_page); - success = - mini_append_entry_to_page(mini, meta_page, next_addr, mini->types[batch]); + success = mini_append_entry_to_page( + mini, meta_page, next_addr, mini->types[batch]); if (mini->pinned) { cache_pin(mini->cc, meta_page); @@ -443,7 +445,7 @@ mini_alloc(mini_allocator *mini, uint64 batch, uint64 *next_extent) // need to allocate the next extent uint64 extent_addr = mini->next_extent[batch]; - platform_status rc = mini_allocator_get_new_extent( + platform_status rc = mini_allocator_get_new_extent( mini, mini->types[batch], &mini->next_extent[batch]); platform_assert_status_ok(rc); next_addr = extent_addr; @@ -543,7 +545,7 @@ mini_alloc_page(mini_allocator *mini, uint64 batch, uint64 *next_extent) { uint64 addrs[2] = {0, 0}; uint64 page_size = cache_page_size(mini->cc); - platform_status rc = mini_alloc_bytes( + platform_status rc = mini_alloc_bytes( mini, batch, page_size, page_size, 0, addrs, next_extent); if (!SUCCESS(rc)) { return 0; @@ -558,7 +560,7 @@ mini_alloc_extent(mini_allocator *mini, uint64 batch, uint64 *next_extent) { uint64 addrs[2] = {0, 0}; uint64 extent_size = cache_extent_size(mini->cc); - platform_status rc = mini_alloc_bytes( + platform_status rc = mini_alloc_bytes( mini, batch, extent_size, extent_size, 0, addrs, next_extent); if (!SUCCESS(rc)) { return 0; @@ -614,14 +616,11 @@ mini_release(mini_allocator *mini) { for (uint64 batch = 0; batch < mini->num_batches; batch++) { // Dealloc the next extent - refcount ref = - allocator_dec_ref(mini->al, - mini->next_extent[batch], - mini->types[batch]); + refcount ref = allocator_dec_ref( + mini->al, mini->next_extent[batch], mini->types[batch]); platform_assert(ref == AL_NO_REFS); - ref = allocator_dec_ref(mini->al, - mini->next_extent[batch], - mini->types[batch]); + ref = allocator_dec_ref( + mini->al, mini->next_extent[batch], mini->types[batch]); platform_assert(ref == AL_FREE); } } diff --git a/src/shard_log.c b/src/shard_log.c index 3fa9abc1..35fb9700 100644 --- a/src/shard_log.c +++ b/src/shard_log.c @@ -231,10 +231,10 @@ shard_log_write(log_handle *logh, key tuple_key, message msg, uint64 generation) { debug_assert(key_is_user_key(tuple_key)); - shard_log *log = (shard_log *)logh; - cache *cc = log->cc; - merge_accumulator log_blob; - bool32 log_blob_inited = FALSE; + shard_log *log = (shard_log *)logh; + cache *cc = log->cc; + merge_accumulator log_blob; + bool32 log_blob_inited = FALSE; uint64 max_entry_size = shard_log_page_size(log->cfg) - sizeof(shard_log_hdr); @@ -244,8 +244,8 @@ shard_log_write(log_handle *logh, key tuple_key, message msg, uint64 generation) merge_accumulator_init(&log_blob, platform_get_heap_id()); platform_status rc; if (message_isblob(msg)) { - rc = message_clone( - &log->cfg->blob_cfg, cc, &log->mini, msg, &log_blob); + rc = + message_clone(&log->cfg->blob_cfg, cc, &log->mini, msg, &log_blob); } else { rc = message_to_blob( &log->cfg->blob_cfg, cc, &log->mini, msg, &log_blob); @@ -254,7 +254,7 @@ shard_log_write(log_handle *logh, key tuple_key, message msg, uint64 generation) merge_accumulator_deinit(&log_blob); return rc.r; } - msg = merge_accumulator_to_message(&log_blob); + msg = merge_accumulator_to_message(&log_blob); log_blob_inited = TRUE; } @@ -491,7 +491,7 @@ shard_log_iterator_curr(iterator *itorh, key *curr_key, message *msg) { shard_log_iterator *itor = (shard_log_iterator *)itorh; *curr_key = log_entry_key(itor->entries[itor->pos]); - *msg = log_entry_message(itor->cc, itor->entries[itor->pos]); + *msg = log_entry_message(itor->cc, itor->entries[itor->pos]); } bool32 @@ -533,10 +533,10 @@ shard_log_config_init(shard_log_config *log_cfg, log_cfg->data_cfg = data_cfg; log_cfg->seed = HASH_SEED; log_cfg->blob_cfg = (blob_build_config){ - .extent_batch = 1, - .page_batch = 2, - .subpage_batch = 3, - .alignment = cache_config_page_size(cache_cfg), + .extent_batch = 1, + .page_batch = 2, + .subpage_batch = 3, + .alignment = cache_config_page_size(cache_cfg), }; } @@ -563,11 +563,12 @@ shard_log_print(shard_log *log) !terminal_log_entry(cfg, page->data, le); le = log_entry_next(le)) { - platform_default_log("%s -- %s%s : %lu\n", - key_string(dcfg, log_entry_key(le)), - log_entry_message_isblob(le) ? "(blob) " : "", - message_string(dcfg, log_entry_message(cc, le)), - le->generation); + platform_default_log( + "%s -- %s%s : %lu\n", + key_string(dcfg, log_entry_key(le)), + log_entry_message_isblob(le) ? "(blob) " : "", + message_string(dcfg, log_entry_message(cc, le)), + le->generation); } } cache_unget(cc, page); diff --git a/src/splinterdb.c b/src/splinterdb.c index 9d88020e..ead7f855 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -543,7 +543,8 @@ splinterdb_lookup_result_value(const splinterdb_lookup_result *result, // IN return platform_status_to_int(status); } - *value = merge_accumulator_to_value(lookup_result_accumulator(mutable_result)); + *value = + merge_accumulator_to_value(lookup_result_accumulator(mutable_result)); return 0; } diff --git a/tests/functional/log_test.c b/tests/functional/log_test.c index 912963f9..4b2eed0e 100644 --- a/tests/functional/log_test.c +++ b/tests/functional/log_test.c @@ -118,7 +118,7 @@ test_log_crash(clockcache *cc, } static int -test_log_large_message(cache *cc, +test_log_large_message(cache *cc, shard_log_config *cfg, shard_log *log, platform_heap_id hid) @@ -130,9 +130,8 @@ test_log_large_message(cache *cc, key returned_key; message returned_message; char key_data[] = "large-log-key"; - key skey = - key_create(FALSE, sizeof(key_data) - 1, key_data); - uint64 value_len = 3 * cache_page_size(cc) + 123; + key skey = key_create(FALSE, sizeof(key_data) - 1, key_data); + uint64 value_len = 3 * cache_page_size(cc) + 123; rc = shard_log_init(log, cc, cfg); platform_assert_status_ok(rc); @@ -143,10 +142,8 @@ test_log_large_message(cache *cc, merge_accumulator_set_class(&msg, MESSAGE_TYPE_INSERT); memset(merge_accumulator_data(&msg), 'L', value_len); - int log_rc = log_write((log_handle *)log, - skey, - merge_accumulator_to_message(&msg), - 0); + int log_rc = + log_write((log_handle *)log, skey, merge_accumulator_to_message(&msg), 0); platform_assert(log_rc == 0); merge_accumulator filler; @@ -154,26 +151,29 @@ test_log_large_message(cache *cc, success = merge_accumulator_resize(&filler, cache_page_size(cc) / 4); platform_assert(success); merge_accumulator_set_class(&filler, MESSAGE_TYPE_INSERT); - memset(merge_accumulator_data(&filler), 'f', merge_accumulator_length(&filler)); + memset( + merge_accumulator_data(&filler), 'f', merge_accumulator_length(&filler)); for (uint64 i = 1; i < 16; i++) { - log_rc = log_write((log_handle *)log, - skey, - merge_accumulator_to_message(&filler), - i); + log_rc = log_write( + (log_handle *)log, skey, merge_accumulator_to_message(&filler), i); platform_assert(log_rc == 0); } merge_accumulator_deinit(&filler); - rc = shard_log_iterator_init( - cc, cfg, hid, log_addr((log_handle *)log), log_magic((log_handle *)log), &itor); + rc = shard_log_iterator_init(cc, + cfg, + hid, + log_addr((log_handle *)log), + log_magic((log_handle *)log), + &itor); platform_assert_status_ok(rc); platform_assert(iterator_can_curr(itorh)); iterator_curr(itorh, &returned_key, &returned_message); platform_assert(data_key_compare(cfg->data_cfg, skey, returned_key) == 0); - platform_assert(message_lex_cmp(merge_accumulator_to_message(&msg), - returned_message) - == 0); + platform_assert( + message_lex_cmp(merge_accumulator_to_message(&msg), returned_message) + == 0); shard_log_iterator_deinit(hid, &itor); merge_accumulator_deinit(&msg); diff --git a/tests/unit/btree_test.c b/tests/unit/btree_test.c index 3c082f94..5b94e9e9 100644 --- a/tests/unit/btree_test.c +++ b/tests/unit/btree_test.c @@ -50,9 +50,8 @@ btree_leaf_incorporate_tuple(const btree_config *cfg, leaf_incorporate_spec *spec, uint64 *generation) { - platform_status rc = - btree_create_leaf_incorporate_spec( - cfg, NULL, NULL, hid, hdr, tuple_key, msg, spec); + platform_status rc = btree_create_leaf_incorporate_spec( + cfg, NULL, NULL, hid, hdr, tuple_key, msg, spec); ASSERT_TRUE(SUCCESS(rc)); return btree_try_perform_leaf_incorporate_spec(cfg, hdr, spec, generation); } diff --git a/tests/unit/splinterdb_quick_test.c b/tests/unit/splinterdb_quick_test.c index 22c5cdea..245441ba 100644 --- a/tests/unit/splinterdb_quick_test.c +++ b/tests/unit/splinterdb_quick_test.c @@ -342,7 +342,7 @@ CTEST2(splinterdb_quick, test_key_size_gt_max_key_size) CTEST2(splinterdb_quick, test_value_size_gt_max_value_size) { size_t too_large_value_len = 3 * IO_DEFAULT_PAGE_SIZE + 123; - char *too_large_value_data; + char *too_large_value_data; too_large_value_data = TYPED_ARRAY_MALLOC( data->cfg.heap_id, too_large_value_data, too_large_value_len); memset(too_large_value_data, 'z', too_large_value_len); @@ -357,7 +357,8 @@ CTEST2(splinterdb_quick, test_value_size_gt_max_value_size) splinterdb_lookup_result_init( data->kvsb, &result, SPLINTERDB_LOOKUP_VALUE, 0, NULL); - rc = splinterdb_lookup(data->kvsb, slice_create(sizeof("foo"), "foo"), &result); + rc = splinterdb_lookup( + data->kvsb, slice_create(sizeof("foo"), "foo"), &result); ASSERT_EQUAL(0, rc); ASSERT_TRUE(splinterdb_lookup_found(&result)); @@ -375,7 +376,8 @@ CTEST2(splinterdb_quick, test_value_size_gt_max_value_size) splinterdb_lookup_result_init( data->kvsb, &result, SPLINTERDB_LOOKUP_VALUE, 0, NULL); - rc = splinterdb_lookup(data->kvsb, slice_create(sizeof("foo"), "foo"), &result); + rc = splinterdb_lookup( + data->kvsb, slice_create(sizeof("foo"), "foo"), &result); ASSERT_EQUAL(0, rc); ASSERT_TRUE(splinterdb_lookup_found(&result)); From 3001b61afb49ca485ff332d4ece0e5b296f5b9b6 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 28 May 2026 10:56:54 -0700 Subject: [PATCH 04/13] some comments Signed-off-by: Rob Johnson --- src/blob.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/blob.c b/src/blob.c index 835a47ad..b6c8e084 100644 --- a/src/blob.c +++ b/src/blob.c @@ -7,6 +7,10 @@ #define MIN_LIVE_PERCENTAGE (90ULL) +/* If the data is large enough (or close enough to a whole number of + * rounded_size pieces), then we just put it entirely into + * rounded_size pieces, since this won't waste too much space. + */ bool can_round_up(uint64 rounded_size, uint64 length) { @@ -157,6 +161,21 @@ blob_page_iterator_init(cache *cc, return STATUS_OK; } +/* + * This function must be kept in sync with the code that decides + * whether to call cache_alloc in blob_build.c. + * + * The current policy is: The blob_build thread that gets the first + * byte of any page is reponsible for calling cache_alloc on that + * page. If the thread gets only part of the page, then it calls + * cache_alloc before calling mini_alloc_bytes_finish + * (i.e. immediately after getting the page from the mini_allocator). + * + * Thus we need to call cache_alloc now only if we got the entire + * page. Any partial page that we got will have already been + * cache_alloced by us (in blob_build) or by some other thread. + */ + static bool should_alloc(blob_page_iterator *iter) { From c3a4e66d84924133d1a1e46a402bb6353e6a3626 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 28 May 2026 11:11:03 -0700 Subject: [PATCH 05/13] cleanup Signed-off-by: Rob Johnson --- src/btree.c | 1 + src/data_internal.h | 21 +++++++------------- src/splinterdb.c | 4 ++-- tests/functional/ycsb_test.c | 1 + tests/unit/btree_stress_test.c | 1 + tests/unit/btree_test.c | 36 ++++++++++++++++++++-------------- tests/unit/splinter_test.c | 1 + 7 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/btree.c b/src/btree.c index fbe66c02..0ab80801 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2497,6 +2497,7 @@ btree_iterator_curr(iterator *base_itor, key *curr_key, message *data) *curr_key = index_entry_key(entry); *data = message_create( MESSAGE_TYPE_PIVOT_DATA, + NULL, slice_create(sizeof(entry->pivot_data), &entry->pivot_data)); } } diff --git a/src/data_internal.h b/src/data_internal.h index 1db32d57..5f7a0912 100644 --- a/src/data_internal.h +++ b/src/data_internal.h @@ -349,13 +349,7 @@ message_type_string(message_type type) ((message){.type = MESSAGE_TYPE_DELETE, .data = NULL_SLICE}) static inline message -message_create(message_type type, slice data) -{ - return (message){.type = type, .cc = NULL, .data = data}; -} - -static inline message -message_create_blob(message_type type, cache *cc, slice data) +message_create(message_type type, cache *cc, slice data) { return (message){.type = type, .cc = cc, .data = data}; } @@ -408,8 +402,8 @@ message_materialize_if_needed(platform_heap_id heap_id, writable_buffer_deinit(tmp); return rc; } - *result = - message_create(message_class(msg), writable_buffer_to_slice(tmp)); + *result = message_create( + message_class(msg), NULL, writable_buffer_to_slice(tmp)); } else { *result = msg; } @@ -499,9 +493,9 @@ ondisk_tuple_message(cache *cc, const ondisk_tuple *odt) { slice data = slice_create(odt->message_length, odt->key_and_message + odt->key_length); - return message_create_blob(ondisk_tuple_message_class(odt), - ondisk_tuple_message_isblob(odt) ? cc : NULL, - data); + return message_create(ondisk_tuple_message_class(odt), + ondisk_tuple_message_isblob(odt) ? cc : NULL, + data); } static inline void @@ -583,8 +577,7 @@ merge_accumulator_isblob(const merge_accumulator *ma) static inline message merge_accumulator_to_message(const merge_accumulator *ma) { - return message_create_blob( - ma->type, ma->cc, writable_buffer_to_slice(&ma->data)); + return message_create(ma->type, ma->cc, writable_buffer_to_slice(&ma->data)); } static inline slice diff --git a/src/splinterdb.c b/src/splinterdb.c index ead7f855..7617646d 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -625,7 +625,7 @@ splinterdb_insert(splinterdb *kvsb, slice value, splinterdb_lookup_result *old_result) { - message msg = message_create(MESSAGE_TYPE_INSERT, value); + message msg = message_create(MESSAGE_TYPE_INSERT, NULL, value); lookup_result *_old_result = old_result == NULL ? NULL : lookup_result_from_splinterdb(old_result); return splinterdb_insert_message(kvsb, user_key, msg, _old_result); @@ -648,7 +648,7 @@ splinterdb_update(splinterdb *kvsb, slice update, splinterdb_lookup_result *old_result) { - message msg = message_create(MESSAGE_TYPE_UPDATE, update); + message msg = message_create(MESSAGE_TYPE_UPDATE, NULL, update); lookup_result *_old_result = old_result == NULL ? NULL : lookup_result_from_splinterdb(old_result); platform_assert(kvsb->data_cfg->merge_tuples); diff --git a/tests/functional/ycsb_test.c b/tests/functional/ycsb_test.c index 9b5596f4..6b93faac 100644 --- a/tests/functional/ycsb_test.c +++ b/tests/functional/ycsb_test.c @@ -358,6 +358,7 @@ ycsb_thread(void *arg) { message val = message_create(MESSAGE_TYPE_INSERT, + NULL, slice_create(YCSB_DATA_SIZE, ops->value)); rc = core_insert( spl, key_create(FALSE, YCSB_KEY_SIZE, ops->key), val, NULL); diff --git a/tests/unit/btree_stress_test.c b/tests/unit/btree_stress_test.c index 04faa897..90b6e1b2 100644 --- a/tests/unit/btree_stress_test.c +++ b/tests/unit/btree_stress_test.c @@ -574,6 +574,7 @@ gen_msg(btree_config *cfg, uint64 i, uint8 *buffer, size_t length) memset(dh->data, 0, datalen); memcpy(dh->data, &i, sizeof(i)); return message_create(MESSAGE_TYPE_INSERT, + NULL, slice_create(sizeof(data_handle) + datalen, buffer)); } diff --git a/tests/unit/btree_test.c b/tests/unit/btree_test.c index 5b94e9e9..bf3d0014 100644 --- a/tests/unit/btree_test.c +++ b/tests/unit/btree_test.c @@ -199,7 +199,8 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) hdr, i, key_create(FALSE, i % sizeof(i), &i), - message_create(MESSAGE_TYPE_INSERT, slice_create(i % sizeof(i), &i))); + message_create( + MESSAGE_TYPE_INSERT, NULL, slice_create(i % sizeof(i), &i))); ASSERT_TRUE(rv, "Failed to insert 4-byte %d\n", i); } @@ -211,9 +212,10 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) cfg->data_cfg, key_create(FALSE, i % sizeof(i), &i), tuple_key); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte key %d\n", i); - cmp_rv = message_lex_cmp( - message_create(MESSAGE_TYPE_INSERT, slice_create(i % sizeof(i), &i)), - msg); + cmp_rv = message_lex_cmp(message_create(MESSAGE_TYPE_INSERT, + NULL, + slice_create(i % sizeof(i), &i)), + msg); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte message %d\n", i); } @@ -224,7 +226,8 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) hdr, i, key_create(FALSE, i % sizeof(i), &i), - message_create(MESSAGE_TYPE_INSERT, slice_create(i % sizeof(i), &i))); + message_create( + MESSAGE_TYPE_INSERT, NULL, slice_create(i % sizeof(i), &i))); ASSERT_TRUE(rv, "Failed to insert 8-byte %ld\n", i); } @@ -236,9 +239,10 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) cfg->data_cfg, key_create(FALSE, i % sizeof(i), &i), tuple_key); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte key %d\n", i); - cmp_rv = message_lex_cmp( - message_create(MESSAGE_TYPE_INSERT, slice_create(i % sizeof(i), &i)), - msg); + cmp_rv = message_lex_cmp(message_create(MESSAGE_TYPE_INSERT, + NULL, + slice_create(i % sizeof(i), &i)), + msg); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte message %d\n", i); } @@ -252,9 +256,10 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) cfg->data_cfg, key_create(FALSE, i % sizeof(i), &i), tuple_key); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte key %d\n", i); - cmp_rv = message_lex_cmp( - message_create(MESSAGE_TYPE_INSERT, slice_create(i % sizeof(i), &i)), - msg); + cmp_rv = message_lex_cmp(message_create(MESSAGE_TYPE_INSERT, + NULL, + slice_create(i % sizeof(i), &i)), + msg); ASSERT_EQUAL(0, cmp_rv, "Bad 4-byte message %d\n", i); } @@ -280,8 +285,8 @@ leaf_hdr_search_tests(btree_config *cfg, platform_heap_id hid) messagebuf[0] = i; key tuple_key = key_create(FALSE, 1, &keybuf); - message msg = - message_create(MESSAGE_TYPE_INSERT, slice_create(i % 8, messagebuf)); + message msg = message_create( + MESSAGE_TYPE_INSERT, NULL, slice_create(i % 8, messagebuf)); leaf_incorporate_spec spec; bool32 result = btree_leaf_incorporate_tuple( @@ -419,10 +424,11 @@ leaf_split_tests(btree_config *cfg, btree_init_hdr(cfg, hdr); int msgsize = bt_page_size / (nkvs + 1); - message msg = - message_create(MESSAGE_TYPE_INSERT, slice_create(msgsize, msg_buffer)); + message msg = message_create( + MESSAGE_TYPE_INSERT, NULL, slice_create(msgsize, msg_buffer)); message bigger_msg = message_create( MESSAGE_TYPE_INSERT, + NULL, slice_create(msgsize + sizeof(table_entry) + 1, msg_buffer)); uint8 realnkvs = 0; diff --git a/tests/unit/splinter_test.c b/tests/unit/splinter_test.c index 2a9f9db9..6694727e 100644 --- a/tests/unit/splinter_test.c +++ b/tests/unit/splinter_test.c @@ -303,6 +303,7 @@ shadow_entry_value(const shadow_entry *entry, char *data) { return message_create( MESSAGE_TYPE_INSERT, + NULL, slice_create(entry->value_length, data + entry->key_offset + entry->key_length)); } From 81b15101225c6b52dd62594afefe0e4062196c70 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 28 May 2026 12:03:37 -0700 Subject: [PATCH 06/13] fix lifetime issues in blob refs Signed-off-by: Rob Johnson --- src/btree.c | 132 ++++++++++++++++++++++------- src/btree.h | 31 ++++--- src/core.c | 41 +++++---- src/data_blob_build.c | 13 ++- src/data_internal.c | 35 ++++++++ src/data_internal.h | 51 +++++++++-- src/lookup_result.h | 14 ++- src/memtable.c | 73 ++++++++++++---- src/memtable.h | 15 +++- tests/unit/btree_stress_test.c | 4 + tests/unit/splinterdb_quick_test.c | 96 +++++++++++++++++++++ 11 files changed, 411 insertions(+), 94 deletions(-) diff --git a/src/btree.c b/src/btree.c index 0ab80801..a47a3101 100644 --- a/src/btree.c +++ b/src/btree.c @@ -602,15 +602,16 @@ btree_record_old_result(const btree_config *cfg, cache *cc, const btree_hdr *hdr, const leaf_incorporate_spec *spec, - lookup_result *old_result) + lookup_result *old_result, + const message_blob_ref *blob_ref) { if (old_result == NULL || spec->old_entry_state != ENTRY_STILL_EXISTS) { return STATUS_OK; } leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); - return lookup_result_update( - old_result, leaf_entry_key(entry), leaf_entry_message(cc, entry)); + return lookup_result_update_with_blob_ref( + old_result, leaf_entry_key(entry), leaf_entry_message(cc, entry), blob_ref); } platform_status @@ -1303,6 +1304,32 @@ btree_dec_ref(cache *cc, return ref == 0; } +static void +btree_blob_ref_inc(const message_blob_ref *ref) +{ + btree_inc_ref(ref->cc, ref->arg, ref->ref); +} + +static void +btree_blob_ref_dec(const message_blob_ref *ref) +{ + btree_dec_ref(ref->cc, ref->arg, ref->ref, ref->type); +} + +static message_blob_ref +btree_blob_ref(cache *cc, + const btree_config *cfg, + uint64 root_addr, + page_type type) +{ + return (message_blob_ref){.cc = cc, + .ref = root_addr, + .type = type, + .arg = cfg, + .inc = btree_blob_ref_inc, + .dec = btree_blob_ref_dec}; +} + /* * ********************************************************************* * The process of splitting a child leaf is divided into four steps in @@ -1354,6 +1381,7 @@ btree_split_child_leaf(cache *cc, btree_node *child, leaf_incorporate_spec *spec, lookup_result *old_result, + const message_blob_ref *old_result_blob_ref, uint64 *generation) // OUT { btree_node right_child; @@ -1413,7 +1441,8 @@ btree_split_child_leaf(cache *cc, /* p: write, c: write, rc: write, cn: write if exists */ platform_status rc = - btree_record_old_result(cfg, cc, child->hdr, spec, old_result); + btree_record_old_result( + cfg, cc, child->hdr, spec, old_result, old_result_blob_ref); if (!SUCCESS(rc)) { if (child_next.addr != 0) { btree_node_full_unlock(cc, cfg, &child_next); @@ -1475,16 +1504,17 @@ btree_split_child_leaf(cache *cc, * - insertion is complete */ static inline platform_status -btree_defragment_or_split_child_leaf(cache *cc, - const btree_config *cfg, - mini_allocator *mini, - btree_scratch *scratch, - btree_node *parent, - uint64 index_of_child_in_parent, - btree_node *child, - leaf_incorporate_spec *spec, - lookup_result *old_result, - uint64 *generation) // OUT +btree_defragment_or_split_child_leaf(cache *cc, + const btree_config *cfg, + mini_allocator *mini, + btree_scratch *scratch, + btree_node *parent, + uint64 index_of_child_in_parent, + btree_node *child, + leaf_incorporate_spec *spec, + lookup_result *old_result, + const message_blob_ref *old_result_blob_ref, + uint64 *generation) // OUT { uint64 nentries = btree_num_entries(child->hdr); uint64 live_bytes = 0; @@ -1508,7 +1538,8 @@ btree_defragment_or_split_child_leaf(cache *cc, btree_node_unget(cc, cfg, parent); btree_node_lock(cc, cfg, child); platform_status rc = - btree_record_old_result(cfg, cc, child->hdr, spec, old_result); + btree_record_old_result( + cfg, cc, child->hdr, spec, old_result, old_result_blob_ref); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, child); return rc; @@ -1528,6 +1559,7 @@ btree_defragment_or_split_child_leaf(cache *cc, child, spec, old_result, + old_result_blob_ref, generation); } @@ -1779,16 +1811,17 @@ btree_grow_root(cache *cc, // IN *----------------------------------------------------------------------------- */ platform_status -btree_insert(cache *cc, // IN - const btree_config *cfg, // IN - platform_heap_id heap_id, // IN - btree_scratch *scratch, // IN - uint64 root_addr, // IN - mini_allocator *mini, // IN - key tuple_key, // IN - message msg, // IN - lookup_result *old_result, // IN/OUT - uint64 *generation) // OUT +btree_insert(cache *cc, // IN + const btree_config *cfg, // IN + platform_heap_id heap_id, // IN + btree_scratch *scratch, // IN + uint64 root_addr, // IN + mini_allocator *mini, // IN + key tuple_key, // IN + message msg, // IN + lookup_result *old_result, // IN/OUT + const message_blob_ref *old_result_blob_ref, // IN + uint64 *generation) // OUT { platform_status rc; leaf_incorporate_spec spec; @@ -1828,8 +1861,12 @@ btree_insert(cache *cc, // IN } btree_node_lock(cc, cfg, &root_node); if (btree_can_perform_leaf_incorporate_spec(cfg, root_node.hdr, &spec)) { - rc = - btree_record_old_result(cfg, cc, root_node.hdr, &spec, old_result); + rc = btree_record_old_result(cfg, + cc, + root_node.hdr, + &spec, + old_result, + old_result_blob_ref); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &root_node); destroy_leaf_incorporate_spec(&spec); @@ -2042,7 +2079,12 @@ btree_insert(cache *cc, // IN goto start_over; } btree_node_lock(cc, cfg, &child_node); - rc = btree_record_old_result(cfg, cc, child_node.hdr, &spec, old_result); + rc = btree_record_old_result(cfg, + cc, + child_node.hdr, + &spec, + old_result, + old_result_blob_ref); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &child_node); destroy_leaf_incorporate_spec(&spec); @@ -2094,6 +2136,7 @@ btree_insert(cache *cc, // IN &child_node, &spec, old_result, + old_result_blob_ref, generation); destroy_leaf_incorporate_spec(&spec); if (STATUS_IS_EQ(rc, STATUS_BUSY)) { @@ -2350,11 +2393,26 @@ btree_lookup_and_merge(cache *cc, // IN key target, // IN lookup_result *result, // IN/OUT bool32 *local_found) // OUT +{ + return btree_lookup_and_merge_with_blob_ref( + cc, cfg, root_addr, type, target, result, NULL, local_found); +} + +platform_status +btree_lookup_and_merge_with_blob_ref(cache *cc, // IN + const btree_config *cfg, // IN + uint64 root_addr, // IN + page_type type, // IN + key target, // IN + lookup_result *result, // IN/OUT + const message_blob_ref *blob_ref, // IN + bool32 *local_found) // OUT { btree_node node; key found_key; message local_data; platform_status rc = STATUS_OK; + message_blob_ref default_blob_ref = MESSAGE_BLOB_REF_NULL; log_trace_key(target, "btree_lookup"); @@ -2368,7 +2426,12 @@ btree_lookup_and_merge(cache *cc, // IN if (local_found != NULL) { *local_found = TRUE; } - rc = lookup_result_update(result, found_key, local_data); + if (message_isblob(local_data) && message_blob_ref_is_null(blob_ref)) { + default_blob_ref = btree_blob_ref(cc, cfg, root_addr, type); + blob_ref = &default_blob_ref; + } + rc = lookup_result_update_with_blob_ref( + result, found_key, local_data, blob_ref); btree_node_unget(cc, cfg, &node); } return rc; @@ -2402,7 +2465,16 @@ btree_lookup_and_merge_async(btree_lookup_async_state *state) platform_status rc = STATUS_OK; if (!key_is_null(state->found_key)) { - rc = lookup_result_update(state->result, state->found_key, state->msg); + message_blob_ref blob_ref = MESSAGE_BLOB_REF_NULL; + if (message_isblob(state->msg)) { + blob_ref = btree_blob_ref( + state->cc, state->cfg, state->root_addr, state->type); + } + rc = lookup_result_update_with_blob_ref( + state->result, + state->found_key, + state->msg, + message_blob_ref_is_null(&blob_ref) ? NULL : &blob_ref); btree_node_unget(state->cc, state->cfg, &state->node); } async_return(state, rc); diff --git a/src/btree.h b/src/btree.h index 3018ebb5..bd8615ca 100644 --- a/src/btree.h +++ b/src/btree.h @@ -179,16 +179,17 @@ typedef struct btree_pack_req { } btree_pack_req; platform_status -btree_insert(cache *cc, // IN - const btree_config *cfg, // IN - platform_heap_id heap_id, // IN - btree_scratch *scratch, // IN - uint64 root_addr, // IN - mini_allocator *mini, // IN - key tuple_key, // IN - message data, // IN - lookup_result *old_result, // IN/OUT - uint64 *generation); // OUT +btree_insert(cache *cc, // IN + const btree_config *cfg, // IN + platform_heap_id heap_id, // IN + btree_scratch *scratch, // IN + uint64 root_addr, // IN + mini_allocator *mini, // IN + key tuple_key, // IN + message data, // IN + lookup_result *old_result, // IN/OUT + const message_blob_ref *old_result_blob_ref, // IN + uint64 *generation); // OUT uint64 btree_create(cache *cc, @@ -231,6 +232,16 @@ btree_lookup_and_merge(cache *cc, lookup_result *result, bool32 *local_found); +platform_status +btree_lookup_and_merge_with_blob_ref(cache *cc, + const btree_config *cfg, + uint64 root_addr, + page_type type, + key target, + lookup_result *result, + const message_blob_ref *blob_ref, + bool32 *local_found); + // clang-format off DEFINE_ASYNC_STATE(btree_lookup_async_state, 3, param, cache *, cc, diff --git a/src/core.c b/src/core.c index a341ef4b..d4044733 100644 --- a/src/core.c +++ b/src/core.c @@ -289,21 +289,16 @@ core_get_compacted_memtable(core_handle *spl, uint64 generation) } static inline void -core_memtable_inc_ref(core_handle *spl, uint64 mt_gen) +core_memtable_inc_ref(core_handle *spl, uint64 root_addr) { - memtable *mt = core_get_memtable(spl, mt_gen); - allocator_inc_ref(spl->al, mt->root_addr); + memtable_root_inc_ref(&spl->mt_ctxt, root_addr); } static void -core_memtable_dec_ref(core_handle *spl, uint64 generation) +core_memtable_dec_ref(core_handle *spl, uint64 root_addr) { - memtable *mt = core_get_memtable(spl, generation); - memtable_dec_ref_maybe_recycle(&spl->mt_ctxt, mt); - - // the branch in the compacted memtable is now in the tree, so don't zap it, - // we don't try to zero out the cmt because that would introduce a race. + memtable_root_dec_ref(&spl->mt_ctxt, root_addr); } @@ -325,7 +320,7 @@ core_memtable_iterator_init(core_handle *spl, bool32 inc_ref) { if (inc_ref) { - allocator_inc_ref(spl->al, root_addr); + core_memtable_inc_ref(spl, root_addr); } btree_iterator_init(spl->cc, spl->cfg.btree_cfg, @@ -345,12 +340,12 @@ core_memtable_iterator_init(core_handle *spl, static void core_memtable_iterator_deinit(core_handle *spl, btree_iterator *itor, - uint64 mt_gen, + uint64 root_addr, bool32 dec_ref) { btree_iterator_deinit(itor); if (dec_ref) { - core_memtable_dec_ref(spl, mt_gen); + core_memtable_dec_ref(spl, root_addr); } } @@ -604,11 +599,7 @@ core_memtable_incorporate(core_handle *spl, core_close_log_stream_if_enabled(spl, &stream); - /* - * Decrement the now-incorporated memtable ref count and recycle if no - * references - */ - memtable_dec_ref_maybe_recycle(&spl->mt_ctxt, mt); + memtable_recycle(&spl->mt_ctxt, mt); if (spl->cfg.use_stats) { const threadid tid = platform_get_tid(); @@ -728,8 +719,15 @@ core_memtable_lookup(core_handle *spl, page_type type = memtable_is_compacted ? PAGE_TYPE_BRANCH : PAGE_TYPE_MEMTABLE; - return btree_lookup_and_merge( - cc, cfg, root_addr, type, target, result, NULL); + if (memtable_is_compacted) { + return btree_lookup_and_merge( + cc, cfg, root_addr, type, target, result, NULL); + } + + message_blob_ref blob_ref; + memtable_blob_ref_init(&spl->mt_ctxt, root_addr, &blob_ref); + return btree_lookup_and_merge_with_blob_ref( + cc, cfg, root_addr, type, target, result, &blob_ref, NULL); } static platform_status @@ -1041,7 +1039,7 @@ core_range_iterator_init(core_handle *spl, if (compacted) { btree_inc_ref(spl->cc, spl->cfg.btree_cfg, root_addr); } else { - core_memtable_inc_ref(spl, mt_gen); + core_memtable_inc_ref(spl, root_addr); } range_itor->branch[range_itor->num_branches].addr = root_addr; @@ -1427,8 +1425,7 @@ core_range_iterator_deinit(core_range_iterator *range_itor) range_itor->branch[i].addr, PAGE_TYPE_BRANCH); } else { - uint64 mt_gen = range_itor->memtable_start_gen - i; - core_memtable_dec_ref(spl, mt_gen); + core_memtable_dec_ref(spl, range_itor->branch[i].addr); } } key_buffer_deinit(&range_itor->min_key); diff --git a/src/data_blob_build.c b/src/data_blob_build.c index 9a8d2f38..6cd1b64b 100644 --- a/src/data_blob_build.c +++ b/src/data_blob_build.c @@ -20,7 +20,11 @@ message_to_blob(const blob_build_config *cfg, platform_assert(!message_isblob(msg)); ma->type = message_class(msg); ma->cc = cc; - return blob_build(cfg, cc, mini, message_slice(msg), &ma->data); + platform_status rc = blob_build(cfg, cc, mini, message_slice(msg), &ma->data); + if (SUCCESS(rc)) { + merge_accumulator_release_blob_ref(ma); + } + return rc; } platform_status @@ -33,7 +37,11 @@ message_clone(const blob_build_config *cfg, platform_assert(message_isblob(msg)); result->type = message_class(msg); result->cc = cc; - return blob_clone(cfg, cc, mini, message_slice(msg), &result->data); + platform_status rc = blob_clone(cfg, cc, mini, message_slice(msg), &result->data); + if (SUCCESS(rc)) { + merge_accumulator_release_blob_ref(result); + } + return rc; } platform_status @@ -60,6 +68,7 @@ merge_accumulator_convert_to_blob(const blob_build_config *cfg, if (!SUCCESS(rc)) { return rc; } + merge_accumulator_release_blob_ref(ma); ma->cc = cc; return rc; } diff --git a/src/data_internal.c b/src/data_internal.c index dbba619d..9d939562 100644 --- a/src/data_internal.c +++ b/src/data_internal.c @@ -4,6 +4,16 @@ #include "data_internal.h" #include "poison.h" +static void +message_blob_ref_dec(message_blob_ref *blob_ref) +{ + if (!message_blob_ref_is_null(blob_ref)) { + message_blob_ref ref = *blob_ref; + *blob_ref = MESSAGE_BLOB_REF_NULL; + ref.dec(&ref); + } +} + message_type merge_accumulator_message_class(const merge_accumulator *ma) { @@ -28,15 +38,38 @@ merge_accumulator_to_slice(const merge_accumulator *ma) return writable_buffer_to_slice(&ma->data); } +void +merge_accumulator_release_blob_ref(merge_accumulator *ma) +{ + message_blob_ref_dec(&ma->blob_ref); +} + /* Copy a message into an already-initialized merge_accumulator. */ _Bool merge_accumulator_copy_message(merge_accumulator *ma, message msg) +{ + return merge_accumulator_copy_message_with_blob_ref(ma, msg, NULL); +} + +_Bool +merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, + message msg, + const message_blob_ref *blob_ref) { platform_status rc = writable_buffer_copy_slice(&ma->data, message_slice(msg)); if (!SUCCESS(rc)) { return FALSE; } + message_blob_ref old_ref = ma->blob_ref; + ma->blob_ref = MESSAGE_BLOB_REF_NULL; + + if (message_isblob(msg) && !message_blob_ref_is_null(blob_ref)) { + blob_ref->inc(blob_ref); + ma->blob_ref = *blob_ref; + } + message_blob_ref_dec(&old_ref); + ma->type = message_class(msg); ma->cc = msg.cc; return TRUE; @@ -47,6 +80,7 @@ merge_accumulator_resize(merge_accumulator *ma, uint64 newsize) { platform_status rc = writable_buffer_resize(&ma->data, newsize); if (SUCCESS(rc) && newsize == 0) { + merge_accumulator_release_blob_ref(ma); ma->cc = NULL; } return SUCCESS(rc); @@ -57,6 +91,7 @@ merge_accumulator_set_class(merge_accumulator *ma, message_type type) { ma->type = type; if (type == MESSAGE_TYPE_INVALID || type == MESSAGE_TYPE_DELETE) { + merge_accumulator_release_blob_ref(ma); ma->cc = NULL; } } diff --git a/src/data_internal.h b/src/data_internal.h index 5f7a0912..141a810c 100644 --- a/src/data_internal.h +++ b/src/data_internal.h @@ -525,18 +525,45 @@ copy_tuple_to_ondisk_tuple(ondisk_tuple *odt, key k, message msg) * Merge accumulators are basically the message version of a writable buffer. */ +typedef struct message_blob_ref { + cache *cc; + uint64 ref; + page_type type; + const void *arg; + void (*inc)(const struct message_blob_ref *ref); + void (*dec)(const struct message_blob_ref *ref); +} message_blob_ref; + +#define MESSAGE_BLOB_REF_NULL ((message_blob_ref){0}) + +static inline bool32 +message_blob_ref_is_null(const message_blob_ref *ref) +{ + return ref == NULL || ref->dec == NULL; +} + struct merge_accumulator { - message_type type; - cache *cc; - writable_buffer data; + message_type type; + cache *cc; + writable_buffer data; + message_blob_ref blob_ref; }; +void +merge_accumulator_release_blob_ref(merge_accumulator *ma); + +_Bool +merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, + message msg, + const message_blob_ref *blob_ref); + static inline void merge_accumulator_init(merge_accumulator *ma, platform_heap_id heap_id) { writable_buffer_init(&ma->data, heap_id); - ma->type = MESSAGE_TYPE_INVALID; - ma->cc = NULL; + ma->type = MESSAGE_TYPE_INVALID; + ma->cc = NULL; + ma->blob_ref = MESSAGE_BLOB_REF_NULL; } static inline void @@ -550,16 +577,19 @@ merge_accumulator_init_with_buffer(merge_accumulator *ma, { writable_buffer_init_with_buffer( &ma->data, heap_id, allocation_size, data, logical_size); - ma->type = type; - ma->cc = NULL; + ma->type = type; + ma->cc = NULL; + ma->blob_ref = MESSAGE_BLOB_REF_NULL; } static inline void merge_accumulator_deinit(merge_accumulator *ma) { + merge_accumulator_release_blob_ref(ma); writable_buffer_deinit(&ma->data); - ma->type = MESSAGE_TYPE_INVALID; - ma->cc = NULL; + ma->type = MESSAGE_TYPE_INVALID; + ma->cc = NULL; + ma->blob_ref = MESSAGE_BLOB_REF_NULL; } static inline bool32 @@ -601,6 +631,7 @@ merge_accumulator_init_from_message(merge_accumulator *ma, static inline void merge_accumulator_set_to_null(merge_accumulator *ma) { + merge_accumulator_release_blob_ref(ma); ma->type = MESSAGE_TYPE_INVALID; ma->cc = NULL; writable_buffer_set_to_null(&ma->data); @@ -612,6 +643,7 @@ merge_accumulator_is_null(const merge_accumulator *ma) bool32 r = writable_buffer_is_null(&ma->data); debug_assert(IMPLIES(r, ma->type == MESSAGE_TYPE_INVALID)); debug_assert(IMPLIES(r, ma->cc == NULL)); + debug_assert(IMPLIES(r, message_blob_ref_is_null(&ma->blob_ref))); return r; } @@ -642,6 +674,7 @@ merge_accumulator_ensure_materialized(merge_accumulator *ma) writable_buffer old_data = ma->data; ma->data = materialized; + merge_accumulator_release_blob_ref(ma); ma->cc = NULL; writable_buffer_deinit(&old_data); return STATUS_OK; diff --git a/src/lookup_result.h b/src/lookup_result.h index 6199cc09..3791d75c 100644 --- a/src/lookup_result.h +++ b/src/lookup_result.h @@ -100,7 +100,10 @@ lookup_result_found(const lookup_result *result) } static inline platform_status -lookup_result_update(lookup_result *result, key found_key, message msg) +lookup_result_update_with_blob_ref(lookup_result *result, + key found_key, + message msg, + const message_blob_ref *blob_ref) { if (lookup_result_is_existence(result)) { bool32 success = merge_accumulator_resize(&result->value, 0); @@ -112,7 +115,8 @@ lookup_result_update(lookup_result *result, key found_key, message msg) } if (merge_accumulator_is_null(&result->value)) { - bool32 success = merge_accumulator_copy_message(&result->value, msg); + bool32 success = merge_accumulator_copy_message_with_blob_ref( + &result->value, msg, blob_ref); return success ? STATUS_OK : STATUS_NO_MEMORY; } @@ -123,6 +127,12 @@ lookup_result_update(lookup_result *result, key found_key, message msg) : STATUS_NO_MEMORY; } +static inline platform_status +lookup_result_update(lookup_result *result, key found_key, message msg) +{ + return lookup_result_update_with_blob_ref(result, found_key, msg, NULL); +} + static inline bool32 lookup_result_should_continue(const lookup_result *result) { diff --git a/src/memtable.c b/src/memtable.c index 169960d4..66df9484 100644 --- a/src/memtable.c +++ b/src/memtable.c @@ -194,6 +194,44 @@ get_btree_scratch(memtable_context *ctxt, uint64 tid) + tid * ctxt->btree_scratch_sz); } +static void +memtable_blob_ref_inc(const message_blob_ref *blob_ref) +{ + btree_inc_ref(blob_ref->cc, blob_ref->arg, blob_ref->ref); +} + +static void +memtable_blob_ref_dec(const message_blob_ref *blob_ref) +{ + btree_dec_ref(blob_ref->cc, blob_ref->arg, blob_ref->ref, blob_ref->type); +} + +void +memtable_blob_ref_init(memtable_context *ctxt, + uint64 root_addr, + message_blob_ref *blob_ref) +{ + *blob_ref = (message_blob_ref){.cc = ctxt->cc, + .ref = root_addr, + .type = PAGE_TYPE_MEMTABLE, + .arg = ctxt->cfg.btree_cfg, + .inc = memtable_blob_ref_inc, + .dec = memtable_blob_ref_dec}; +} + +void +memtable_root_inc_ref(memtable_context *ctxt, uint64 root_addr) +{ + btree_inc_ref(ctxt->cc, ctxt->cfg.btree_cfg, root_addr); +} + +bool32 +memtable_root_dec_ref(memtable_context *ctxt, uint64 root_addr) +{ + return btree_dec_ref( + ctxt->cc, ctxt->cfg.btree_cfg, root_addr, PAGE_TYPE_MEMTABLE); +} + platform_status memtable_insert(memtable_context *ctxt, memtable *mt, @@ -206,6 +244,8 @@ memtable_insert(memtable_context *ctxt, const threadid tid = platform_get_tid(); btree_scratch *scratch = get_btree_scratch(ctxt, tid); + message_blob_ref blob_ref; + memtable_blob_ref_init(ctxt, mt->root_addr, &blob_ref); platform_status rc = btree_insert(ctxt->cc, ctxt->cfg.btree_cfg, heap_id, @@ -215,6 +255,7 @@ memtable_insert(memtable_context *ctxt, tuple_key, msg, old_result, + &blob_ref, leaf_generation); if (!SUCCESS(rc)) { return rc; @@ -225,26 +266,24 @@ memtable_insert(memtable_context *ctxt, return rc; } -/* - * if there are no outstanding refs, then destroy and reinit memtable and - * transition to READY - */ -bool32 -memtable_dec_ref_maybe_recycle(memtable_context *ctxt, memtable *mt) +void +memtable_recycle(memtable_context *ctxt, memtable *mt) { cache *cc = ctxt->cc; - bool32 freed = btree_dec_ref(cc, mt->cfg, mt->root_addr, PAGE_TYPE_MEMTABLE); - if (freed) { - platform_assert(mt->state == MEMTABLE_STATE_INCORPORATED); - mt->root_addr = btree_create(cc, mt->cfg, &mt->mini, PAGE_TYPE_MEMTABLE); - memtable_lock_incorporation_lock(ctxt); - mt->generation += ctxt->cfg.max_memtables; - memtable_unlock_incorporation_lock(ctxt); - memtable_transition( - mt, MEMTABLE_STATE_INCORPORATED, MEMTABLE_STATE_READY); - } - return freed; + platform_assert(mt->state == MEMTABLE_STATE_INCORPORATED); + /* + * Reuse the slot independently of the old tree's lifetime. The old root's + * mini allocator is protected by its own refcount and may be GCed by this + * dec-ref or by a later lookup result / iterator release. + */ + uint64 old_root_addr = mt->root_addr; + mt->root_addr = btree_create(cc, mt->cfg, &mt->mini, PAGE_TYPE_MEMTABLE); + memtable_lock_incorporation_lock(ctxt); + mt->generation += ctxt->cfg.max_memtables; + memtable_unlock_incorporation_lock(ctxt); + memtable_transition(mt, MEMTABLE_STATE_INCORPORATED, MEMTABLE_STATE_READY); + memtable_root_dec_ref(ctxt, old_root_addr); } uint64 diff --git a/src/memtable.h b/src/memtable.h index 568c2212..41443819 100644 --- a/src/memtable.h +++ b/src/memtable.h @@ -162,6 +162,17 @@ memtable_block_lookups(memtable_context *ctxt); void memtable_unblock_lookups(memtable_context *ctxt); +void +memtable_blob_ref_init(memtable_context *ctxt, + uint64 root_addr, + message_blob_ref *blob_ref); + +void +memtable_root_inc_ref(memtable_context *ctxt, uint64 root_addr); + +bool32 +memtable_root_dec_ref(memtable_context *ctxt, uint64 root_addr); + platform_status memtable_insert(memtable_context *ctxt, memtable *mt, @@ -171,8 +182,8 @@ memtable_insert(memtable_context *ctxt, lookup_result *old_result, uint64 *generation); -bool32 -memtable_dec_ref_maybe_recycle(memtable_context *ctxt, memtable *mt); +void +memtable_recycle(memtable_context *ctxt, memtable *mt); uint64 memtable_force_finalize(memtable_context *ctxt); diff --git a/tests/unit/btree_stress_test.c b/tests/unit/btree_stress_test.c index 90b6e1b2..612b167e 100644 --- a/tests/unit/btree_stress_test.c +++ b/tests/unit/btree_stress_test.c @@ -227,6 +227,7 @@ CTEST2(btree_stress, iterator_basics) gen_key(&data->dbtree_cfg, i, keybuf, sizeof(keybuf)), gen_msg(&data->dbtree_cfg, i, msgbuf, sizeof(msgbuf)), NULL, + NULL, &generation))) { ASSERT_TRUE(FALSE, "Failed to insert 4-byte %d\n", i); @@ -393,6 +394,7 @@ CTEST2(btree_stress, overwrite_returns_old_value_after_tree_growth) gen_key(&data->dbtree_cfg, i, keybuf, bt_page_size), gen_msg(&data->dbtree_cfg, i, msgbuf, bt_page_size), &uniqueness_result, + NULL, &generation); ASSERT_TRUE(SUCCESS(rc)); ASSERT_FALSE(lookup_result_found(&uniqueness_result)); @@ -449,6 +451,7 @@ CTEST2(btree_stress, overwrite_returns_old_value_after_tree_growth) gen_key(&data->dbtree_cfg, i, keybuf, bt_page_size), merge_accumulator_to_message(&update_msg), &old_result, + NULL, &generation); ASSERT_TRUE(SUCCESS(rc)); ASSERT_TRUE(lookup_result_found(&old_result)); @@ -531,6 +534,7 @@ insert_tests(cache *cc, gen_key(cfg, i, keybuf, keybuf_size), gen_msg(cfg, i, msgbuf, msgbuf_size), NULL, + NULL, &generation))) { ASSERT_TRUE(FALSE, "Failed to insert 4-byte %ld\n", i); diff --git a/tests/unit/splinterdb_quick_test.c b/tests/unit/splinterdb_quick_test.c index 245441ba..43df0e21 100644 --- a/tests/unit/splinterdb_quick_test.c +++ b/tests/unit/splinterdb_quick_test.c @@ -113,6 +113,9 @@ custom_key_comparator(const data_config *cfg, user_key key1, user_key key2); static uint64 sum_branch_lookups(const splinterdb *kvsb); +static uint64 +force_flush_current_memtable(splinterdb *kvsb); + static void assert_lookup_result_not_found(const splinterdb_lookup_result *result); @@ -390,6 +393,88 @@ CTEST2(splinterdb_quick, test_value_size_gt_max_value_size) platform_free(data->cfg.heap_id, too_large_value_data); } +CTEST2(splinterdb_quick, test_large_lookup_survives_memtable_recycle) +{ + size_t large_value_len = 3 * IO_DEFAULT_PAGE_SIZE + 123; + char *large_value_data; + large_value_data = + TYPED_ARRAY_MALLOC(data->cfg.heap_id, large_value_data, large_value_len); + memset(large_value_data, 'z', large_value_len); + slice large_value = slice_create(large_value_len, large_value_data); + slice user_key = slice_create(sizeof("foo"), "foo"); + + int rc = splinterdb_insert(data->kvsb, user_key, large_value, NULL); + ASSERT_EQUAL(0, rc); + + splinterdb_lookup_result result; + splinterdb_lookup_result_init( + data->kvsb, &result, SPLINTERDB_LOOKUP_VALUE, 0, NULL); + + rc = splinterdb_lookup(data->kvsb, user_key, &result); + ASSERT_EQUAL(0, rc); + ASSERT_TRUE(splinterdb_lookup_found(&result)); + + core_handle *core = (core_handle *)splinterdb_get_trunk_handle(data->kvsb); + uint64 generation = force_flush_current_memtable(data->kvsb); + memtable *mt = + &core->mt_ctxt.mt[generation % core->mt_ctxt.cfg.max_memtables]; + ASSERT_EQUAL(MEMTABLE_STATE_READY, mt->state); + + for (uint64 i = 0; i < core->mt_ctxt.cfg.max_memtables; i++) { + char recycle_key[TEST_INSERT_KEY_LENGTH] = {0}; + char recycle_val[TEST_INSERT_VAL_LENGTH] = {0}; + + snprintf(recycle_key, sizeof(recycle_key), "r-%06lu", i); + snprintf(recycle_val, sizeof(recycle_val), "v-%06lu", i); + rc = splinterdb_insert(data->kvsb, + slice_create(strlen(recycle_key), recycle_key), + slice_create(strlen(recycle_val), recycle_val), + NULL); + ASSERT_EQUAL(0, rc); + force_flush_current_memtable(data->kvsb); + } + + slice value; + rc = splinterdb_lookup_result_value(&result, &value); + ASSERT_EQUAL(0, rc); + ASSERT_EQUAL(large_value_len, slice_length(value)); + ASSERT_EQUAL(0, slice_lex_cmp(large_value, value)); + + splinterdb_lookup_result_deinit(&result); + platform_free(data->cfg.heap_id, large_value_data); +} + +CTEST2(splinterdb_quick, test_iterator_survives_memtable_recycle) +{ + const int num_inserts = 50; + int rc = insert_some_keys(num_inserts, data->kvsb); + ASSERT_EQUAL(0, rc); + + splinterdb_iterator *it = NULL; + rc = splinterdb_iterator_init( + data->kvsb, &it, greater_than_or_equal, NULL_SLICE); + ASSERT_EQUAL(0, rc); + ASSERT_TRUE(splinterdb_iterator_valid(it)); + + core_handle *core = (core_handle *)splinterdb_get_trunk_handle(data->kvsb); + uint64 generation = force_flush_current_memtable(data->kvsb); + memtable *mt = + &core->mt_ctxt.mt[generation % core->mt_ctxt.cfg.max_memtables]; + ASSERT_EQUAL(MEMTABLE_STATE_READY, mt->state); + + int i = 0; + for (; splinterdb_iterator_valid(it); splinterdb_iterator_next(it)) { + rc = check_current_tuple(it, i); + ASSERT_EQUAL(0, rc); + i++; + } + ASSERT_EQUAL(num_inserts, i); + + rc = splinterdb_iterator_status(it); + ASSERT_EQUAL(0, rc); + splinterdb_iterator_deinit(it); +} + /* * Test case to exercise APIs for variable-length values; empty value, * short and somewhat longish value. After inserting this data, the lookup @@ -1635,6 +1720,17 @@ create_default_cfg(splinterdb_config *out_cfg, data_config *default_data_cfg) .data_cfg = default_data_cfg}; } +static uint64 +force_flush_current_memtable(splinterdb *kvsb) +{ + core_handle *core = (core_handle *)splinterdb_get_trunk_handle(kvsb); + uint64 generation = memtable_force_finalize(&core->mt_ctxt); + core->mt_ctxt.process(core->mt_ctxt.process_ctxt, generation); + platform_status rc = task_perform_until_quiescent(core->ts); + ASSERT_TRUE(SUCCESS(rc)); + return generation; +} + static uint64 sum_branch_lookups(const splinterdb *kvsb) { From cf22cbc5691bb79f0ce1c2c7d344d5f9d0d9cdcc Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 28 May 2026 13:08:26 -0700 Subject: [PATCH 07/13] generalize ondisk_ref Signed-off-by: Rob Johnson --- src/btree.c | 40 ++++++++++++++-------------- src/btree.h | 4 +-- src/core.c | 2 +- src/data_internal.c | 24 +++++------------ src/data_internal.h | 30 +++++---------------- src/lookup_result.h | 2 +- src/memtable.c | 25 +++++++++--------- src/memtable.h | 3 ++- src/trunk.c | 64 +++++++++++++++++++++++++++++++-------------- src/trunk.h | 7 +++-- 10 files changed, 100 insertions(+), 101 deletions(-) diff --git a/src/btree.c b/src/btree.c index a47a3101..8c62f08c 100644 --- a/src/btree.c +++ b/src/btree.c @@ -603,7 +603,7 @@ btree_record_old_result(const btree_config *cfg, const btree_hdr *hdr, const leaf_incorporate_spec *spec, lookup_result *old_result, - const message_blob_ref *blob_ref) + const ondisk_ref *blob_ref) { if (old_result == NULL || spec->old_entry_state != ENTRY_STILL_EXISTS) { return STATUS_OK; @@ -1305,29 +1305,29 @@ btree_dec_ref(cache *cc, } static void -btree_blob_ref_inc(const message_blob_ref *ref) +btree_blob_ref_inc(const ondisk_ref *ref) { - btree_inc_ref(ref->cc, ref->arg, ref->ref); + btree_inc_ref(ref->cc, ref->arg, ref->addr); } static void -btree_blob_ref_dec(const message_blob_ref *ref) +btree_blob_ref_dec(const ondisk_ref *ref) { - btree_dec_ref(ref->cc, ref->arg, ref->ref, ref->type); + btree_dec_ref(ref->cc, ref->arg, ref->addr, ref->type); } -static message_blob_ref +static ondisk_ref btree_blob_ref(cache *cc, const btree_config *cfg, uint64 root_addr, page_type type) { - return (message_blob_ref){.cc = cc, - .ref = root_addr, - .type = type, - .arg = cfg, - .inc = btree_blob_ref_inc, - .dec = btree_blob_ref_dec}; + return (ondisk_ref){.cc = cc, + .addr = root_addr, + .type = type, + .arg = cfg, + .inc = btree_blob_ref_inc, + .dec = btree_blob_ref_dec}; } /* @@ -1381,7 +1381,7 @@ btree_split_child_leaf(cache *cc, btree_node *child, leaf_incorporate_spec *spec, lookup_result *old_result, - const message_blob_ref *old_result_blob_ref, + const ondisk_ref *old_result_blob_ref, uint64 *generation) // OUT { btree_node right_child; @@ -1513,7 +1513,7 @@ btree_defragment_or_split_child_leaf(cache *cc, btree_node *child, leaf_incorporate_spec *spec, lookup_result *old_result, - const message_blob_ref *old_result_blob_ref, + const ondisk_ref *old_result_blob_ref, uint64 *generation) // OUT { uint64 nentries = btree_num_entries(child->hdr); @@ -1820,7 +1820,7 @@ btree_insert(cache *cc, // IN key tuple_key, // IN message msg, // IN lookup_result *old_result, // IN/OUT - const message_blob_ref *old_result_blob_ref, // IN + const ondisk_ref *old_result_blob_ref, // IN uint64 *generation) // OUT { platform_status rc; @@ -2405,14 +2405,14 @@ btree_lookup_and_merge_with_blob_ref(cache *cc, // IN page_type type, // IN key target, // IN lookup_result *result, // IN/OUT - const message_blob_ref *blob_ref, // IN + const ondisk_ref *blob_ref, // IN bool32 *local_found) // OUT { btree_node node; key found_key; message local_data; platform_status rc = STATUS_OK; - message_blob_ref default_blob_ref = MESSAGE_BLOB_REF_NULL; + ondisk_ref default_blob_ref = ONDISK_REF_NULL; log_trace_key(target, "btree_lookup"); @@ -2426,7 +2426,7 @@ btree_lookup_and_merge_with_blob_ref(cache *cc, // IN if (local_found != NULL) { *local_found = TRUE; } - if (message_isblob(local_data) && message_blob_ref_is_null(blob_ref)) { + if (message_isblob(local_data) && ondisk_ref_is_null(blob_ref)) { default_blob_ref = btree_blob_ref(cc, cfg, root_addr, type); blob_ref = &default_blob_ref; } @@ -2465,7 +2465,7 @@ btree_lookup_and_merge_async(btree_lookup_async_state *state) platform_status rc = STATUS_OK; if (!key_is_null(state->found_key)) { - message_blob_ref blob_ref = MESSAGE_BLOB_REF_NULL; + ondisk_ref blob_ref = ONDISK_REF_NULL; if (message_isblob(state->msg)) { blob_ref = btree_blob_ref( state->cc, state->cfg, state->root_addr, state->type); @@ -2474,7 +2474,7 @@ btree_lookup_and_merge_async(btree_lookup_async_state *state) state->result, state->found_key, state->msg, - message_blob_ref_is_null(&blob_ref) ? NULL : &blob_ref); + ondisk_ref_is_null(&blob_ref) ? NULL : &blob_ref); btree_node_unget(state->cc, state->cfg, &state->node); } async_return(state, rc); diff --git a/src/btree.h b/src/btree.h index bd8615ca..3be391ad 100644 --- a/src/btree.h +++ b/src/btree.h @@ -188,7 +188,7 @@ btree_insert(cache *cc, // IN key tuple_key, // IN message data, // IN lookup_result *old_result, // IN/OUT - const message_blob_ref *old_result_blob_ref, // IN + const ondisk_ref *old_result_blob_ref, // IN uint64 *generation); // OUT uint64 @@ -239,7 +239,7 @@ btree_lookup_and_merge_with_blob_ref(cache *cc, page_type type, key target, lookup_result *result, - const message_blob_ref *blob_ref, + const ondisk_ref *blob_ref, bool32 *local_found); // clang-format off diff --git a/src/core.c b/src/core.c index d4044733..0cd7cb20 100644 --- a/src/core.c +++ b/src/core.c @@ -724,7 +724,7 @@ core_memtable_lookup(core_handle *spl, cc, cfg, root_addr, type, target, result, NULL); } - message_blob_ref blob_ref; + ondisk_ref blob_ref; memtable_blob_ref_init(&spl->mt_ctxt, root_addr, &blob_ref); return btree_lookup_and_merge_with_blob_ref( cc, cfg, root_addr, type, target, result, &blob_ref, NULL); diff --git a/src/data_internal.c b/src/data_internal.c index 9d939562..82c77d3f 100644 --- a/src/data_internal.c +++ b/src/data_internal.c @@ -4,16 +4,6 @@ #include "data_internal.h" #include "poison.h" -static void -message_blob_ref_dec(message_blob_ref *blob_ref) -{ - if (!message_blob_ref_is_null(blob_ref)) { - message_blob_ref ref = *blob_ref; - *blob_ref = MESSAGE_BLOB_REF_NULL; - ref.dec(&ref); - } -} - message_type merge_accumulator_message_class(const merge_accumulator *ma) { @@ -41,7 +31,7 @@ merge_accumulator_to_slice(const merge_accumulator *ma) void merge_accumulator_release_blob_ref(merge_accumulator *ma) { - message_blob_ref_dec(&ma->blob_ref); + ondisk_ref_dec(&ma->blob_ref); } /* Copy a message into an already-initialized merge_accumulator. */ @@ -54,21 +44,21 @@ merge_accumulator_copy_message(merge_accumulator *ma, message msg) _Bool merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, message msg, - const message_blob_ref *blob_ref) + const ondisk_ref *blob_ref) { platform_status rc = writable_buffer_copy_slice(&ma->data, message_slice(msg)); if (!SUCCESS(rc)) { return FALSE; } - message_blob_ref old_ref = ma->blob_ref; - ma->blob_ref = MESSAGE_BLOB_REF_NULL; + ondisk_ref old_ref = ma->blob_ref; + ma->blob_ref = ONDISK_REF_NULL; - if (message_isblob(msg) && !message_blob_ref_is_null(blob_ref)) { - blob_ref->inc(blob_ref); + if (message_isblob(msg) && !ondisk_ref_is_null(blob_ref)) { + ondisk_ref_inc(blob_ref); ma->blob_ref = *blob_ref; } - message_blob_ref_dec(&old_ref); + ondisk_ref_dec(&old_ref); ma->type = message_class(msg); ma->cc = msg.cc; diff --git a/src/data_internal.h b/src/data_internal.h index 141a810c..cd7e33f3 100644 --- a/src/data_internal.h +++ b/src/data_internal.h @@ -12,6 +12,7 @@ #include "splinterdb/data.h" #include "blob.h" #include "cache.h" +#include "ondisk_ref.h" #include "util.h" /* @@ -525,28 +526,11 @@ copy_tuple_to_ondisk_tuple(ondisk_tuple *odt, key k, message msg) * Merge accumulators are basically the message version of a writable buffer. */ -typedef struct message_blob_ref { - cache *cc; - uint64 ref; - page_type type; - const void *arg; - void (*inc)(const struct message_blob_ref *ref); - void (*dec)(const struct message_blob_ref *ref); -} message_blob_ref; - -#define MESSAGE_BLOB_REF_NULL ((message_blob_ref){0}) - -static inline bool32 -message_blob_ref_is_null(const message_blob_ref *ref) -{ - return ref == NULL || ref->dec == NULL; -} - struct merge_accumulator { message_type type; cache *cc; writable_buffer data; - message_blob_ref blob_ref; + ondisk_ref blob_ref; }; void @@ -555,7 +539,7 @@ merge_accumulator_release_blob_ref(merge_accumulator *ma); _Bool merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, message msg, - const message_blob_ref *blob_ref); + const ondisk_ref *blob_ref); static inline void merge_accumulator_init(merge_accumulator *ma, platform_heap_id heap_id) @@ -563,7 +547,7 @@ merge_accumulator_init(merge_accumulator *ma, platform_heap_id heap_id) writable_buffer_init(&ma->data, heap_id); ma->type = MESSAGE_TYPE_INVALID; ma->cc = NULL; - ma->blob_ref = MESSAGE_BLOB_REF_NULL; + ma->blob_ref = ONDISK_REF_NULL; } static inline void @@ -579,7 +563,7 @@ merge_accumulator_init_with_buffer(merge_accumulator *ma, &ma->data, heap_id, allocation_size, data, logical_size); ma->type = type; ma->cc = NULL; - ma->blob_ref = MESSAGE_BLOB_REF_NULL; + ma->blob_ref = ONDISK_REF_NULL; } static inline void @@ -589,7 +573,7 @@ merge_accumulator_deinit(merge_accumulator *ma) writable_buffer_deinit(&ma->data); ma->type = MESSAGE_TYPE_INVALID; ma->cc = NULL; - ma->blob_ref = MESSAGE_BLOB_REF_NULL; + ma->blob_ref = ONDISK_REF_NULL; } static inline bool32 @@ -643,7 +627,7 @@ merge_accumulator_is_null(const merge_accumulator *ma) bool32 r = writable_buffer_is_null(&ma->data); debug_assert(IMPLIES(r, ma->type == MESSAGE_TYPE_INVALID)); debug_assert(IMPLIES(r, ma->cc == NULL)); - debug_assert(IMPLIES(r, message_blob_ref_is_null(&ma->blob_ref))); + debug_assert(IMPLIES(r, ondisk_ref_is_null(&ma->blob_ref))); return r; } diff --git a/src/lookup_result.h b/src/lookup_result.h index 3791d75c..76d991af 100644 --- a/src/lookup_result.h +++ b/src/lookup_result.h @@ -103,7 +103,7 @@ static inline platform_status lookup_result_update_with_blob_ref(lookup_result *result, key found_key, message msg, - const message_blob_ref *blob_ref) + const ondisk_ref *blob_ref) { if (lookup_result_is_existence(result)) { bool32 success = merge_accumulator_resize(&result->value, 0); diff --git a/src/memtable.c b/src/memtable.c index 66df9484..38b84bcc 100644 --- a/src/memtable.c +++ b/src/memtable.c @@ -195,28 +195,29 @@ get_btree_scratch(memtable_context *ctxt, uint64 tid) } static void -memtable_blob_ref_inc(const message_blob_ref *blob_ref) +memtable_blob_ref_inc(const ondisk_ref *blob_ref) { - btree_inc_ref(blob_ref->cc, blob_ref->arg, blob_ref->ref); + btree_inc_ref(blob_ref->cc, blob_ref->arg, blob_ref->addr); } static void -memtable_blob_ref_dec(const message_blob_ref *blob_ref) +memtable_blob_ref_dec(const ondisk_ref *blob_ref) { - btree_dec_ref(blob_ref->cc, blob_ref->arg, blob_ref->ref, blob_ref->type); + btree_dec_ref( + blob_ref->cc, blob_ref->arg, blob_ref->addr, blob_ref->type); } void memtable_blob_ref_init(memtable_context *ctxt, uint64 root_addr, - message_blob_ref *blob_ref) + ondisk_ref *blob_ref) { - *blob_ref = (message_blob_ref){.cc = ctxt->cc, - .ref = root_addr, - .type = PAGE_TYPE_MEMTABLE, - .arg = ctxt->cfg.btree_cfg, - .inc = memtable_blob_ref_inc, - .dec = memtable_blob_ref_dec}; + *blob_ref = (ondisk_ref){.cc = ctxt->cc, + .addr = root_addr, + .type = PAGE_TYPE_MEMTABLE, + .arg = ctxt->cfg.btree_cfg, + .inc = memtable_blob_ref_inc, + .dec = memtable_blob_ref_dec}; } void @@ -244,7 +245,7 @@ memtable_insert(memtable_context *ctxt, const threadid tid = platform_get_tid(); btree_scratch *scratch = get_btree_scratch(ctxt, tid); - message_blob_ref blob_ref; + ondisk_ref blob_ref; memtable_blob_ref_init(ctxt, mt->root_addr, &blob_ref); platform_status rc = btree_insert(ctxt->cc, ctxt->cfg.btree_cfg, diff --git a/src/memtable.h b/src/memtable.h index 41443819..bfcd9d01 100644 --- a/src/memtable.h +++ b/src/memtable.h @@ -12,6 +12,7 @@ #include "platform_mutex.h" #include "task.h" #include "cache.h" +#include "ondisk_ref.h" #include "btree.h" #include "batch_rwlock.h" @@ -165,7 +166,7 @@ memtable_unblock_lookups(memtable_context *ctxt); void memtable_blob_ref_init(memtable_context *ctxt, uint64 root_addr, - message_blob_ref *blob_ref); + ondisk_ref *blob_ref); void memtable_root_inc_ref(memtable_context *ctxt, uint64 root_addr); diff --git a/src/trunk.c b/src/trunk.c index 18acaca7..32915e96 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -1787,9 +1787,26 @@ trunk_node_inc_all_refs(trunk_context *context, trunk_node *node) } } +static void +trunk_ondisk_node_ref_inc(const ondisk_ref *ref) +{ + trunk_context *context = (trunk_context *)ref->arg; + debug_assert(ref->type == PAGE_TYPE_TRUNK); + trunk_ondisk_node_inc_ref(context, ref->addr); +} + +static void +trunk_ondisk_node_ref_dec(const ondisk_ref *ref) +{ + trunk_context *context = (trunk_context *)ref->arg; + debug_assert(ref->type == PAGE_TYPE_TRUNK); + trunk_ondisk_node_dec_ref(context, ref->addr); +} + static trunk_ondisk_node_ref * -trunk_ondisk_node_ref_create(platform_heap_id hid, key k, uint64 child_addr) +trunk_ondisk_node_ref_create(trunk_context *context, key k, uint64 child_addr) { + platform_heap_id hid = context->hid; trunk_ondisk_node_ref *result = TYPED_FLEXIBLE_STRUCT_ZALLOC( hid, result, key.bytes, ondisk_key_required_data_capacity(k)); if (result == NULL) { @@ -1797,7 +1814,12 @@ trunk_ondisk_node_ref_create(platform_heap_id hid, key k, uint64 child_addr) "%s():%d: TYPED_FLEXIBLE_STRUCT_ZALLOC() failed", __func__, __LINE__); return NULL; } - result->addr = child_addr; + result->ref = (ondisk_ref){.cc = context->cc, + .addr = child_addr, + .type = PAGE_TYPE_TRUNK, + .arg = context, + .inc = trunk_ondisk_node_ref_inc, + .dec = trunk_ondisk_node_ref_dec}; copy_key_to_ondisk_key(&result->key, k); return result; } @@ -1807,9 +1829,9 @@ trunk_ondisk_node_ref_destroy(trunk_ondisk_node_ref *odnref, trunk_context *context, platform_heap_id hid) { - if (odnref->addr != 0) { - trunk_ondisk_node_dec_ref(context, odnref->addr); - } + debug_assert(ondisk_ref_is_null(&odnref->ref) + || odnref->ref.arg == context); + ondisk_ref_dec(&odnref->ref); platform_free(hid, odnref); } @@ -1819,7 +1841,7 @@ trunk_pivot_create_from_ondisk_node_ref(trunk_ondisk_node_ref *odnref, { return trunk_pivot_create(hid, ondisk_key_to_key(&odnref->key), - odnref->addr, + odnref->ref.addr, 0, TRUNK_STATS_ZERO, TRUNK_STATS_ZERO); @@ -2118,7 +2140,7 @@ trunk_node_serialize(trunk_context *context, trunk_node *node) trunk_node_inc_all_refs(context, node); result = trunk_ondisk_node_ref_create( - context->hid, trunk_node_pivot_key(node, 0), header_addr); + context, trunk_node_pivot_key(node, 0), header_addr); if (result == NULL) { platform_error_log( "%s():%d: ondisk_node_ref_create() failed", __func__, __LINE__); @@ -2391,7 +2413,7 @@ trunk_init_root_handle(trunk_context *context, trunk_ondisk_node_handle *handle) rc = STATUS_OK; } else { rc = trunk_ondisk_node_handle_init( - handle, context->cc, context->root->addr); + handle, context->cc, context->root->ref.addr); } trunk_read_end(context); return rc; @@ -2516,7 +2538,8 @@ trunk_apply_changes_internal(trunk_context *context, rc = vector_append(&new_child_refs, new_child_ref); platform_assert_status_ok(rc); - trunk_pivot_set_child_addr(child_pivot, new_child_ref->addr); + trunk_pivot_set_child_addr(child_pivot, + new_child_ref->ref.addr); } } } @@ -2544,7 +2567,7 @@ trunk_apply_changes(trunk_context *context, void *arg) { trunk_ondisk_node_ref *new_root_ref = trunk_apply_changes_internal( - context, context->root->addr, minkey, maxkey, height, func, arg); + context, context->root->ref.addr, minkey, maxkey, height, func, arg); if (new_root_ref != NULL) { trunk_set_root(context, new_root_ref); } else { @@ -4904,7 +4927,7 @@ trunk_incorporate_prepare(trunk_context *context, uint64 branch_addr) // Read the old root. trunk_node root; if (context->root != NULL) { - rc = trunk_node_deserialize(context, context->root->addr, &root); + rc = trunk_node_deserialize(context, context->root->ref.addr, &root); if (!SUCCESS(rc)) { platform_error_log("trunk_incorporate: node_deserialize failed: %d\n", rc.r); @@ -5828,22 +5851,23 @@ trunk_context_init(trunk_context *context, { memset(context, 0, sizeof(trunk_context)); + context->cfg = cfg; + context->hid = hid; + context->cc = cc; + context->al = al; + context->ts = ts; + if (root_addr != 0) { - context->root = - trunk_ondisk_node_ref_create(hid, NEGATIVE_INFINITY_KEY, root_addr); + context->root = trunk_ondisk_node_ref_create( + context, NEGATIVE_INFINITY_KEY, root_addr); if (context->root == NULL) { platform_error_log("trunk_node_context_init: " "ondisk_node_ref_create failed\n"); return STATUS_NO_MEMORY; } - allocator_inc_ref(al, root_addr); + ondisk_ref_inc(&context->root->ref); } - context->cfg = cfg; - context->hid = hid; - context->cc = cc; - context->al = al; - context->ts = ts; context->stats = NULL; if (cfg->use_stats) { context->stats = TYPED_ARRAY_MALLOC(hid, context->stats, MAX_THREADS); @@ -6040,7 +6064,7 @@ trunk_print_insertion_stats(platform_log_handle *log_handle, // Get the height of the tree trunk_node root; platform_status rc = - trunk_node_deserialize(context, context->root->addr, &root); + trunk_node_deserialize(context, context->root->ref.addr, &root); if (!SUCCESS(rc)) { platform_error_log("trunk_node_print_insertion_stats: " "node_deserialize failed: %d\n", diff --git a/src/trunk.h b/src/trunk.h index f859f3ab..1fa3e4fe 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -10,6 +10,7 @@ #include "vector.h" #include "cache.h" #include "allocator.h" +#include "ondisk_ref.h" #include "task.h" #include "btree.h" #include "routing_filter.h" @@ -113,11 +114,9 @@ typedef struct trunk_pivot_state_map { trunk_pivot_state *buckets[TRUNK_PIVOT_STATE_MAP_BUCKETS]; } trunk_pivot_state_map; -/* An ondisk_node_ref is a pivot that has an associated bump in the refcount of - * the child, so destroying an ondisk_node_ref will perform an - * ondisk_node_dec_ref. */ +/* An ondisk_node_ref is a pivot key plus an owned in-memory ref to the child. */ typedef struct trunk_ondisk_node_ref { - uint64 addr; + ondisk_ref ref; ondisk_key key; } trunk_ondisk_node_ref; From 4464c67de6754b8fbc55dfcdc79b2264bcd0f4ad Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 28 May 2026 21:34:49 -0700 Subject: [PATCH 08/13] add generalized ondisk_ref Signed-off-by: Rob Johnson --- src/ondisk_ref.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 src/ondisk_ref.h diff --git a/src/ondisk_ref.h b/src/ondisk_ref.h new file mode 100644 index 00000000..5f8855ce --- /dev/null +++ b/src/ondisk_ref.h @@ -0,0 +1,54 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +/* + * ondisk_ref.h -- + * + * In-memory ownership references for allocator-backed on-disk objects. + */ + +#pragma once + +#include "allocator.h" +#include "cache.h" + +/* + * ondisk_ref is an in-memory owner for an allocator-backed on-disk object. + * Callers provide object-specific inc/dec callbacks so the ref can protect + * roots of compound structures such as trunk nodes or btree mini-allocators. + */ +typedef struct ondisk_ref { + cache *cc; + uint64 addr; + page_type type; + const void *arg; + void (*inc)(const struct ondisk_ref *ref); + void (*dec)(const struct ondisk_ref *ref); +} ondisk_ref; + +#define ONDISK_REF_NULL ((ondisk_ref){0}) + +static inline bool32 +ondisk_ref_is_null(const ondisk_ref *ref) +{ + return ref == NULL || ref->dec == NULL; +} + +static inline void +ondisk_ref_inc(const ondisk_ref *ref) +{ + if (!ondisk_ref_is_null(ref)) { + debug_assert(ref->inc != NULL); + ref->inc(ref); + } +} + +static inline void +ondisk_ref_dec(ondisk_ref *ref) +{ + if (!ondisk_ref_is_null(ref)) { + ondisk_ref ref_copy = *ref; + *ref = ONDISK_REF_NULL; + ref_copy.dec(&ref_copy); + } +} From 208a97362ddfc953b6dd37a862874673fbe2cc02 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 29 May 2026 13:11:32 -0700 Subject: [PATCH 09/13] make ondisk_refs harder to misuse Signed-off-by: Rob Johnson --- Makefile | 1 + src/btree.c | 175 ++++++++++++++--------------- src/btree.h | 38 +++---- src/btree_private.h | 4 +- src/core.c | 4 +- src/data_blob_build.c | 10 +- src/data_internal.c | 17 +-- src/data_internal.h | 58 +++++----- src/lookup_result.h | 8 +- src/memtable.c | 25 +++-- src/memtable.h | 2 +- src/ondisk_ref.c | 147 ++++++++++++++++++++++++ src/ondisk_ref.h | 73 +++++++----- src/shard_log.c | 6 +- src/splinterdb.c | 2 +- src/trunk.c | 43 ++++--- src/trunk.h | 3 +- tests/unit/splinterdb_quick_test.c | 6 +- 18 files changed, 399 insertions(+), 223 deletions(-) create mode 100644 src/ondisk_ref.c diff --git a/Makefile b/Makefile index b8dde00f..dd839b10 100644 --- a/Makefile +++ b/Makefile @@ -415,6 +415,7 @@ BTREE_SYS = $(OBJDIR)/$(SRCDIR)/btree.o \ $(OBJDIR)/$(SRCDIR)/data_blob_build.o \ $(OBJDIR)/$(SRCDIR)/data_internal.o \ $(OBJDIR)/$(SRCDIR)/mini_allocator.o \ + $(OBJDIR)/$(SRCDIR)/ondisk_ref.o \ $(CLOCKCACHE_SYS) ################################################################# diff --git a/src/btree.c b/src/btree.c index 8c62f08c..1f0016e3 100644 --- a/src/btree.c +++ b/src/btree.c @@ -169,7 +169,7 @@ leaf_entry_key_size(const leaf_entry *entry) static inline uint64 leaf_entry_message_size(const leaf_entry *entry) { - if (!ondisk_tuple_message_isblob(entry)) { + if (!ondisk_tuple_message_is_blob(entry)) { return entry->message_length; } slice sblob = slice_create(entry->message_length, @@ -413,7 +413,7 @@ btree_copy_leaf_entry(const btree_config *cfg, { key entry_key = ondisk_tuple_key(entry); message entry_msg = ondisk_tuple_message( - ondisk_tuple_message_isblob(entry) ? (cache *)1 : NULL, entry); + ondisk_tuple_message_is_blob(entry) ? (cache *)1 : NULL, entry); return btree_set_leaf_entry(cfg, hdr, k, entry_key, entry_msg); } @@ -591,9 +591,9 @@ static message spec_message(const leaf_incorporate_spec *spec) { if (spec->use_msg) { - return spec->msg.msg; + return spec->msg.new_message; } else { - return merge_accumulator_to_message(&spec->msg.ma); + return merge_accumulator_to_message(&spec->msg.merged_message); } } @@ -610,8 +610,10 @@ btree_record_old_result(const btree_config *cfg, } leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); - return lookup_result_update_with_blob_ref( - old_result, leaf_entry_key(entry), leaf_entry_message(cc, entry), blob_ref); + return lookup_result_update_with_blob_ref(old_result, + leaf_entry_key(entry), + leaf_entry_message(cc, entry), + blob_ref); } platform_status @@ -632,43 +634,47 @@ btree_create_leaf_incorporate_spec(const btree_config *cfg, if (!found) { spec->idx++; - if (message_isblob(msg)) { + if (message_is_blob(msg)) { spec->use_msg = FALSE; - merge_accumulator_init(&spec->msg.ma, heap_id); - rc = message_clone(&btree_blob_cfg, cc, mini, msg, &spec->msg.ma); + merge_accumulator_init(&spec->msg.merged_message, heap_id); + rc = message_clone( + &btree_blob_cfg, cc, mini, msg, &spec->msg.merged_message); } else if (MAX_INLINE_MESSAGE_SIZE(btree_page_size(cfg)) < message_length(msg)) { spec->use_msg = FALSE; - merge_accumulator_init(&spec->msg.ma, heap_id); - rc = message_to_blob(&btree_blob_cfg, cc, mini, msg, &spec->msg.ma); + merge_accumulator_init(&spec->msg.merged_message, heap_id); + rc = message_to_blob( + &btree_blob_cfg, cc, mini, msg, &spec->msg.merged_message); } else { - spec->use_msg = TRUE; - spec->msg.msg = msg; + spec->use_msg = TRUE; + spec->msg.new_message = msg; } } else { leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); message oldmessage = leaf_entry_message(cc, entry); bool32 success; spec->use_msg = FALSE; - success = - merge_accumulator_init_from_message(&spec->msg.ma, heap_id, msg); + success = merge_accumulator_init_from_message( + &spec->msg.merged_message, heap_id, msg); if (!success) { return STATUS_NO_MEMORY; } - if (btree_merge_tuples(cfg, tuple_key, oldmessage, &spec->msg.ma)) { - merge_accumulator_deinit(&spec->msg.ma); + if (btree_merge_tuples( + cfg, tuple_key, oldmessage, &spec->msg.merged_message)) + { + merge_accumulator_deinit(&spec->msg.merged_message); return STATUS_NO_MEMORY; } else if (MAX_INLINE_MESSAGE_SIZE(btree_page_size(cfg)) - < merge_accumulator_length(&spec->msg.ma)) + < merge_accumulator_length(&spec->msg.merged_message)) { rc = merge_accumulator_convert_to_blob( - &btree_blob_cfg, cc, mini, &spec->msg.ma); + &btree_blob_cfg, cc, mini, &spec->msg.merged_message); } } if (!SUCCESS(rc) && !spec->use_msg) { - merge_accumulator_deinit(&spec->msg.ma); + merge_accumulator_deinit(&spec->msg.merged_message); } return rc; } @@ -677,7 +683,7 @@ void destroy_leaf_incorporate_spec(leaf_incorporate_spec *spec) { if (!spec->use_msg) { - merge_accumulator_deinit(&spec->msg.ma); + merge_accumulator_deinit(&spec->msg.merged_message); } } @@ -1316,18 +1322,15 @@ btree_blob_ref_dec(const ondisk_ref *ref) btree_dec_ref(ref->cc, ref->arg, ref->addr, ref->type); } -static ondisk_ref -btree_blob_ref(cache *cc, - const btree_config *cfg, - uint64 root_addr, - page_type type) +static void +btree_blob_ref_init(ondisk_ref *ref, + cache *cc, + const btree_config *cfg, + uint64 root_addr, + page_type type) { - return (ondisk_ref){.cc = cc, - .addr = root_addr, - .type = type, - .arg = cfg, - .inc = btree_blob_ref_inc, - .dec = btree_blob_ref_dec}; + ondisk_ref_init( + ref, cc, root_addr, type, cfg, btree_blob_ref_inc, btree_blob_ref_dec); } /* @@ -1381,7 +1384,7 @@ btree_split_child_leaf(cache *cc, btree_node *child, leaf_incorporate_spec *spec, lookup_result *old_result, - const ondisk_ref *old_result_blob_ref, + const ondisk_ref *old_result_blob_ref, uint64 *generation) // OUT { btree_node right_child; @@ -1440,9 +1443,8 @@ btree_split_child_leaf(cache *cc, btree_node_lock(cc, cfg, child); /* p: write, c: write, rc: write, cn: write if exists */ - platform_status rc = - btree_record_old_result( - cfg, cc, child->hdr, spec, old_result, old_result_blob_ref); + platform_status rc = btree_record_old_result( + cfg, cc, child->hdr, spec, old_result, old_result_blob_ref); if (!SUCCESS(rc)) { if (child_next.addr != 0) { btree_node_full_unlock(cc, cfg, &child_next); @@ -1504,17 +1506,17 @@ btree_split_child_leaf(cache *cc, * - insertion is complete */ static inline platform_status -btree_defragment_or_split_child_leaf(cache *cc, - const btree_config *cfg, - mini_allocator *mini, - btree_scratch *scratch, - btree_node *parent, - uint64 index_of_child_in_parent, - btree_node *child, - leaf_incorporate_spec *spec, - lookup_result *old_result, - const ondisk_ref *old_result_blob_ref, - uint64 *generation) // OUT +btree_defragment_or_split_child_leaf(cache *cc, + const btree_config *cfg, + mini_allocator *mini, + btree_scratch *scratch, + btree_node *parent, + uint64 index_of_child_in_parent, + btree_node *child, + leaf_incorporate_spec *spec, + lookup_result *old_result, + const ondisk_ref *old_result_blob_ref, + uint64 *generation) // OUT { uint64 nentries = btree_num_entries(child->hdr); uint64 live_bytes = 0; @@ -1537,9 +1539,8 @@ btree_defragment_or_split_child_leaf(cache *cc, btree_node_unclaim(cc, cfg, parent); btree_node_unget(cc, cfg, parent); btree_node_lock(cc, cfg, child); - platform_status rc = - btree_record_old_result( - cfg, cc, child->hdr, spec, old_result, old_result_blob_ref); + platform_status rc = btree_record_old_result( + cfg, cc, child->hdr, spec, old_result, old_result_blob_ref); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, child); return rc; @@ -1811,17 +1812,17 @@ btree_grow_root(cache *cc, // IN *----------------------------------------------------------------------------- */ platform_status -btree_insert(cache *cc, // IN - const btree_config *cfg, // IN - platform_heap_id heap_id, // IN - btree_scratch *scratch, // IN - uint64 root_addr, // IN - mini_allocator *mini, // IN - key tuple_key, // IN - message msg, // IN - lookup_result *old_result, // IN/OUT - const ondisk_ref *old_result_blob_ref, // IN - uint64 *generation) // OUT +btree_insert(cache *cc, // IN + const btree_config *cfg, // IN + platform_heap_id heap_id, // IN + btree_scratch *scratch, // IN + uint64 root_addr, // IN + mini_allocator *mini, // IN + key tuple_key, // IN + message msg, // IN + lookup_result *old_result, // IN/OUT + const ondisk_ref *old_result_blob_ref, // IN + uint64 *generation) // OUT { platform_status rc; leaf_incorporate_spec spec; @@ -1861,12 +1862,8 @@ btree_insert(cache *cc, // IN } btree_node_lock(cc, cfg, &root_node); if (btree_can_perform_leaf_incorporate_spec(cfg, root_node.hdr, &spec)) { - rc = btree_record_old_result(cfg, - cc, - root_node.hdr, - &spec, - old_result, - old_result_blob_ref); + rc = btree_record_old_result( + cfg, cc, root_node.hdr, &spec, old_result, old_result_blob_ref); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &root_node); destroy_leaf_incorporate_spec(&spec); @@ -2079,12 +2076,8 @@ btree_insert(cache *cc, // IN goto start_over; } btree_node_lock(cc, cfg, &child_node); - rc = btree_record_old_result(cfg, - cc, - child_node.hdr, - &spec, - old_result, - old_result_blob_ref); + rc = btree_record_old_result( + cfg, cc, child_node.hdr, &spec, old_result, old_result_blob_ref); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &child_node); destroy_leaf_incorporate_spec(&spec); @@ -2399,20 +2392,20 @@ btree_lookup_and_merge(cache *cc, // IN } platform_status -btree_lookup_and_merge_with_blob_ref(cache *cc, // IN - const btree_config *cfg, // IN - uint64 root_addr, // IN - page_type type, // IN - key target, // IN - lookup_result *result, // IN/OUT - const ondisk_ref *blob_ref, // IN - bool32 *local_found) // OUT +btree_lookup_and_merge_with_blob_ref(cache *cc, // IN + const btree_config *cfg, // IN + uint64 root_addr, // IN + page_type type, // IN + key target, // IN + lookup_result *result, // IN/OUT + const ondisk_ref *blob_ref, // IN + bool32 *local_found) // OUT { btree_node node; key found_key; message local_data; - platform_status rc = STATUS_OK; - ondisk_ref default_blob_ref = ONDISK_REF_NULL; + platform_status rc = STATUS_OK; + ondisk_ref default_blob_ref = ONDISK_REF_NULL; log_trace_key(target, "btree_lookup"); @@ -2426,14 +2419,15 @@ btree_lookup_and_merge_with_blob_ref(cache *cc, // IN if (local_found != NULL) { *local_found = TRUE; } - if (message_isblob(local_data) && ondisk_ref_is_null(blob_ref)) { - default_blob_ref = btree_blob_ref(cc, cfg, root_addr, type); - blob_ref = &default_blob_ref; + if (message_is_blob(local_data) && ondisk_ref_is_null(blob_ref)) { + btree_blob_ref_init(&default_blob_ref, cc, cfg, root_addr, type); + blob_ref = &default_blob_ref; } rc = lookup_result_update_with_blob_ref( result, found_key, local_data, blob_ref); btree_node_unget(cc, cfg, &node); } + ondisk_ref_deinit(&default_blob_ref); return rc; } @@ -2466,15 +2460,16 @@ btree_lookup_and_merge_async(btree_lookup_async_state *state) platform_status rc = STATUS_OK; if (!key_is_null(state->found_key)) { ondisk_ref blob_ref = ONDISK_REF_NULL; - if (message_isblob(state->msg)) { - blob_ref = btree_blob_ref( - state->cc, state->cfg, state->root_addr, state->type); + if (message_is_blob(state->msg)) { + btree_blob_ref_init( + &blob_ref, state->cc, state->cfg, state->root_addr, state->type); } rc = lookup_result_update_with_blob_ref( state->result, state->found_key, state->msg, ondisk_ref_is_null(&blob_ref) ? NULL : &blob_ref); + ondisk_ref_deinit(&blob_ref); btree_node_unget(state->cc, state->cfg, &state->node); } async_return(state, rc); @@ -3639,7 +3634,7 @@ btree_pack_loop(btree_pack_req *req, // IN/OUT return STATUS_INVALID_STATE; } - if (message_isblob(msg)) { + if (message_is_blob(msg)) { rc = message_clone(&btree_blob_cfg, req->cc, &req->mini, msg, &req->ma); if (!SUCCESS(rc)) { return rc; diff --git a/src/btree.h b/src/btree.h index 3be391ad..782c8645 100644 --- a/src/btree.h +++ b/src/btree.h @@ -179,17 +179,17 @@ typedef struct btree_pack_req { } btree_pack_req; platform_status -btree_insert(cache *cc, // IN - const btree_config *cfg, // IN - platform_heap_id heap_id, // IN - btree_scratch *scratch, // IN - uint64 root_addr, // IN - mini_allocator *mini, // IN - key tuple_key, // IN - message data, // IN - lookup_result *old_result, // IN/OUT - const ondisk_ref *old_result_blob_ref, // IN - uint64 *generation); // OUT +btree_insert(cache *cc, // IN + const btree_config *cfg, // IN + platform_heap_id heap_id, // IN + btree_scratch *scratch, // IN + uint64 root_addr, // IN + mini_allocator *mini, // IN + key tuple_key, // IN + message data, // IN + lookup_result *old_result, // IN/OUT + const ondisk_ref *old_result_blob_ref, // IN + uint64 *generation); // OUT uint64 btree_create(cache *cc, @@ -233,14 +233,14 @@ btree_lookup_and_merge(cache *cc, bool32 *local_found); platform_status -btree_lookup_and_merge_with_blob_ref(cache *cc, - const btree_config *cfg, - uint64 root_addr, - page_type type, - key target, - lookup_result *result, - const ondisk_ref *blob_ref, - bool32 *local_found); +btree_lookup_and_merge_with_blob_ref(cache *cc, + const btree_config *cfg, + uint64 root_addr, + page_type type, + key target, + lookup_result *result, + const ondisk_ref *blob_ref, + bool32 *local_found); // clang-format off DEFINE_ASYNC_STATE(btree_lookup_async_state, 3, diff --git a/src/btree_private.h b/src/btree_private.h index c3ff81ac..8cdf3ff4 100644 --- a/src/btree_private.h +++ b/src/btree_private.h @@ -70,8 +70,8 @@ typedef struct leaf_incorporate_spec { bool32 use_msg; union { /* use_msg is the tag on this union. */ - message msg; - merge_accumulator ma; + message new_message; + merge_accumulator merged_message; } msg; } leaf_incorporate_spec; diff --git a/src/core.c b/src/core.c index 0cd7cb20..d5a8a51a 100644 --- a/src/core.c +++ b/src/core.c @@ -726,8 +726,10 @@ core_memtable_lookup(core_handle *spl, ondisk_ref blob_ref; memtable_blob_ref_init(&spl->mt_ctxt, root_addr, &blob_ref); - return btree_lookup_and_merge_with_blob_ref( + platform_status rc = btree_lookup_and_merge_with_blob_ref( cc, cfg, root_addr, type, target, result, &blob_ref, NULL); + ondisk_ref_deinit(&blob_ref); + return rc; } static platform_status diff --git a/src/data_blob_build.c b/src/data_blob_build.c index 6cd1b64b..90f62944 100644 --- a/src/data_blob_build.c +++ b/src/data_blob_build.c @@ -17,10 +17,11 @@ message_to_blob(const blob_build_config *cfg, message msg, merge_accumulator *ma) { - platform_assert(!message_isblob(msg)); + platform_assert(!message_is_blob(msg)); ma->type = message_class(msg); ma->cc = cc; - platform_status rc = blob_build(cfg, cc, mini, message_slice(msg), &ma->data); + platform_status rc = + blob_build(cfg, cc, mini, message_slice(msg), &ma->data); if (SUCCESS(rc)) { merge_accumulator_release_blob_ref(ma); } @@ -34,10 +35,11 @@ message_clone(const blob_build_config *cfg, message msg, merge_accumulator *result) { - platform_assert(message_isblob(msg)); + platform_assert(message_is_blob(msg)); result->type = message_class(msg); result->cc = cc; - platform_status rc = blob_clone(cfg, cc, mini, message_slice(msg), &result->data); + platform_status rc = + blob_clone(cfg, cc, mini, message_slice(msg), &result->data); if (SUCCESS(rc)) { merge_accumulator_release_blob_ref(result); } diff --git a/src/data_internal.c b/src/data_internal.c index 82c77d3f..7372dc51 100644 --- a/src/data_internal.c +++ b/src/data_internal.c @@ -31,7 +31,7 @@ merge_accumulator_to_slice(const merge_accumulator *ma) void merge_accumulator_release_blob_ref(merge_accumulator *ma) { - ondisk_ref_dec(&ma->blob_ref); + ondisk_ref_deinit(&ma->blob_ref); } /* Copy a message into an already-initialized merge_accumulator. */ @@ -42,23 +42,16 @@ merge_accumulator_copy_message(merge_accumulator *ma, message msg) } _Bool -merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, - message msg, - const ondisk_ref *blob_ref) +merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, + message msg, + const ondisk_ref *blob_ref) { platform_status rc = writable_buffer_copy_slice(&ma->data, message_slice(msg)); if (!SUCCESS(rc)) { return FALSE; } - ondisk_ref old_ref = ma->blob_ref; - ma->blob_ref = ONDISK_REF_NULL; - - if (message_isblob(msg) && !ondisk_ref_is_null(blob_ref)) { - ondisk_ref_inc(blob_ref); - ma->blob_ref = *blob_ref; - } - ondisk_ref_dec(&old_ref); + ondisk_ref_replace(&ma->blob_ref, message_is_blob(msg) ? blob_ref : NULL); ma->type = message_class(msg); ma->cc = msg.cc; diff --git a/src/data_internal.h b/src/data_internal.h index cd7e33f3..33cfaf28 100644 --- a/src/data_internal.h +++ b/src/data_internal.h @@ -355,6 +355,12 @@ message_create(message_type type, cache *cc, slice data) return (message){.type = type, .cc = cc, .data = data}; } +static inline bool32 +message_is_blob(message msg) +{ + return message_isblob(msg); +} + static inline bool32 message_is_null(message msg) { @@ -379,7 +385,7 @@ message_is_invalid_user_type(message msg) static inline uint64 message_materialized_length(message msg) { - if (message_isblob(msg)) { + if (message_is_blob(msg)) { return blob_length(message_slice(msg)); } else { return message_length(msg); @@ -395,7 +401,7 @@ message_materialize_if_needed(platform_heap_id heap_id, writable_buffer *tmp, message *result) { - if (message_isblob(msg)) { + if (message_is_blob(msg)) { writable_buffer_init(tmp, heap_id); platform_status rc = blob_materialize_full(msg.cc, message_slice(msg), tmp); @@ -414,7 +420,7 @@ message_materialize_if_needed(platform_heap_id heap_id, static inline void message_dematerialize_if_needed(message msg, writable_buffer *tmp) { - if (message_isblob(msg)) { + if (message_is_blob(msg)) { writable_buffer_deinit(tmp); } } @@ -451,7 +457,7 @@ _Static_assert(MESSAGE_TYPE_MAX_VALID_USER_TYPE < (1ULL << ONDISK_MESSAGE_TYPE_BITS), "ONDISK_MESSAGE_TYPE_BITS is too small"); #define ONDISK_MESSAGE_TYPE_MASK ((0x1 << ONDISK_MESSAGE_TYPE_BITS) - 1) -#define ONDISK_MESSAGE_ISBLOB (0x4) +#define ONDISK_MESSAGE_IS_BLOB (0x4) /* Size of the data part of an existing ondisk_tuple */ static inline uint64 @@ -484,9 +490,9 @@ ondisk_tuple_message_class(const ondisk_tuple *odt) } static inline bool32 -ondisk_tuple_message_isblob(const ondisk_tuple *odt) +ondisk_tuple_message_is_blob(const ondisk_tuple *odt) { - return (odt->flags & ONDISK_MESSAGE_ISBLOB) != 0; + return (odt->flags & ONDISK_MESSAGE_IS_BLOB) != 0; } static inline message @@ -495,7 +501,7 @@ ondisk_tuple_message(cache *cc, const ondisk_tuple *odt) slice data = slice_create(odt->message_length, odt->key_and_message + odt->key_length); return message_create(ondisk_tuple_message_class(odt), - ondisk_tuple_message_isblob(odt) ? cc : NULL, + ondisk_tuple_message_is_blob(odt) ? cc : NULL, data); } @@ -504,7 +510,7 @@ copy_message_to_ondisk_tuple(ondisk_tuple *odt, message msg) { odt->message_length = message_length(msg); odt->flags = message_class(msg); - odt->flags |= message_isblob(msg) ? ONDISK_MESSAGE_ISBLOB : 0; + odt->flags |= message_is_blob(msg) ? ONDISK_MESSAGE_IS_BLOB : 0; memcpy(odt->key_and_message + odt->key_length, message_data(msg), message_length(msg)); @@ -527,27 +533,27 @@ copy_tuple_to_ondisk_tuple(ondisk_tuple *odt, key k, message msg) */ struct merge_accumulator { - message_type type; - cache *cc; - writable_buffer data; - ondisk_ref blob_ref; + message_type type; + cache *cc; + writable_buffer data; + ondisk_ref blob_ref; }; void merge_accumulator_release_blob_ref(merge_accumulator *ma); _Bool -merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, - message msg, - const ondisk_ref *blob_ref); +merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, + message msg, + const ondisk_ref *blob_ref); static inline void merge_accumulator_init(merge_accumulator *ma, platform_heap_id heap_id) { writable_buffer_init(&ma->data, heap_id); - ma->type = MESSAGE_TYPE_INVALID; - ma->cc = NULL; - ma->blob_ref = ONDISK_REF_NULL; + ma->type = MESSAGE_TYPE_INVALID; + ma->cc = NULL; + ondisk_ref_init_null(&ma->blob_ref); } static inline void @@ -561,9 +567,9 @@ merge_accumulator_init_with_buffer(merge_accumulator *ma, { writable_buffer_init_with_buffer( &ma->data, heap_id, allocation_size, data, logical_size); - ma->type = type; - ma->cc = NULL; - ma->blob_ref = ONDISK_REF_NULL; + ma->type = type; + ma->cc = NULL; + ondisk_ref_init_null(&ma->blob_ref); } static inline void @@ -571,9 +577,9 @@ merge_accumulator_deinit(merge_accumulator *ma) { merge_accumulator_release_blob_ref(ma); writable_buffer_deinit(&ma->data); - ma->type = MESSAGE_TYPE_INVALID; - ma->cc = NULL; - ma->blob_ref = ONDISK_REF_NULL; + ma->type = MESSAGE_TYPE_INVALID; + ma->cc = NULL; + ondisk_ref_init_null(&ma->blob_ref); } static inline bool32 @@ -634,7 +640,7 @@ merge_accumulator_is_null(const merge_accumulator *ma) static inline platform_status message_materialize(message msg, merge_accumulator *tmp) { - debug_assert(message_isblob(msg)); + debug_assert(message_is_blob(msg)); tmp->type = message_class(msg); tmp->cc = NULL; return blob_materialize_full(msg.cc, message_slice(msg), &tmp->data); @@ -659,7 +665,7 @@ merge_accumulator_ensure_materialized(merge_accumulator *ma) writable_buffer old_data = ma->data; ma->data = materialized; merge_accumulator_release_blob_ref(ma); - ma->cc = NULL; + ma->cc = NULL; writable_buffer_deinit(&old_data); return STATUS_OK; } diff --git a/src/lookup_result.h b/src/lookup_result.h index 76d991af..d97b376e 100644 --- a/src/lookup_result.h +++ b/src/lookup_result.h @@ -100,10 +100,10 @@ lookup_result_found(const lookup_result *result) } static inline platform_status -lookup_result_update_with_blob_ref(lookup_result *result, - key found_key, - message msg, - const ondisk_ref *blob_ref) +lookup_result_update_with_blob_ref(lookup_result *result, + key found_key, + message msg, + const ondisk_ref *blob_ref) { if (lookup_result_is_existence(result)) { bool32 success = merge_accumulator_resize(&result->value, 0); diff --git a/src/memtable.c b/src/memtable.c index 38b84bcc..dcb38631 100644 --- a/src/memtable.c +++ b/src/memtable.c @@ -203,21 +203,21 @@ memtable_blob_ref_inc(const ondisk_ref *blob_ref) static void memtable_blob_ref_dec(const ondisk_ref *blob_ref) { - btree_dec_ref( - blob_ref->cc, blob_ref->arg, blob_ref->addr, blob_ref->type); + btree_dec_ref(blob_ref->cc, blob_ref->arg, blob_ref->addr, blob_ref->type); } void memtable_blob_ref_init(memtable_context *ctxt, - uint64 root_addr, + uint64 root_addr, ondisk_ref *blob_ref) { - *blob_ref = (ondisk_ref){.cc = ctxt->cc, - .addr = root_addr, - .type = PAGE_TYPE_MEMTABLE, - .arg = ctxt->cfg.btree_cfg, - .inc = memtable_blob_ref_inc, - .dec = memtable_blob_ref_dec}; + ondisk_ref_init(blob_ref, + ctxt->cc, + root_addr, + PAGE_TYPE_MEMTABLE, + ctxt->cfg.btree_cfg, + memtable_blob_ref_inc, + memtable_blob_ref_dec); } void @@ -244,10 +244,10 @@ memtable_insert(memtable_context *ctxt, { const threadid tid = platform_get_tid(); - btree_scratch *scratch = get_btree_scratch(ctxt, tid); - ondisk_ref blob_ref; + btree_scratch *scratch = get_btree_scratch(ctxt, tid); + ondisk_ref blob_ref; memtable_blob_ref_init(ctxt, mt->root_addr, &blob_ref); - platform_status rc = btree_insert(ctxt->cc, + platform_status rc = btree_insert(ctxt->cc, ctxt->cfg.btree_cfg, heap_id, scratch, @@ -258,6 +258,7 @@ memtable_insert(memtable_context *ctxt, old_result, &blob_ref, leaf_generation); + ondisk_ref_deinit(&blob_ref); if (!SUCCESS(rc)) { return rc; } diff --git a/src/memtable.h b/src/memtable.h index bfcd9d01..d9540517 100644 --- a/src/memtable.h +++ b/src/memtable.h @@ -165,7 +165,7 @@ memtable_unblock_lookups(memtable_context *ctxt); void memtable_blob_ref_init(memtable_context *ctxt, - uint64 root_addr, + uint64 root_addr, ondisk_ref *blob_ref); void diff --git a/src/ondisk_ref.c b/src/ondisk_ref.c new file mode 100644 index 00000000..49227760 --- /dev/null +++ b/src/ondisk_ref.c @@ -0,0 +1,147 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +#include "ondisk_ref.h" +#include "poison.h" + +bool32 +ondisk_ref_is_null(const ondisk_ref *ref) +{ + return ref == NULL || ref->dec == NULL; +} + +static void +ondisk_ref_assert_valid(const ondisk_ref *ref) +{ + debug_assert(!ondisk_ref_is_null(ref)); + debug_assert(ref->self == ref); + debug_assert(ref->inc != NULL); + debug_assert(ref->dec != NULL); +} + +static void +ondisk_ref_clear(ondisk_ref *ref) +{ + ref->cc = NULL; + ref->addr = 0; + ref->type = PAGE_TYPE_INVALID; + ref->arg = NULL; + ref->self = NULL; + ref->inc = NULL; + ref->dec = NULL; +} + +static void +ondisk_ref_init_internal(ondisk_ref *ref, + cache *cc, + uint64 addr, + page_type type, + const void *arg, + void (*inc)(const ondisk_ref *ref), + void (*dec)(const ondisk_ref *ref)) +{ + ref->cc = cc; + ref->addr = addr; + ref->type = type; + ref->arg = arg; + ref->self = ref; + ref->inc = inc; + ref->dec = dec; +} + +static void +ondisk_ref_inc(const ondisk_ref *ref) +{ + if (!ondisk_ref_is_null(ref)) { + ondisk_ref_assert_valid(ref); + ref->inc(ref); + } +} + +static void +ondisk_ref_copy_fields(ondisk_ref *dst, const ondisk_ref *src) +{ + dst->cc = src->cc; + dst->addr = src->addr; + dst->type = src->type; + dst->arg = src->arg; + dst->self = dst; + dst->inc = src->inc; + dst->dec = src->dec; +} + +void +ondisk_ref_init_null(ondisk_ref *ref) +{ + ondisk_ref_clear(ref); +} + +void +ondisk_ref_init(ondisk_ref *ref, + cache *cc, + uint64 addr, + page_type type, + const void *arg, + void (*inc)(const ondisk_ref *ref), + void (*dec)(const ondisk_ref *ref)) +{ + ondisk_ref_init_internal(ref, cc, addr, type, arg, inc, dec); + ondisk_ref_inc(ref); +} + +void +ondisk_ref_init_adopt(ondisk_ref *ref, + cache *cc, + uint64 addr, + page_type type, + const void *arg, + void (*inc)(const ondisk_ref *ref), + void (*dec)(const ondisk_ref *ref)) +{ + ondisk_ref_init_internal(ref, cc, addr, type, arg, inc, dec); +} + +void +ondisk_ref_deinit(ondisk_ref *ref) +{ + if (!ondisk_ref_is_null(ref)) { + ondisk_ref_assert_valid(ref); + ondisk_ref ref_copy = ONDISK_REF_NULL; + ondisk_ref_copy_fields(&ref_copy, ref); + ondisk_ref_clear(ref); + ref_copy.dec(&ref_copy); + ondisk_ref_clear(&ref_copy); + } +} + +void +ondisk_ref_copy(ondisk_ref *dst, const ondisk_ref *src) +{ + debug_assert(dst != src); + if (ondisk_ref_is_null(src)) { + ondisk_ref_init_null(dst); + return; + } + ondisk_ref_inc(src); + ondisk_ref_copy_fields(dst, src); +} + +void +ondisk_ref_replace(ondisk_ref *dst, const ondisk_ref *src) +{ + if (dst == src) { + return; + } + + bool32 has_src = !ondisk_ref_is_null(src); + if (has_src) { + ondisk_ref_inc(src); + } + + ondisk_ref_deinit(dst); + if (has_src) { + ondisk_ref_copy_fields(dst, src); + } else { + ondisk_ref_clear(dst); + } +} diff --git a/src/ondisk_ref.h b/src/ondisk_ref.h index 5f8855ce..8e84a1fa 100644 --- a/src/ondisk_ref.h +++ b/src/ondisk_ref.h @@ -16,39 +16,56 @@ * ondisk_ref is an in-memory owner for an allocator-backed on-disk object. * Callers provide object-specific inc/dec callbacks so the ref can protect * roots of compound structures such as trunk nodes or btree mini-allocators. + * + * Lifecycle mirrors RAII: + * init acquires a new reference + * init_adopt records an already-owned reference + * deinit releases the held reference + * copy initializes dst with a newly acquired copy of src + * replace releases dst after acquiring src */ typedef struct ondisk_ref { - cache *cc; - uint64 addr; - page_type type; - const void *arg; + cache *cc; + uint64 addr; + page_type type; + const void *arg; + const struct ondisk_ref *self; void (*inc)(const struct ondisk_ref *ref); void (*dec)(const struct ondisk_ref *ref); + const uint8 no_copy; } ondisk_ref; #define ONDISK_REF_NULL ((ondisk_ref){0}) -static inline bool32 -ondisk_ref_is_null(const ondisk_ref *ref) -{ - return ref == NULL || ref->dec == NULL; -} - -static inline void -ondisk_ref_inc(const ondisk_ref *ref) -{ - if (!ondisk_ref_is_null(ref)) { - debug_assert(ref->inc != NULL); - ref->inc(ref); - } -} - -static inline void -ondisk_ref_dec(ondisk_ref *ref) -{ - if (!ondisk_ref_is_null(ref)) { - ondisk_ref ref_copy = *ref; - *ref = ONDISK_REF_NULL; - ref_copy.dec(&ref_copy); - } -} +bool32 +ondisk_ref_is_null(const ondisk_ref *ref); + +void +ondisk_ref_init_null(ondisk_ref *ref); + +void +ondisk_ref_init(ondisk_ref *ref, + cache *cc, + uint64 addr, + page_type type, + const void *arg, + void (*inc)(const ondisk_ref *ref), + void (*dec)(const ondisk_ref *ref)); + +void +ondisk_ref_init_adopt(ondisk_ref *ref, + cache *cc, + uint64 addr, + page_type type, + const void *arg, + void (*inc)(const ondisk_ref *ref), + void (*dec)(const ondisk_ref *ref)); + +void +ondisk_ref_deinit(ondisk_ref *ref); + +void +ondisk_ref_copy(ondisk_ref *dst, const ondisk_ref *src); + +void +ondisk_ref_replace(ondisk_ref *dst, const ondisk_ref *src); diff --git a/src/shard_log.c b/src/shard_log.c index 35fb9700..7ff58572 100644 --- a/src/shard_log.c +++ b/src/shard_log.c @@ -167,7 +167,7 @@ log_entry_key(log_entry *le) static bool32 log_entry_message_isblob(log_entry *le) { - return ondisk_tuple_message_isblob(&le->tuple); + return ondisk_tuple_message_is_blob(&le->tuple); } static message @@ -238,12 +238,12 @@ shard_log_write(log_handle *logh, key tuple_key, message msg, uint64 generation) uint64 max_entry_size = shard_log_page_size(log->cfg) - sizeof(shard_log_hdr); - if (message_isblob(msg) + if (message_is_blob(msg) || max_entry_size < log_entry_required_capacity(tuple_key, msg)) { merge_accumulator_init(&log_blob, platform_get_heap_id()); platform_status rc; - if (message_isblob(msg)) { + if (message_is_blob(msg)) { rc = message_clone(&log->cfg->blob_cfg, cc, &log->mini, msg, &log_blob); } else { diff --git a/src/splinterdb.c b/src/splinterdb.c index 7617646d..1567ae0f 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -850,7 +850,7 @@ splinterdb_iterator_get_current(splinterdb_iterator *iter, // IN iterator_curr(itor, &result_key, &msg); *outkey = key_slice(result_key); - if (message_isblob(msg)) { + if (message_is_blob(msg)) { iter->last_rc = message_materialize(msg, &iter->materialized_message); if (SUCCESS(iter->last_rc)) { *value = merge_accumulator_to_value(&iter->materialized_message); diff --git a/src/trunk.c b/src/trunk.c index 32915e96..9a0e9e12 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -1804,9 +1804,12 @@ trunk_ondisk_node_ref_dec(const ondisk_ref *ref) } static trunk_ondisk_node_ref * -trunk_ondisk_node_ref_create(trunk_context *context, key k, uint64 child_addr) +trunk_ondisk_node_ref_create(trunk_context *context, + key k, + uint64 child_addr, + bool32 adopt) { - platform_heap_id hid = context->hid; + platform_heap_id hid = context->hid; trunk_ondisk_node_ref *result = TYPED_FLEXIBLE_STRUCT_ZALLOC( hid, result, key.bytes, ondisk_key_required_data_capacity(k)); if (result == NULL) { @@ -1814,12 +1817,23 @@ trunk_ondisk_node_ref_create(trunk_context *context, key k, uint64 child_addr) "%s():%d: TYPED_FLEXIBLE_STRUCT_ZALLOC() failed", __func__, __LINE__); return NULL; } - result->ref = (ondisk_ref){.cc = context->cc, - .addr = child_addr, - .type = PAGE_TYPE_TRUNK, - .arg = context, - .inc = trunk_ondisk_node_ref_inc, - .dec = trunk_ondisk_node_ref_dec}; + if (adopt) { + ondisk_ref_init_adopt(&result->ref, + context->cc, + child_addr, + PAGE_TYPE_TRUNK, + context, + trunk_ondisk_node_ref_inc, + trunk_ondisk_node_ref_dec); + } else { + ondisk_ref_init(&result->ref, + context->cc, + child_addr, + PAGE_TYPE_TRUNK, + context, + trunk_ondisk_node_ref_inc, + trunk_ondisk_node_ref_dec); + } copy_key_to_ondisk_key(&result->key, k); return result; } @@ -1829,9 +1843,8 @@ trunk_ondisk_node_ref_destroy(trunk_ondisk_node_ref *odnref, trunk_context *context, platform_heap_id hid) { - debug_assert(ondisk_ref_is_null(&odnref->ref) - || odnref->ref.arg == context); - ondisk_ref_dec(&odnref->ref); + debug_assert(ondisk_ref_is_null(&odnref->ref) || odnref->ref.arg == context); + ondisk_ref_deinit(&odnref->ref); platform_free(hid, odnref); } @@ -2140,7 +2153,7 @@ trunk_node_serialize(trunk_context *context, trunk_node *node) trunk_node_inc_all_refs(context, node); result = trunk_ondisk_node_ref_create( - context, trunk_node_pivot_key(node, 0), header_addr); + context, trunk_node_pivot_key(node, 0), header_addr, TRUE); if (result == NULL) { platform_error_log( "%s():%d: ondisk_node_ref_create() failed", __func__, __LINE__); @@ -2538,8 +2551,7 @@ trunk_apply_changes_internal(trunk_context *context, rc = vector_append(&new_child_refs, new_child_ref); platform_assert_status_ok(rc); - trunk_pivot_set_child_addr(child_pivot, - new_child_ref->ref.addr); + trunk_pivot_set_child_addr(child_pivot, new_child_ref->ref.addr); } } } @@ -5859,13 +5871,12 @@ trunk_context_init(trunk_context *context, if (root_addr != 0) { context->root = trunk_ondisk_node_ref_create( - context, NEGATIVE_INFINITY_KEY, root_addr); + context, NEGATIVE_INFINITY_KEY, root_addr, FALSE); if (context->root == NULL) { platform_error_log("trunk_node_context_init: " "ondisk_node_ref_create failed\n"); return STATUS_NO_MEMORY; } - ondisk_ref_inc(&context->root->ref); } context->stats = NULL; diff --git a/src/trunk.h b/src/trunk.h index 1fa3e4fe..9daf45fd 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -114,7 +114,8 @@ typedef struct trunk_pivot_state_map { trunk_pivot_state *buckets[TRUNK_PIVOT_STATE_MAP_BUCKETS]; } trunk_pivot_state_map; -/* An ondisk_node_ref is a pivot key plus an owned in-memory ref to the child. */ +/* An ondisk_node_ref is a pivot key plus an owned in-memory ref to the child. + */ typedef struct trunk_ondisk_node_ref { ondisk_ref ref; ondisk_key key; diff --git a/tests/unit/splinterdb_quick_test.c b/tests/unit/splinterdb_quick_test.c index 43df0e21..476ba5d4 100644 --- a/tests/unit/splinterdb_quick_test.c +++ b/tests/unit/splinterdb_quick_test.c @@ -414,7 +414,7 @@ CTEST2(splinterdb_quick, test_large_lookup_survives_memtable_recycle) ASSERT_EQUAL(0, rc); ASSERT_TRUE(splinterdb_lookup_found(&result)); - core_handle *core = (core_handle *)splinterdb_get_trunk_handle(data->kvsb); + core_handle *core = (core_handle *)splinterdb_get_trunk_handle(data->kvsb); uint64 generation = force_flush_current_memtable(data->kvsb); memtable *mt = &core->mt_ctxt.mt[generation % core->mt_ctxt.cfg.max_memtables]; @@ -451,12 +451,12 @@ CTEST2(splinterdb_quick, test_iterator_survives_memtable_recycle) ASSERT_EQUAL(0, rc); splinterdb_iterator *it = NULL; - rc = splinterdb_iterator_init( + rc = splinterdb_iterator_init( data->kvsb, &it, greater_than_or_equal, NULL_SLICE); ASSERT_EQUAL(0, rc); ASSERT_TRUE(splinterdb_iterator_valid(it)); - core_handle *core = (core_handle *)splinterdb_get_trunk_handle(data->kvsb); + core_handle *core = (core_handle *)splinterdb_get_trunk_handle(data->kvsb); uint64 generation = force_flush_current_memtable(data->kvsb); memtable *mt = &core->mt_ctxt.mt[generation % core->mt_ctxt.cfg.max_memtables]; From 5c63246024258a4ae02fce9176a27b6a6356f800 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 29 May 2026 13:57:18 -0700 Subject: [PATCH 10/13] cleanup some _with_blob interfaces Signed-off-by: Rob Johnson --- src/btree.c | 105 ++++++++++++++++----------------- src/btree.h | 11 ---- src/core.c | 13 +--- src/lookup_result.h | 16 ++--- src/memtable.c | 30 ---------- src/memtable.h | 6 -- src/ondisk_ref.c | 5 +- tests/unit/btree_stress_test.c | 4 -- 8 files changed, 59 insertions(+), 131 deletions(-) diff --git a/src/btree.c b/src/btree.c index 1f0016e3..737ea501 100644 --- a/src/btree.c +++ b/src/btree.c @@ -113,6 +113,13 @@ static const blob_build_config btree_blob_cfg = { .alignment = 0, }; +static void +btree_blob_ref_init(ondisk_ref *ref, + cache *cc, + const btree_config *cfg, + uint64 root_addr, + page_type type); + static inline uint8 btree_height(const btree_hdr *hdr) @@ -600,20 +607,27 @@ spec_message(const leaf_incorporate_spec *spec) static inline platform_status btree_record_old_result(const btree_config *cfg, cache *cc, + uint64 root_addr, const btree_hdr *hdr, const leaf_incorporate_spec *spec, - lookup_result *old_result, - const ondisk_ref *blob_ref) + lookup_result *old_result) { if (old_result == NULL || spec->old_entry_state != ENTRY_STILL_EXISTS) { return STATUS_OK; } - leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); - return lookup_result_update_with_blob_ref(old_result, - leaf_entry_key(entry), - leaf_entry_message(cc, entry), - blob_ref); + leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); + message old_msg = leaf_entry_message(cc, entry); + ondisk_ref blob_ref = ONDISK_REF_NULL; + const ondisk_ref *ref = NULL; + if (message_is_blob(old_msg)) { + btree_blob_ref_init(&blob_ref, cc, cfg, root_addr, PAGE_TYPE_MEMTABLE); + ref = &blob_ref; + } + platform_status rc = + lookup_result_update(old_result, leaf_entry_key(entry), old_msg, ref); + ondisk_ref_deinit(&blob_ref); + return rc; } platform_status @@ -1379,12 +1393,12 @@ btree_split_child_leaf(cache *cc, const btree_config *cfg, mini_allocator *mini, btree_scratch *scratch, + uint64 root_addr, btree_node *parent, uint64 index_of_child_in_parent, btree_node *child, leaf_incorporate_spec *spec, lookup_result *old_result, - const ondisk_ref *old_result_blob_ref, uint64 *generation) // OUT { btree_node right_child; @@ -1443,8 +1457,8 @@ btree_split_child_leaf(cache *cc, btree_node_lock(cc, cfg, child); /* p: write, c: write, rc: write, cn: write if exists */ - platform_status rc = btree_record_old_result( - cfg, cc, child->hdr, spec, old_result, old_result_blob_ref); + platform_status rc = + btree_record_old_result(cfg, cc, root_addr, child->hdr, spec, old_result); if (!SUCCESS(rc)) { if (child_next.addr != 0) { btree_node_full_unlock(cc, cfg, &child_next); @@ -1510,12 +1524,12 @@ btree_defragment_or_split_child_leaf(cache *cc, const btree_config *cfg, mini_allocator *mini, btree_scratch *scratch, + uint64 root_addr, btree_node *parent, uint64 index_of_child_in_parent, btree_node *child, leaf_incorporate_spec *spec, lookup_result *old_result, - const ondisk_ref *old_result_blob_ref, uint64 *generation) // OUT { uint64 nentries = btree_num_entries(child->hdr); @@ -1540,7 +1554,7 @@ btree_defragment_or_split_child_leaf(cache *cc, btree_node_unget(cc, cfg, parent); btree_node_lock(cc, cfg, child); platform_status rc = btree_record_old_result( - cfg, cc, child->hdr, spec, old_result, old_result_blob_ref); + cfg, cc, root_addr, child->hdr, spec, old_result); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, child); return rc; @@ -1555,12 +1569,12 @@ btree_defragment_or_split_child_leaf(cache *cc, cfg, mini, scratch, + root_addr, parent, index_of_child_in_parent, child, spec, old_result, - old_result_blob_ref, generation); } @@ -1812,17 +1826,16 @@ btree_grow_root(cache *cc, // IN *----------------------------------------------------------------------------- */ platform_status -btree_insert(cache *cc, // IN - const btree_config *cfg, // IN - platform_heap_id heap_id, // IN - btree_scratch *scratch, // IN - uint64 root_addr, // IN - mini_allocator *mini, // IN - key tuple_key, // IN - message msg, // IN - lookup_result *old_result, // IN/OUT - const ondisk_ref *old_result_blob_ref, // IN - uint64 *generation) // OUT +btree_insert(cache *cc, // IN + const btree_config *cfg, // IN + platform_heap_id heap_id, // IN + btree_scratch *scratch, // IN + uint64 root_addr, // IN + mini_allocator *mini, // IN + key tuple_key, // IN + message msg, // IN + lookup_result *old_result, // IN/OUT + uint64 *generation) // OUT { platform_status rc; leaf_incorporate_spec spec; @@ -1863,7 +1876,7 @@ btree_insert(cache *cc, // IN btree_node_lock(cc, cfg, &root_node); if (btree_can_perform_leaf_incorporate_spec(cfg, root_node.hdr, &spec)) { rc = btree_record_old_result( - cfg, cc, root_node.hdr, &spec, old_result, old_result_blob_ref); + cfg, cc, root_addr, root_node.hdr, &spec, old_result); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &root_node); destroy_leaf_incorporate_spec(&spec); @@ -2077,7 +2090,7 @@ btree_insert(cache *cc, // IN } btree_node_lock(cc, cfg, &child_node); rc = btree_record_old_result( - cfg, cc, child_node.hdr, &spec, old_result, old_result_blob_ref); + cfg, cc, root_addr, child_node.hdr, &spec, old_result); if (!SUCCESS(rc)) { btree_node_full_unlock(cc, cfg, &child_node); destroy_leaf_incorporate_spec(&spec); @@ -2124,12 +2137,12 @@ btree_insert(cache *cc, // IN cfg, mini, scratch, + root_addr, &parent_node, child_idx, &child_node, &spec, old_result, - old_result_blob_ref, generation); destroy_leaf_incorporate_spec(&spec); if (STATUS_IS_EQ(rc, STATUS_BUSY)) { @@ -2386,26 +2399,12 @@ btree_lookup_and_merge(cache *cc, // IN key target, // IN lookup_result *result, // IN/OUT bool32 *local_found) // OUT -{ - return btree_lookup_and_merge_with_blob_ref( - cc, cfg, root_addr, type, target, result, NULL, local_found); -} - -platform_status -btree_lookup_and_merge_with_blob_ref(cache *cc, // IN - const btree_config *cfg, // IN - uint64 root_addr, // IN - page_type type, // IN - key target, // IN - lookup_result *result, // IN/OUT - const ondisk_ref *blob_ref, // IN - bool32 *local_found) // OUT { btree_node node; key found_key; message local_data; - platform_status rc = STATUS_OK; - ondisk_ref default_blob_ref = ONDISK_REF_NULL; + platform_status rc = STATUS_OK; + ondisk_ref blob_ref = ONDISK_REF_NULL; log_trace_key(target, "btree_lookup"); @@ -2419,15 +2418,13 @@ btree_lookup_and_merge_with_blob_ref(cache *cc, // IN if (local_found != NULL) { *local_found = TRUE; } - if (message_is_blob(local_data) && ondisk_ref_is_null(blob_ref)) { - btree_blob_ref_init(&default_blob_ref, cc, cfg, root_addr, type); - blob_ref = &default_blob_ref; + if (message_is_blob(local_data)) { + btree_blob_ref_init(&blob_ref, cc, cfg, root_addr, type); } - rc = lookup_result_update_with_blob_ref( - result, found_key, local_data, blob_ref); + rc = lookup_result_update(result, found_key, local_data, &blob_ref); btree_node_unget(cc, cfg, &node); } - ondisk_ref_deinit(&default_blob_ref); + ondisk_ref_deinit(&blob_ref); return rc; } @@ -2464,11 +2461,11 @@ btree_lookup_and_merge_async(btree_lookup_async_state *state) btree_blob_ref_init( &blob_ref, state->cc, state->cfg, state->root_addr, state->type); } - rc = lookup_result_update_with_blob_ref( - state->result, - state->found_key, - state->msg, - ondisk_ref_is_null(&blob_ref) ? NULL : &blob_ref); + rc = + lookup_result_update(state->result, + state->found_key, + state->msg, + ondisk_ref_is_null(&blob_ref) ? NULL : &blob_ref); ondisk_ref_deinit(&blob_ref); btree_node_unget(state->cc, state->cfg, &state->node); } diff --git a/src/btree.h b/src/btree.h index 782c8645..1b02bbb9 100644 --- a/src/btree.h +++ b/src/btree.h @@ -188,7 +188,6 @@ btree_insert(cache *cc, // IN key tuple_key, // IN message data, // IN lookup_result *old_result, // IN/OUT - const ondisk_ref *old_result_blob_ref, // IN uint64 *generation); // OUT uint64 @@ -232,16 +231,6 @@ btree_lookup_and_merge(cache *cc, lookup_result *result, bool32 *local_found); -platform_status -btree_lookup_and_merge_with_blob_ref(cache *cc, - const btree_config *cfg, - uint64 root_addr, - page_type type, - key target, - lookup_result *result, - const ondisk_ref *blob_ref, - bool32 *local_found); - // clang-format off DEFINE_ASYNC_STATE(btree_lookup_async_state, 3, param, cache *, cc, diff --git a/src/core.c b/src/core.c index d5a8a51a..56e6dc01 100644 --- a/src/core.c +++ b/src/core.c @@ -719,17 +719,8 @@ core_memtable_lookup(core_handle *spl, page_type type = memtable_is_compacted ? PAGE_TYPE_BRANCH : PAGE_TYPE_MEMTABLE; - if (memtable_is_compacted) { - return btree_lookup_and_merge( - cc, cfg, root_addr, type, target, result, NULL); - } - - ondisk_ref blob_ref; - memtable_blob_ref_init(&spl->mt_ctxt, root_addr, &blob_ref); - platform_status rc = btree_lookup_and_merge_with_blob_ref( - cc, cfg, root_addr, type, target, result, &blob_ref, NULL); - ondisk_ref_deinit(&blob_ref); - return rc; + return btree_lookup_and_merge( + cc, cfg, root_addr, type, target, result, NULL); } static platform_status diff --git a/src/lookup_result.h b/src/lookup_result.h index d97b376e..e50ecb93 100644 --- a/src/lookup_result.h +++ b/src/lookup_result.h @@ -100,10 +100,10 @@ lookup_result_found(const lookup_result *result) } static inline platform_status -lookup_result_update_with_blob_ref(lookup_result *result, - key found_key, - message msg, - const ondisk_ref *blob_ref) +lookup_result_update(lookup_result *result, + key found_key, + message msg, + const ondisk_ref *msg_blob_ref) { if (lookup_result_is_existence(result)) { bool32 success = merge_accumulator_resize(&result->value, 0); @@ -116,7 +116,7 @@ lookup_result_update_with_blob_ref(lookup_result *result, if (merge_accumulator_is_null(&result->value)) { bool32 success = merge_accumulator_copy_message_with_blob_ref( - &result->value, msg, blob_ref); + &result->value, msg, msg_blob_ref); return success ? STATUS_OK : STATUS_NO_MEMORY; } @@ -127,12 +127,6 @@ lookup_result_update_with_blob_ref(lookup_result *result, : STATUS_NO_MEMORY; } -static inline platform_status -lookup_result_update(lookup_result *result, key found_key, message msg) -{ - return lookup_result_update_with_blob_ref(result, found_key, msg, NULL); -} - static inline bool32 lookup_result_should_continue(const lookup_result *result) { diff --git a/src/memtable.c b/src/memtable.c index dcb38631..40d0d8b4 100644 --- a/src/memtable.c +++ b/src/memtable.c @@ -194,32 +194,6 @@ get_btree_scratch(memtable_context *ctxt, uint64 tid) + tid * ctxt->btree_scratch_sz); } -static void -memtable_blob_ref_inc(const ondisk_ref *blob_ref) -{ - btree_inc_ref(blob_ref->cc, blob_ref->arg, blob_ref->addr); -} - -static void -memtable_blob_ref_dec(const ondisk_ref *blob_ref) -{ - btree_dec_ref(blob_ref->cc, blob_ref->arg, blob_ref->addr, blob_ref->type); -} - -void -memtable_blob_ref_init(memtable_context *ctxt, - uint64 root_addr, - ondisk_ref *blob_ref) -{ - ondisk_ref_init(blob_ref, - ctxt->cc, - root_addr, - PAGE_TYPE_MEMTABLE, - ctxt->cfg.btree_cfg, - memtable_blob_ref_inc, - memtable_blob_ref_dec); -} - void memtable_root_inc_ref(memtable_context *ctxt, uint64 root_addr) { @@ -245,8 +219,6 @@ memtable_insert(memtable_context *ctxt, const threadid tid = platform_get_tid(); btree_scratch *scratch = get_btree_scratch(ctxt, tid); - ondisk_ref blob_ref; - memtable_blob_ref_init(ctxt, mt->root_addr, &blob_ref); platform_status rc = btree_insert(ctxt->cc, ctxt->cfg.btree_cfg, heap_id, @@ -256,9 +228,7 @@ memtable_insert(memtable_context *ctxt, tuple_key, msg, old_result, - &blob_ref, leaf_generation); - ondisk_ref_deinit(&blob_ref); if (!SUCCESS(rc)) { return rc; } diff --git a/src/memtable.h b/src/memtable.h index d9540517..973ee52a 100644 --- a/src/memtable.h +++ b/src/memtable.h @@ -12,7 +12,6 @@ #include "platform_mutex.h" #include "task.h" #include "cache.h" -#include "ondisk_ref.h" #include "btree.h" #include "batch_rwlock.h" @@ -163,11 +162,6 @@ memtable_block_lookups(memtable_context *ctxt); void memtable_unblock_lookups(memtable_context *ctxt); -void -memtable_blob_ref_init(memtable_context *ctxt, - uint64 root_addr, - ondisk_ref *blob_ref); - void memtable_root_inc_ref(memtable_context *ctxt, uint64 root_addr); diff --git a/src/ondisk_ref.c b/src/ondisk_ref.c index 49227760..45b1e00b 100644 --- a/src/ondisk_ref.c +++ b/src/ondisk_ref.c @@ -106,11 +106,8 @@ ondisk_ref_deinit(ondisk_ref *ref) { if (!ondisk_ref_is_null(ref)) { ondisk_ref_assert_valid(ref); - ondisk_ref ref_copy = ONDISK_REF_NULL; - ondisk_ref_copy_fields(&ref_copy, ref); + ref->dec(ref); ondisk_ref_clear(ref); - ref_copy.dec(&ref_copy); - ondisk_ref_clear(&ref_copy); } } diff --git a/tests/unit/btree_stress_test.c b/tests/unit/btree_stress_test.c index 612b167e..90b6e1b2 100644 --- a/tests/unit/btree_stress_test.c +++ b/tests/unit/btree_stress_test.c @@ -227,7 +227,6 @@ CTEST2(btree_stress, iterator_basics) gen_key(&data->dbtree_cfg, i, keybuf, sizeof(keybuf)), gen_msg(&data->dbtree_cfg, i, msgbuf, sizeof(msgbuf)), NULL, - NULL, &generation))) { ASSERT_TRUE(FALSE, "Failed to insert 4-byte %d\n", i); @@ -394,7 +393,6 @@ CTEST2(btree_stress, overwrite_returns_old_value_after_tree_growth) gen_key(&data->dbtree_cfg, i, keybuf, bt_page_size), gen_msg(&data->dbtree_cfg, i, msgbuf, bt_page_size), &uniqueness_result, - NULL, &generation); ASSERT_TRUE(SUCCESS(rc)); ASSERT_FALSE(lookup_result_found(&uniqueness_result)); @@ -451,7 +449,6 @@ CTEST2(btree_stress, overwrite_returns_old_value_after_tree_growth) gen_key(&data->dbtree_cfg, i, keybuf, bt_page_size), merge_accumulator_to_message(&update_msg), &old_result, - NULL, &generation); ASSERT_TRUE(SUCCESS(rc)); ASSERT_TRUE(lookup_result_found(&old_result)); @@ -534,7 +531,6 @@ insert_tests(cache *cc, gen_key(cfg, i, keybuf, keybuf_size), gen_msg(cfg, i, msgbuf, msgbuf_size), NULL, - NULL, &generation))) { ASSERT_TRUE(FALSE, "Failed to insert 4-byte %ld\n", i); From c72064465433c01e523270701a600fe726df69e2 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 29 May 2026 14:25:22 -0700 Subject: [PATCH 11/13] move blob refs to lookup_results Signed-off-by: Rob Johnson --- src/btree.c | 18 ++++++++--------- src/btree.h | 20 +++++++++---------- src/data_blob_build.c | 15 ++------------ src/data_internal.c | 17 ---------------- src/data_internal.h | 19 +----------------- src/lookup_result.h | 46 ++++++++++++++++++++++++++++++++----------- src/memtable.c | 4 ++-- src/splinterdb.c | 3 +-- 8 files changed, 59 insertions(+), 83 deletions(-) diff --git a/src/btree.c b/src/btree.c index 737ea501..70c0d5b2 100644 --- a/src/btree.c +++ b/src/btree.c @@ -616,16 +616,14 @@ btree_record_old_result(const btree_config *cfg, return STATUS_OK; } - leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); - message old_msg = leaf_entry_message(cc, entry); - ondisk_ref blob_ref = ONDISK_REF_NULL; - const ondisk_ref *ref = NULL; + leaf_entry *entry = btree_get_leaf_entry(cfg, hdr, spec->idx); + message old_msg = leaf_entry_message(cc, entry); + ondisk_ref blob_ref = ONDISK_REF_NULL; if (message_is_blob(old_msg)) { btree_blob_ref_init(&blob_ref, cc, cfg, root_addr, PAGE_TYPE_MEMTABLE); - ref = &blob_ref; } - platform_status rc = - lookup_result_update(old_result, leaf_entry_key(entry), old_msg, ref); + platform_status rc = lookup_result_update( + old_result, leaf_entry_key(entry), old_msg, &blob_ref); ondisk_ref_deinit(&blob_ref); return rc; } @@ -2403,8 +2401,7 @@ btree_lookup_and_merge(cache *cc, // IN btree_node node; key found_key; message local_data; - platform_status rc = STATUS_OK; - ondisk_ref blob_ref = ONDISK_REF_NULL; + platform_status rc = STATUS_OK; log_trace_key(target, "btree_lookup"); @@ -2415,6 +2412,7 @@ btree_lookup_and_merge(cache *cc, // IN btree_lookup_with_ref( cc, cfg, root_addr, type, target, &node, &found_key, &local_data); if (!key_is_null(found_key)) { + ondisk_ref blob_ref = ONDISK_REF_NULL; if (local_found != NULL) { *local_found = TRUE; } @@ -2422,9 +2420,9 @@ btree_lookup_and_merge(cache *cc, // IN btree_blob_ref_init(&blob_ref, cc, cfg, root_addr, type); } rc = lookup_result_update(result, found_key, local_data, &blob_ref); + ondisk_ref_deinit(&blob_ref); btree_node_unget(cc, cfg, &node); } - ondisk_ref_deinit(&blob_ref); return rc; } diff --git a/src/btree.h b/src/btree.h index 1b02bbb9..3018ebb5 100644 --- a/src/btree.h +++ b/src/btree.h @@ -179,16 +179,16 @@ typedef struct btree_pack_req { } btree_pack_req; platform_status -btree_insert(cache *cc, // IN - const btree_config *cfg, // IN - platform_heap_id heap_id, // IN - btree_scratch *scratch, // IN - uint64 root_addr, // IN - mini_allocator *mini, // IN - key tuple_key, // IN - message data, // IN - lookup_result *old_result, // IN/OUT - uint64 *generation); // OUT +btree_insert(cache *cc, // IN + const btree_config *cfg, // IN + platform_heap_id heap_id, // IN + btree_scratch *scratch, // IN + uint64 root_addr, // IN + mini_allocator *mini, // IN + key tuple_key, // IN + message data, // IN + lookup_result *old_result, // IN/OUT + uint64 *generation); // OUT uint64 btree_create(cache *cc, diff --git a/src/data_blob_build.c b/src/data_blob_build.c index 90f62944..208e2690 100644 --- a/src/data_blob_build.c +++ b/src/data_blob_build.c @@ -20,12 +20,7 @@ message_to_blob(const blob_build_config *cfg, platform_assert(!message_is_blob(msg)); ma->type = message_class(msg); ma->cc = cc; - platform_status rc = - blob_build(cfg, cc, mini, message_slice(msg), &ma->data); - if (SUCCESS(rc)) { - merge_accumulator_release_blob_ref(ma); - } - return rc; + return blob_build(cfg, cc, mini, message_slice(msg), &ma->data); } platform_status @@ -38,12 +33,7 @@ message_clone(const blob_build_config *cfg, platform_assert(message_is_blob(msg)); result->type = message_class(msg); result->cc = cc; - platform_status rc = - blob_clone(cfg, cc, mini, message_slice(msg), &result->data); - if (SUCCESS(rc)) { - merge_accumulator_release_blob_ref(result); - } - return rc; + return blob_clone(cfg, cc, mini, message_slice(msg), &result->data); } platform_status @@ -70,7 +60,6 @@ merge_accumulator_convert_to_blob(const blob_build_config *cfg, if (!SUCCESS(rc)) { return rc; } - merge_accumulator_release_blob_ref(ma); ma->cc = cc; return rc; } diff --git a/src/data_internal.c b/src/data_internal.c index 7372dc51..b33b5d55 100644 --- a/src/data_internal.c +++ b/src/data_internal.c @@ -28,30 +28,15 @@ merge_accumulator_to_slice(const merge_accumulator *ma) return writable_buffer_to_slice(&ma->data); } -void -merge_accumulator_release_blob_ref(merge_accumulator *ma) -{ - ondisk_ref_deinit(&ma->blob_ref); -} - /* Copy a message into an already-initialized merge_accumulator. */ _Bool merge_accumulator_copy_message(merge_accumulator *ma, message msg) -{ - return merge_accumulator_copy_message_with_blob_ref(ma, msg, NULL); -} - -_Bool -merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, - message msg, - const ondisk_ref *blob_ref) { platform_status rc = writable_buffer_copy_slice(&ma->data, message_slice(msg)); if (!SUCCESS(rc)) { return FALSE; } - ondisk_ref_replace(&ma->blob_ref, message_is_blob(msg) ? blob_ref : NULL); ma->type = message_class(msg); ma->cc = msg.cc; @@ -63,7 +48,6 @@ merge_accumulator_resize(merge_accumulator *ma, uint64 newsize) { platform_status rc = writable_buffer_resize(&ma->data, newsize); if (SUCCESS(rc) && newsize == 0) { - merge_accumulator_release_blob_ref(ma); ma->cc = NULL; } return SUCCESS(rc); @@ -74,7 +58,6 @@ merge_accumulator_set_class(merge_accumulator *ma, message_type type) { ma->type = type; if (type == MESSAGE_TYPE_INVALID || type == MESSAGE_TYPE_DELETE) { - merge_accumulator_release_blob_ref(ma); ma->cc = NULL; } } diff --git a/src/data_internal.h b/src/data_internal.h index 33cfaf28..f6d485f3 100644 --- a/src/data_internal.h +++ b/src/data_internal.h @@ -12,7 +12,6 @@ #include "splinterdb/data.h" #include "blob.h" #include "cache.h" -#include "ondisk_ref.h" #include "util.h" /* @@ -536,24 +535,14 @@ struct merge_accumulator { message_type type; cache *cc; writable_buffer data; - ondisk_ref blob_ref; }; -void -merge_accumulator_release_blob_ref(merge_accumulator *ma); - -_Bool -merge_accumulator_copy_message_with_blob_ref(merge_accumulator *ma, - message msg, - const ondisk_ref *blob_ref); - static inline void merge_accumulator_init(merge_accumulator *ma, platform_heap_id heap_id) { writable_buffer_init(&ma->data, heap_id); ma->type = MESSAGE_TYPE_INVALID; ma->cc = NULL; - ondisk_ref_init_null(&ma->blob_ref); } static inline void @@ -569,17 +558,14 @@ merge_accumulator_init_with_buffer(merge_accumulator *ma, &ma->data, heap_id, allocation_size, data, logical_size); ma->type = type; ma->cc = NULL; - ondisk_ref_init_null(&ma->blob_ref); } static inline void merge_accumulator_deinit(merge_accumulator *ma) { - merge_accumulator_release_blob_ref(ma); writable_buffer_deinit(&ma->data); ma->type = MESSAGE_TYPE_INVALID; ma->cc = NULL; - ondisk_ref_init_null(&ma->blob_ref); } static inline bool32 @@ -621,7 +607,6 @@ merge_accumulator_init_from_message(merge_accumulator *ma, static inline void merge_accumulator_set_to_null(merge_accumulator *ma) { - merge_accumulator_release_blob_ref(ma); ma->type = MESSAGE_TYPE_INVALID; ma->cc = NULL; writable_buffer_set_to_null(&ma->data); @@ -633,7 +618,6 @@ merge_accumulator_is_null(const merge_accumulator *ma) bool32 r = writable_buffer_is_null(&ma->data); debug_assert(IMPLIES(r, ma->type == MESSAGE_TYPE_INVALID)); debug_assert(IMPLIES(r, ma->cc == NULL)); - debug_assert(IMPLIES(r, ondisk_ref_is_null(&ma->blob_ref))); return r; } @@ -664,8 +648,7 @@ merge_accumulator_ensure_materialized(merge_accumulator *ma) writable_buffer old_data = ma->data; ma->data = materialized; - merge_accumulator_release_blob_ref(ma); - ma->cc = NULL; + ma->cc = NULL; writable_buffer_deinit(&old_data); return STATUS_OK; } diff --git a/src/lookup_result.h b/src/lookup_result.h index e50ecb93..3ab5b1a8 100644 --- a/src/lookup_result.h +++ b/src/lookup_result.h @@ -5,12 +5,14 @@ #include "splinterdb/splinterdb.h" #include "data_internal.h" +#include "ondisk_ref.h" #include "platform_assert.h" typedef struct lookup_result { const data_config *data_cfg; splinterdb_lookup_flags flags; merge_accumulator value; + ondisk_ref blob_ref; } lookup_result; _Static_assert(sizeof(lookup_result) <= sizeof(splinterdb_lookup_result), @@ -50,6 +52,7 @@ lookup_result_init(lookup_result *result, buffer, WRITABLE_BUFFER_NULL_LENGTH, MESSAGE_TYPE_INVALID); + ondisk_ref_init_null(&result->blob_ref); } static inline void @@ -62,6 +65,7 @@ lookup_result_set_data_config(lookup_result *result, static inline void lookup_result_deinit(lookup_result *result) { + ondisk_ref_deinit(&result->blob_ref); merge_accumulator_deinit(&result->value); } @@ -74,6 +78,7 @@ lookup_result_is_existence(const lookup_result *result) static inline void lookup_result_reset(lookup_result *result) { + ondisk_ref_deinit(&result->blob_ref); merge_accumulator_set_to_null(&result->value); } @@ -106,25 +111,39 @@ lookup_result_update(lookup_result *result, const ondisk_ref *msg_blob_ref) { if (lookup_result_is_existence(result)) { - bool32 success = merge_accumulator_resize(&result->value, 0); - if (!success) { - return STATUS_NO_MEMORY; - } + platform_assert(merge_accumulator_length(&result->value) == 0); + platform_assert(ondisk_ref_is_null(&result->blob_ref)); merge_accumulator_set_class(&result->value, message_class(msg)); return STATUS_OK; } if (merge_accumulator_is_null(&result->value)) { - bool32 success = merge_accumulator_copy_message_with_blob_ref( - &result->value, msg, msg_blob_ref); - return success ? STATUS_OK : STATUS_NO_MEMORY; + debug_assert(!message_is_blob(msg) || !ondisk_ref_is_null(msg_blob_ref)); + bool32 success = merge_accumulator_copy_message(&result->value, msg); + if (!success) { + return STATUS_NO_MEMORY; + } + ondisk_ref_replace(&result->blob_ref, + message_is_blob(msg) ? msg_blob_ref : NULL); + return STATUS_OK; } platform_assert(result->data_cfg != NULL); - return data_merge_tuples(result->data_cfg, found_key, msg, &result->value) - == 0 - ? STATUS_OK - : STATUS_NO_MEMORY; + int rc = data_merge_tuples(result->data_cfg, found_key, msg, &result->value); + if (!merge_accumulator_isblob(&result->value)) { + ondisk_ref_deinit(&result->blob_ref); + } + return rc == 0 ? STATUS_OK : STATUS_NO_MEMORY; +} + +static inline platform_status +lookup_result_ensure_materialized(lookup_result *result) +{ + platform_status rc = merge_accumulator_ensure_materialized(&result->value); + if (SUCCESS(rc)) { + ondisk_ref_deinit(&result->blob_ref); + } + return rc; } static inline bool32 @@ -143,6 +162,7 @@ lookup_result_note_filter_hit(lookup_result *result) if (lookup_result_is_existence(result)) { bool32 success = merge_accumulator_resize(&result->value, 0); platform_assert(success); + ondisk_ref_deinit(&result->blob_ref); merge_accumulator_set_class(&result->value, MESSAGE_TYPE_UPDATE); } return lookup_result_should_continue(result); @@ -157,6 +177,9 @@ lookup_result_finalize(lookup_result *result, key query_key) { platform_assert(result->data_cfg != NULL); data_merge_tuples_final(result->data_cfg, query_key, &result->value); + if (!merge_accumulator_isblob(&result->value)) { + ondisk_ref_deinit(&result->blob_ref); + } } if (!lookup_result_is_existence(result) @@ -164,6 +187,7 @@ lookup_result_finalize(lookup_result *result, key query_key) && merge_accumulator_message_class(&result->value) == MESSAGE_TYPE_DELETE) { + ondisk_ref_deinit(&result->blob_ref); merge_accumulator_set_to_null(&result->value); } } diff --git a/src/memtable.c b/src/memtable.c index 40d0d8b4..77ee1589 100644 --- a/src/memtable.c +++ b/src/memtable.c @@ -218,8 +218,8 @@ memtable_insert(memtable_context *ctxt, { const threadid tid = platform_get_tid(); - btree_scratch *scratch = get_btree_scratch(ctxt, tid); - platform_status rc = btree_insert(ctxt->cc, + btree_scratch *scratch = get_btree_scratch(ctxt, tid); + platform_status rc = btree_insert(ctxt->cc, ctxt->cfg.btree_cfg, heap_id, scratch, diff --git a/src/splinterdb.c b/src/splinterdb.c index 1567ae0f..bdbfdb4e 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -537,8 +537,7 @@ splinterdb_lookup_result_value(const splinterdb_lookup_result *result, // IN lookup_result *mutable_result = lookup_result_from_splinterdb((splinterdb_lookup_result *)result); - platform_status status = - merge_accumulator_ensure_materialized(&mutable_result->value); + platform_status status = lookup_result_ensure_materialized(mutable_result); if (!SUCCESS(status)) { return platform_status_to_int(status); } From 5f3b52a62bb2f7c21f844188dff00c20d9791a12 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 29 May 2026 21:19:16 -0700 Subject: [PATCH 12/13] fix gcc snprintf truncation warning Signed-off-by: Rob Johnson --- tests/unit/splinterdb_quick_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/splinterdb_quick_test.c b/tests/unit/splinterdb_quick_test.c index 476ba5d4..242a320e 100644 --- a/tests/unit/splinterdb_quick_test.c +++ b/tests/unit/splinterdb_quick_test.c @@ -49,8 +49,8 @@ // Hard-coded format strings to generate key and values static const char key_fmt[] = "key-%04x"; static const char val_fmt[] = "val-%04x"; -#define KEY_FMT_LENGTH (8) -#define VAL_FMT_LENGTH (8) +#define KEY_FMT_LENGTH (22) +#define VAL_FMT_LENGTH (22) #define TEST_INSERT_KEY_LENGTH (KEY_FMT_LENGTH + 1) #define TEST_INSERT_VAL_LENGTH (VAL_FMT_LENGTH + 1) From 2b79935d3ced4cd1b1e5e9a24fa42db55468106e Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 29 May 2026 21:21:18 -0700 Subject: [PATCH 13/13] isblob -> is_blob Signed-off-by: Rob Johnson --- include/splinterdb/data.h | 2 +- src/data_blob_build.c | 2 +- src/data_internal.h | 6 +++--- src/lookup_result.h | 4 ++-- src/shard_log.c | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/splinterdb/data.h b/include/splinterdb/data.h index 058b8d87..129fb30d 100644 --- a/include/splinterdb/data.h +++ b/include/splinterdb/data.h @@ -73,7 +73,7 @@ message_length(message msg) } static inline _Bool -message_isblob(message msg) +message_is_blob(message msg) { return msg.cc != NULL; } diff --git a/src/data_blob_build.c b/src/data_blob_build.c index 208e2690..a59c1ab4 100644 --- a/src/data_blob_build.c +++ b/src/data_blob_build.c @@ -42,7 +42,7 @@ merge_accumulator_convert_to_blob(const blob_build_config *cfg, mini_allocator *mini, merge_accumulator *ma) { - if (merge_accumulator_isblob(ma)) { + if (merge_accumulator_is_blob(ma)) { return STATUS_OK; } diff --git a/src/data_internal.h b/src/data_internal.h index f6d485f3..6d3de16a 100644 --- a/src/data_internal.h +++ b/src/data_internal.h @@ -357,7 +357,7 @@ message_create(message_type type, cache *cc, slice data) static inline bool32 message_is_blob(message msg) { - return message_isblob(msg); + return message_is_blob(msg); } static inline bool32 @@ -575,7 +575,7 @@ merge_accumulator_is_definitive(const merge_accumulator *ma) } static inline bool32 -merge_accumulator_isblob(const merge_accumulator *ma) +merge_accumulator_is_blob(const merge_accumulator *ma) { return ma->cc != NULL; } @@ -633,7 +633,7 @@ message_materialize(message msg, merge_accumulator *tmp) static inline platform_status merge_accumulator_ensure_materialized(merge_accumulator *ma) { - if (!merge_accumulator_isblob(ma)) { + if (!merge_accumulator_is_blob(ma)) { return STATUS_OK; } diff --git a/src/lookup_result.h b/src/lookup_result.h index 3ab5b1a8..3e5aa3db 100644 --- a/src/lookup_result.h +++ b/src/lookup_result.h @@ -130,7 +130,7 @@ lookup_result_update(lookup_result *result, platform_assert(result->data_cfg != NULL); int rc = data_merge_tuples(result->data_cfg, found_key, msg, &result->value); - if (!merge_accumulator_isblob(&result->value)) { + if (!merge_accumulator_is_blob(&result->value)) { ondisk_ref_deinit(&result->blob_ref); } return rc == 0 ? STATUS_OK : STATUS_NO_MEMORY; @@ -177,7 +177,7 @@ lookup_result_finalize(lookup_result *result, key query_key) { platform_assert(result->data_cfg != NULL); data_merge_tuples_final(result->data_cfg, query_key, &result->value); - if (!merge_accumulator_isblob(&result->value)) { + if (!merge_accumulator_is_blob(&result->value)) { ondisk_ref_deinit(&result->blob_ref); } } diff --git a/src/shard_log.c b/src/shard_log.c index 7ff58572..d997d998 100644 --- a/src/shard_log.c +++ b/src/shard_log.c @@ -165,7 +165,7 @@ log_entry_key(log_entry *le) } static bool32 -log_entry_message_isblob(log_entry *le) +log_entry_message_is_blob(log_entry *le) { return ondisk_tuple_message_is_blob(&le->tuple); } @@ -566,7 +566,7 @@ shard_log_print(shard_log *log) platform_default_log( "%s -- %s%s : %lu\n", key_string(dcfg, log_entry_key(le)), - log_entry_message_isblob(le) ? "(blob) " : "", + log_entry_message_is_blob(le) ? "(blob) " : "", message_string(dcfg, log_entry_message(cc, le)), le->generation); }