From c73fb26f1e59b2d893c0c8b4ca17c7abc315e12a Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 30 May 2026 22:37:37 -0700 Subject: [PATCH 1/7] pack-objects: honor a ".baddeltas" marker in try_delta() When considering an object pair for delta compression, try_delta() short-circuits (skipping a real delta search) when both objects came from the same on-disk pack and neither is stored there as a delta. The reasoning is that an earlier pack-objects run already had the chance to try this pair and chose not to delta either against the other, so trying again is unlikely to find anything new. That heuristic is correct only if the source pack was actually built by something that did a real delta search. That may not be the case if the pack was written by `git fast-import` (which only does a basic delta comparison for objects in the pack it writes), the bulk_checkin framework when streaming large blobs straight into a pack, or pack-objects with --window=0. Add a `.baddeltas` sidecar marker, parallel to `.keep`, `.promisor`, and `.mtimes`, that lets a producer flag its output pack as one whose delta layout should not be trusted by this skip. For context on the underlying delta skip and earlier discussions of a similar marker, see Jeff King's analysis on the list: https://lore.kernel.org/git/20231009202149.GA3281325@coredump.intra.peff.net/ Assisted-by: Claude Opus 4.7 Signed-off-by: Elijah Newren --- Documentation/gitformat-pack.adoc | 27 +++++++ builtin/pack-objects.c | 6 ++ packfile.c | 15 +++- packfile.h | 3 +- t/meson.build | 1 + t/t5337-pack-baddeltas.sh | 124 ++++++++++++++++++++++++++++++ 6 files changed, 171 insertions(+), 5 deletions(-) create mode 100755 t/t5337-pack-baddeltas.sh diff --git a/Documentation/gitformat-pack.adoc b/Documentation/gitformat-pack.adoc index 3416edceab82e9..1ea5e66e54618c 100644 --- a/Documentation/gitformat-pack.adoc +++ b/Documentation/gitformat-pack.adoc @@ -12,6 +12,7 @@ SYNOPSIS $GIT_DIR/objects/pack/pack-*.{pack,idx} $GIT_DIR/objects/pack/pack-*.rev $GIT_DIR/objects/pack/pack-*.mtimes +$GIT_DIR/objects/pack/pack-*.baddeltas $GIT_DIR/objects/pack/multi-pack-index DESCRIPTION @@ -357,6 +358,32 @@ All 4-byte numbers are in network byte order. and a checksum of all of the above (each having length according to the specified hash function). +== pack-*.baddeltas files + +The optional `.baddeltas` file is an empty marker sitting alongside a +`pack-*.pack` (and its `.idx`). It signals to `git pack-objects` that +the delta layout of the pack should not be trusted: even when two +objects appear together in the same pack and neither is stored as a +delta, the next packing run should still call out to its delta search +routine for the pair instead of assuming a prior pack-objects already +considered (and rejected) the pair. + +This is intended for producers that intentionally skip delta search +when writing a pack (for example, processes that bulk-import objects +or aggregate multiple existing packs without recomputing deltas). +Without this marker, the same-pack delta skip in `git pack-objects` +would silently inherit those producers' lack of delta search into +future repacks. + +The contents of the file are currently ignored. Producers should +write an empty file; consumers must tolerate (and ignore) any +content. + +The marker only affects whether `git pack-objects` will attempt to +compute new deltas for object pairs that share the marked pack. It +does not disable reuse of existing on-disk deltas, nor does it affect +multi-pack-index, bitmap, or pack reuse for transfer. + == multi-pack-index (MIDX) files have the following format: The multi-pack-index files refer to multiple pack-files and loose objects. diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index fe9fbecb30e100..c6baa24c56b9e7 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -2794,9 +2794,15 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, * be considered, as even if we produce a suboptimal delta against * it, we will still save the transfer cost, as we already know * the other side has it and we won't send src_entry at all. + * + * If the source pack carries a ".baddeltas" marker, we treat its + * existing delta layout as untrusted: even if the two objects are + * in the same pack and neither is a delta, we have no reason to + * believe a previous packing run actually considered the pair. */ if (reuse_delta && IN_PACK(trg_entry) && IN_PACK(trg_entry) == IN_PACK(src_entry) && + !IN_PACK(trg_entry)->has_bad_deltas && !src_entry->preferred_base && trg_entry->in_pack_type != OBJ_REF_DELTA && trg_entry->in_pack_type != OBJ_OFS_DELTA) diff --git a/packfile.c b/packfile.c index 89366abfe32386..45427e927d53e5 100644 --- a/packfile.c +++ b/packfile.c @@ -451,7 +451,9 @@ void close_pack(struct packed_git *p) void unlink_pack_path(const char *pack_name, int force_delete) { - static const char *exts[] = {".idx", ".pack", ".rev", ".keep", ".bitmap", ".promisor", ".mtimes"}; + static const char *exts[] = {".idx", ".pack", ".rev", ".keep", + ".bitmap", ".promisor", ".mtimes", + ".baddeltas"}; int i; struct strbuf buf = STRBUF_INIT; size_t plen; @@ -807,10 +809,10 @@ struct packed_git *add_packed_git(struct repository *r, const char *path, return NULL; /* - * ".promisor" is long enough to hold any suffix we're adding (and + * ".baddeltas" is long enough to hold any suffix we're adding (and * the use xsnprintf double-checks that) */ - alloc = st_add3(path_len, strlen(".promisor"), 1); + alloc = st_add3(path_len, strlen(".baddeltas"), 1); p = alloc_packed_git(r, alloc); memcpy(p->pack_name, path, path_len); @@ -837,6 +839,10 @@ struct packed_git *add_packed_git(struct repository *r, const char *path, if (!access(p->pack_name, F_OK)) p->is_cruft = 1; + xsnprintf(p->pack_name + path_len, alloc - path_len, ".baddeltas"); + if (!access(p->pack_name, F_OK)) + p->has_bad_deltas = 1; + xsnprintf(p->pack_name + path_len, alloc - path_len, ".pack"); if (stat(p->pack_name, &st) || !S_ISREG(st.st_mode)) { free(p); @@ -1019,7 +1025,8 @@ static void prepare_pack(const char *full_name, size_t full_name_len, ends_with(file_name, ".bitmap") || ends_with(file_name, ".keep") || ends_with(file_name, ".promisor") || - ends_with(file_name, ".mtimes")) + ends_with(file_name, ".mtimes") || + ends_with(file_name, ".baddeltas")) string_list_append(data->garbage, full_name); else report_garbage(PACKDIR_FILE_GARBAGE, full_name); diff --git a/packfile.h b/packfile.h index 49d6bdecf6ea18..a2cee4b1f055f4 100644 --- a/packfile.h +++ b/packfile.h @@ -33,7 +33,8 @@ struct packed_git { do_not_close:1, pack_promisor:1, multi_pack_index:1, - is_cruft:1; + is_cruft:1, + has_bad_deltas:1; unsigned char hash[GIT_MAX_RAWSZ]; struct revindex_entry *revindex; const uint32_t *revindex_data; diff --git a/t/meson.build b/t/meson.build index 2af8d0127991db..3516913411dc5c 100644 --- a/t/meson.build +++ b/t/meson.build @@ -629,6 +629,7 @@ integration_tests = [ 't5333-pseudo-merge-bitmaps.sh', 't5334-incremental-multi-pack-index.sh', 't5335-compact-multi-pack-index.sh', + 't5337-pack-baddeltas.sh', 't5351-unpack-large-objects.sh', 't5400-send-pack.sh', 't5401-update-hooks.sh', diff --git a/t/t5337-pack-baddeltas.sh b/t/t5337-pack-baddeltas.sh new file mode 100755 index 00000000000000..8c1a11e1067af3 --- /dev/null +++ b/t/t5337-pack-baddeltas.sh @@ -0,0 +1,124 @@ +#!/bin/sh + +test_description='`.baddeltas` sidecar disables the same-pack try_delta() skip' + +. ./test-lib.sh + +# Two similar but distinct blobs. The contents are deliberately large +# enough and similar enough that a real delta search will find a useful +# delta between them. +generate_blobs () { + { + printf "header\n" && + i=0 && + while test $i -lt 200 + do + printf "line %d padding-padding-padding-padding\n" $i && + i=$((i + 1)) || return 1 + done + } >a && + { + printf "header\n" && + i=0 && + while test $i -lt 200 + do + printf "line %d padding-padding-padding-padding\n" $i && + i=$((i + 1)) || return 1 + done && + printf "tail-line\n" + } >b +} + +# Build a pack from the two blobs without doing any delta search, and +# echo the basename of the resulting .pack file (the last "pack-*.pack" +# in objects/pack). +build_input_pack () { + A=$(git hash-object -w a) && + B=$(git hash-object -w b) && + printf "%s\n%s\n" "$A" "$B" | + git pack-objects --window=0 .git/objects/pack/pack >/dev/null && + git prune-packed && + basename "$(ls -t .git/objects/pack/pack-*.pack | head -1)" +} + +# Count delta entries in a pack idx. "git verify-pack -v" prints one +# line per object with 5 fields for non-delta entries (oid, type, size, +# size-in-pack, offset) and 7 fields for delta entries (... depth +# base-oid). +count_deltas () { + git verify-pack -v "$1" | + awk 'NF == 7 { n++ } END { print n + 0 }' +} + +# Repack just the two blobs from the existing pack into a fresh pack +# with prefix "out". Echo the basename of the resulting .pack file. +repack_blobs () { + A=$(git hash-object a) && + B=$(git hash-object b) && + rm -f .git/objects/pack/out-*.pack \ + .git/objects/pack/out-*.idx \ + .git/objects/pack/out-*.rev && + printf "%s\n%s\n" "$A" "$B" | + git pack-objects .git/objects/pack/out >/dev/null && + basename "$(ls .git/objects/pack/out-*.pack)" +} + +test_expect_success 'set up two similar blobs' ' + git init repo && + ( + cd repo && + generate_blobs + ) +' + +test_expect_success 'without .baddeltas, same-pack pair is skipped' ' + test_when_finished "rm -fr work" && + cp -R repo work && + ( + cd work && + input_pack=$(build_input_pack) && + test 0 -eq "$(count_deltas .git/objects/pack/${input_pack%.pack}.idx)" && + out_pack=$(repack_blobs) && + test 0 -eq "$(count_deltas .git/objects/pack/${out_pack%.pack}.idx)" + ) +' + +test_expect_success 'with .baddeltas, same-pack pair gets reconsidered' ' + test_when_finished "rm -fr work" && + cp -R repo work && + ( + cd work && + input_pack=$(build_input_pack) && + test 0 -eq "$(count_deltas .git/objects/pack/${input_pack%.pack}.idx)" && + >.git/objects/pack/${input_pack%.pack}.baddeltas && + out_pack=$(repack_blobs) && + test 1 -le "$(count_deltas .git/objects/pack/${out_pack%.pack}.idx)" + ) +' + +test_expect_success '.baddeltas does not trigger garbage warnings' ' + test_when_finished "rm -fr work" && + cp -R repo work && + ( + cd work && + input_pack=$(build_input_pack) && + >.git/objects/pack/${input_pack%.pack}.baddeltas && + git count-objects -v 2>warnings && + ! grep -i garbage warnings + ) +' + +test_expect_success '.baddeltas is removed by git repack -d' ' + test_when_finished "rm -fr work" && + cp -R repo work && + ( + cd work && + input_pack=$(build_input_pack) && + >.git/objects/pack/${input_pack%.pack}.baddeltas && + git repack -ad && + test_path_is_missing .git/objects/pack/${input_pack%.pack}.baddeltas && + test_path_is_missing .git/objects/pack/$input_pack + ) +' + +test_done From eee4074aa81e4260896716050bc93f01b1bd99fc Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 30 May 2026 22:40:13 -0700 Subject: [PATCH 2/7] pack-objects: add a --mark-bad-deltas option The previous patch taught pack-objects to honor a `.baddeltas` sidecar marker that tells the same-pack delta skip in try_delta() to fall through to a real delta search. Add a `--mark-bad-deltas` option that writes such a marker alongside each output pack. The flag is incompatible with `--stdout`, since the marker lives next to the output pack in `$GIT_DIR/objects/pack` and `--stdout` has no on-disk output to mark. When pack-objects splits its output across multiple packs (with `--max-pack-size`), a marker is written for each resulting pack inside the existing per-pack output loop, between stage_tmp_packfiles() and rename_tmp_packfile_idx() so the marker is in place before the `.idx` anchor becomes visible. Document the option, and extend t5335 to cover both the producer side (marker is written and disables the same-pack skip for the resulting pack) and the `--stdout` incompatibility. Assisted-by: Claude Opus 4.7 Signed-off-by: Elijah Newren --- Documentation/git-pack-objects.adoc | 8 +++++++ builtin/pack-objects.c | 24 +++++++++++++++++++++ t/t5337-pack-baddeltas.sh | 33 +++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/Documentation/git-pack-objects.adoc b/Documentation/git-pack-objects.adoc index 8a27aa19fd3f1f..6a6c6ec76932b6 100644 --- a/Documentation/git-pack-objects.adoc +++ b/Documentation/git-pack-objects.adoc @@ -143,6 +143,14 @@ options which imply `--revs`. have an mtime older than ``. If unspecified (and given `--cruft`), then no objects are eliminated. +--mark-bad-deltas:: + Write a `.baddeltas` marker file alongside each output pack. The + marker signals that objects within the pack have not been fully + delta-searched against other objects within the same pack and + that future repacking should consider them. Any deltas that do + exist within this pack can still be reused, however. + Incompatible with `--stdout`. + --window=:: --depth=:: These two options affect how the objects contained in diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index c6baa24c56b9e7..2ac48c517cefa7 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -43,6 +43,7 @@ #include "pack-mtimes.h" #include "parse-options.h" #include "pkt-line.h" +#include "path.h" #include "blob.h" #include "tree.h" #include "path-walk.h" @@ -211,6 +212,7 @@ static int keep_unreachable, unpack_unreachable, include_tag; static timestamp_t unpack_unreachable_expiration; static int pack_loose_unreachable; static int cruft; +static int mark_bad_deltas; static int shallow = 0; static timestamp_t cruft_expiration; static int local; @@ -1459,6 +1461,23 @@ static void write_pack_file(void) &pack_idx_opts, hash, &idx_tmp_name); + if (mark_bad_deltas) { + size_t tmpname_len = tmpname.len; + int fd; + + strbuf_addstr(&tmpname, "baddeltas"); + fd = xopen(tmpname.buf, + O_WRONLY | O_CREAT | O_TRUNC, 0444); + if (close(fd)) + die_errno(_("unable to close '%s'"), + tmpname.buf); + if (adjust_shared_perm(the_repository, + tmpname.buf)) + die_errno(_("unable to make '%s' readable"), + tmpname.buf); + strbuf_setlen(&tmpname, tmpname_len); + } + if (write_bitmap_index) { size_t tmpname_len = tmpname.len; @@ -5091,6 +5110,8 @@ int cmd_pack_objects(int argc, N_("unpack unreachable objects newer than