diff --git a/Documentation/config/repack.adoc b/Documentation/config/repack.adoc index 4c22a499f6216c..9fd250870eb0a4 100644 --- a/Documentation/config/repack.adoc +++ b/Documentation/config/repack.adoc @@ -64,3 +64,13 @@ repack.midxNewLayerThreshold:: When the tip layer has fewer packs than this threshold, those packs are excluded from the geometric repack entirely, and are thus left unmodified. Must be at least 1. Defaults to 8. + +repack.aggregate:: + If set to true, linkgit:git-repack[1] will spawn a background + linkgit:git-pack-aggregate[1] for the duration of its main + pack-objects run. The aggregator rolls up small packs and + loose objects that arrive after pack-objects has enumerated + its inputs, preventing them from accumulating in + `objects/pack/` and slowing other Git operations on busy + servers. Defaults to false. Can be overridden on the + command line with `--aggregate` or `--no-aggregate`. diff --git a/Documentation/git-pack-aggregate.adoc b/Documentation/git-pack-aggregate.adoc new file mode 100644 index 00000000000000..0848dde6d64208 --- /dev/null +++ b/Documentation/git-pack-aggregate.adoc @@ -0,0 +1,115 @@ +git-pack-aggregate(1) +===================== + +NAME +---- +git-pack-aggregate - Periodically roll up loose objects and small packs +into a single pack without doing any delta compression or search + +SYNOPSIS +-------- +[verse] +'git pack-aggregate' (--once | --loop) [--interval=] + [--min-loose=] [--min-packs=] + [--exclude-pack-file=] + [--exclude-loose-file=] + [--parent-pipe-fd=] + +DESCRIPTION +----------- + +`git pack-aggregate` watches `$GIT_DIR/objects` for loose objects and +small packfiles that have accumulated since the last cycle, and +periodically rolls them up into a single new pack. Each cycle has two +steps: + +1. Bundle local loose objects (minus any listed in + `--exclude-loose-file`) into a new pack and unlink the loose copies. +2. Aggregate small local packs (minus any listed in + `--exclude-pack-file`, or those referenced by the multi-pack-index, + or those carrying a `.keep`, `.promisor`, `.mtimes`, or `.bitmap` + sidecar) into another new pack and unlink the source packs. The + pack written in step 1 is naturally a candidate here and will + normally be folded into the step-2 output. + +No delta search is performed, no bitmaps or commit-graphs are updated, +and no objects are recompressed; the existing on-disk representation +of each object is copied through unchanged. Both new packs are +written with `pack-objects --window=0 --mark-bad-deltas`, so each one +ships with a `.baddeltas` sidecar (see linkgit:gitformat-pack[5]). A +subsequent thorough repack (linkgit:git-repack[1]) honors that marker +and reconsiders intra-pack deltas at that time. + +The purpose of this command is to keep `objects/` from accumulating +large numbers of loose objects or small packs while a long-running +repack is in flight, so unrelated `git` operations are not slowed down +by readdir cost in a sprawling object directory. Used together with +`repack.aggregate` (see linkgit:git-repack[1]), `git repack` will spawn +this command for the duration of its own work. + +`git pack-aggregate` takes no locks of its own. It mirrors +linkgit:git-repack[1] in this regard: callers that need serialization +must arrange it themselves (for example by running under `git gc`). +The exclusion files supplied via `--exclude-pack-file` and +`--exclude-loose-file` are the only mechanism by which a concurrent +`pack-objects` or `git repack` declares "do not touch these inputs". + +OPTIONS +------- + +--once:: + Run a single cycle and exit. Exactly one of `--once` or + `--loop` is required. + +--loop:: + Run cycles forever, sleeping `--interval` seconds between + cycles. Stops on `SIGTERM`, `SIGHUP`, or `SIGINT`. + +--interval=:: + Number of seconds to sleep between the end of one cycle and the + start of the next. Defaults to 60. Only meaningful with + `--loop`. + +--min-loose=:: + Skip step 1 if fewer than `` loose objects (after applying + `--exclude-loose-file`) are present. Defaults to 5. + +--min-packs=:: + Skip step 2 if fewer than `` aggregatable packs (after + applying all exclusions) are present. Defaults to 5. + +--exclude-pack-file=:: + Read a list of pack basenames (one per line) from `` and + never touch any of those packs. Lines may name a basename with + or without a `.pack` or `.idx` suffix; the suffix is stripped. + Blank lines and lines beginning with `#` are ignored. When + `git pack-aggregate` is spawned by a long-running `git repack`, + this file is populated by that `pack-objects` itself via + `--emit-input-packs`, and may contain a conservative superset + of the packs it will actually consume (such as in `--geometric` + mode). + +--exclude-loose-file=:: + Read a list of loose object IDs (one per line) from `` + and never pack or unlink any of those loose objects. Blank + lines and lines beginning with `#` are ignored. When `git + pack-aggregate` is spawned by a long-running `git repack`, this + file is populated by that `pack-objects` itself via + `--emit-input-loose`. + +--parent-pipe-fd=:: + An inherited pipe file descriptor whose write end the parent + process holds open. When the parent exits the pipe is closed + and `git pack-aggregate` exits at the next cycle boundary, + without waiting out the remainder of `--interval`. This is an + internal plumbing option used by `git repack` to keep its + companion aggregator from outliving it. + +SEE ALSO +-------- +linkgit:git-repack[1], linkgit:git-pack-objects[1], +linkgit:gitformat-pack[5] + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Documentation/git-pack-objects.adoc b/Documentation/git-pack-objects.adoc index 8a27aa19fd3f1f..6a6c6ec76932b6 100644 --- a/Documentation/git-pack-objects.adoc +++ b/Documentation/git-pack-objects.adoc @@ -143,6 +143,14 @@ options which imply `--revs`. have an mtime older than ``. If unspecified (and given `--cruft`), then no objects are eliminated. +--mark-bad-deltas:: + Write a `.baddeltas` marker file alongside each output pack. The + marker signals that objects within the pack have not been fully + delta-searched against other objects within the same pack and + that future repacking should consider them. Any deltas that do + exist within this pack can still be reused, however. + Incompatible with `--stdout`. + --window=:: --depth=:: These two options affect how the objects contained in diff --git a/Documentation/git-repack.adoc b/Documentation/git-repack.adoc index 72c42015e23f94..d140c68d40a99d 100644 --- a/Documentation/git-repack.adoc +++ b/Documentation/git-repack.adoc @@ -12,6 +12,7 @@ SYNOPSIS 'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m] [--window=] [--depth=] [--threads=] [--keep-pack=] [--write-midx[=]] [--name-hash-version=] [--path-walk] + [--[no-]aggregate] DESCRIPTION ----------- @@ -300,6 +301,16 @@ created for any new pack(s) without disturbing the existing chain. Pass the `--path-walk` option to the underlying `git pack-objects` process. See linkgit:git-pack-objects[1] for full details. +--aggregate:: +--no-aggregate:: + Spawn a background linkgit:git-pack-aggregate[1] for the + duration of the main pack-objects run. The aggregator rolls + up small packs and loose objects that arrive after + pack-objects has enumerated its inputs, preventing them from + accumulating in `objects/pack/` and slowing other Git + operations on busy servers. Overrides the + `repack.aggregate` configuration variable. Off by default. + CONFIGURATION ------------- @@ -324,6 +335,7 @@ SEE ALSO -------- linkgit:git-pack-objects[1] linkgit:git-prune-packed[1] +linkgit:git-pack-aggregate[1] GIT --- diff --git a/Documentation/gitformat-pack.adoc b/Documentation/gitformat-pack.adoc index 3416edceab82e9..1ea5e66e54618c 100644 --- a/Documentation/gitformat-pack.adoc +++ b/Documentation/gitformat-pack.adoc @@ -12,6 +12,7 @@ SYNOPSIS $GIT_DIR/objects/pack/pack-*.{pack,idx} $GIT_DIR/objects/pack/pack-*.rev $GIT_DIR/objects/pack/pack-*.mtimes +$GIT_DIR/objects/pack/pack-*.baddeltas $GIT_DIR/objects/pack/multi-pack-index DESCRIPTION @@ -357,6 +358,32 @@ All 4-byte numbers are in network byte order. and a checksum of all of the above (each having length according to the specified hash function). +== pack-*.baddeltas files + +The optional `.baddeltas` file is an empty marker sitting alongside a +`pack-*.pack` (and its `.idx`). It signals to `git pack-objects` that +the delta layout of the pack should not be trusted: even when two +objects appear together in the same pack and neither is stored as a +delta, the next packing run should still call out to its delta search +routine for the pair instead of assuming a prior pack-objects already +considered (and rejected) the pair. + +This is intended for producers that intentionally skip delta search +when writing a pack (for example, processes that bulk-import objects +or aggregate multiple existing packs without recomputing deltas). +Without this marker, the same-pack delta skip in `git pack-objects` +would silently inherit those producers' lack of delta search into +future repacks. + +The contents of the file are currently ignored. Producers should +write an empty file; consumers must tolerate (and ignore) any +content. + +The marker only affects whether `git pack-objects` will attempt to +compute new deltas for object pairs that share the marked pack. It +does not disable reuse of existing on-disk deltas, nor does it affect +multi-pack-index, bitmap, or pack reuse for transfer. + == multi-pack-index (MIDX) files have the following format: The multi-pack-index files refer to multiple pack-files and loose objects. diff --git a/Documentation/meson.build b/Documentation/meson.build index f4854f802d455f..c1c7d2bf1ea68b 100644 --- a/Documentation/meson.build +++ b/Documentation/meson.build @@ -99,6 +99,7 @@ manpages = { 'git-name-rev.adoc' : 1, 'git-notes.adoc' : 1, 'git-p4.adoc' : 1, + 'git-pack-aggregate.adoc' : 1, 'git-pack-objects.adoc' : 1, 'git-pack-refs.adoc' : 1, 'git-patch-id.adoc' : 1, diff --git a/Makefile b/Makefile index b31ecb07564a73..d6776db1df84fc 100644 --- a/Makefile +++ b/Makefile @@ -1450,6 +1450,7 @@ BUILTIN_OBJS += builtin/multi-pack-index.o BUILTIN_OBJS += builtin/mv.o BUILTIN_OBJS += builtin/name-rev.o BUILTIN_OBJS += builtin/notes.o +BUILTIN_OBJS += builtin/pack-aggregate.o BUILTIN_OBJS += builtin/pack-objects.o ifndef WITH_BREAKING_CHANGES BUILTIN_OBJS += builtin/pack-redundant.o diff --git a/builtin.h b/builtin.h index 4e47a4ebd30ba3..8fdd14b49360cf 100644 --- a/builtin.h +++ b/builtin.h @@ -224,6 +224,7 @@ int cmd_multi_pack_index(int argc, const char **argv, const char *prefix, struct int cmd_mv(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_name_rev(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_notes(int argc, const char **argv, const char *prefix, struct repository *repo); +int cmd_pack_aggregate(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_pack_objects(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_pack_redundant(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_patch_id(int argc, const char **argv, const char *prefix, struct repository *repo); diff --git a/builtin/pack-aggregate.c b/builtin/pack-aggregate.c new file mode 100644 index 00000000000000..e6bbee47967a93 --- /dev/null +++ b/builtin/pack-aggregate.c @@ -0,0 +1,636 @@ +#define USE_THE_REPOSITORY_VARIABLE + +#include "builtin.h" +#include "gettext.h" +#include "hash.h" +#include "hex.h" +#include "midx.h" +#include "object-file.h" +#include "odb.h" +#include "oid-array.h" +#include "packfile.h" +#include "parse-options.h" +#include "path.h" +#include "repository.h" +#include "run-command.h" +#include "sigchain.h" +#include "strbuf.h" +#include "string-list.h" +#include "strmap.h" +#include "strvec.h" +#include "wrapper.h" + +static const char *const pack_aggregate_usage[] = { + N_("git pack-aggregate (--once | --loop) [--interval=]\n" + " [--min-loose=] [--min-packs=]\n" + " [--exclude-pack-file=]\n" + " [--exclude-loose-file=]\n" + " [--parent-pipe-fd=]"), + NULL +}; + +static volatile sig_atomic_t stop_signaled; +static volatile sig_atomic_t active_child_pid; +static int parent_pipe_fd = -1; + +static void term_handler(int sig) +{ + stop_signaled = 1; + /* + * Best-effort forward to an in-flight pack-objects so it + * doesn't keep running after we have been asked to stop. + * The check-and-signal is racy with finish_command() in the + * main thread; the worst case is signaling a pid we no longer + * own (kill() returns ESRCH and we ignore it). + */ + if (active_child_pid > 0) + kill(active_child_pid, sig); +} + +static int is_hex(const char *s, size_t len) +{ + while (len--) + if (!isxdigit(*s++)) + return 0; + return 1; +} + +static int looks_like_pack_basename(const char *name, size_t hexsz) +{ + static const char prefix[] = "pack-"; + size_t plen = strlen(prefix); + size_t nlen = strlen(name); + + /* Does this look like "pack-" with no extension? */ + if (nlen != plen + hexsz || strncmp(name, prefix, plen)) + return 0; + return is_hex(name + plen, hexsz); +} + +static int has_sidecar(const char *packdir, const char *basename, + const char *ext) +{ + struct strbuf buf = STRBUF_INIT; + struct stat st; + int ret; + + strbuf_addf(&buf, "%s/%s.%s", packdir, basename, ext); + ret = !lstat(buf.buf, &st); + strbuf_release(&buf); + return ret; +} + +static int has_protective_sidecar(const char *packdir, const char *basename) +{ + /* + * Ignore packs with sidecars that mean "don't touch me". .baddeltas + * is intentionally absent: rolling those up is the point. + */ + static const char *exts[] = { + "keep", "promisor", "mtimes", "bitmap", NULL + }; + int i; + + for (i = 0; exts[i]; i++) + if (has_sidecar(packdir, basename, exts[i])) + return 1; + return 0; +} + +static int has_idx(const char *packdir, const char *basename) +{ + return has_sidecar(packdir, basename, "idx"); +} + +static void load_exclusions_from_file(const char *path, struct strset *set) +{ + FILE *fp; + struct strbuf line = STRBUF_INIT; + + fp = fopen(path, "r"); + if (!fp) + die_errno(_("could not open exclude file '%s'"), path); + + while (strbuf_getline_lf(&line, fp) != EOF) { + strbuf_trim(&line); + if (!line.len || line.buf[0] == '#') + continue; + strbuf_strip_suffix(&line, ".pack"); + strbuf_strip_suffix(&line, ".idx"); + strset_add(set, line.buf); + } + strbuf_release(&line); + fclose(fp); +} + +/* + * Re-read the multi-pack-index (and any incremental layers) and + * populate `set` with the basenames of every referenced pack. The + * strset is cleared first so this is safe to call once per cycle. + */ +static int refresh_midx_exclusions(struct repository *repo, + struct strset *set) +{ + struct multi_pack_index *m; + int had_midx = 0; + + strset_clear(set); + + odb_reprepare(repo->objects); + m = get_multi_pack_index(repo->objects->sources); + for (; m; m = m->base_midx) { + uint32_t i; + had_midx = 1; + for (i = 0; i < m->num_packs; i++) { + struct strbuf base = STRBUF_INIT; + strbuf_addstr(&base, m->pack_names[i]); + strbuf_strip_suffix(&base, ".idx"); + strbuf_strip_suffix(&base, ".pack"); + strset_add(set, base.buf); + strbuf_release(&base); + } + } + return had_midx; +} + +/* ---------- loose-object pre-pass ---------- */ + +struct loose_scan { + struct strset *exclude; + struct oid_array *oids; + struct string_list *paths; +}; + +static int loose_scan_cb(const struct object_id *oid, const char *path, + void *cb_data) +{ + struct loose_scan *data = cb_data; + + if (strset_contains(data->exclude, oid_to_hex(oid))) + return 0; + oid_array_append(data->oids, oid); + string_list_append(data->paths, path); + return 0; +} + +static int run_pack_objects_loose(const char *packtmp, struct oid_array *oids, + struct strbuf *out_hash) +{ + struct child_process cmd = CHILD_PROCESS_INIT; + struct strbuf line = STRBUF_INIT; + FILE *in, *out; + size_t i; + int ret; + + strvec_pushl(&cmd.args, + "pack-objects", + "--window=0", + "--mark-bad-deltas", + "--delta-base-offset", + "--no-write-bitmap-index", + "--quiet", + packtmp, + NULL); + cmd.git_cmd = 1; + cmd.in = -1; + cmd.out = -1; + + if (start_command(&cmd)) + return -1; + active_child_pid = cmd.pid; + + /* + * The order is "write all stdin, then read all stdout". This + * is only safe because --quiet bounds pack-objects's stdout to + * a single short pack-hash line, which fits in the pipe buffer; + * pack-objects can therefore drain its stdin without blocking + * on stdout writes back to us. + */ + in = xfdopen(cmd.in, "w"); + for (i = 0; i < oids->nr; i++) + fprintf(in, "%s\n", oid_to_hex(&oids->oid[i])); + if (fclose(in)) + warning_errno(_("could not close pack-objects input")); + + /* + * Capture the first line as the pack hash, then drain to EOF so + * pack-objects observes a clean close on its stdout and we never + * end up in finish_command() with bytes left in the pipe. + */ + out = xfdopen(cmd.out, "r"); + while (strbuf_getline_lf(&line, out) != EOF) { + if (!out_hash->len) + strbuf_addbuf(out_hash, &line); + } + fclose(out); + + ret = finish_command(&cmd); + active_child_pid = 0; + strbuf_release(&line); + return ret; +} + +static void unlink_loose_paths(const struct string_list *paths) +{ + size_t i; + + for (i = 0; i < paths->nr; i++) { + const char *p = paths->items[i].string; + if (unlink(p) < 0 && errno != ENOENT) + warning_errno(_("could not unlink loose object '%s'"), + p); + } +} + +/* ---------- pack aggregation ---------- */ + +struct pack_scan { + const char *packdir; + struct strset *file_exclude; + struct strset *midx_exclude; + struct string_list *candidates; + size_t hexsz; +}; + +static void pack_scan_cb(const char *full_path UNUSED, + size_t full_path_len UNUSED, + const char *file_name, void *cb_data) +{ + struct pack_scan *data = cb_data; + struct strbuf base = STRBUF_INIT; + + if (!ends_with(file_name, ".pack")) + return; + strbuf_addstr(&base, file_name); + strbuf_setlen(&base, base.len - strlen(".pack")); + + if (!looks_like_pack_basename(base.buf, data->hexsz)) + goto out; + if (strset_contains(data->file_exclude, base.buf)) + goto out; + if (strset_contains(data->midx_exclude, base.buf)) + goto out; + if (has_protective_sidecar(data->packdir, base.buf)) + goto out; + if (!has_idx(data->packdir, base.buf)) + goto out; + + string_list_append(data->candidates, base.buf); +out: + strbuf_release(&base); +} + +static int run_pack_objects_packs(const char *packtmp, + const struct string_list *bases, + struct strbuf *out_hash) +{ + struct child_process cmd = CHILD_PROCESS_INIT; + struct strbuf line = STRBUF_INIT; + FILE *in, *out; + size_t i; + int ret; + + strvec_pushl(&cmd.args, + "pack-objects", + "--stdin-packs", + "--window=0", + "--mark-bad-deltas", + "--delta-base-offset", + "--no-write-bitmap-index", + "--quiet", + packtmp, + NULL); + cmd.git_cmd = 1; + cmd.in = -1; + cmd.out = -1; + + if (start_command(&cmd)) + return -1; + active_child_pid = cmd.pid; + + /* See run_pack_objects_loose() for why write-then-read is safe. */ + in = xfdopen(cmd.in, "w"); + for (i = 0; i < bases->nr; i++) + fprintf(in, "%s.pack\n", bases->items[i].string); + if (fclose(in)) + warning_errno(_("could not close pack-objects input")); + + out = xfdopen(cmd.out, "r"); + while (strbuf_getline_lf(&line, out) != EOF) { + if (!out_hash->len) + strbuf_addbuf(out_hash, &line); + } + fclose(out); + + ret = finish_command(&cmd); + active_child_pid = 0; + strbuf_release(&line); + return ret; +} + +/* + * pack-objects writes its output as -.; rename it + * into place as /pack-.. .idx is renamed last so + * a concurrent reader scanning the pack directory never sees a .idx + * without its companion .pack. + */ +static void install_pack(const char *packtmp, const char *packdir, + const char *hash) +{ + static const char *exts[] = { + ".pack", ".rev", ".baddeltas", ".idx" + }; + size_t i; + + for (i = 0; i < ARRAY_SIZE(exts); i++) { + struct strbuf src = STRBUF_INIT; + struct strbuf dst = STRBUF_INIT; + struct stat st; + + strbuf_addf(&src, "%s-%s%s", packtmp, hash, exts[i]); + strbuf_addf(&dst, "%s/pack-%s%s", packdir, hash, exts[i]); + + if (!stat(src.buf, &st)) { + if (chmod(src.buf, st.st_mode & ~(S_IWUSR | S_IWGRP | S_IWOTH))) + warning_errno(_("could not make '%s' read-only"), + src.buf); + if (rename(src.buf, dst.buf)) + die_errno(_("renaming '%s' to '%s' failed"), + src.buf, dst.buf); + } else if (errno != ENOENT) { + die_errno(_("could not stat '%s'"), src.buf); + } + strbuf_release(&src); + strbuf_release(&dst); + } +} + +static void unlink_consumed_packs(const char *packdir, + const struct string_list *bases, + const char *keep_basename) +{ + static const char *exts[] = { + "pack", "idx", "rev", "baddeltas", NULL + }; + size_t i; + + for (i = 0; i < bases->nr; i++) { + const char *base = bases->items[i].string; + int j; + + /* + * Recheck protective sidecars: a .keep (or similar) may + * have appeared between scan and now, in which case the + * pack must stay. + */ + if (has_protective_sidecar(packdir, base)) + continue; + /* + * Never unlink the pack we just installed. Aggregating + * packs whose union equals one of the inputs (e.g. one + * input is a strict superset of the others) can yield a + * byte-identical output, which lands at the same final + * name as that input. In that case, we do not want the + * file serving as both input and output to be deleted. + */ + if (keep_basename && !strcmp(base, keep_basename)) + continue; + + for (j = 0; exts[j]; j++) { + struct strbuf fname = STRBUF_INIT; + strbuf_addf(&fname, "%s/%s.%s", packdir, base, + exts[j]); + if (unlink(fname.buf) < 0 && errno != ENOENT) + warning_errno(_("could not unlink '%s'"), + fname.buf); + strbuf_release(&fname); + } + } +} + +static int do_one_cycle(struct repository *repo, const char *packdir, + struct strset *pack_exclude, + struct strset *loose_exclude, + struct strset *midx_exclude, + int min_loose, int min_packs) +{ + struct oid_array loose_oids = OID_ARRAY_INIT; + struct string_list loose_paths = STRING_LIST_INIT_DUP; + struct loose_scan loose_data = { + .exclude = loose_exclude, + .oids = &loose_oids, + .paths = &loose_paths, + }; + struct string_list candidates = STRING_LIST_INIT_DUP; + struct pack_scan pack_data = { + .packdir = packdir, + .file_exclude = pack_exclude, + .midx_exclude = midx_exclude, + .candidates = &candidates, + .hexsz = repo->hash_algo->hexsz, + }; + struct strbuf loose_hash = STRBUF_INIT; + struct strbuf packs_hash = STRBUF_INIT; + char *packtmp_loose = NULL; + char *packtmp_packs = NULL; + int ret = 0; + + /* + * Step 1: bundle local loose objects (minus excluded ones) into + * a single new pack and remove the on-disk loose copies. The + * resulting pack picks up a .baddeltas marker thanks to + * --mark-bad-deltas, and is in turn a candidate for the pack + * aggregation step below. + */ + for_each_loose_file_in_source(repo->objects->sources, + loose_scan_cb, NULL, NULL, &loose_data); + if ((int)loose_oids.nr >= min_loose && !stop_signaled) { + packtmp_loose = mkpathdup("%s/.tmp-%d-loose-pack", + packdir, (int)getpid()); + if (run_pack_objects_loose(packtmp_loose, &loose_oids, + &loose_hash)) { + ret = error(_("pack-objects failed during " + "loose-object rollup")); + goto out; + } + if (loose_hash.len) { + install_pack(packtmp_loose, packdir, loose_hash.buf); + unlink_loose_paths(&loose_paths); + } + } + + if (stop_signaled) + goto out; + + /* + * Step 2: aggregate small packs into a single bigger pack. The + * freshly-installed loose-rollup pack from step 1 is picked up + * naturally by the directory scan below (it has no protective + * sidecars and isn't in any exclusion set). Re-read the MIDX + * just before scanning so a pack added to a MIDX between cycles + * is honored. + */ + refresh_midx_exclusions(repo, midx_exclude); + for_each_file_in_pack_dir(repo_get_object_directory(repo), + pack_scan_cb, &pack_data); + + if ((int)candidates.nr < min_packs) + goto out; + + packtmp_packs = mkpathdup("%s/.tmp-%d-pack", + packdir, (int)getpid()); + if (run_pack_objects_packs(packtmp_packs, &candidates, + &packs_hash)) { + ret = error(_("pack-objects failed during " + "pack aggregation")); + goto out; + } + if (packs_hash.len) { + struct strbuf output_base = STRBUF_INIT; + install_pack(packtmp_packs, packdir, packs_hash.buf); + strbuf_addf(&output_base, "pack-%s", packs_hash.buf); + unlink_consumed_packs(packdir, &candidates, output_base.buf); + strbuf_release(&output_base); + } + +out: + free(packtmp_loose); + free(packtmp_packs); + strbuf_release(&loose_hash); + strbuf_release(&packs_hash); + string_list_clear(&candidates, 0); + oid_array_clear(&loose_oids); + string_list_clear(&loose_paths, 0); + return ret; +} + +static void interruptible_sleep(unsigned int seconds) +{ + struct pollfd pfd; + int timeout_ms; + + if (stop_signaled) + return; + + if (parent_pipe_fd < 0) { + unsigned int remaining = seconds; + while (remaining > 0 && !stop_signaled) + remaining = sleep(remaining); + return; + } + + /* + * Watch the parent pipe so we wake immediately if the process + * that spawned us exits. POLLHUP is reported in revents + * regardless of whether it appears in events, so we leave + * events==0; any of POLLHUP/POLLERR/POLLNVAL/POLLIN means the + * other end of the pipe is gone and we should stop. + */ + pfd.fd = parent_pipe_fd; + pfd.events = 0; + timeout_ms = (seconds > INT_MAX / 1000) ? INT_MAX + : (int)(seconds * 1000); + + while (!stop_signaled) { + int ret; + pfd.revents = 0; + ret = poll(&pfd, 1, timeout_ms); + if (ret < 0) { + if (errno == EINTR) + continue; + break; + } + if (ret == 0) + break; + if (pfd.revents & (POLLHUP | POLLERR | POLLNVAL | POLLIN)) { + stop_signaled = 1; + break; + } + } +} + +int cmd_pack_aggregate(int argc, const char **argv, + const char *prefix, struct repository *repo) +{ + const char *exclude_pack_file = NULL; + const char *exclude_loose_file = NULL; + int interval = 60; + int min_packs = 5; + int min_loose = 5; + int once = 0; + int loop = 0; + struct option options[] = { + OPT_BOOL(0, "once", &once, + N_("run a single cycle and exit")), + OPT_BOOL(0, "loop", &loop, + N_("loop forever, sleeping --interval seconds " + "between cycles")), + OPT_INTEGER(0, "interval", &interval, + N_("seconds to sleep between cycles " + "(default 60)")), + OPT_INTEGER(0, "min-loose", &min_loose, + N_("skip loose-object rollup if fewer " + "candidates (default 5)")), + OPT_INTEGER(0, "min-packs", &min_packs, + N_("skip pack aggregation if fewer " + "candidates (default 5)")), + OPT_STRING(0, "exclude-pack-file", &exclude_pack_file, + N_("file"), + N_("file listing pack basenames never to " + "touch")), + OPT_STRING(0, "exclude-loose-file", &exclude_loose_file, + N_("file"), + N_("file listing loose object OIDs never to " + "touch")), + OPT_INTEGER(0, "parent-pipe-fd", &parent_pipe_fd, + N_("inherited fd of a pipe whose write end " + "the parent holds; EOF triggers exit")), + OPT_END(), + }; + struct strset pack_exclude = STRSET_INIT; + struct strset loose_exclude = STRSET_INIT; + struct strset midx_exclude = STRSET_INIT; + char *packdir; + int ret = 0; + + argc = parse_options(argc, argv, prefix, options, + pack_aggregate_usage, 0); + if (argc > 0) + usage_with_options(pack_aggregate_usage, options); + if (once == loop) + die(_("exactly one of --once or --loop is required")); + if (interval < 1) + die(_("--interval must be at least 1")); + if (min_loose < 1) + die(_("--min-loose must be at least 1")); + if (min_packs < 1) + die(_("--min-packs must be at least 1")); + + packdir = mkpathdup("%s/pack", repo_get_object_directory(repo)); + + if (exclude_pack_file) + load_exclusions_from_file(exclude_pack_file, &pack_exclude); + if (exclude_loose_file) + load_exclusions_from_file(exclude_loose_file, &loose_exclude); + + sigchain_push(SIGTERM, term_handler); + sigchain_push(SIGHUP, term_handler); + sigchain_push(SIGINT, term_handler); + + do { + if (stop_signaled) + break; + ret = do_one_cycle(repo, packdir, &pack_exclude, + &loose_exclude, &midx_exclude, + min_loose, min_packs); + if (ret || once || stop_signaled) + break; + interruptible_sleep((unsigned int)interval); + } while (!stop_signaled); + + strset_clear(&pack_exclude); + strset_clear(&loose_exclude); + strset_clear(&midx_exclude); + free(packdir); + return ret ? 1 : 0; +} diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index fe9fbecb30e100..c5758116504496 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -43,6 +43,7 @@ #include "pack-mtimes.h" #include "parse-options.h" #include "pkt-line.h" +#include "path.h" #include "blob.h" #include "tree.h" #include "path-walk.h" @@ -211,6 +212,9 @@ static int keep_unreachable, unpack_unreachable, include_tag; static timestamp_t unpack_unreachable_expiration; static int pack_loose_unreachable; static int cruft; +static int mark_bad_deltas; +static const char *emit_input_packs_path; +static const char *emit_input_loose_path; static int shallow = 0; static timestamp_t cruft_expiration; static int local; @@ -1459,6 +1463,23 @@ static void write_pack_file(void) &pack_idx_opts, hash, &idx_tmp_name); + if (mark_bad_deltas) { + size_t tmpname_len = tmpname.len; + int fd; + + strbuf_addstr(&tmpname, "baddeltas"); + fd = xopen(tmpname.buf, + O_WRONLY | O_CREAT | O_TRUNC, 0444); + if (close(fd)) + die_errno(_("unable to close '%s'"), + tmpname.buf); + if (adjust_shared_perm(the_repository, + tmpname.buf)) + die_errno(_("unable to make '%s' readable"), + tmpname.buf); + strbuf_setlen(&tmpname, tmpname_len); + } + if (write_bitmap_index) { size_t tmpname_len = tmpname.len; @@ -2794,9 +2815,15 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, * be considered, as even if we produce a suboptimal delta against * it, we will still save the transfer cost, as we already know * the other side has it and we won't send src_entry at all. + * + * If the source pack carries a ".baddeltas" marker, we treat its + * existing delta layout as untrusted: even if the two objects are + * in the same pack and neither is a delta, we have no reason to + * believe a previous packing run actually considered the pair. */ if (reuse_delta && IN_PACK(trg_entry) && IN_PACK(trg_entry) == IN_PACK(src_entry) && + !IN_PACK(trg_entry)->has_bad_deltas && !src_entry->preferred_base && trg_entry->in_pack_type != OBJ_REF_DELTA && trg_entry->in_pack_type != OBJ_OFS_DELTA) @@ -4057,6 +4084,7 @@ static void read_stdin_packs(enum stdin_packs_mode mode, int rev_list_unpacked) { int prev_fetch_if_missing = fetch_if_missing; struct rev_info revs; + int need_walk; /* * The revision walk may hit objects that are promised, only. As the @@ -4074,7 +4102,14 @@ static void read_stdin_packs(enum stdin_packs_mode mode, int rev_list_unpacked) * That may cause us to avoid populating all of the namehash fields of * all included objects, but our goal is best-effort, since this is only * an optimization during delta selection. + * + * However, the walk is only needed for delta selection (which + * consumes the namehash) and for STDIN_PACKS_MODE_FOLLOW (which + * uses the walk to discover additional reachable objects); skip + * it when neither applies. */ + need_walk = (window && depth) || mode == STDIN_PACKS_MODE_FOLLOW; + revs.no_kept_objects = 1; revs.keep_pack_cache_flags |= KEPT_PACK_IN_CORE; revs.blob_objects = 1; @@ -4097,12 +4132,14 @@ static void read_stdin_packs(enum stdin_packs_mode mode, int rev_list_unpacked) if (rev_list_unpacked) add_unreachable_loose_objects(&revs); - if (prepare_revision_walk(&revs)) - die(_("revision walk setup failed")); - traverse_commit_list(&revs, - show_commit_pack_hint, - show_object_pack_hint, - &mode); + if (need_walk) { + if (prepare_revision_walk(&revs)) + die(_("revision walk setup failed")); + traverse_commit_list(&revs, + show_commit_pack_hint, + show_object_pack_hint, + &mode); + } release_revisions(&revs); @@ -5005,6 +5042,62 @@ static int parse_stdin_packs_mode(const struct option *opt, const char *arg, return 0; } +static void emit_input_packs_to_file(const char *path) +{ + struct strbuf tmp = STRBUF_INIT; + struct packed_git *p; + FILE *fp; + + strbuf_addf(&tmp, "%s.tmp", path); + fp = fopen(tmp.buf, "w"); + if (!fp) + die_errno(_("unable to write '%s'"), tmp.buf); + repo_for_each_pack(the_repository, p) { + /* Exclude alternates */ + if (!p->pack_local) + continue; + fprintf(fp, "%s\n", pack_basename(p)); + } + if (fclose(fp)) + die_errno(_("unable to write '%s'"), tmp.buf); + if (rename(tmp.buf, path)) + die_errno(_("unable to rename '%s' to '%s'"), tmp.buf, path); + strbuf_release(&tmp); +} + +static int emit_input_loose_cb(const struct object_id *oid, + const char *path UNUSED, + void *data) +{ + FILE *fp = data; + fprintf(fp, "%s\n", oid_to_hex(oid)); + return 0; +} + +static void emit_input_loose_to_file(const char *path) +{ + struct strbuf tmp = STRBUF_INIT; + FILE *fp; + + strbuf_addf(&tmp, "%s.tmp", path); + fp = fopen(tmp.buf, "w"); + if (!fp) + die_errno(_("unable to write '%s'"), tmp.buf); + /* + * Note: for_each_loose_file_in_source() walks only the local + * source (sources->next is skipped), thus excluding + * alternates and matching the "p->pack_local" check in + * emit_input_packs_to_file(). + */ + for_each_loose_file_in_source(the_repository->objects->sources, + emit_input_loose_cb, NULL, NULL, fp); + if (fclose(fp)) + die_errno(_("unable to write '%s'"), tmp.buf); + if (rename(tmp.buf, path)) + die_errno(_("unable to rename '%s' to '%s'"), tmp.buf, path); + strbuf_release(&tmp); +} + int cmd_pack_objects(int argc, const char **argv, const char *prefix, @@ -5085,6 +5178,8 @@ int cmd_pack_objects(int argc, N_("unpack unreachable objects newer than