aboutsummaryrefslogtreecommitdiff
path: root/midx-write.c
diff options
context:
space:
mode:
authorTaylor Blau <me@ttaylorr.com>2026-02-24 14:00:41 -0500
committerJunio C Hamano <gitster@pobox.com>2026-02-24 11:16:34 -0800
commit9df44a97f165bbdd8d591c6e99c8894ae036b5bd (patch)
treef0fdbc701c8e8e47966817eda9c74cfdcd6e3360 /midx-write.c
parentdedf71f0b1b9a64d5aa7558ef45b26166d8d66fc (diff)
downloadgit-9df44a97f165bbdd8d591c6e99c8894ae036b5bd.tar.xz
midx: implement MIDX compaction
When managing a MIDX chain with many layers, it is convenient to combine a sequence of adjacent layers into a single layer to prevent the chain from growing too long. While it is conceptually possible to "compact" a sequence of MIDX layers together by running "git multi-pack-index write --stdin-packs", there are a few drawbacks that make this less than desirable: - Preserving the MIDX chain is impossible, since there is no way to write a MIDX layer that contains objects or packs found in an earlier MIDX layer already part of the chain. So callers would have to write an entirely new (non-incremental) MIDX containing only the compacted layers, discarding all other objects/packs from the MIDX. - There is (currently) no way to write a MIDX layer outside of the MIDX chain to work around the above, such that the MIDX chain could be reassembled substituting the compacted layers with the MIDX that was written. - The `--stdin-packs` command-line option does not allow us to specify the order of packs as they appear in the MIDX. Therefore, even if there were workarounds for the previous two challenges, any bitmaps belonging to layers which come after the compacted layer(s) would no longer be valid. This commit introduces a way to compact a sequence of adjacent MIDX layers into a single layer while preserving the MIDX chain, as well as any bitmap(s) in layers which are newer than the compacted ones. Implementing MIDX compaction does not require a significant number of changes to how MIDX layers are written. The main changes are as follows: - Instead of calling `fill_packs_from_midx()`, we call a new function `fill_packs_from_midx_range()`, which walks backwards along the portion of the MIDX chain which we are compacting, and adds packs one layer a time. In order to preserve the pseudo-pack order, the concatenated pack order is preserved, with the exception of preferred packs which are always added first. - After adding entries from the set of packs in the compaction range, `compute_sorted_entries()` must adjust the `pack_int_id`'s for all objects added in each fanout layer to match their original `pack_int_id`'s (as opposed to the index at which each pack appears in `ctx.info`). Note that we cannot reuse `midx_fanout_add_midx_fanout()` directly here, as it unconditionally recurs through the `->base_midx`. Factor out a `_1()` variant that operates on a single layer, reimplement the existing function in terms of it, and use the new variant from `midx_fanout_add_compact()`. Since we are sorting the list of objects ourselves, the order we add them in does not matter. - When writing out the new 'multi-pack-index-chain' file, discard any layers in the compaction range, replacing them with the newly written layer, instead of keeping them and placing the new layer at the end of the chain. This ends up being sufficient to implement MIDX compaction in such a way that preserves bitmaps corresponding to more recent layers in the MIDX chain. The tests for MIDX compaction are so far fairly spartan, since the main interesting behavior here is ensuring that the right packs/objects are selected from each layer, and that the pack order is preserved despite whether or not they are sorted in lexicographic order in the original MIDX chain. Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'midx-write.c')
-rw-r--r--midx-write.c277
1 files changed, 250 insertions, 27 deletions
diff --git a/midx-write.c b/midx-write.c
index ca2469213e..bf53ad1c4b 100644
--- a/midx-write.c
+++ b/midx-write.c
@@ -113,6 +113,10 @@ struct write_midx_context {
int incremental;
uint32_t num_multi_pack_indexes_before;
+ struct multi_pack_index *compact_from;
+ struct multi_pack_index *compact_to;
+ int compact;
+
struct string_list *to_include;
struct repository *repo;
@@ -122,6 +126,8 @@ struct write_midx_context {
static uint32_t midx_pack_perm(struct write_midx_context *ctx,
uint32_t orig_pack_int_id)
{
+ if (ctx->compact)
+ orig_pack_int_id -= ctx->compact_from->num_packs_in_base;
return ctx->pack_perm[orig_pack_int_id];
}
@@ -268,18 +274,14 @@ static void midx_fanout_sort(struct midx_fanout *fanout)
QSORT(fanout->entries, fanout->nr, midx_oid_compare);
}
-static void midx_fanout_add_midx_fanout(struct midx_fanout *fanout,
- struct multi_pack_index *m,
- uint32_t cur_fanout,
- uint32_t preferred_pack)
+static void midx_fanout_add_midx_fanout_1(struct midx_fanout *fanout,
+ struct multi_pack_index *m,
+ uint32_t cur_fanout,
+ uint32_t preferred_pack)
{
uint32_t start = m->num_objects_in_base, end;
uint32_t cur_object;
- if (m->base_midx)
- midx_fanout_add_midx_fanout(fanout, m->base_midx, cur_fanout,
- preferred_pack);
-
if (cur_fanout)
start += ntohl(m->chunk_oid_fanout[cur_fanout - 1]);
end = m->num_objects_in_base + ntohl(m->chunk_oid_fanout[cur_fanout]);
@@ -303,6 +305,17 @@ static void midx_fanout_add_midx_fanout(struct midx_fanout *fanout,
}
}
+static void midx_fanout_add_midx_fanout(struct midx_fanout *fanout,
+ struct multi_pack_index *m,
+ uint32_t cur_fanout,
+ uint32_t preferred_pack)
+{
+ if (m->base_midx)
+ midx_fanout_add_midx_fanout(fanout, m->base_midx, cur_fanout,
+ preferred_pack);
+ midx_fanout_add_midx_fanout_1(fanout, m, cur_fanout, preferred_pack);
+}
+
static void midx_fanout_add_pack_fanout(struct midx_fanout *fanout,
struct pack_info *info,
uint32_t cur_pack,
@@ -352,6 +365,21 @@ static void midx_fanout_add(struct midx_fanout *fanout,
cur_fanout);
}
+static void midx_fanout_add_compact(struct midx_fanout *fanout,
+ struct write_midx_context *ctx,
+ uint32_t cur_fanout)
+{
+ struct multi_pack_index *m = ctx->compact_to;
+
+ ASSERT(ctx->compact);
+
+ while (m && m != ctx->compact_from->base_midx) {
+ midx_fanout_add_midx_fanout_1(fanout, m, cur_fanout,
+ NO_PREFERRED_PACK);
+ m = m->base_midx;
+ }
+}
+
/*
* It is possible to artificially get into a state where there are many
* duplicate copies of objects. That can create high memory pressure if
@@ -370,6 +398,9 @@ static void compute_sorted_entries(struct write_midx_context *ctx,
size_t alloc_objects, total_objects = 0;
struct midx_fanout fanout = { 0 };
+ if (ctx->compact)
+ ASSERT(!start_pack);
+
for (cur_pack = start_pack; cur_pack < ctx->nr; cur_pack++)
total_objects = st_add(total_objects,
ctx->info[cur_pack].p->num_objects);
@@ -388,7 +419,10 @@ static void compute_sorted_entries(struct write_midx_context *ctx,
for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) {
fanout.nr = 0;
- midx_fanout_add(&fanout, ctx, start_pack, cur_fanout);
+ if (ctx->compact)
+ midx_fanout_add_compact(&fanout, ctx, cur_fanout);
+ else
+ midx_fanout_add(&fanout, ctx, start_pack, cur_fanout);
midx_fanout_sort(&fanout);
/*
@@ -956,6 +990,75 @@ static int fill_packs_from_midx(struct write_midx_context *ctx)
return 0;
}
+static uint32_t compactible_packs_between(const struct multi_pack_index *from,
+ const struct multi_pack_index *to)
+{
+ uint32_t nr;
+
+ ASSERT(from && to);
+
+ if (unsigned_add_overflows(to->num_packs, to->num_packs_in_base))
+ die(_("too many packs, unable to compact"));
+
+ nr = to->num_packs + to->num_packs_in_base;
+ if (nr < from->num_packs_in_base)
+ BUG("unexpected number of packs in base during compaction: "
+ "%"PRIu32" < %"PRIu32, nr, from->num_packs_in_base);
+
+ return nr - from->num_packs_in_base;
+}
+
+static int fill_packs_from_midx_range(struct write_midx_context *ctx,
+ int bitmap_order)
+{
+ struct multi_pack_index *m = ctx->compact_to;
+ uint32_t packs_nr;
+
+ ASSERT(ctx->compact && !ctx->nr);
+ ASSERT(ctx->compact_from);
+ ASSERT(ctx->compact_to);
+
+ packs_nr = compactible_packs_between(ctx->compact_from,
+ ctx->compact_to);
+
+ ALLOC_GROW(ctx->info, packs_nr, ctx->alloc);
+
+ while (m != ctx->compact_from->base_midx) {
+ uint32_t pack_int_id, preferred_pack_id;
+ uint32_t i;
+
+ if (bitmap_order) {
+ if (midx_preferred_pack(m, &preferred_pack_id) < 0)
+ die(_("could not determine preferred pack"));
+ } else {
+ preferred_pack_id = m->num_packs_in_base;
+ }
+
+ pack_int_id = m->num_packs_in_base - ctx->compact_from->num_packs_in_base;
+
+ if (fill_pack_from_midx(&ctx->info[pack_int_id++], m,
+ preferred_pack_id) < 0)
+ return -1;
+
+ for (i = m->num_packs_in_base;
+ i < m->num_packs_in_base + m->num_packs; i++) {
+ if (preferred_pack_id == i)
+ continue;
+
+ if (fill_pack_from_midx(&ctx->info[pack_int_id++], m,
+ i) < 0)
+ return -1;
+ }
+
+ ctx->nr += m->num_packs;
+ m = m->base_midx;
+ }
+
+ ASSERT(ctx->nr == packs_nr);
+
+ return 0;
+}
+
static struct {
const char *non_split;
const char *split;
@@ -1075,6 +1178,9 @@ static bool midx_needs_update(struct multi_pack_index *midx, struct write_midx_c
if (ctx->incremental)
goto out;
+ if (ctx->compact)
+ goto out; /* Compaction always requires an update. */
+
/*
* Otherwise, we need to verify that the packs covered by the existing
* MIDX match the packs that we already have. The logic to do so is way
@@ -1120,12 +1226,23 @@ out:
return needed;
}
+static int midx_hashcmp(const struct multi_pack_index *a,
+ const struct multi_pack_index *b,
+ const struct git_hash_algo *algop)
+{
+ return hashcmp(midx_get_checksum_hash(a), midx_get_checksum_hash(b),
+ algop);
+}
+
struct write_midx_opts {
struct odb_source *source; /* non-optional */
struct string_list *packs_to_include;
struct string_list *packs_to_drop;
+ struct multi_pack_index *compact_from;
+ struct multi_pack_index *compact_to;
+
const char *preferred_pack_name;
const char *refs_snapshot;
unsigned flags;
@@ -1150,6 +1267,7 @@ static int write_midx_internal(struct write_midx_opts *opts)
int dropped_packs = 0;
int result = -1;
const char **keep_hashes = NULL;
+ size_t keep_hashes_nr = 0;
struct chunkfile *cf;
trace2_region_enter("midx", "write_midx_internal", r);
@@ -1162,6 +1280,19 @@ static int write_midx_internal(struct write_midx_opts *opts)
die(_("unknown MIDX version: %d"), ctx.version);
ctx.incremental = !!(opts->flags & MIDX_WRITE_INCREMENTAL);
+ ctx.compact = !!(opts->flags & MIDX_WRITE_COMPACT);
+
+ if (ctx.compact) {
+ if (ctx.version != MIDX_VERSION_V2)
+ die(_("cannot perform MIDX compaction with v1 format"));
+ if (!opts->compact_from)
+ BUG("expected non-NULL 'from' MIDX during compaction");
+ if (!opts->compact_to)
+ BUG("expected non-NULL 'to' MIDX during compaction");
+
+ ctx.compact_from = opts->compact_from;
+ ctx.compact_to = opts->compact_to;
+ }
if (ctx.incremental)
strbuf_addf(&midx_name,
@@ -1189,11 +1320,18 @@ static int write_midx_internal(struct write_midx_opts *opts)
*/
if (ctx.incremental)
ctx.base_midx = m;
- else if (!opts->packs_to_include)
+ if (!opts->packs_to_include)
ctx.m = m;
}
}
+ /*
+ * If compacting MIDX layer(s) in the range [from, to], then the
+ * compacted MIDX will share the same base MIDX as 'from'.
+ */
+ if (ctx.compact)
+ ctx.base_midx = ctx.compact_from->base_midx;
+
ctx.nr = 0;
ctx.alloc = ctx.m ? ctx.m->num_packs + ctx.m->num_packs_in_base : 16;
ctx.info = NULL;
@@ -1210,7 +1348,7 @@ static int write_midx_internal(struct write_midx_opts *opts)
ctx.num_multi_pack_indexes_before++;
m = m->base_midx;
}
- } else if (ctx.m && fill_packs_from_midx(&ctx)) {
+ } else if (ctx.m && !ctx.compact && fill_packs_from_midx(&ctx)) {
goto cleanup;
}
@@ -1223,9 +1361,18 @@ static int write_midx_internal(struct write_midx_opts *opts)
else
ctx.progress = NULL;
- ctx.to_include = opts->packs_to_include;
+ if (ctx.compact) {
+ int bitmap_order = 0;
+ if (opts->preferred_pack_name)
+ bitmap_order |= 1;
+ else if (opts->flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP))
+ bitmap_order |= 1;
- for_each_file_in_pack_dir(opts->source->path, add_pack_to_midx, &ctx);
+ fill_packs_from_midx_range(&ctx, bitmap_order);
+ } else {
+ ctx.to_include = opts->packs_to_include;
+ for_each_file_in_pack_dir(opts->source->path, add_pack_to_midx, &ctx);
+ }
stop_progress(&ctx.progress);
if (!opts->packs_to_drop) {
@@ -1354,12 +1501,19 @@ static int write_midx_internal(struct write_midx_opts *opts)
ctx.large_offsets_needed = 1;
}
- QSORT(ctx.info, ctx.nr, pack_info_compare);
+ if (ctx.compact) {
+ if (ctx.version != MIDX_VERSION_V2)
+ BUG("performing MIDX compaction with v1 MIDX");
+ } else {
+ QSORT(ctx.info, ctx.nr, pack_info_compare);
+ }
if (opts->packs_to_drop && opts->packs_to_drop->nr) {
size_t drop_index = 0;
int missing_drops = 0;
+ ASSERT(!ctx.compact);
+
for (size_t i = 0;
i < ctx.nr && drop_index < opts->packs_to_drop->nr; i++) {
int cmp = strcmp(ctx.info[i].pack_name,
@@ -1391,12 +1545,20 @@ static int write_midx_internal(struct write_midx_opts *opts)
*/
ALLOC_ARRAY(ctx.pack_perm, ctx.nr);
for (size_t i = 0; i < ctx.nr; i++) {
+ uint32_t from = ctx.info[i].orig_pack_int_id;
+ uint32_t to;
+
if (ctx.info[i].expired) {
+ to = PACK_EXPIRED;
dropped_packs++;
- ctx.pack_perm[ctx.info[i].orig_pack_int_id] = PACK_EXPIRED;
} else {
- ctx.pack_perm[ctx.info[i].orig_pack_int_id] = i - dropped_packs;
+ to = i - dropped_packs;
}
+
+ if (ctx.compact)
+ from -= ctx.compact_from->num_packs_in_base;
+
+ ctx.pack_perm[from] = to;
}
for (size_t i = 0; i < ctx.nr; i++) {
@@ -1542,7 +1704,24 @@ static int write_midx_internal(struct write_midx_opts *opts)
if (ctx.num_multi_pack_indexes_before == UINT32_MAX)
die(_("too many multi-pack-indexes"));
- CALLOC_ARRAY(keep_hashes, ctx.num_multi_pack_indexes_before + 1);
+ if (ctx.compact) {
+ struct multi_pack_index *m;
+
+ /*
+ * Keep all MIDX layers excluding those in the range [from, to].
+ */
+ for (m = ctx.base_midx; m; m = m->base_midx)
+ keep_hashes_nr++;
+ for (m = ctx.m;
+ m && midx_hashcmp(m, ctx.compact_to, r->hash_algo);
+ m = m->base_midx)
+ keep_hashes_nr++;
+
+ keep_hashes_nr++; /* include the compacted layer */
+ } else {
+ keep_hashes_nr = ctx.num_multi_pack_indexes_before + 1;
+ }
+ CALLOC_ARRAY(keep_hashes, keep_hashes_nr);
if (ctx.incremental) {
FILE *chainf = fdopen_lock_file(&lk, "w");
@@ -1567,17 +1746,47 @@ static int write_midx_internal(struct write_midx_opts *opts)
strbuf_release(&final_midx_name);
- keep_hashes[ctx.num_multi_pack_indexes_before] =
- xstrdup(hash_to_hex_algop(midx_hash, r->hash_algo));
+ if (ctx.compact) {
+ struct multi_pack_index *m;
+ uint32_t num_layers_before_from = 0;
+ uint32_t i;
- for (uint32_t i = 0; i < ctx.num_multi_pack_indexes_before; i++) {
- uint32_t j = ctx.num_multi_pack_indexes_before - i - 1;
+ for (m = ctx.base_midx; m; m = m->base_midx)
+ num_layers_before_from++;
- keep_hashes[j] = xstrdup(midx_get_checksum_hex(m));
- m = m->base_midx;
+ m = ctx.base_midx;
+ for (i = 0; i < num_layers_before_from; i++) {
+ uint32_t j = num_layers_before_from - i - 1;
+
+ keep_hashes[j] = xstrdup(midx_get_checksum_hex(m));
+ m = m->base_midx;
+ }
+
+ keep_hashes[i] = xstrdup(hash_to_hex_algop(midx_hash,
+ r->hash_algo));
+
+ i = 0;
+ for (m = ctx.m;
+ m && midx_hashcmp(m, ctx.compact_to, r->hash_algo);
+ m = m->base_midx) {
+ keep_hashes[keep_hashes_nr - i - 1] =
+ xstrdup(midx_get_checksum_hex(m));
+ i++;
+ }
+ } else {
+ keep_hashes[ctx.num_multi_pack_indexes_before] =
+ xstrdup(hash_to_hex_algop(midx_hash,
+ r->hash_algo));
+
+ for (uint32_t i = 0; i < ctx.num_multi_pack_indexes_before; i++) {
+ uint32_t j = ctx.num_multi_pack_indexes_before - i - 1;
+
+ keep_hashes[j] = xstrdup(midx_get_checksum_hex(m));
+ m = m->base_midx;
+ }
}
- for (uint32_t i = 0; i <= ctx.num_multi_pack_indexes_before; i++)
+ for (uint32_t i = 0; i < keep_hashes_nr; i++)
fprintf(get_lock_file_fp(&lk), "%s\n", keep_hashes[i]);
} else {
keep_hashes[ctx.num_multi_pack_indexes_before] =
@@ -1590,8 +1799,7 @@ static int write_midx_internal(struct write_midx_opts *opts)
if (commit_lock_file(&lk) < 0)
die_errno(_("could not write multi-pack-index"));
- clear_midx_files(opts->source, keep_hashes,
- ctx.num_multi_pack_indexes_before + 1,
+ clear_midx_files(opts->source, keep_hashes, keep_hashes_nr,
ctx.incremental);
result = 0;
@@ -1609,7 +1817,7 @@ cleanup:
free(ctx.pack_perm);
free(ctx.pack_order);
if (keep_hashes) {
- for (uint32_t i = 0; i <= ctx.num_multi_pack_indexes_before; i++)
+ for (uint32_t i = 0; i < keep_hashes_nr; i++)
free((char *)keep_hashes[i]);
free(keep_hashes);
}
@@ -1651,6 +1859,21 @@ int write_midx_file_only(struct odb_source *source,
return write_midx_internal(&opts);
}
+int write_midx_file_compact(struct odb_source *source,
+ struct multi_pack_index *from,
+ struct multi_pack_index *to,
+ unsigned flags)
+{
+ struct write_midx_opts opts = {
+ .source = source,
+ .compact_from = from,
+ .compact_to = to,
+ .flags = flags | MIDX_WRITE_COMPACT,
+ };
+
+ return write_midx_internal(&opts);
+}
+
int expire_midx_packs(struct odb_source *source, unsigned flags)
{
uint32_t i, *count, result = 0;