From 3f9ac8d259fb919e001671c5e403e5fceaabf0d8 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Wed, 15 Feb 2006 17:34:29 -0800 Subject: pack-objects: reuse data from existing packs. When generating a new pack, notice if we have already needed objects in existing packs. If an object is stored deltified, and its base object is also what we are going to pack, then reuse the existing deltified representation unconditionally, bypassing all the expensive find_deltas() and try_deltas() calls. Also, notice if what we are going to write out exactly match what is already in an existing pack (either deltified or just compressed). In such a case, we can just copy it instead of going through the usual uncompressing & recompressing cycle. Without this patch, in linux-2.6 repository with about 1500 loose objects and a single mega pack: $ git-rev-list --objects v2.6.16-rc3 >RL $ wc -l RL 184141 RL $ time git-pack-objects p --- pack-objects.c | 367 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 310 insertions(+), 57 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index c5a5e61605..70fb2afeb8 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -9,15 +9,31 @@ static const char pack_usage[] = "git-pack-objects [-q] [--non-empty] [--local] struct object_entry { unsigned char sha1[20]; - unsigned long size; - unsigned long offset; - unsigned int depth; - unsigned int hash; + unsigned long size; /* uncompressed size */ + unsigned long offset; /* offset into the final pack file (nonzero if already written) */ + unsigned int depth; /* delta depth */ + unsigned int hash; /* name hint hash */ enum object_type type; - unsigned long delta_size; - struct object_entry *delta; + unsigned long delta_size; /* delta data size (uncompressed) */ + struct object_entry *delta; /* delta base object */ + struct packed_git *in_pack; /* already in pack */ + enum object_type in_pack_type; /* could be delta */ + unsigned int in_pack_offset; }; +/* + * Objects we are going to pack are colected in objects array (dynamically + * expanded). nr_objects & nr_alloc controls this array. They are stored + * in the order we see -- typically rev-list --objects order that gives us + * nice "minimum seek" order. + * + * sorted-by-sha ans sorted-by-type are arrays of pointers that point at + * elements in the objects array. The former is used to build the pack + * index (lists object names in the ascending order to help offset lookup), + * and the latter is used to group similar things together by try_delta() + * heuristics. + */ + static unsigned char object_list_sha1[20]; static int non_empty = 0; static int local = 0; @@ -29,6 +45,135 @@ static const char *base_name; static unsigned char pack_file_sha1[20]; static int progress = 1; +/* + * The object names in objects array are hashed with this hashtable, + * to help looking up the entry by object name. Binary search from + * sorted_by_sha is also possible but this was easier to code and faster. + * This hashtable is built after all the objects are seen. + */ +static int *object_ix = NULL; +static int object_ix_hashsz = 0; + +/* + * Pack index for existing packs give us easy access to the offsets into + * corresponding pack file where each object's data starts, but the entries + * do not store the size of the compressed representation (uncompressed + * size is easily available by examining the pack entry header). We build + * a hashtable of existing packs (pack_revindex), and keep reverse index + * here -- pack index file is sorted by object name mapping to offset; this + * pack_revindex[].revindex array is an ordered list of offsets, so if you + * know the offset of an object, next offset is where its packed + * representation ends. + */ +struct pack_revindex { + struct packed_git *p; + unsigned long *revindex; +} *pack_revindex = NULL; +static int pack_revindex_hashsz = 0; + +/* + * stats + */ +static int written = 0; +static int reused = 0; + +static int pack_revindex_ix(struct packed_git *p) +{ + unsigned int ui = (unsigned int) p; + int i; + + ui = ui ^ (ui >> 16); /* defeat structure alignment */ + i = (int)(ui % pack_revindex_hashsz); + while (pack_revindex[i].p) { + if (pack_revindex[i].p == p) + return i; + if (++i == pack_revindex_hashsz) + i = 0; + } + return -1 - i; +} + +static void prepare_pack_ix(void) +{ + int num; + struct packed_git *p; + for (num = 0, p = packed_git; p; p = p->next) + num++; + if (!num) + return; + pack_revindex_hashsz = num * 11; + pack_revindex = xcalloc(sizeof(*pack_revindex), pack_revindex_hashsz); + for (p = packed_git; p; p = p->next) { + num = pack_revindex_ix(p); + num = - 1 - num; + pack_revindex[num].p = p; + } + /* revindex elements are lazily initialized */ +} + +static int cmp_offset(const void *a_, const void *b_) +{ + unsigned long a = *(unsigned long *) a_; + unsigned long b = *(unsigned long *) b_; + if (a < b) + return -1; + else if (a == b) + return 0; + else + return 1; +} + +/* + * Ordered list of offsets of objects in the pack. + */ +static void prepare_pack_revindex(struct pack_revindex *rix) +{ + struct packed_git *p = rix->p; + int num_ent = num_packed_objects(p); + int i; + void *index = p->index_base + 256; + + rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1)); + for (i = 0; i < num_ent; i++) { + long hl = *((long *)(index + 24 * i)); + rix->revindex[i] = ntohl(hl); + } + /* This knows the pack format -- the 20-byte trailer + * follows immediately after the last object data. + */ + rix->revindex[num_ent] = p->pack_size - 20; + qsort(rix->revindex, num_ent, sizeof(unsigned long), cmp_offset); +} + +static unsigned long find_packed_object_size(struct packed_git *p, + unsigned long ofs) +{ + int num; + int lo, hi; + struct pack_revindex *rix; + unsigned long *revindex; + num = pack_revindex_ix(p); + if (num < 0) + die("internal error: pack revindex uninitialized"); + rix = &pack_revindex[num]; + if (!rix->revindex) + prepare_pack_revindex(rix); + revindex = rix->revindex; + lo = 0; + hi = num_packed_objects(p) + 1; + do { + int mi = (lo + hi) / 2; + if (revindex[mi] == ofs) { + return revindex[mi+1] - ofs; + } + else if (ofs < revindex[mi]) + hi = mi; + else + lo = mi + 1; + } while (lo < hi); + die("internal error: pack revindex corrupt"); +} + static void *delta_against(void *buf, unsigned long size, struct object_entry *entry) { unsigned long othersize, delta_size; @@ -78,35 +223,52 @@ static unsigned long write_object(struct sha1file *f, struct object_entry *entry { unsigned long size; char type[10]; - void *buf = read_sha1_file(entry->sha1, type, &size); + void *buf; unsigned char header[10]; unsigned hdrlen, datalen; enum object_type obj_type; - if (!buf) - die("unable to read %s", sha1_to_hex(entry->sha1)); - if (size != entry->size) - die("object %s size inconsistency (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size); - - /* - * The object header is a byte of 'type' followed by zero or - * more bytes of length. For deltas, the 20 bytes of delta sha1 - * follows that. - */ obj_type = entry->type; - if (entry->delta) { - buf = delta_against(buf, size, entry); - size = entry->delta_size; - obj_type = OBJ_DELTA; + if (!entry->in_pack || + (obj_type != entry->in_pack_type)) { + buf = read_sha1_file(entry->sha1, type, &size); + if (!buf) + die("unable to read %s", sha1_to_hex(entry->sha1)); + if (size != entry->size) + die("object %s size inconsistency (%lu vs %lu)", + sha1_to_hex(entry->sha1), size, entry->size); + if (entry->delta) { + buf = delta_against(buf, size, entry); + size = entry->delta_size; + obj_type = OBJ_DELTA; + } + /* + * The object header is a byte of 'type' followed by zero or + * more bytes of length. For deltas, the 20 bytes of delta + * sha1 follows that. + */ + hdrlen = encode_header(obj_type, size, header); + sha1write(f, header, hdrlen); + + if (entry->delta) { + sha1write(f, entry->delta, 20); + hdrlen += 20; + } + datalen = sha1write_compressed(f, buf, size); + free(buf); } - hdrlen = encode_header(obj_type, size, header); - sha1write(f, header, hdrlen); - if (entry->delta) { - sha1write(f, entry->delta, 20); - hdrlen += 20; + else { + struct packed_git *p = entry->in_pack; + use_packed_git(p); + + datalen = find_packed_object_size(p, entry->in_pack_offset); + buf = p->pack_base + entry->in_pack_offset; + sha1write(f, buf, datalen); + unuse_packed_git(p); + hdrlen = 0; /* not really */ + reused++; } - datalen = sha1write_compressed(f, buf, size); - free(buf); + written++; return hdrlen + datalen; } @@ -148,8 +310,6 @@ static void write_pack_file(void) offset = write_one(f, objects + i, offset); sha1close(f, pack_file_sha1, 1); - mb = offset >> 20; - offset &= 0xfffff; } static void write_index_file(void) @@ -196,18 +356,21 @@ static int add_object_entry(unsigned char *sha1, unsigned int hash) { unsigned int idx = nr_objects; struct object_entry *entry; - - if (incremental || local) { - struct packed_git *p; - - for (p = packed_git; p; p = p->next) { - struct pack_entry e; - - if (find_pack_entry_one(sha1, &e, p)) { - if (incremental) - return 0; - if (local && !p->pack_local) - return 0; + struct packed_git *p; + unsigned int found_offset; + struct packed_git *found_pack; + + found_pack = NULL; + for (p = packed_git; p; p = p->next) { + struct pack_entry e; + if (find_pack_entry_one(sha1, &e, p)) { + if (incremental) + return 0; + if (local && !p->pack_local) + return 0; + if (!found_pack) { + found_offset = e.offset; + found_pack = e.p; } } } @@ -221,30 +384,107 @@ static int add_object_entry(unsigned char *sha1, unsigned int hash) memset(entry, 0, sizeof(*entry)); memcpy(entry->sha1, sha1, 20); entry->hash = hash; + if (found_pack) { + entry->in_pack = found_pack; + entry->in_pack_offset = found_offset; + } nr_objects = idx+1; return 1; } +static int locate_object_entry_hash(unsigned char *sha1) +{ + int i; + unsigned int ui; + memcpy(&ui, sha1, sizeof(unsigned int)); + i = ui % object_ix_hashsz; + while (0 < object_ix[i]) { + if (!memcmp(sha1, objects[object_ix[i]-1].sha1, 20)) + return i; + if (++i == object_ix_hashsz) + i = 0; + } + return -1 - i; +} + +static struct object_entry *locate_object_entry(unsigned char *sha1) +{ + int i = locate_object_entry_hash(sha1); + if (0 <= i) + return &objects[object_ix[i]-1]; + return NULL; +} + static void check_object(struct object_entry *entry) { char type[20]; - if (!sha1_object_info(entry->sha1, type, &entry->size)) { - if (!strcmp(type, "commit")) { - entry->type = OBJ_COMMIT; - } else if (!strcmp(type, "tree")) { - entry->type = OBJ_TREE; - } else if (!strcmp(type, "blob")) { - entry->type = OBJ_BLOB; - } else if (!strcmp(type, "tag")) { - entry->type = OBJ_TAG; - } else - die("unable to pack object %s of type %s", - sha1_to_hex(entry->sha1), type); + if (entry->in_pack) { + /* Check if it is delta, and the base is also an object + * we are going to pack. If so we will reuse the existing + * delta. + */ + unsigned char base[20]; + unsigned long size; + struct object_entry *base_entry; + if (!check_reuse_pack_delta(entry->in_pack, + entry->in_pack_offset, + base, &size, + &entry->in_pack_type) && + (base_entry = locate_object_entry(base))) { + /* We do not know depth at this point, but it + * does not matter. Getting delta_chain_length + * with packed_object_info_detail() is not so + * expensive, so we could do that later if we + * wanted to. Calling sha1_object_info to get + * the true size (and later an uncompressed + * representation) of deeply deltified object + * is quite expensive. + */ + entry->depth = 1; + /* uncompressed size */ + entry->size = entry->delta_size = size; + entry->delta = base_entry; + entry->type = OBJ_DELTA; + return; + } + /* Otherwise we would do the usual */ } - else + + if (sha1_object_info(entry->sha1, type, &entry->size)) die("unable to get type of object %s", sha1_to_hex(entry->sha1)); + + if (!strcmp(type, "commit")) { + entry->type = OBJ_COMMIT; + } else if (!strcmp(type, "tree")) { + entry->type = OBJ_TREE; + } else if (!strcmp(type, "blob")) { + entry->type = OBJ_BLOB; + } else if (!strcmp(type, "tag")) { + entry->type = OBJ_TAG; + } else + die("unable to pack object %s of type %s", + sha1_to_hex(entry->sha1), type); +} + +static void hash_objects(void) +{ + int i; + struct object_entry *oe; + + object_ix_hashsz = nr_objects * 2; + object_ix = xcalloc(sizeof(int), object_ix_hashsz); + for (i = 0, oe = objects; i < nr_objects; i++, oe++) { + int ix = locate_object_entry_hash(oe->sha1); + if (0 <= ix) { + error("the same object '%s' added twice", + sha1_to_hex(oe->sha1)); + continue; + } + ix = -1 - ix; + object_ix[ix] = i + 1; + } } static void get_object_details(void) @@ -252,6 +492,8 @@ static void get_object_details(void) int i; struct object_entry *entry = objects; + hash_objects(); + prepare_pack_ix(); for (i = 0; i < nr_objects; i++) check_object(entry++); } @@ -382,6 +624,13 @@ static void find_deltas(struct object_entry **list, int window, int depth) eye_candy -= nr_objects / 20; fputc('.', stderr); } + + if (entry->delta) + /* This happens if we decided to reuse existing + * delta from a pack. + */ + continue; + free(n->data); n->entry = entry; n->data = read_sha1_file(entry->sha1, type, &size); @@ -411,10 +660,12 @@ static void find_deltas(struct object_entry **list, int window, int depth) static void prepare_pack(int window, int depth) { - get_object_details(); - if (progress) fprintf(stderr, "Packing %d objects", nr_objects); + get_object_details(); + if (progress) + fprintf(stderr, "."); + sorted_by_type = create_sorted_list(type_size_sort); if (window && depth) find_deltas(sorted_by_type, window+1, depth); @@ -599,5 +850,7 @@ int main(int argc, char **argv) puts(sha1_to_hex(object_list_sha1)); } } + fprintf(stderr, "Total %d, written %d, reused %d\n", + nr_objects, written, reused); return 0; } -- cgit v1.3 From ab7cd7bb8c02dc40ca3a909653e8f56226f9e440 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 16 Feb 2006 11:55:51 -0800 Subject: pack-objects: finishing touches. This introduces --no-reuse-delta option to disable reusing of existing delta, which is a large part of the optimization introduced by this series. This may become necessary if repeated repacking makes delta chain too long. With this, the output of the command becomes identical to that of the older implementation. But the performance suffers greatly. It still allows reusing non-deltified representations; there is no point uncompressing and recompressing the whole text. It also adds a couple more statistics output, while squelching it under -q flag, which the last round forgot to do. $ time old-git-pack-objects --stdout >/dev/null /dev/null /dev/null --- Documentation/git-pack-objects.txt | 21 +++++++- pack-objects.c | 102 ++++++++++++++++++++++++++----------- 2 files changed, 91 insertions(+), 32 deletions(-) (limited to 'pack-objects.c') diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 2d67d3911e..4cb2e83faa 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -8,7 +8,10 @@ git-pack-objects - Create a packed archive of objects. SYNOPSIS -------- -'git-pack-objects' [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list +[verse] +'git-pack-objects' [-q] [--no-reuse-delta] [--non-empty] + [--local] [--incremental] [--window=N] [--depth=N] + {--stdout | base-name} < object-list DESCRIPTION @@ -32,6 +35,10 @@ Placing both in the pack/ subdirectory of $GIT_OBJECT_DIRECTORY (or any of the directories on $GIT_ALTERNATE_OBJECT_DIRECTORIES) enables git to read from such an archive. +In a packed archive, an object is either stored as a compressed +whole, or as a difference from some other object. The latter is +often called a delta. + OPTIONS ------- @@ -74,6 +81,18 @@ base-name:: Only create a packed archive if it would contain at least one object. +-q:: + This flag makes the command not to report its progress + on the standard error stream. + +--no-reuse-delta:: + When creating a packed archive in a repository that + has existing packs, the command reuses existing deltas. + This sometimes results in a slightly suboptimal pack. + This flag tells the command not to reuse existing deltas + but compute them from scratch. + + Author ------ Written by Linus Torvalds diff --git a/pack-objects.c b/pack-objects.c index 70fb2afeb8..38e1c9921b 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -5,7 +5,7 @@ #include "csum-file.h" #include -static const char pack_usage[] = "git-pack-objects [-q] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list"; +static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list"; struct object_entry { unsigned char sha1[20]; @@ -14,10 +14,11 @@ struct object_entry { unsigned int depth; /* delta depth */ unsigned int hash; /* name hint hash */ enum object_type type; + unsigned char edge; /* reused delta chain points at this entry. */ + enum object_type in_pack_type; /* could be delta */ unsigned long delta_size; /* delta data size (uncompressed) */ struct object_entry *delta; /* delta base object */ struct packed_git *in_pack; /* already in pack */ - enum object_type in_pack_type; /* could be delta */ unsigned int in_pack_offset; }; @@ -36,6 +37,7 @@ struct object_entry { static unsigned char object_list_sha1[20]; static int non_empty = 0; +static int no_reuse_delta = 0; static int local = 0; static int incremental = 0; static struct object_entry **sorted_by_sha, **sorted_by_type; @@ -75,7 +77,9 @@ static int pack_revindex_hashsz = 0; * stats */ static int written = 0; +static int written_delta = 0; static int reused = 0; +static int reused_delta = 0; static int pack_revindex_ix(struct packed_git *p) { @@ -227,10 +231,23 @@ static unsigned long write_object(struct sha1file *f, struct object_entry *entry unsigned char header[10]; unsigned hdrlen, datalen; enum object_type obj_type; + int to_reuse = 0; obj_type = entry->type; - if (!entry->in_pack || - (obj_type != entry->in_pack_type)) { + if (! entry->in_pack) + to_reuse = 0; /* can't reuse what we don't have */ + else if (obj_type == OBJ_DELTA) + to_reuse = 1; /* check_object() decided it for us */ + else if (obj_type != entry->in_pack_type) + to_reuse = 0; /* pack has delta which is unusable */ + else if (entry->delta) + to_reuse = 0; /* we want to pack afresh */ + else + to_reuse = 1; /* we have it in-pack undeltified, + * and we do not need to deltify it. + */ + + if (! to_reuse) { buf = read_sha1_file(entry->sha1, type, &size); if (!buf) die("unable to read %s", sha1_to_hex(entry->sha1)); @@ -266,8 +283,12 @@ static unsigned long write_object(struct sha1file *f, struct object_entry *entry sha1write(f, buf, datalen); unuse_packed_git(p); hdrlen = 0; /* not really */ + if (obj_type == OBJ_DELTA) + reused_delta++; reused++; } + if (obj_type == OBJ_DELTA) + written_delta++; written++; return hdrlen + datalen; } @@ -294,7 +315,6 @@ static void write_pack_file(void) int i; struct sha1file *f; unsigned long offset; - unsigned long mb; struct pack_header hdr; if (!base_name) @@ -357,10 +377,9 @@ static int add_object_entry(unsigned char *sha1, unsigned int hash) unsigned int idx = nr_objects; struct object_entry *entry; struct packed_git *p; - unsigned int found_offset; - struct packed_git *found_pack; + unsigned int found_offset = 0; + struct packed_git *found_pack = NULL; - found_pack = NULL; for (p = packed_git; p; p = p->next) { struct pack_entry e; if (find_pack_entry_one(sha1, &e, p)) { @@ -420,32 +439,39 @@ static void check_object(struct object_entry *entry) char type[20]; if (entry->in_pack) { + unsigned char base[20]; + unsigned long size; + struct object_entry *base_entry; + + /* We want in_pack_type even if we do not reuse delta. + * There is no point not reusing non-delta representations. + */ + check_reuse_pack_delta(entry->in_pack, + entry->in_pack_offset, + base, &size, + &entry->in_pack_type); + /* Check if it is delta, and the base is also an object * we are going to pack. If so we will reuse the existing * delta. */ - unsigned char base[20]; - unsigned long size; - struct object_entry *base_entry; - if (!check_reuse_pack_delta(entry->in_pack, - entry->in_pack_offset, - base, &size, - &entry->in_pack_type) && + if (!no_reuse_delta && + entry->in_pack_type == OBJ_DELTA && (base_entry = locate_object_entry(base))) { - /* We do not know depth at this point, but it - * does not matter. Getting delta_chain_length - * with packed_object_info_detail() is not so - * expensive, so we could do that later if we - * wanted to. Calling sha1_object_info to get - * the true size (and later an uncompressed - * representation) of deeply deltified object - * is quite expensive. + + /* Depth value does not matter - find_deltas() + * will never consider reused delta as the + * base object to deltify other objects + * against, in order to avoid circular deltas. */ - entry->depth = 1; - /* uncompressed size */ + + /* uncompressed size of the delta data */ entry->size = entry->delta_size = size; entry->delta = base_entry; entry->type = OBJ_DELTA; + + base_entry->edge = 1; + return; } /* Otherwise we would do the usual */ @@ -568,6 +594,13 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de if (cur_entry->type != old_entry->type) return -1; + /* If the current object is at edge, take the depth the objects + * that depend on the current object into account -- otherwise + * they would become too deep. + */ + if (cur_entry->edge) + max_depth /= 4; + size = cur_entry->size; if (size < 50) return -1; @@ -627,7 +660,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (entry->delta) /* This happens if we decided to reuse existing - * delta from a pack. + * delta from a pack. "!no_reuse_delta &&" is implied. */ continue; @@ -636,6 +669,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) n->data = read_sha1_file(entry->sha1, type, &size); if (size != entry->size) die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size); + j = window; while (--j > 0) { unsigned int other_idx = idx + j; @@ -664,7 +698,7 @@ static void prepare_pack(int window, int depth) fprintf(stderr, "Packing %d objects", nr_objects); get_object_details(); if (progress) - fprintf(stderr, "."); + fputc('.', stderr); sorted_by_type = create_sorted_list(type_size_sort); if (window && depth) @@ -694,8 +728,9 @@ static int reuse_cached_pack(unsigned char *sha1, int pack_to_stdout) } } - fprintf(stderr, "Reusing %d objects pack %s\n", nr_objects, - sha1_to_hex(sha1)); + if (progress) + fprintf(stderr, "Reusing %d objects pack %s\n", nr_objects, + sha1_to_hex(sha1)); if (pack_to_stdout) { if (copy_fd(ifd, 1)) @@ -775,6 +810,10 @@ int main(int argc, char **argv) progress = 0; continue; } + if (!strcmp("--no-reuse-delta", arg)) { + no_reuse_delta = 1; + continue; + } if (!strcmp("--stdout", arg)) { pack_to_stdout = 1; continue; @@ -850,7 +889,8 @@ int main(int argc, char **argv) puts(sha1_to_hex(object_list_sha1)); } } - fprintf(stderr, "Total %d, written %d, reused %d\n", - nr_objects, written, reused); + if (progress) + fprintf(stderr, "Total %d, written %d (delta %d), reused %d (delta %d)\n", + nr_objects, written, written_delta, reused, reused_delta); return 0; } -- cgit v1.3 From 15b4d577ae2e0117b7b5a4add2217442a8458812 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 17 Feb 2006 20:58:45 -0800 Subject: pack-objects: avoid delta chains that are too long. This tries to rework the solution for the excess delta chain problem. An earlier commit worked it around ``cheaply'', but repeated repacking risks unbound growth of delta chains. This version counts the length of delta chain we are reusing from the existing pack, and makes sure a base object that has sufficiently long delta chain does not get deltified. Signed-off-by: Junio C Hamano --- pack-objects.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index 38e1c9921b..0c9f4c9d23 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -10,16 +10,22 @@ static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--no struct object_entry { unsigned char sha1[20]; unsigned long size; /* uncompressed size */ - unsigned long offset; /* offset into the final pack file (nonzero if already written) */ + unsigned long offset; /* offset into the final pack file; + * nonzero if already written. + */ unsigned int depth; /* delta depth */ + unsigned int delta_limit; /* base adjustment for in-pack delta */ unsigned int hash; /* name hint hash */ enum object_type type; - unsigned char edge; /* reused delta chain points at this entry. */ enum object_type in_pack_type; /* could be delta */ unsigned long delta_size; /* delta data size (uncompressed) */ struct object_entry *delta; /* delta base object */ struct packed_git *in_pack; /* already in pack */ unsigned int in_pack_offset; + struct object_entry *delta_child; /* delitified objects who bases me */ + struct object_entry *delta_sibling; /* other deltified objects who + * uses the same base as me + */ }; /* @@ -470,7 +476,8 @@ static void check_object(struct object_entry *entry) entry->delta = base_entry; entry->type = OBJ_DELTA; - base_entry->edge = 1; + entry->delta_sibling = base_entry->delta_child; + base_entry->delta_child = entry; return; } @@ -513,15 +520,32 @@ static void hash_objects(void) } } +static unsigned int check_delta_limit(struct object_entry *me, unsigned int n) +{ + struct object_entry *child = me->delta_child; + unsigned int m = n; + while (child) { + unsigned int c = check_delta_limit(child, n + 1); + if (m < c) + m = c; + child = child->delta_sibling; + } + return m; +} + static void get_object_details(void) { int i; - struct object_entry *entry = objects; + struct object_entry *entry; hash_objects(); prepare_pack_ix(); - for (i = 0; i < nr_objects; i++) - check_object(entry++); + for (i = 0, entry = objects; i < nr_objects; i++, entry++) + check_object(entry); + for (i = 0, entry = objects; i < nr_objects; i++, entry++) + if (!entry->delta && entry->delta_child) + entry->delta_limit = + check_delta_limit(entry, 1); } typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *); @@ -598,8 +622,11 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de * that depend on the current object into account -- otherwise * they would become too deep. */ - if (cur_entry->edge) - max_depth /= 4; + if (cur_entry->delta_child) { + if (max_depth <= cur_entry->delta_limit) + return 0; + max_depth -= cur_entry->delta_limit; + } size = cur_entry->size; if (size < 50) -- cgit v1.3 From b2504a0d2ff5a51feb516f7732beb9549b5db454 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 22 Feb 2006 16:00:08 -0500 Subject: nicer eye candies for pack-objects This provides a stable and simpler progress reporting mechanism that updates progress as often as possible but accurately not updating more than once a second. The deltification phase is also made more interesting to watch (since repacking a big repository and only seeing a dot appear once every many seconds is rather boring and doesn't provide much food for anticipation). Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- pack-objects.c | 56 +++++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index 0c9f4c9d23..5e1e14cf70 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -4,6 +4,7 @@ #include "pack.h" #include "csum-file.h" #include +#include static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list"; @@ -661,17 +662,22 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de return 0; } +static volatile int progress_update = 0; +static void progress_interval(int signum) +{ + signal(SIGALRM, progress_interval); + progress_update = 1; +} + static void find_deltas(struct object_entry **list, int window, int depth) { int i, idx; unsigned int array_size = window * sizeof(struct unpacked); struct unpacked *array = xmalloc(array_size); - int eye_candy; memset(array, 0, array_size); i = nr_objects; idx = 0; - eye_candy = i - (nr_objects / 20); while (--i >= 0) { struct object_entry *entry = list[i]; @@ -680,9 +686,10 @@ static void find_deltas(struct object_entry **list, int window, int depth) char type[10]; int j; - if (progress && i <= eye_candy) { - eye_candy -= nr_objects / 20; - fputc('.', stderr); + if (progress_update || i == 0) { + fprintf(stderr, "Deltifying (%d %d%%)\r", + nr_objects-i, (nr_objects-i) * 100/nr_objects); + progress_update = 0; } if (entry->delta) @@ -714,6 +721,9 @@ static void find_deltas(struct object_entry **list, int window, int depth) idx = 0; } + if (progress) + fputc('\n', stderr); + for (i = 0; i < window; ++i) free(array[i].data); free(array); @@ -721,17 +731,10 @@ static void find_deltas(struct object_entry **list, int window, int depth) static void prepare_pack(int window, int depth) { - if (progress) - fprintf(stderr, "Packing %d objects", nr_objects); get_object_details(); - if (progress) - fputc('.', stderr); - sorted_by_type = create_sorted_list(type_size_sort); if (window && depth) find_deltas(sorted_by_type, window+1, depth); - if (progress) - fputc('\n', stderr); write_pack_file(); } @@ -796,10 +799,6 @@ int main(int argc, char **argv) int window = 10, depth = 10, pack_to_stdout = 0; struct object_entry **list; int i; - struct timeval prev_tv; - int eye_candy = 0; - int eye_candy_incr = 500; - setup_git_directory(); @@ -856,30 +855,25 @@ int main(int argc, char **argv) usage(pack_usage); prepare_packed_git(); + if (progress) { + struct itimerval v; + v.it_interval.tv_sec = 1; + v.it_interval.tv_usec = 0; + v.it_value = v.it_interval; + signal(SIGALRM, progress_interval); + setitimer(ITIMER_REAL, &v, NULL); fprintf(stderr, "Generating pack...\n"); - gettimeofday(&prev_tv, NULL); } + while (fgets(line, sizeof(line), stdin) != NULL) { unsigned int hash; char *p; unsigned char sha1[20]; - if (progress && (eye_candy <= nr_objects)) { + if (progress_update) { fprintf(stderr, "Counting objects...%d\r", nr_objects); - if (eye_candy && (50 <= eye_candy_incr)) { - struct timeval tv; - int time_diff; - gettimeofday(&tv, NULL); - time_diff = (tv.tv_sec - prev_tv.tv_sec); - time_diff <<= 10; - time_diff += (tv.tv_usec - prev_tv.tv_usec); - if ((1 << 9) < time_diff) - eye_candy_incr += 50; - else if (50 < eye_candy_incr) - eye_candy_incr -= 50; - } - eye_candy += eye_candy_incr; + progress_update = 0; } if (get_sha1_hex(line, sha1)) die("expected sha1, got garbage:\n %s", line); -- cgit v1.3 From 5e8dc750ee56d8c295ecd7478a6bd5d148cb7177 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 22 Feb 2006 17:41:32 -0500 Subject: also adds progress when actually writing a pack If that pack is big, it takes significant time to write and might benefit from some more eye candies as well. This is however disabled when the pack is written to stdout since in that case the output is usually piped into unpack_objects which already does its own progress reporting. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- pack-objects.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index 5e1e14cf70..dc928b3d8f 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -53,6 +53,7 @@ static int nr_objects = 0, nr_alloc = 0; static const char *base_name; static unsigned char pack_file_sha1[20]; static int progress = 1; +static volatile int progress_update = 0; /* * The object names in objects array are hashed with this hashtable, @@ -333,8 +334,14 @@ static void write_pack_file(void) hdr.hdr_entries = htonl(nr_objects); sha1write(f, &hdr, sizeof(hdr)); offset = sizeof(hdr); - for (i = 0; i < nr_objects; i++) + for (i = 0; i < nr_objects; i++) { offset = write_one(f, objects + i, offset); + if (progress_update) { + fprintf(stderr, "Writing (%d %d%%)\r", + i+1, (i+1) * 100/nr_objects); + progress_update = 0; + } + } sha1close(f, pack_file_sha1, 1); } @@ -662,7 +669,6 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de return 0; } -static volatile int progress_update = 0; static void progress_interval(int signum) { signal(SIGALRM, progress_interval); @@ -735,7 +741,6 @@ static void prepare_pack(int window, int depth) sorted_by_type = create_sorted_list(type_size_sort); if (window && depth) find_deltas(sorted_by_type, window+1, depth); - write_pack_file(); } static int reuse_cached_pack(unsigned char *sha1, int pack_to_stdout) @@ -905,6 +910,14 @@ int main(int argc, char **argv) ; else { prepare_pack(window, depth); + if (progress && pack_to_stdout) { + /* the other end usually displays progress itself */ + struct itimerval v = {{0,},}; + setitimer(ITIMER_REAL, &v, NULL); + signal(SIGALRM, SIG_IGN ); + progress_update = 0; + } + write_pack_file(); if (!pack_to_stdout) { write_index_file(); puts(sha1_to_hex(object_list_sha1)); -- cgit v1.3 From 183bdb2cccff792f11fd9e825df67af446aff171 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Wed, 22 Feb 2006 16:02:59 -0800 Subject: pack-objects eye-candy: finishing touches. This updates the progress output to match "every one second or every percent whichever comes early" used by unpack-objects, as discussed on the list. Signed-off-by: Junio C Hamano --- pack-objects.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index dc928b3d8f..8f352aa6c1 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -324,11 +324,19 @@ static void write_pack_file(void) struct sha1file *f; unsigned long offset; struct pack_header hdr; + unsigned last_percent = 999; + int do_progress = 0; if (!base_name) f = sha1fd(1, ""); - else - f = sha1create("%s-%s.%s", base_name, sha1_to_hex(object_list_sha1), "pack"); + else { + f = sha1create("%s-%s.%s", base_name, + sha1_to_hex(object_list_sha1), "pack"); + do_progress = progress; + } + if (do_progress) + fprintf(stderr, "Writing %d objects.\n", nr_objects); + hdr.hdr_signature = htonl(PACK_SIGNATURE); hdr.hdr_version = htonl(PACK_VERSION); hdr.hdr_entries = htonl(nr_objects); @@ -336,12 +344,18 @@ static void write_pack_file(void) offset = sizeof(hdr); for (i = 0; i < nr_objects; i++) { offset = write_one(f, objects + i, offset); - if (progress_update) { - fprintf(stderr, "Writing (%d %d%%)\r", - i+1, (i+1) * 100/nr_objects); - progress_update = 0; + if (do_progress) { + unsigned percent = written * 100 / nr_objects; + if (progress_update || percent != last_percent) { + fprintf(stderr, "%4u%% (%u/%u) done\r", + percent, written, nr_objects); + progress_update = 0; + last_percent = percent; + } } } + if (do_progress) + fputc('\n', stderr); sha1close(f, pack_file_sha1, 1); } @@ -680,10 +694,14 @@ static void find_deltas(struct object_entry **list, int window, int depth) int i, idx; unsigned int array_size = window * sizeof(struct unpacked); struct unpacked *array = xmalloc(array_size); + unsigned processed = 0; + unsigned last_percent = 999; memset(array, 0, array_size); i = nr_objects; idx = 0; + if (progress) + fprintf(stderr, "Deltifying %d objects.\n", nr_objects); while (--i >= 0) { struct object_entry *entry = list[i]; @@ -692,10 +710,15 @@ static void find_deltas(struct object_entry **list, int window, int depth) char type[10]; int j; - if (progress_update || i == 0) { - fprintf(stderr, "Deltifying (%d %d%%)\r", - nr_objects-i, (nr_objects-i) * 100/nr_objects); - progress_update = 0; + processed++; + if (progress) { + unsigned percent = processed * 100 / nr_objects; + if (percent != last_percent || progress_update) { + fprintf(stderr, "%4u%% (%u/%u) done\r", + percent, processed, nr_objects); + progress_update = 0; + last_percent = percent; + } } if (entry->delta) -- cgit v1.3