From f6c7081aa97aa67baa06390a1ef36088c33bf010 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 26 Apr 2006 23:58:00 -0400 Subject: use delta index data when finding best delta matches This patch allows for computing the delta index for each base object only once and reuse it when trying to find the best delta match. This should set the mark and pave the way for possibly better delta generator algorithms. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- pack-objects.c | 78 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 37 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index c0acc460bb..5b2ef9a513 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -994,6 +994,7 @@ static int type_size_sort(const struct object_entry *a, const struct object_entr struct unpacked { struct object_entry *entry; void *data; + struct delta_index *index; }; /* @@ -1004,64 +1005,56 @@ struct unpacked { * more importantly, the bigger file is likely the more recent * one. */ -static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_depth) +static int try_delta(struct unpacked *trg, struct unpacked *src, + struct delta_index *src_index, unsigned max_depth) { - struct object_entry *cur_entry = cur->entry; - struct object_entry *old_entry = old->entry; - unsigned long size, oldsize, delta_size, sizediff; - long max_size; + struct object_entry *trg_entry = trg->entry; + struct object_entry *src_entry = src->entry; + unsigned long size, src_size, delta_size, sizediff, max_size; void *delta_buf; /* Don't bother doing diffs between different types */ - if (cur_entry->type != old_entry->type) + if (trg_entry->type != src_entry->type) return -1; /* We do not compute delta to *create* objects we are not * going to pack. */ - if (cur_entry->preferred_base) + if (trg_entry->preferred_base) return -1; - /* If the current object is at pack edge, take the depth the + /* + * If the current object is at pack edge, take the depth the * objects that depend on the current object into account -- * otherwise they would become too deep. */ - if (cur_entry->delta_child) { - if (max_depth <= cur_entry->delta_limit) + if (trg_entry->delta_child) { + if (max_depth <= trg_entry->delta_limit) return 0; - max_depth -= cur_entry->delta_limit; + max_depth -= trg_entry->delta_limit; } - - size = cur_entry->size; - oldsize = old_entry->size; - sizediff = oldsize > size ? oldsize - size : size - oldsize; - - if (size < 50) - return -1; - if (old_entry->depth >= max_depth) + if (src_entry->depth >= max_depth) return 0; - /* - * NOTE! - * - * We always delta from the bigger to the smaller, since that's - * more space-efficient (deletes don't have to say _what_ they - * delete). - */ + /* Now some size filtering euristics. */ + size = trg_entry->size; max_size = size / 2 - 20; - if (cur_entry->delta) - max_size = cur_entry->delta_size-1; + if (trg_entry->delta) + max_size = trg_entry->delta_size-1; + src_size = src_entry->size; + sizediff = src_size < size ? size - src_size : 0; if (sizediff >= max_size) return 0; - delta_buf = diff_delta(old->data, oldsize, - cur->data, size, &delta_size, max_size); + + delta_buf = create_delta(src_index, trg->data, size, &delta_size, max_size); if (!delta_buf) return 0; - cur_entry->delta = old_entry; - cur_entry->delta_size = delta_size; - cur_entry->depth = old_entry->depth + 1; + + trg_entry->delta = src_entry; + trg_entry->delta_size = delta_size; + trg_entry->depth = src_entry->depth + 1; free(delta_buf); - return 0; + return 1; } static void progress_interval(int signum) @@ -1109,11 +1102,19 @@ static void find_deltas(struct object_entry **list, int window, int depth) */ continue; + if (entry->size < 50) + continue; + if (n->index) + free_delta_index(n->index); free(n->data); n->entry = entry; n->data = read_sha1_file(entry->sha1, type, &size); if (size != entry->size) - die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size); + die("object %s inconsistent object length (%lu vs %lu)", + sha1_to_hex(entry->sha1), size, entry->size); + n->index = create_delta_index(n->data, size); + if (!n->index) + die("out of memory"); j = window; while (--j > 0) { @@ -1124,7 +1125,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) m = array + other_idx; if (!m->entry) break; - if (try_delta(n, m, depth) < 0) + if (try_delta(n, m, m->index, depth) < 0) break; } #if 0 @@ -1144,8 +1145,11 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (progress) fputc('\n', stderr); - for (i = 0; i < window; ++i) + for (i = 0; i < window; ++i) { + if (array[i].index) + free_delta_index(array[i].index); free(array[i].data); + } free(array); } -- cgit v1.3 From 9a8b6a0a9d4de3d9c1005b872b9b57a213d3e9f8 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 27 Apr 2006 19:31:46 -0700 Subject: pack-objects: update size heuristucs. We used to omit delta base candidates that is much bigger than the target, but delta size does not grow when we delete more, so that was not a very good heuristics. Signed-off-by: Junio C Hamano --- pack-objects.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index c0acc460bb..6604338131 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -1032,12 +1032,6 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de max_depth -= cur_entry->delta_limit; } - size = cur_entry->size; - oldsize = old_entry->size; - sizediff = oldsize > size ? oldsize - size : size - oldsize; - - if (size < 50) - return -1; if (old_entry->depth >= max_depth) return 0; @@ -1048,9 +1042,12 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de * more space-efficient (deletes don't have to say _what_ they * delete). */ + size = cur_entry->size; max_size = size / 2 - 20; if (cur_entry->delta) max_size = cur_entry->delta_size-1; + oldsize = old_entry->size; + sizediff = oldsize < size ? size - oldsize : 0; if (sizediff >= max_size) return 0; delta_buf = diff_delta(old->data, oldsize, @@ -1109,6 +1106,9 @@ static void find_deltas(struct object_entry **list, int window, int depth) */ continue; + if (entry->size < 50) + continue; + free(n->data); n->entry = entry; n->data = read_sha1_file(entry->sha1, type, &size); -- cgit v1.3 From 86118bcb463e3f34b3df21d550335a40586dfb66 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 5 May 2006 03:20:44 -0700 Subject: pack-object: squelch eye-candy on non-tty One of my post-update scripts runs a git-fetch into a separate repository and sends the results back to me (2>&1); I end up getting this in the mail: Generating pack... Done counting 180 objects. Result has 131 objects. Deltifying 131 objects. 0% (0/131) done^M 1% (2/131) done^M... This defaults not to do the progress report when not on a tty. You could give --progress to force the progress report, but let's not bother even documenting it nor mentioning it in the usage string. Signed-off-by: Junio C Hamano --- pack-objects.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index 6604338131..53caed42dd 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -1239,6 +1239,7 @@ int main(int argc, char **argv) setup_git_directory(); + progress = isatty(2); for (i = 1; i < argc; i++) { const char *arg = argv[i]; @@ -1269,6 +1270,10 @@ int main(int argc, char **argv) usage(pack_usage); continue; } + if (!strcmp("--progress", arg)) { + progress = 1; + continue; + } if (!strcmp("-q", arg)) { progress = 0; continue; -- cgit v1.3 From 66561f5a776f2343331fff5b98adff1000622f42 Mon Sep 17 00:00:00 2001 From: Dennis Stosberg Date: Thu, 11 May 2006 19:36:32 +0200 Subject: Fix git-pack-objects for 64-bit platforms The offset of an object in the pack is recorded as a 4-byte integer in the index file. When reading the offset from the mmap'ed index in prepare_pack_revindex(), the address is dereferenced as a long*. This works fine as long as the long type is four bytes wide. On NetBSD/sparc64, however, a long is 8 bytes wide and so dereferencing the offset produces garbage. [jc: taking suggestion by Linus to use uint32_t] Signed-off-by: Dennis Stosberg Signed-off-by: Junio C Hamano --- pack-objects.c | 2 +- sha1_file.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index c0acc460bb..a81d609b26 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -156,7 +156,7 @@ static void prepare_pack_revindex(struct pack_revindex *rix) rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1)); for (i = 0; i < num_ent; i++) { - long hl = *((long *)(index + 24 * i)); + uint32_t hl = *((uint32_t *)(index + 24 * i)); rix->revindex[i] = ntohl(hl); } /* This knows the pack format -- the 20-byte trailer diff --git a/sha1_file.c b/sha1_file.c index f2d33afb27..642c45ad75 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1126,7 +1126,7 @@ int find_pack_entry_one(const unsigned char *sha1, int mi = (lo + hi) / 2; int cmp = memcmp(index + 24 * mi + 4, sha1, 20); if (!cmp) { - e->offset = ntohl(*((int*)(index + 24 * mi))); + e->offset = ntohl(*((uint32_t *)(index + 24 * mi))); memcpy(e->sha1, sha1, 20); e->p = p; return 1; -- cgit v1.3 From d9635e9c539465792b1920437b52fa8792a71650 Mon Sep 17 00:00:00 2001 From: Ben Clifford Date: Sun, 14 May 2006 21:34:56 +0100 Subject: include header to define uint32_t, necessary on Mac OS X Signed-off-by: Junio C Hamano --- pack-objects.c | 1 + sha1_file.c | 1 + 2 files changed, 2 insertions(+) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index a81d609b26..aa2c098617 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -10,6 +10,7 @@ #include "tree-walk.h" #include #include +#include static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list"; diff --git a/sha1_file.c b/sha1_file.c index 642c45ad75..673c58d450 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -13,6 +13,7 @@ #include "commit.h" #include "tag.h" #include "tree.h" +#include #ifndef O_NOATIME #if defined(__linux__) && (defined(__i386__) || defined(__PPC__)) -- cgit v1.3 From 4e8da1958111796d55ad63b229ebd3ae6c54bf87 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 15 May 2006 11:40:05 -0400 Subject: simple euristic for further free packing improvements Given that the early eviction of objects with maximum delta depth may exhibit bad packing on its own, why not considering a bias against deep base objects in try_delta() to mitigate that bad behavior. This patch adjust the MAX_size allowed for a delta based on the depth of the base object as well as enabling the early eviction of max depth objects from the object window. When used separately, those two things produce slightly better and much worse results respectively. But their combined effect is a surprising significant packing improvement. With this really simple patch the GIT repo gets nearly 15% smaller, and the Linux kernel repo about 5% smaller, with no significantly measurable CPU usage difference. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- pack-objects.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index 5466b15167..526c090c61 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -1039,8 +1039,8 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, /* Now some size filtering euristics. */ size = trg_entry->size; - max_size = size / 2 - 20; - if (trg_entry->delta) + max_size = (size/2 - 20) / (src_entry->depth + 1); + if (trg_entry->delta && trg_entry->delta_size <= max_size) max_size = trg_entry->delta_size-1; src_size = src_entry->size; sizediff = src_size < size ? size - src_size : 0; @@ -1129,15 +1129,12 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (try_delta(n, m, m->index, depth) < 0) break; } -#if 0 /* if we made n a delta, and if n is already at max * depth, leaving it in the window is pointless. we * should evict it first. - * ... in theory only; somehow this makes things worse. */ if (entry->delta && depth <= entry->depth) continue; -#endif idx++; if (idx >= window) idx = 0; -- cgit v1.3 From ff45715ce50b80ab16ee0d0dc7fff0c47a51959a Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 15 May 2006 13:47:16 -0400 Subject: pack-object: slightly more efficient Avoid creating a delta index for objects with maximum depth since they are not going to be used as delta base anyway. This also reduce peak memory usage slightly as the current object's delta index is not useful until the next object in the loop is considered for deltification. This saves a bit more than 1% on CPU usage. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- delta.h | 2 ++ pack-objects.c | 15 ++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'pack-objects.c') diff --git a/delta.h b/delta.h index 727ae30e9e..7b3f86d85f 100644 --- a/delta.h +++ b/delta.h @@ -18,6 +18,8 @@ create_delta_index(const void *buf, unsigned long bufsize); /* * free_delta_index: free the index created by create_delta_index() + * + * Given pointer must be what create_delta_index() returned, or NULL. */ extern void free_delta_index(struct delta_index *index); diff --git a/pack-objects.c b/pack-objects.c index 526c090c61..b430b02cf7 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -1105,17 +1105,14 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (entry->size < 50) continue; - if (n->index) - free_delta_index(n->index); + free_delta_index(n->index); + n->index = NULL; free(n->data); n->entry = entry; n->data = read_sha1_file(entry->sha1, type, &size); if (size != entry->size) die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size); - n->index = create_delta_index(n->data, size); - if (!n->index) - die("out of memory"); j = window; while (--j > 0) { @@ -1135,6 +1132,11 @@ static void find_deltas(struct object_entry **list, int window, int depth) */ if (entry->delta && depth <= entry->depth) continue; + + n->index = create_delta_index(n->data, size); + if (!n->index) + die("out of memory"); + idx++; if (idx >= window) idx = 0; @@ -1144,8 +1146,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) fputc('\n', stderr); for (i = 0; i < window; ++i) { - if (array[i].index) - free_delta_index(array[i].index); + free_delta_index(array[i].index); free(array[i].data); } free(array); -- cgit v1.3 From 1b9bc5a7b7434d771726011613a00cb202bd9f44 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 15 May 2006 12:52:00 -0700 Subject: Fix pack-index issue on 64-bit platforms a bit more portably. Apparently is not enough for uint32_t on OpenBSD; use "unsigned int" -- hopefully that would stay 32-bit on every platform we care about, at least until we update the pack-index file format. Our sha1 routines optimized for architectures use uint32_t and expects '#include ' to be enough, so OpenBSD on arm or ppc might have similar issues down the road, I dunno. Signed-off-by: Junio C Hamano --- pack-objects.c | 3 +-- sha1_file.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index aa2c098617..614e87bc8c 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -10,7 +10,6 @@ #include "tree-walk.h" #include #include -#include static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list"; @@ -157,7 +156,7 @@ static void prepare_pack_revindex(struct pack_revindex *rix) rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1)); for (i = 0; i < num_ent; i++) { - uint32_t hl = *((uint32_t *)(index + 24 * i)); + unsigned int hl = *((unsigned int *)(index + 24 * i)); rix->revindex[i] = ntohl(hl); } /* This knows the pack format -- the 20-byte trailer diff --git a/sha1_file.c b/sha1_file.c index 673c58d450..66db206d9b 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -13,7 +13,6 @@ #include "commit.h" #include "tag.h" #include "tree.h" -#include #ifndef O_NOATIME #if defined(__linux__) && (defined(__i386__) || defined(__PPC__)) @@ -1127,7 +1126,7 @@ int find_pack_entry_one(const unsigned char *sha1, int mi = (lo + hi) / 2; int cmp = memcmp(index + 24 * mi + 4, sha1, 20); if (!cmp) { - e->offset = ntohl(*((uint32_t *)(index + 24 * mi))); + e->offset = ntohl(*((unsigned int *)(index + 24 * mi))); memcpy(e->sha1, sha1, 20); e->p = p; return 1; -- cgit v1.3 From c3b06a69ffc41b3ac3600628593dd0fdd3988607 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 16 May 2006 16:29:14 -0400 Subject: improve depth heuristic for maximum delta size This provides a linear decrement on the penalty related to delta depth instead of being an 1/x function. With this another 5% reduction is observed on packs for both the GIT repo and the Linux kernel repo, as well as fixing a pack size regression in another sample repo I have. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- pack-objects.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'pack-objects.c') diff --git a/pack-objects.c b/pack-objects.c index b430b02cf7..33751797fa 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -1037,9 +1037,12 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, if (src_entry->depth >= max_depth) return 0; - /* Now some size filtering euristics. */ + /* Now some size filtering heuristics. */ size = trg_entry->size; - max_size = (size/2 - 20) / (src_entry->depth + 1); + max_size = size/2 - 20; + max_size = max_size * (max_depth - src_entry->depth) / max_depth; + if (max_size == 0) + return 0; if (trg_entry->delta && trg_entry->delta_size <= max_size) max_size = trg_entry->delta_size-1; src_size = src_entry->size; -- cgit v1.3