From 9208e318f548d28c677203a222f02a31a15c5d0e Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 12 Jul 2018 15:39:25 -0400 Subject: packfile: generalize pack directory list In anticipation of sharing the pack directory listing with the multi-pack-index, generalize prepare_packed_git_one() into for_each_file_in_pack_dir(). Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- packfile.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'packfile.h') diff --git a/packfile.h b/packfile.h index e0a38aba93..d2ad30300a 100644 --- a/packfile.h +++ b/packfile.h @@ -28,6 +28,12 @@ extern char *sha1_pack_index_name(const unsigned char *sha1); extern struct packed_git *parse_pack_index(unsigned char *sha1, const char *idx_path); +typedef void each_file_in_pack_dir_fn(const char *full_path, size_t full_path_len, + const char *file_pach, void *data); +void for_each_file_in_pack_dir(const char *objdir, + each_file_in_pack_dir_fn fn, + void *data); + /* A hook to report invalid files in pack directory */ #define PACKDIR_FILE_PACK 1 #define PACKDIR_FILE_IDX 2 -- cgit v1.3-5-g9baa From fe1ed56f5e482507b54a4fb491273f122c5fd9ea Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 12 Jul 2018 15:39:29 -0400 Subject: midx: sort and deduplicate objects from packfiles Before writing a list of objects and their offsets to a multi-pack-index, we need to collect the list of objects contained in the packfiles. There may be multiple copies of some objects, so this list must be deduplicated. It is possible to artificially get into a state where there are many duplicate copies of objects. That can create high memory pressure if we are to create a list of all objects before de-duplication. To reduce this memory pressure without a significant performance drop, automatically group objects by the first byte of their object id. Use the IDX fanout tables to group the data, copy to a local array, then sort. Copy only the de-duplicated entries. Select the duplicate based on the most-recent modified time of a packfile containing the object. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ packfile.c | 17 ++++++++ packfile.h | 2 + 3 files changed, 147 insertions(+) (limited to 'packfile.h') diff --git a/midx.c b/midx.c index fcdf6553ce..29f8de5ee6 100644 --- a/midx.c +++ b/midx.c @@ -4,6 +4,7 @@ #include "lockfile.h" #include "packfile.h" #include "object-store.h" +#include "packfile.h" #include "midx.h" #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ @@ -182,12 +183,21 @@ static void add_pack_to_midx(const char *full_path, size_t full_path_len, packs->list[packs->nr] = add_packed_git(full_path, full_path_len, 0); + if (!packs->list[packs->nr]) { warning(_("failed to add packfile '%s'"), full_path); return; } + if (open_pack_index(packs->list[packs->nr])) { + warning(_("failed to open pack-index '%s'"), + full_path); + close_pack(packs->list[packs->nr]); + FREE_AND_NULL(packs->list[packs->nr]); + return; + } + packs->names[packs->nr] = xstrdup(file_name); packs->pack_name_concat_len += strlen(file_name) + 1; packs->nr++; @@ -228,6 +238,119 @@ static void sort_packs_by_name(char **pack_names, uint32_t nr_packs, uint32_t *p free(pairs); } +struct pack_midx_entry { + struct object_id oid; + uint32_t pack_int_id; + time_t pack_mtime; + uint64_t offset; +}; + +static int midx_oid_compare(const void *_a, const void *_b) +{ + const struct pack_midx_entry *a = (const struct pack_midx_entry *)_a; + const struct pack_midx_entry *b = (const struct pack_midx_entry *)_b; + int cmp = oidcmp(&a->oid, &b->oid); + + if (cmp) + return cmp; + + if (a->pack_mtime > b->pack_mtime) + return -1; + else if (a->pack_mtime < b->pack_mtime) + return 1; + + return a->pack_int_id - b->pack_int_id; +} + +static void fill_pack_entry(uint32_t pack_int_id, + struct packed_git *p, + uint32_t cur_object, + struct pack_midx_entry *entry) +{ + if (!nth_packed_object_oid(&entry->oid, p, cur_object)) + die(_("failed to locate object %d in packfile"), cur_object); + + entry->pack_int_id = pack_int_id; + entry->pack_mtime = p->mtime; + + entry->offset = nth_packed_object_offset(p, cur_object); +} + +/* + * It is possible to artificially get into a state where there are many + * duplicate copies of objects. That can create high memory pressure if + * we are to create a list of all objects before de-duplication. To reduce + * this memory pressure without a significant performance drop, automatically + * group objects by the first byte of their object id. Use the IDX fanout + * tables to group the data, copy to a local array, then sort. + * + * Copy only the de-duplicated entries (selected by most-recent modified time + * of a packfile containing the object). + */ +static struct pack_midx_entry *get_sorted_entries(struct packed_git **p, + uint32_t *perm, + uint32_t nr_packs, + uint32_t *nr_objects) +{ + uint32_t cur_fanout, cur_pack, cur_object; + uint32_t alloc_fanout, alloc_objects, total_objects = 0; + struct pack_midx_entry *entries_by_fanout = NULL; + struct pack_midx_entry *deduplicated_entries = NULL; + + for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) + total_objects += p[cur_pack]->num_objects; + + /* + * As we de-duplicate by fanout value, we expect the fanout + * slices to be evenly distributed, with some noise. Hence, + * allocate slightly more than one 256th. + */ + alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 200 : 16; + + ALLOC_ARRAY(entries_by_fanout, alloc_fanout); + ALLOC_ARRAY(deduplicated_entries, alloc_objects); + *nr_objects = 0; + + for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) { + uint32_t nr_fanout = 0; + + for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) { + uint32_t start = 0, end; + + if (cur_fanout) + start = get_pack_fanout(p[cur_pack], cur_fanout - 1); + end = get_pack_fanout(p[cur_pack], cur_fanout); + + for (cur_object = start; cur_object < end; cur_object++) { + ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout); + fill_pack_entry(perm[cur_pack], p[cur_pack], cur_object, &entries_by_fanout[nr_fanout]); + nr_fanout++; + } + } + + QSORT(entries_by_fanout, nr_fanout, midx_oid_compare); + + /* + * The batch is now sorted by OID and then mtime (descending). + * Take only the first duplicate. + */ + for (cur_object = 0; cur_object < nr_fanout; cur_object++) { + if (cur_object && !oidcmp(&entries_by_fanout[cur_object - 1].oid, + &entries_by_fanout[cur_object].oid)) + continue; + + ALLOC_GROW(deduplicated_entries, *nr_objects + 1, alloc_objects); + memcpy(&deduplicated_entries[*nr_objects], + &entries_by_fanout[cur_object], + sizeof(struct pack_midx_entry)); + (*nr_objects)++; + } + } + + free(entries_by_fanout); + return deduplicated_entries; +} + static size_t write_midx_pack_names(struct hashfile *f, char **pack_names, uint32_t num_packs) @@ -271,6 +394,8 @@ int write_midx_file(const char *object_dir) uint64_t written = 0; uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; + uint32_t nr_entries; + struct pack_midx_entry *entries = NULL; midx_name = get_midx_filename(object_dir); if (safe_create_leading_directories(midx_name)) { @@ -296,6 +421,8 @@ int write_midx_file(const char *object_dir) ALLOC_ARRAY(pack_perm, packs.nr); sort_packs_by_name(packs.names, packs.nr, pack_perm); + entries = get_sorted_entries(packs.list, pack_perm, packs.nr, &nr_entries); + hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR); f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf); FREE_AND_NULL(midx_name); @@ -365,5 +492,6 @@ int write_midx_file(const char *object_dir) free(packs.list); free(packs.names); + free(entries); return 0; } diff --git a/packfile.c b/packfile.c index ee1ab9b804..3d652212c6 100644 --- a/packfile.c +++ b/packfile.c @@ -196,6 +196,23 @@ int open_pack_index(struct packed_git *p) return ret; } +uint32_t get_pack_fanout(struct packed_git *p, uint32_t value) +{ + const uint32_t *level1_ofs = p->index_data; + + if (!level1_ofs) { + if (open_pack_index(p)) + return 0; + level1_ofs = p->index_data; + } + + if (p->index_version > 1) { + level1_ofs += 2; + } + + return ntohl(level1_ofs[value]); +} + static struct packed_git *alloc_packed_git(int extra) { struct packed_git *p = xmalloc(st_add(sizeof(*p), extra)); diff --git a/packfile.h b/packfile.h index d2ad30300a..b0eed44c0b 100644 --- a/packfile.h +++ b/packfile.h @@ -69,6 +69,8 @@ extern int open_pack_index(struct packed_git *); */ extern void close_pack_index(struct packed_git *); +extern uint32_t get_pack_fanout(struct packed_git *p, uint32_t value); + extern unsigned char *use_pack(struct packed_git *, struct pack_window **, off_t, unsigned long *); extern void close_pack_windows(struct packed_git *); extern void close_pack(struct packed_git *); -- cgit v1.3-5-g9baa From 8aac67a174061a0744557a3984a433f926bf5cb3 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 12 Jul 2018 15:39:35 -0400 Subject: midx: use midx in abbreviation calculations Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 11 ++++++++++ midx.h | 3 +++ packfile.c | 6 ++++++ packfile.h | 1 + sha1-name.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 91 insertions(+) (limited to 'packfile.h') diff --git a/midx.c b/midx.c index 182535933c..4e014ff6e3 100644 --- a/midx.c +++ b/midx.c @@ -203,6 +203,17 @@ int bsearch_midx(const struct object_id *oid, struct multi_pack_index *m, uint32 MIDX_HASH_LEN, result); } +struct object_id *nth_midxed_object_oid(struct object_id *oid, + struct multi_pack_index *m, + uint32_t n) +{ + if (n >= m->num_objects) + return NULL; + + hashcpy(oid->hash, m->chunk_oid_lookup + m->hash_len * n); + return oid; +} + static off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos) { const unsigned char *offset_data; diff --git a/midx.h b/midx.h index 377838c9ca..1b976df873 100644 --- a/midx.h +++ b/midx.h @@ -31,6 +31,9 @@ struct multi_pack_index { struct multi_pack_index *load_multi_pack_index(const char *object_dir); int bsearch_midx(const struct object_id *oid, struct multi_pack_index *m, uint32_t *result); +struct object_id *nth_midxed_object_oid(struct object_id *oid, + struct multi_pack_index *m, + uint32_t n); int fill_midx_entry(const struct object_id *oid, struct pack_entry *e, struct multi_pack_index *m); int prepare_multi_pack_index_one(struct repository *r, const char *object_dir); diff --git a/packfile.c b/packfile.c index bc763d91b9..c0eb5ac885 100644 --- a/packfile.c +++ b/packfile.c @@ -961,6 +961,12 @@ struct packed_git *get_packed_git(struct repository *r) return r->objects->packed_git; } +struct multi_pack_index *get_multi_pack_index(struct repository *r) +{ + prepare_packed_git(r); + return r->objects->multi_pack_index; +} + struct list_head *get_packed_git_mru(struct repository *r) { prepare_packed_git(r); diff --git a/packfile.h b/packfile.h index b0eed44c0b..046280caf3 100644 --- a/packfile.h +++ b/packfile.h @@ -45,6 +45,7 @@ extern void install_packed_git(struct repository *r, struct packed_git *pack); struct packed_git *get_packed_git(struct repository *r); struct list_head *get_packed_git_mru(struct repository *r); +struct multi_pack_index *get_multi_pack_index(struct repository *r); /* * Give a rough count of objects in the repository. This sacrifices accuracy diff --git a/sha1-name.c b/sha1-name.c index 60d9ef3c7e..7dc71201e6 100644 --- a/sha1-name.c +++ b/sha1-name.c @@ -12,6 +12,7 @@ #include "packfile.h" #include "object-store.h" #include "repository.h" +#include "midx.h" static int get_oid_oneline(const char *, struct object_id *, struct commit_list *); @@ -149,6 +150,32 @@ static int match_sha(unsigned len, const unsigned char *a, const unsigned char * return 1; } +static void unique_in_midx(struct multi_pack_index *m, + struct disambiguate_state *ds) +{ + uint32_t num, i, first = 0; + const struct object_id *current = NULL; + num = m->num_objects; + + if (!num) + return; + + bsearch_midx(&ds->bin_pfx, m, &first); + + /* + * At this point, "first" is the location of the lowest object + * with an object name that could match "bin_pfx". See if we have + * 0, 1 or more objects that actually match(es). + */ + for (i = first; i < num && !ds->ambiguous; i++) { + struct object_id oid; + current = nth_midxed_object_oid(&oid, m, i); + if (!match_sha(ds->len, ds->bin_pfx.hash, current->hash)) + break; + update_candidates(ds, current); + } +} + static void unique_in_pack(struct packed_git *p, struct disambiguate_state *ds) { @@ -177,8 +204,12 @@ static void unique_in_pack(struct packed_git *p, static void find_short_packed_object(struct disambiguate_state *ds) { + struct multi_pack_index *m; struct packed_git *p; + for (m = get_multi_pack_index(the_repository); m && !ds->ambiguous; + m = m->next) + unique_in_midx(m, ds); for (p = get_packed_git(the_repository); p && !ds->ambiguous; p = p->next) unique_in_pack(p, ds); @@ -527,6 +558,42 @@ static int extend_abbrev_len(const struct object_id *oid, void *cb_data) return 0; } +static void find_abbrev_len_for_midx(struct multi_pack_index *m, + struct min_abbrev_data *mad) +{ + int match = 0; + uint32_t num, first = 0; + struct object_id oid; + const struct object_id *mad_oid; + + if (!m->num_objects) + return; + + num = m->num_objects; + mad_oid = mad->oid; + match = bsearch_midx(mad_oid, m, &first); + + /* + * first is now the position in the packfile where we would insert + * mad->hash if it does not exist (or the position of mad->hash if + * it does exist). Hence, we consider a maximum of two objects + * nearby for the abbreviation length. + */ + mad->init_len = 0; + if (!match) { + if (nth_midxed_object_oid(&oid, m, first)) + extend_abbrev_len(&oid, mad); + } else if (first < num - 1) { + if (nth_midxed_object_oid(&oid, m, first + 1)) + extend_abbrev_len(&oid, mad); + } + if (first > 0) { + if (nth_midxed_object_oid(&oid, m, first - 1)) + extend_abbrev_len(&oid, mad); + } + mad->init_len = mad->cur_len; +} + static void find_abbrev_len_for_pack(struct packed_git *p, struct min_abbrev_data *mad) { @@ -565,8 +632,11 @@ static void find_abbrev_len_for_pack(struct packed_git *p, static void find_abbrev_len_packed(struct min_abbrev_data *mad) { + struct multi_pack_index *m; struct packed_git *p; + for (m = get_multi_pack_index(the_repository); m; m = m->next) + find_abbrev_len_for_midx(m, mad); for (p = get_packed_git(the_repository); p; p = p->next) find_abbrev_len_for_pack(p, mad); } -- cgit v1.3-5-g9baa