From ddac965965719f19c2905a7184a7dd55287bf817 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Jan 2024 07:22:21 +0100 Subject: reftable/writer: fix index corruption when writing multiple indices Each reftable may contain multiple types of blocks for refs, objects and reflog records, where each of these may have an index that makes it more efficient to find the records. It was observed that the index for log records can become corrupted under certain circumstances, where the first entry of the index points into the object index instead of to the log records. As it turns out, this corruption can occur whenever we write a log index as well as at least one additional index. Writing records and their index is basically a two-step process: 1. We write all blocks for the corresponding record. Each block that gets written is added to a list of blocks to index. 2. Once all blocks were written we finish the section. If at least two blocks have been added to the list of blocks to index then we will now write the index for those blocks and flush it, as well. When we have a very large number of blocks then we may decide to write a multi-level index, which is why we also keep track of the list of the index blocks in the same way as we previously kept track of the blocks to index. Now when we have finished writing all index blocks we clear the index and flush the last block to disk. This is done in the wrong order though because flushing the block to disk will re-add it to the list of blocks to be indexed. The result is that the next section we are about to write will have an entry in the list of blocks to index that points to the last block of the preceding section's index, which will corrupt the log index. Fix this corruption by clearing the index after having written the last block. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/readwrite_test.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'reftable/readwrite_test.c') diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c index 278663f22d..9c16e0504e 100644 --- a/reftable/readwrite_test.c +++ b/reftable/readwrite_test.c @@ -798,6 +798,85 @@ static void test_write_key_order(void) strbuf_release(&buf); } +static void test_write_multiple_indices(void) +{ + struct reftable_write_options opts = { + .block_size = 100, + }; + struct strbuf writer_buf = STRBUF_INIT, buf = STRBUF_INIT; + struct reftable_block_source source = { 0 }; + struct reftable_iterator it = { 0 }; + const struct reftable_stats *stats; + struct reftable_writer *writer; + struct reftable_reader *reader; + int err, i; + + writer = reftable_new_writer(&strbuf_add_void, &writer_buf, &opts); + reftable_writer_set_limits(writer, 1, 1); + for (i = 0; i < 100; i++) { + unsigned char hash[GIT_SHA1_RAWSZ] = {i}; + struct reftable_ref_record ref = { + .update_index = 1, + .value_type = REFTABLE_REF_VAL1, + .value.val1 = hash, + }; + + strbuf_reset(&buf); + strbuf_addf(&buf, "refs/heads/%04d", i); + ref.refname = buf.buf, + + err = reftable_writer_add_ref(writer, &ref); + EXPECT_ERR(err); + } + + for (i = 0; i < 100; i++) { + unsigned char hash[GIT_SHA1_RAWSZ] = {i}; + struct reftable_log_record log = { + .update_index = 1, + .value_type = REFTABLE_LOG_UPDATE, + .value.update = { + .old_hash = hash, + .new_hash = hash, + }, + }; + + strbuf_reset(&buf); + strbuf_addf(&buf, "refs/heads/%04d", i); + log.refname = buf.buf, + + err = reftable_writer_add_log(writer, &log); + EXPECT_ERR(err); + } + + reftable_writer_close(writer); + + /* + * The written data should be sufficiently large to result in indices + * for each of the block types. + */ + stats = reftable_writer_stats(writer); + EXPECT(stats->ref_stats.index_offset > 0); + EXPECT(stats->obj_stats.index_offset > 0); + EXPECT(stats->log_stats.index_offset > 0); + + block_source_from_strbuf(&source, &writer_buf); + err = reftable_new_reader(&reader, &source, "filename"); + EXPECT_ERR(err); + + /* + * Seeking the log uses the log index now. In case there is any + * confusion regarding indices we would notice here. + */ + err = reftable_reader_seek_log(reader, &it, ""); + EXPECT_ERR(err); + + reftable_iterator_destroy(&it); + reftable_writer_free(writer); + reftable_reader_free(reader); + strbuf_release(&writer_buf); + strbuf_release(&buf); +} + static void test_corrupt_table_empty(void) { struct strbuf buf = STRBUF_INIT; @@ -847,5 +926,6 @@ int readwrite_test_main(int argc, const char *argv[]) RUN_TEST(test_log_overflow); RUN_TEST(test_write_object_id_length); RUN_TEST(test_write_object_id_min_length); + RUN_TEST(test_write_multiple_indices); return 0; } -- cgit v1.3-5-g9baa From 7af607c58d7985a0eb70fc3bca6eef8eb2381f14 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Jan 2024 07:22:30 +0100 Subject: reftable/record: store "val1" hashes as static arrays When reading ref records of type "val1", we store its object ID in an allocated array. This results in an additional allocation for every single ref record we read, which is rather inefficient especially when iterating over refs. Refactor the code to instead use an embedded array of `GIT_MAX_RAWSZ` bytes. While this means that `struct ref_record` is bigger now, we typically do not store all refs in an array anyway and instead only handle a limited number of records at the same point in time. Using `git show-ref --quiet` in a repository with ~350k refs this leads to a significant drop in allocations. Before: HEAP SUMMARY: in use at exit: 21,098 bytes in 192 blocks total heap usage: 2,116,683 allocs, 2,116,491 frees, 76,098,060 bytes allocated After: HEAP SUMMARY: in use at exit: 21,098 bytes in 192 blocks total heap usage: 1,419,031 allocs, 1,418,839 frees, 62,145,036 bytes allocated Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/block_test.c | 4 +--- reftable/merged_test.c | 16 ++++++---------- reftable/readwrite_test.c | 14 ++++---------- reftable/record.c | 3 --- reftable/record_test.c | 1 - reftable/reftable-record.h | 3 ++- reftable/stack_test.c | 2 -- 7 files changed, 13 insertions(+), 30 deletions(-) (limited to 'reftable/readwrite_test.c') diff --git a/reftable/block_test.c b/reftable/block_test.c index c00bbc8aed..dedb05c7d8 100644 --- a/reftable/block_test.c +++ b/reftable/block_test.c @@ -49,13 +49,11 @@ static void test_block_read_write(void) for (i = 0; i < N; i++) { char name[100]; - uint8_t hash[GIT_SHA1_RAWSZ]; snprintf(name, sizeof(name), "branch%02d", i); - memset(hash, i, sizeof(hash)); rec.u.ref.refname = name; rec.u.ref.value_type = REFTABLE_REF_VAL1; - rec.u.ref.value.val1 = hash; + memset(rec.u.ref.value.val1, i, GIT_SHA1_RAWSZ); names[i] = xstrdup(name); n = block_writer_add(&bw, &rec); diff --git a/reftable/merged_test.c b/reftable/merged_test.c index d08c16abef..b3927a5d73 100644 --- a/reftable/merged_test.c +++ b/reftable/merged_test.c @@ -123,13 +123,11 @@ static void readers_destroy(struct reftable_reader **readers, size_t n) static void test_merged_between(void) { - uint8_t hash1[GIT_SHA1_RAWSZ] = { 1, 2, 3, 0 }; - struct reftable_ref_record r1[] = { { .refname = "b", .update_index = 1, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash1, + .value.val1 = { 1, 2, 3, 0 }, } }; struct reftable_ref_record r2[] = { { .refname = "a", @@ -165,26 +163,24 @@ static void test_merged_between(void) static void test_merged(void) { - uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 }; - uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 }; struct reftable_ref_record r1[] = { { .refname = "a", .update_index = 1, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash1, + .value.val1 = { 1 }, }, { .refname = "b", .update_index = 1, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash1, + .value.val1 = { 1 }, }, { .refname = "c", .update_index = 1, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash1, + .value.val1 = { 1 }, } }; struct reftable_ref_record r2[] = { { @@ -197,13 +193,13 @@ static void test_merged(void) .refname = "c", .update_index = 3, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash2, + .value.val1 = { 2 }, }, { .refname = "d", .update_index = 3, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash1, + .value.val1 = { 1 }, }, }; diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c index 9c16e0504e..87b238105c 100644 --- a/reftable/readwrite_test.c +++ b/reftable/readwrite_test.c @@ -60,18 +60,15 @@ static void write_table(char ***names, struct strbuf *buf, int N, *names = reftable_calloc(sizeof(char *) * (N + 1)); reftable_writer_set_limits(w, update_index, update_index); for (i = 0; i < N; i++) { - uint8_t hash[GIT_SHA256_RAWSZ] = { 0 }; char name[100]; int n; - set_test_hash(hash, i); - snprintf(name, sizeof(name), "refs/heads/branch%02d", i); ref.refname = name; ref.update_index = update_index; ref.value_type = REFTABLE_REF_VAL1; - ref.value.val1 = hash; + set_test_hash(ref.value.val1, i); (*names)[i] = xstrdup(name); n = reftable_writer_add_ref(w, &ref); @@ -675,11 +672,10 @@ static void test_write_object_id_min_length(void) struct strbuf buf = STRBUF_INIT; struct reftable_writer *w = reftable_new_writer(&strbuf_add_void, &buf, &opts); - uint8_t hash[GIT_SHA1_RAWSZ] = {42}; struct reftable_ref_record ref = { .update_index = 1, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash, + .value.val1 = {42}, }; int err; int i; @@ -711,11 +707,10 @@ static void test_write_object_id_length(void) struct strbuf buf = STRBUF_INIT; struct reftable_writer *w = reftable_new_writer(&strbuf_add_void, &buf, &opts); - uint8_t hash[GIT_SHA1_RAWSZ] = {42}; struct reftable_ref_record ref = { .update_index = 1, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash, + .value.val1 = {42}, }; int err; int i; @@ -814,11 +809,10 @@ static void test_write_multiple_indices(void) writer = reftable_new_writer(&strbuf_add_void, &writer_buf, &opts); reftable_writer_set_limits(writer, 1, 1); for (i = 0; i < 100; i++) { - unsigned char hash[GIT_SHA1_RAWSZ] = {i}; struct reftable_ref_record ref = { .update_index = 1, .value_type = REFTABLE_REF_VAL1, - .value.val1 = hash, + .value.val1 = {i}, }; strbuf_reset(&buf); diff --git a/reftable/record.c b/reftable/record.c index 5e258c734b..a67a6b4d8a 100644 --- a/reftable/record.c +++ b/reftable/record.c @@ -219,7 +219,6 @@ static void reftable_ref_record_copy_from(void *rec, const void *src_rec, case REFTABLE_REF_DELETION: break; case REFTABLE_REF_VAL1: - ref->value.val1 = reftable_malloc(hash_size); memcpy(ref->value.val1, src->value.val1, hash_size); break; case REFTABLE_REF_VAL2: @@ -303,7 +302,6 @@ void reftable_ref_record_release(struct reftable_ref_record *ref) reftable_free(ref->value.val2.value); break; case REFTABLE_REF_VAL1: - reftable_free(ref->value.val1); break; case REFTABLE_REF_DELETION: break; @@ -394,7 +392,6 @@ static int reftable_ref_record_decode(void *rec, struct strbuf key, return -1; } - r->value.val1 = reftable_malloc(hash_size); memcpy(r->value.val1, in.buf, hash_size); string_view_consume(&in, hash_size); break; diff --git a/reftable/record_test.c b/reftable/record_test.c index 70ae78feca..5c94d26e35 100644 --- a/reftable/record_test.c +++ b/reftable/record_test.c @@ -119,7 +119,6 @@ static void test_reftable_ref_record_roundtrip(void) case REFTABLE_REF_DELETION: break; case REFTABLE_REF_VAL1: - in.u.ref.value.val1 = reftable_malloc(GIT_SHA1_RAWSZ); set_hash(in.u.ref.value.val1, 1); break; case REFTABLE_REF_VAL2: diff --git a/reftable/reftable-record.h b/reftable/reftable-record.h index f7eb2d6015..7f3a0df635 100644 --- a/reftable/reftable-record.h +++ b/reftable/reftable-record.h @@ -9,6 +9,7 @@ https://developers.google.com/open-source/licenses/bsd #ifndef REFTABLE_RECORD_H #define REFTABLE_RECORD_H +#include "hash-ll.h" #include /* @@ -38,7 +39,7 @@ struct reftable_ref_record { #define REFTABLE_NR_REF_VALUETYPES 4 } value_type; union { - uint8_t *val1; /* malloced hash. */ + unsigned char val1[GIT_MAX_RAWSZ]; struct { uint8_t *value; /* first value, malloced hash */ uint8_t *target_value; /* second value, malloced hash */ diff --git a/reftable/stack_test.c b/reftable/stack_test.c index 14a3fc11ee..feab49d7f7 100644 --- a/reftable/stack_test.c +++ b/reftable/stack_test.c @@ -463,7 +463,6 @@ static void test_reftable_stack_add(void) refs[i].refname = xstrdup(buf); refs[i].update_index = i + 1; refs[i].value_type = REFTABLE_REF_VAL1; - refs[i].value.val1 = reftable_malloc(GIT_SHA1_RAWSZ); set_test_hash(refs[i].value.val1, i); logs[i].refname = xstrdup(buf); @@ -600,7 +599,6 @@ static void test_reftable_stack_tombstone(void) refs[i].update_index = i + 1; if (i % 2 == 0) { refs[i].value_type = REFTABLE_REF_VAL1; - refs[i].value.val1 = reftable_malloc(GIT_SHA1_RAWSZ); set_test_hash(refs[i].value.val1, i); } -- cgit v1.3-5-g9baa From b31e3cc620f926273af9346fbda4ff507f60682e Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Jan 2024 07:22:34 +0100 Subject: reftable/record: store "val2" hashes as static arrays Similar to the preceding commit, convert ref records of type "val2" to store their object IDs in static arrays instead of allocating them for every single record. We're using the same benchmark as in the preceding commit, with `git show-ref --quiet` in a repository with ~350k refs. This time around though the effects aren't this huge. Before: HEAP SUMMARY: in use at exit: 21,163 bytes in 193 blocks total heap usage: 1,419,040 allocs, 1,418,847 frees, 62,153,868 bytes allocated After: HEAP SUMMARY: in use at exit: 21,163 bytes in 193 blocks total heap usage: 1,410,148 allocs, 1,409,955 frees, 61,976,068 bytes allocated This is because "val2"-type records are typically only stored for peeled tags, and the number of annotated tags in the benchmark repository is rather low. Still, it can be seen that this change leads to a reduction of allocations overall, even if only a small one. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- reftable/readwrite_test.c | 12 ++++-------- reftable/record.c | 6 ------ reftable/record_test.c | 4 ---- reftable/reftable-record.h | 4 ++-- 4 files changed, 6 insertions(+), 20 deletions(-) (limited to 'reftable/readwrite_test.c') diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c index 87b238105c..178766dfa8 100644 --- a/reftable/readwrite_test.c +++ b/reftable/readwrite_test.c @@ -547,8 +547,6 @@ static void test_table_refs_for(int indexed) uint8_t hash[GIT_SHA1_RAWSZ]; char fill[51] = { 0 }; char name[100]; - uint8_t hash1[GIT_SHA1_RAWSZ]; - uint8_t hash2[GIT_SHA1_RAWSZ]; struct reftable_ref_record ref = { NULL }; memset(hash, i, sizeof(hash)); @@ -558,11 +556,9 @@ static void test_table_refs_for(int indexed) name[40] = 0; ref.refname = name; - set_test_hash(hash1, i / 4); - set_test_hash(hash2, 3 + i / 4); ref.value_type = REFTABLE_REF_VAL2; - ref.value.val2.value = hash1; - ref.value.val2.target_value = hash2; + set_test_hash(ref.value.val2.value, i / 4); + set_test_hash(ref.value.val2.target_value, 3 + i / 4); /* 80 bytes / entry, so 3 entries per block. Yields 17 */ @@ -570,8 +566,8 @@ static void test_table_refs_for(int indexed) n = reftable_writer_add_ref(w, &ref); EXPECT(n == 0); - if (!memcmp(hash1, want_hash, GIT_SHA1_RAWSZ) || - !memcmp(hash2, want_hash, GIT_SHA1_RAWSZ)) { + if (!memcmp(ref.value.val2.value, want_hash, GIT_SHA1_RAWSZ) || + !memcmp(ref.value.val2.target_value, want_hash, GIT_SHA1_RAWSZ)) { want_names[want_names_len++] = xstrdup(name); } } diff --git a/reftable/record.c b/reftable/record.c index a67a6b4d8a..5c3fbb7b2a 100644 --- a/reftable/record.c +++ b/reftable/record.c @@ -222,9 +222,7 @@ static void reftable_ref_record_copy_from(void *rec, const void *src_rec, memcpy(ref->value.val1, src->value.val1, hash_size); break; case REFTABLE_REF_VAL2: - ref->value.val2.value = reftable_malloc(hash_size); memcpy(ref->value.val2.value, src->value.val2.value, hash_size); - ref->value.val2.target_value = reftable_malloc(hash_size); memcpy(ref->value.val2.target_value, src->value.val2.target_value, hash_size); break; @@ -298,8 +296,6 @@ void reftable_ref_record_release(struct reftable_ref_record *ref) reftable_free(ref->value.symref); break; case REFTABLE_REF_VAL2: - reftable_free(ref->value.val2.target_value); - reftable_free(ref->value.val2.value); break; case REFTABLE_REF_VAL1: break; @@ -401,11 +397,9 @@ static int reftable_ref_record_decode(void *rec, struct strbuf key, return -1; } - r->value.val2.value = reftable_malloc(hash_size); memcpy(r->value.val2.value, in.buf, hash_size); string_view_consume(&in, hash_size); - r->value.val2.target_value = reftable_malloc(hash_size); memcpy(r->value.val2.target_value, in.buf, hash_size); string_view_consume(&in, hash_size); break; diff --git a/reftable/record_test.c b/reftable/record_test.c index 5c94d26e35..2876db7d27 100644 --- a/reftable/record_test.c +++ b/reftable/record_test.c @@ -122,11 +122,7 @@ static void test_reftable_ref_record_roundtrip(void) set_hash(in.u.ref.value.val1, 1); break; case REFTABLE_REF_VAL2: - in.u.ref.value.val2.value = - reftable_malloc(GIT_SHA1_RAWSZ); set_hash(in.u.ref.value.val2.value, 1); - in.u.ref.value.val2.target_value = - reftable_malloc(GIT_SHA1_RAWSZ); set_hash(in.u.ref.value.val2.target_value, 2); break; case REFTABLE_REF_SYMREF: diff --git a/reftable/reftable-record.h b/reftable/reftable-record.h index 7f3a0df635..bb6e99acd3 100644 --- a/reftable/reftable-record.h +++ b/reftable/reftable-record.h @@ -41,8 +41,8 @@ struct reftable_ref_record { union { unsigned char val1[GIT_MAX_RAWSZ]; struct { - uint8_t *value; /* first value, malloced hash */ - uint8_t *target_value; /* second value, malloced hash */ + unsigned char value[GIT_MAX_RAWSZ]; /* first hash */ + unsigned char target_value[GIT_MAX_RAWSZ]; /* second hash */ } val2; char *symref; /* referent, malloced 0-terminated string */ } value; -- cgit v1.3-5-g9baa