From ee1f0c242efc022c185e9be9f672289d5420a664 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 6 Jan 2023 16:31:54 +0000 Subject: read-cache: add index.skipHash config option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous change allowed skipping the hashing portion of the hashwrite API, using it instead as a buffered write API. Disabling the hashwrite can be particularly helpful when the write operation is in a critical path. One such critical path is the writing of the index. This operation is so critical that the sparse index was created specifically to reduce the size of the index to make these writes (and reads) faster. This trade-off between file stability at rest and write-time performance is not easy to balance. The index is an interesting case for a couple reasons: 1. Writes block users. Writing the index takes place in many user- blocking foreground operations. The speed improvement directly impacts their use. Other file formats are typically written in the background (commit-graph, multi-pack-index) or are super-critical to correctness (pack-files). 2. Index files are short lived. It is rare that a user leaves an index for a long time with many staged changes. Outside of staged changes, the index can be completely destroyed and rewritten with minimal impact to the user. Following a similar approach to one used in the microsoft/git fork [1], add a new config option (index.skipHash) that allows disabling this hashing during the index write. The cost is that we can no longer validate the contents for corruption-at-rest using the trailing hash. [1] https://github.com/microsoft/git/commit/21fed2d91410f45d85279467f21d717a2db45201 We load this config from the repository config given by istate->repo, with a fallback to the_repository if it is not set. While older Git versions will not recognize the null hash as a special case, the file format itself is still being met in terms of its structure. Using this null hash will still allow Git operations to function across older versions. The one exception is 'git fsck' which checks the hash of the index file. This used to be a check on every index read, but was split out to just the index in a33fc72fe91 (read-cache: force_verify_index_checksum, 2017-04-14) and released first in Git 2.13.0. Document the versions that relaxed these restrictions, with the optimistic expectation that this change will be included in Git 2.40.0. Here, we disable this check if the trailing hash is all zeroes. We add a warning to the config option that this may cause undesirable behavior with older Git versions. As a quick comparison, I tested 'git update-index --force-write' with and without index.skipHash=true on a copy of the Linux kernel repository. Benchmark 1: with hash Time (mean ± σ): 46.3 ms ± 13.8 ms [User: 34.3 ms, System: 11.9 ms] Range (min … max): 34.3 ms … 79.1 ms 82 runs Benchmark 2: without hash Time (mean ± σ): 26.0 ms ± 7.9 ms [User: 11.8 ms, System: 14.2 ms] Range (min … max): 16.3 ms … 42.0 ms 69 runs Summary 'without hash' ran 1.78 ± 0.76 times faster than 'with hash' These performance benefits are substantial enough to allow users the ability to opt-in to this feature, even with the potential confusion with older 'git fsck' versions. Test this new config option, both at a command-line level and within a submodule. The confirmation is currently limited to confirm that 'git fsck' does not complain about the index. Future updates will make this test more robust. It is critical that this test is placed before the test_index_version tests, since those tests obliterate the .git/config file and hence lose the setting from GIT_TEST_DEFAULT_HASH, if set. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- read-cache.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'read-cache.c') diff --git a/read-cache.c b/read-cache.c index 46f5e497b1..d73a81e41a 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1817,6 +1817,8 @@ static int verify_hdr(const struct cache_header *hdr, unsigned long size) git_hash_ctx c; unsigned char hash[GIT_MAX_RAWSZ]; int hdr_version; + unsigned char *start, *end; + struct object_id oid; if (hdr->hdr_signature != htonl(CACHE_SIGNATURE)) return error(_("bad signature 0x%08x"), hdr->hdr_signature); @@ -1827,10 +1829,16 @@ static int verify_hdr(const struct cache_header *hdr, unsigned long size) if (!verify_index_checksum) return 0; + end = (unsigned char *)hdr + size; + start = end - the_hash_algo->rawsz; + oidread(&oid, start); + if (oideq(&oid, null_oid())) + return 0; + the_hash_algo->init_fn(&c); the_hash_algo->update_fn(&c, hdr, size - the_hash_algo->rawsz); the_hash_algo->final_fn(hash, &c); - if (!hasheq(hash, (unsigned char *)hdr + size - the_hash_algo->rawsz)) + if (!hasheq(hash, start)) return error(_("bad index file sha1 signature")); return 0; } @@ -2915,9 +2923,12 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, int ieot_entries = 1; struct index_entry_offset_table *ieot = NULL; int nr, nr_threads; + struct repository *r = istate->repo ? istate->repo : the_repository; f = hashfd(tempfile->fd, tempfile->filename.buf); + repo_config_get_bool(r, "index.skiphash", &f->skip_hash); + for (i = removed = extended = 0; i < entries; i++) { if (cache[i]->ce_flags & CE_REMOVE) removed++; -- cgit v1.3-5-g9baa From 17194b195d5db1cfd19af57e817c29bd3fa75c02 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 6 Jan 2023 16:31:56 +0000 Subject: features: feature.manyFiles implies fast index writes The recent addition of the index.skipHash config option allows index writes to speed up by skipping the hash computation for the trailing checksum. This is particularly critical for repositories with many files at HEAD, so add this config option to two cases where users in that scenario may opt-in to such behavior: 1. The feature.manyFiles config option enables some options that are helpful for repositories with many files at HEAD. 2. 'scalar register' and 'scalar reconfigure' set config options that optimize for large repositories. In both of these cases, set index.skipHash=true to gain this speedup. Add tests that demonstrate the proper way that index.skipHash=true can override feature.manyFiles=true. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/config/feature.txt | 5 +++++ read-cache.c | 3 ++- repo-settings.c | 2 ++ repository.h | 1 + scalar.c | 1 + t/t1600-index.sh | 11 +++++++++++ 6 files changed, 22 insertions(+), 1 deletion(-) (limited to 'read-cache.c') diff --git a/Documentation/config/feature.txt b/Documentation/config/feature.txt index 95975e5091..e52bc6b858 100644 --- a/Documentation/config/feature.txt +++ b/Documentation/config/feature.txt @@ -23,6 +23,11 @@ feature.manyFiles:: working directory. With many files, commands such as `git status` and `git checkout` may be slow and these new defaults improve performance: + +* `index.skipHash=true` speeds up index writes by not computing a trailing + checksum. Note that this will cause Git versions earlier than 2.13.0 to + refuse to parse the index and Git versions earlier than 2.40.0 will report + a corrupted index during `git fsck`. ++ * `index.version=4` enables path-prefix compression in the index. + * `core.untrackedCache=true` enables the untracked cache. This setting assumes diff --git a/read-cache.c b/read-cache.c index d73a81e41a..feefa0f68b 100644 --- a/read-cache.c +++ b/read-cache.c @@ -2927,7 +2927,8 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile, f = hashfd(tempfile->fd, tempfile->filename.buf); - repo_config_get_bool(r, "index.skiphash", &f->skip_hash); + prepare_repo_settings(r); + f->skip_hash = r->settings.index_skip_hash; for (i = removed = extended = 0; i < entries; i++) { if (cache[i]->ce_flags & CE_REMOVE) diff --git a/repo-settings.c b/repo-settings.c index 3021921c53..3dbd3f0e2e 100644 --- a/repo-settings.c +++ b/repo-settings.c @@ -47,6 +47,7 @@ void prepare_repo_settings(struct repository *r) } if (manyfiles) { r->settings.index_version = 4; + r->settings.index_skip_hash = 1; r->settings.core_untracked_cache = UNTRACKED_CACHE_WRITE; } @@ -61,6 +62,7 @@ void prepare_repo_settings(struct repository *r) repo_cfg_bool(r, "pack.usesparse", &r->settings.pack_use_sparse, 1); repo_cfg_bool(r, "core.multipackindex", &r->settings.core_multi_pack_index, 1); repo_cfg_bool(r, "index.sparse", &r->settings.sparse_index, 0); + repo_cfg_bool(r, "index.skiphash", &r->settings.index_skip_hash, r->settings.index_skip_hash); /* * The GIT_TEST_MULTI_PACK_INDEX variable is special in that diff --git a/repository.h b/repository.h index 6c461c5b9d..e8c67ffe16 100644 --- a/repository.h +++ b/repository.h @@ -42,6 +42,7 @@ struct repo_settings { struct fsmonitor_settings *fsmonitor; /* lazily loaded */ int index_version; + int index_skip_hash; enum untracked_cache_setting core_untracked_cache; int pack_use_sparse; diff --git a/scalar.c b/scalar.c index 6c52243cdf..b49bb8c24e 100644 --- a/scalar.c +++ b/scalar.c @@ -143,6 +143,7 @@ static int set_recommended_config(int reconfigure) { "credential.validate", "false", 1 }, /* GCM4W-only */ { "gc.auto", "0", 1 }, { "gui.GCWarning", "false", 1 }, + { "index.skipHash", "false", 1 }, { "index.threads", "true", 1 }, { "index.version", "4", 1 }, { "merge.stat", "false", 1 }, diff --git a/t/t1600-index.sh b/t/t1600-index.sh index 2f792bb8ff..0ebbae1305 100755 --- a/t/t1600-index.sh +++ b/t/t1600-index.sh @@ -73,6 +73,17 @@ test_expect_success 'index.skipHash config option' ' test_cmp expect hash && git fsck && + rm -f .git/index && + git -c feature.manyFiles=true add a && + test_trailing_hash .git/index >hash && + cmp expect hash && + + rm -f .git/index && + git -c feature.manyFiles=true \ + -c index.skipHash=false add a && + test_trailing_hash .git/index >hash && + ! cmp expect hash && + test_commit start && git -c protocol.file.allow=always submodule add ./ sub && git config index.skipHash false && -- cgit v1.3-5-g9baa