From 7be182045a6a113b118982fc81296d5b9746779e Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 26 Mar 2026 15:14:52 +0000 Subject: backfill: work with prefix pathspecs The previous change allowed specifying revision arguments over the 'git backfill' command-line. This created the opportunity for restricting the initial commit set by filtering the revision walk through a pathspec. Other than filtering the commit set (and thereby the root trees), this did not restrict the path-walk implementation of 'git backfill' and did not restrict the blobs that were downloaded to only those matching the pathspec. Update the path-walk API to accept certain kinds of pathspecs and to silently ignore anything too complex, for now. We will update this in the next change to properly restrict to even complex pathspecs. The current behavior focuses on pathspecs that match paths exactly. This includes exact filenames, including directory names as prefixes. Pathspecs containing wildcards or magic are cleared so the path walk downloads all blobs, as before. The reason for this restriction is to allow for a faster execution by pruning the path walk to only trees that could contribute towards one of those paths as a parent directory. The test directory 'd/f/' (next to 'd/file*.txt') was prepared in a previous commit to exercise the subtlety in prefix matching. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- path-walk.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'path-walk.c') diff --git a/path-walk.c b/path-walk.c index 364e4cfa19..3750552978 100644 --- a/path-walk.c +++ b/path-walk.c @@ -11,6 +11,7 @@ #include "list-objects.h" #include "object.h" #include "oid-array.h" +#include "path.h" #include "prio-queue.h" #include "repository.h" #include "revision.h" @@ -206,6 +207,33 @@ static int add_tree_entries(struct path_walk_context *ctx, match != MATCHED) continue; } + if (ctx->revs->prune_data.nr) { + struct pathspec *pd = &ctx->revs->prune_data; + bool found = false; + int did_strip_suffix = strbuf_strip_suffix(&path, "/"); + + + for (int i = 0; i < pd->nr; i++) { + struct pathspec_item *item = &pd->items[i]; + + /* + * Continue if either is a directory prefix + * of the other. + */ + if (dir_prefix(path.buf, item->match) || + dir_prefix(item->match, path.buf)) { + found = true; + break; + } + } + + if (did_strip_suffix) + strbuf_addch(&path, '/'); + + /* Skip paths that do not match the prefix. */ + if (!found) + continue; + } add_path_to_list(ctx, path.buf, type, &entry.oid, !(o->flags & UNINTERESTING)); @@ -481,6 +509,17 @@ int walk_objects_by_path(struct path_walk_info *info) if (info->tags) info->revs->tag_objects = 1; + if (ctx.revs->prune_data.nr) { + /* + * Only exact prefix pathspecs are currently supported. + * Clear any wildcard or magic pathspecs to avoid + * incorrect prefix matching. + */ + if (ctx.revs->prune_data.has_wildcard || + ctx.revs->prune_data.magic) + clear_pathspec(&ctx.revs->prune_data); + } + /* Insert a single list for the root tree into the paths. */ CALLOC_ARRAY(root_tree_list, 1); root_tree_list->type = OBJ_TREE; -- cgit v1.3-5-g45d5 From 3f20c21a1ceeb796e121147a53ba10d28041b1fe Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 26 Mar 2026 15:14:53 +0000 Subject: path-walk: support wildcard pathspecs for blob filtering Previously, walk_objects_by_path() silently ignored pathspecs containing wildcards or magic by clearing them. This caused all blobs to be downloaded regardless of the given pathspec. Wildcard pathspecs like "d/file.*.txt" are useful for narrowing which blobs to process (e.g., during 'git backfill'). Support wildcard pathspecs by making two changes: 1. Add an 'exact_pathspecs' flag to path_walk_context. When the pathspec has no wildcards or magic, set this flag and use the existing fast-path prefix matching in add_tree_entries(). When wildcards are present, skip that block since prefix matching cannot handle glob patterns. 2. Add a match_pathspec() check in walk_path() to filter out blobs whose full path does not match the pathspec. This provides the actual blob-level filtering for wildcard pathspecs. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- path-walk.c | 22 +++++++++++++--------- t/t5620-backfill.sh | 7 +++---- 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'path-walk.c') diff --git a/path-walk.c b/path-walk.c index 3750552978..2aa3e7d8a4 100644 --- a/path-walk.c +++ b/path-walk.c @@ -63,6 +63,8 @@ struct path_walk_context { */ struct prio_queue path_stack; struct strset path_stack_pushed; + + unsigned exact_pathspecs:1; }; static int compare_by_type(const void *one, const void *two, void *cb_data) @@ -207,7 +209,7 @@ static int add_tree_entries(struct path_walk_context *ctx, match != MATCHED) continue; } - if (ctx->revs->prune_data.nr) { + if (ctx->revs->prune_data.nr && ctx->exact_pathspecs) { struct pathspec *pd = &ctx->revs->prune_data; bool found = false; int did_strip_suffix = strbuf_strip_suffix(&path, "/"); @@ -302,6 +304,13 @@ static int walk_path(struct path_walk_context *ctx, return 0; } + if (list->type == OBJ_BLOB && + ctx->revs->prune_data.nr && + !match_pathspec(ctx->repo->index, &ctx->revs->prune_data, + path, strlen(path), 0, + NULL, 0)) + return 0; + /* Evaluate function pointer on this data, if requested. */ if ((list->type == OBJ_TREE && ctx->info->trees) || (list->type == OBJ_BLOB && ctx->info->blobs) || @@ -510,14 +519,9 @@ int walk_objects_by_path(struct path_walk_info *info) info->revs->tag_objects = 1; if (ctx.revs->prune_data.nr) { - /* - * Only exact prefix pathspecs are currently supported. - * Clear any wildcard or magic pathspecs to avoid - * incorrect prefix matching. - */ - if (ctx.revs->prune_data.has_wildcard || - ctx.revs->prune_data.magic) - clear_pathspec(&ctx.revs->prune_data); + if (!ctx.revs->prune_data.has_wildcard && + !ctx.revs->prune_data.magic) + ctx.exact_pathspecs = 1; } /* Insert a single list for the root tree into the paths. */ diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh index 52f6484ca1..c6f54ee91c 100755 --- a/t/t5620-backfill.sh +++ b/t/t5620-backfill.sh @@ -307,12 +307,11 @@ test_expect_success 'backfill with wildcard pathspec' ' git -C backfill-path rev-list --quiet --objects --missing=print HEAD >missing && test_line_count = 48 missing && - # TODO: The wildcard pathspec should limit downloaded blobs, - # but currently all blobs are downloaded. - git -C backfill-path backfill HEAD -- "d/file.*.txt" && + git -C backfill-path backfill HEAD -- "d/file.*.txt" 2>err && + test_must_be_empty err && git -C backfill-path rev-list --quiet --objects --missing=print HEAD >missing && - test_line_count = 0 missing + test_line_count = 40 missing ' test_expect_success 'backfill with --all' ' -- cgit v1.3-5-g45d5