From 22d87333e5ee8871a9d42a15834ad91168a95928 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 13 Aug 2018 04:33:00 -0700 Subject: linear-assignment: a function to solve least-cost assignment problems The problem solved by the code introduced in this commit goes like this: given two sets of items, and a cost matrix which says how much it "costs" to assign any given item of the first set to any given item of the second, assign all items (except when the sets have different size) in the cheapest way. We use the Jonker-Volgenant algorithm to solve the assignment problem to answer questions such as: given two different versions of a topic branch (or iterations of a patch series), what is the best pairing of commits/patches between the different versions? Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'Makefile') diff --git a/Makefile b/Makefile index bc4fc8eeab..1af719b448 100644 --- a/Makefile +++ b/Makefile @@ -870,6 +870,7 @@ LIB_OBJS += gpg-interface.o LIB_OBJS += graph.o LIB_OBJS += grep.o LIB_OBJS += hashmap.o +LIB_OBJS += linear-assignment.o LIB_OBJS += help.o LIB_OBJS += hex.o LIB_OBJS += ident.o -- cgit v1.3 From 348ae56cb2266d3294611112ae0368386124d720 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 13 Aug 2018 04:33:02 -0700 Subject: Introduce `range-diff` to compare iterations of a topic branch This command does not do a whole lot so far, apart from showing a usage that is oddly similar to that of `git tbdiff`. And for a good reason: the next commits will turn `range-branch` into a full-blown replacement for `tbdiff`. At this point, we ignore tbdiff's color options, as they will all be implemented later using diff_options. Since f318d739159 (generate-cmds.sh: export all commands to command-list.h, 2018-05-10), every new command *requires* a man page to build right away, so let's also add a blank man page, too. Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- .gitignore | 1 + Documentation/git-range-diff.txt | 10 ++++++++++ Makefile | 1 + builtin.h | 1 + builtin/range-diff.c | 25 +++++++++++++++++++++++++ command-list.txt | 1 + git.c | 1 + 7 files changed, 40 insertions(+) create mode 100644 Documentation/git-range-diff.txt create mode 100644 builtin/range-diff.c (limited to 'Makefile') diff --git a/.gitignore b/.gitignore index 3284a1e9b1..cc0ad74b4b 100644 --- a/.gitignore +++ b/.gitignore @@ -113,6 +113,7 @@ /git-pull /git-push /git-quiltimport +/git-range-diff /git-read-tree /git-rebase /git-rebase--am diff --git a/Documentation/git-range-diff.txt b/Documentation/git-range-diff.txt new file mode 100644 index 0000000000..49f717db8b --- /dev/null +++ b/Documentation/git-range-diff.txt @@ -0,0 +1,10 @@ +git-range-diff(1) +================= + +NAME +---- +git-range-diff - Compare two commit ranges (e.g. two versions of a branch) + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Makefile b/Makefile index 1af719b448..7ff7eba42b 100644 --- a/Makefile +++ b/Makefile @@ -1063,6 +1063,7 @@ BUILTIN_OBJS += builtin/prune-packed.o BUILTIN_OBJS += builtin/prune.o BUILTIN_OBJS += builtin/pull.o BUILTIN_OBJS += builtin/push.o +BUILTIN_OBJS += builtin/range-diff.o BUILTIN_OBJS += builtin/read-tree.o BUILTIN_OBJS += builtin/rebase--helper.o BUILTIN_OBJS += builtin/receive-pack.o diff --git a/builtin.h b/builtin.h index 0362f1ce25..99206df4bd 100644 --- a/builtin.h +++ b/builtin.h @@ -201,6 +201,7 @@ extern int cmd_prune(int argc, const char **argv, const char *prefix); extern int cmd_prune_packed(int argc, const char **argv, const char *prefix); extern int cmd_pull(int argc, const char **argv, const char *prefix); extern int cmd_push(int argc, const char **argv, const char *prefix); +extern int cmd_range_diff(int argc, const char **argv, const char *prefix); extern int cmd_read_tree(int argc, const char **argv, const char *prefix); extern int cmd_rebase__helper(int argc, const char **argv, const char *prefix); extern int cmd_receive_pack(int argc, const char **argv, const char *prefix); diff --git a/builtin/range-diff.c b/builtin/range-diff.c new file mode 100644 index 0000000000..36788ea4f2 --- /dev/null +++ b/builtin/range-diff.c @@ -0,0 +1,25 @@ +#include "cache.h" +#include "builtin.h" +#include "parse-options.h" + +static const char * const builtin_range_diff_usage[] = { +N_("git range-diff [] .. .."), +N_("git range-diff [] ..."), +N_("git range-diff [] "), +NULL +}; + +int cmd_range_diff(int argc, const char **argv, const char *prefix) +{ + int creation_factor = 60; + struct option options[] = { + OPT_INTEGER(0, "creation-factor", &creation_factor, + N_("Percentage by which creation is weighted")), + OPT_END() + }; + + argc = parse_options(argc, argv, NULL, options, + builtin_range_diff_usage, 0); + + return 0; +} diff --git a/command-list.txt b/command-list.txt index e1c26c1bb7..a9dda3b8af 100644 --- a/command-list.txt +++ b/command-list.txt @@ -139,6 +139,7 @@ git-prune-packed plumbingmanipulators git-pull mainporcelain remote git-push mainporcelain remote git-quiltimport foreignscminterface +git-range-diff mainporcelain git-read-tree plumbingmanipulators git-rebase mainporcelain history git-receive-pack synchelpers diff --git a/git.c b/git.c index fc7d15d549..5b48cac3a9 100644 --- a/git.c +++ b/git.c @@ -520,6 +520,7 @@ static struct cmd_struct commands[] = { { "prune-packed", cmd_prune_packed, RUN_SETUP }, { "pull", cmd_pull, RUN_SETUP | NEED_WORK_TREE }, { "push", cmd_push, RUN_SETUP }, + { "range-diff", cmd_range_diff, RUN_SETUP | USE_PAGER }, { "read-tree", cmd_read_tree, RUN_SETUP | SUPPORT_SUPER_PREFIX}, { "rebase--helper", cmd_rebase__helper, RUN_SETUP | NEED_WORK_TREE }, { "receive-pack", cmd_receive_pack }, -- cgit v1.3 From d9c66f0b5bfdf3fc2898b7baad1bb9a72bfd7bf7 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 13 Aug 2018 04:33:04 -0700 Subject: range-diff: first rudimentary implementation At this stage, `git range-diff` can determine corresponding commits of two related commit ranges. This makes use of the recently introduced implementation of the linear assignment algorithm. The core of this patch is a straight port of the ideas of tbdiff, the apparently dormant project at https://github.com/trast/tbdiff. The output does not at all match `tbdiff`'s output yet, as this patch really concentrates on getting the patch matching part right. Note: due to differences in the diff algorithm (`tbdiff` uses the Python module `difflib`, Git uses its xdiff fork), the cost matrix calculated by `range-diff` is different (but very similar) to the one calculated by `tbdiff`. Therefore, it is possible that they find different matching commits in corner cases (e.g. when a patch was split into two patches of roughly equal length). Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- Makefile | 1 + builtin/range-diff.c | 45 +++++++- range-diff.c | 311 +++++++++++++++++++++++++++++++++++++++++++++++++++ range-diff.h | 7 ++ 4 files changed, 363 insertions(+), 1 deletion(-) create mode 100644 range-diff.c create mode 100644 range-diff.h (limited to 'Makefile') diff --git a/Makefile b/Makefile index 7ff7eba42b..72f16882e6 100644 --- a/Makefile +++ b/Makefile @@ -925,6 +925,7 @@ LIB_OBJS += progress.o LIB_OBJS += prompt.o LIB_OBJS += protocol.o LIB_OBJS += quote.o +LIB_OBJS += range-diff.o LIB_OBJS += reachable.o LIB_OBJS += read-cache.o LIB_OBJS += reflog-walk.o diff --git a/builtin/range-diff.c b/builtin/range-diff.c index 36788ea4f2..94c1f362cc 100644 --- a/builtin/range-diff.c +++ b/builtin/range-diff.c @@ -1,6 +1,7 @@ #include "cache.h" #include "builtin.h" #include "parse-options.h" +#include "range-diff.h" static const char * const builtin_range_diff_usage[] = { N_("git range-diff [] .. .."), @@ -17,9 +18,51 @@ int cmd_range_diff(int argc, const char **argv, const char *prefix) N_("Percentage by which creation is weighted")), OPT_END() }; + int res = 0; + struct strbuf range1 = STRBUF_INIT, range2 = STRBUF_INIT; argc = parse_options(argc, argv, NULL, options, builtin_range_diff_usage, 0); - return 0; + if (argc == 2) { + if (!strstr(argv[0], "..")) + die(_("no .. in range: '%s'"), argv[0]); + strbuf_addstr(&range1, argv[0]); + + if (!strstr(argv[1], "..")) + die(_("no .. in range: '%s'"), argv[1]); + strbuf_addstr(&range2, argv[1]); + } else if (argc == 3) { + strbuf_addf(&range1, "%s..%s", argv[0], argv[1]); + strbuf_addf(&range2, "%s..%s", argv[0], argv[2]); + } else if (argc == 1) { + const char *b = strstr(argv[0], "..."), *a = argv[0]; + int a_len; + + if (!b) { + error(_("single arg format must be symmetric range")); + usage_with_options(builtin_range_diff_usage, options); + } + + a_len = (int)(b - a); + if (!a_len) { + a = "HEAD"; + a_len = strlen(a); + } + b += 3; + if (!*b) + b = "HEAD"; + strbuf_addf(&range1, "%s..%.*s", b, a_len, a); + strbuf_addf(&range2, "%.*s..%s", a_len, a, b); + } else { + error(_("need two commit ranges")); + usage_with_options(builtin_range_diff_usage, options); + } + + res = show_range_diff(range1.buf, range2.buf, creation_factor); + + strbuf_release(&range1); + strbuf_release(&range2); + + return res; } diff --git a/range-diff.c b/range-diff.c new file mode 100644 index 0000000000..15d418afa6 --- /dev/null +++ b/range-diff.c @@ -0,0 +1,311 @@ +#include "cache.h" +#include "range-diff.h" +#include "string-list.h" +#include "run-command.h" +#include "argv-array.h" +#include "hashmap.h" +#include "xdiff-interface.h" +#include "linear-assignment.h" + +struct patch_util { + /* For the search for an exact match */ + struct hashmap_entry e; + const char *diff, *patch; + + int i; + int diffsize; + size_t diff_offset; + /* the index of the matching item in the other branch, or -1 */ + int matching; + struct object_id oid; +}; + +/* + * Reads the patches into a string list, with the `util` field being populated + * as struct object_id (will need to be free()d). + */ +static int read_patches(const char *range, struct string_list *list) +{ + struct child_process cp = CHILD_PROCESS_INIT; + FILE *in; + struct strbuf buf = STRBUF_INIT, line = STRBUF_INIT; + struct patch_util *util = NULL; + int in_header = 1; + + argv_array_pushl(&cp.args, "log", "--no-color", "-p", "--no-merges", + "--reverse", "--date-order", "--decorate=no", + "--no-abbrev-commit", range, + NULL); + cp.out = -1; + cp.no_stdin = 1; + cp.git_cmd = 1; + + if (start_command(&cp)) + return error_errno(_("could not start `log`")); + in = fdopen(cp.out, "r"); + if (!in) { + error_errno(_("could not read `log` output")); + finish_command(&cp); + return -1; + } + + while (strbuf_getline(&line, in) != EOF) { + const char *p; + + if (skip_prefix(line.buf, "commit ", &p)) { + if (util) { + string_list_append(list, buf.buf)->util = util; + strbuf_reset(&buf); + } + util = xcalloc(sizeof(*util), 1); + if (get_oid(p, &util->oid)) { + error(_("could not parse commit '%s'"), p); + free(util); + string_list_clear(list, 1); + strbuf_release(&buf); + strbuf_release(&line); + fclose(in); + finish_command(&cp); + return -1; + } + util->matching = -1; + in_header = 1; + continue; + } + + if (starts_with(line.buf, "diff --git")) { + in_header = 0; + strbuf_addch(&buf, '\n'); + if (!util->diff_offset) + util->diff_offset = buf.len; + strbuf_addbuf(&buf, &line); + } else if (in_header) { + if (starts_with(line.buf, "Author: ")) { + strbuf_addbuf(&buf, &line); + strbuf_addstr(&buf, "\n\n"); + } else if (starts_with(line.buf, " ")) { + strbuf_addbuf(&buf, &line); + strbuf_addch(&buf, '\n'); + } + continue; + } else if (starts_with(line.buf, "@@ ")) + strbuf_addstr(&buf, "@@"); + else if (!line.buf[0] || starts_with(line.buf, "index ")) + /* + * A completely blank (not ' \n', which is context) + * line is not valid in a diff. We skip it + * silently, because this neatly handles the blank + * separator line between commits in git-log + * output. + * + * We also want to ignore the diff's `index` lines + * because they contain exact blob hashes in which + * we are not interested. + */ + continue; + else + strbuf_addbuf(&buf, &line); + + strbuf_addch(&buf, '\n'); + util->diffsize++; + } + fclose(in); + strbuf_release(&line); + + if (util) + string_list_append(list, buf.buf)->util = util; + strbuf_release(&buf); + + if (finish_command(&cp)) + return -1; + + return 0; +} + +static int patch_util_cmp(const void *dummy, const struct patch_util *a, + const struct patch_util *b, const char *keydata) +{ + return strcmp(a->diff, keydata ? keydata : b->diff); +} + +static void find_exact_matches(struct string_list *a, struct string_list *b) +{ + struct hashmap map; + int i; + + hashmap_init(&map, (hashmap_cmp_fn)patch_util_cmp, NULL, 0); + + /* First, add the patches of a to a hash map */ + for (i = 0; i < a->nr; i++) { + struct patch_util *util = a->items[i].util; + + util->i = i; + util->patch = a->items[i].string; + util->diff = util->patch + util->diff_offset; + hashmap_entry_init(util, strhash(util->diff)); + hashmap_add(&map, util); + } + + /* Now try to find exact matches in b */ + for (i = 0; i < b->nr; i++) { + struct patch_util *util = b->items[i].util, *other; + + util->i = i; + util->patch = b->items[i].string; + util->diff = util->patch + util->diff_offset; + hashmap_entry_init(util, strhash(util->diff)); + other = hashmap_remove(&map, util, NULL); + if (other) { + if (other->matching >= 0) + BUG("already assigned!"); + + other->matching = i; + util->matching = other->i; + } + } + + hashmap_free(&map, 0); +} + +static void diffsize_consume(void *data, char *line, unsigned long len) +{ + (*(int *)data)++; +} + +static int diffsize(const char *a, const char *b) +{ + xpparam_t pp = { 0 }; + xdemitconf_t cfg = { 0 }; + mmfile_t mf1, mf2; + int count = 0; + + mf1.ptr = (char *)a; + mf1.size = strlen(a); + mf2.ptr = (char *)b; + mf2.size = strlen(b); + + cfg.ctxlen = 3; + if (!xdi_diff_outf(&mf1, &mf2, diffsize_consume, &count, &pp, &cfg)) + return count; + + error(_("failed to generate diff")); + return COST_MAX; +} + +static void get_correspondences(struct string_list *a, struct string_list *b, + int creation_factor) +{ + int n = a->nr + b->nr; + int *cost, c, *a2b, *b2a; + int i, j; + + ALLOC_ARRAY(cost, st_mult(n, n)); + ALLOC_ARRAY(a2b, n); + ALLOC_ARRAY(b2a, n); + + for (i = 0; i < a->nr; i++) { + struct patch_util *a_util = a->items[i].util; + + for (j = 0; j < b->nr; j++) { + struct patch_util *b_util = b->items[j].util; + + if (a_util->matching == j) + c = 0; + else if (a_util->matching < 0 && b_util->matching < 0) + c = diffsize(a_util->diff, b_util->diff); + else + c = COST_MAX; + cost[i + n * j] = c; + } + + c = a_util->matching < 0 ? + a_util->diffsize * creation_factor / 100 : COST_MAX; + for (j = b->nr; j < n; j++) + cost[i + n * j] = c; + } + + for (j = 0; j < b->nr; j++) { + struct patch_util *util = b->items[j].util; + + c = util->matching < 0 ? + util->diffsize * creation_factor / 100 : COST_MAX; + for (i = a->nr; i < n; i++) + cost[i + n * j] = c; + } + + for (i = a->nr; i < n; i++) + for (j = b->nr; j < n; j++) + cost[i + n * j] = 0; + + compute_assignment(n, n, cost, a2b, b2a); + + for (i = 0; i < a->nr; i++) + if (a2b[i] >= 0 && a2b[i] < b->nr) { + struct patch_util *a_util = a->items[i].util; + struct patch_util *b_util = b->items[a2b[i]].util; + + a_util->matching = a2b[i]; + b_util->matching = i; + } + + free(cost); + free(a2b); + free(b2a); +} + +static const char *short_oid(struct patch_util *util) +{ + return find_unique_abbrev(&util->oid, DEFAULT_ABBREV); +} + +static void output(struct string_list *a, struct string_list *b) +{ + int i; + + for (i = 0; i < b->nr; i++) { + struct patch_util *util = b->items[i].util, *prev; + + if (util->matching < 0) + printf("-: -------- > %d: %s\n", + i + 1, short_oid(util)); + else { + prev = a->items[util->matching].util; + printf("%d: %s ! %d: %s\n", + util->matching + 1, short_oid(prev), + i + 1, short_oid(util)); + } + } + + for (i = 0; i < a->nr; i++) { + struct patch_util *util = a->items[i].util; + + if (util->matching < 0) + printf("%d: %s < -: --------\n", + i + 1, short_oid(util)); + } +} + +int show_range_diff(const char *range1, const char *range2, + int creation_factor) +{ + int res = 0; + + struct string_list branch1 = STRING_LIST_INIT_DUP; + struct string_list branch2 = STRING_LIST_INIT_DUP; + + if (read_patches(range1, &branch1)) + res = error(_("could not parse log for '%s'"), range1); + if (!res && read_patches(range2, &branch2)) + res = error(_("could not parse log for '%s'"), range2); + + if (!res) { + find_exact_matches(&branch1, &branch2); + get_correspondences(&branch1, &branch2, creation_factor); + output(&branch1, &branch2); + } + + string_list_clear(&branch1, 1); + string_list_clear(&branch2, 1); + + return res; +} diff --git a/range-diff.h b/range-diff.h new file mode 100644 index 0000000000..7b6eef303f --- /dev/null +++ b/range-diff.h @@ -0,0 +1,7 @@ +#ifndef RANGE_DIFF_H +#define RANGE_DIFF_H + +int show_range_diff(const char *range1, const char *range2, + int creation_factor); + +#endif -- cgit v1.3