From 0133dab75d8b15c559aa9df66134d72dce0e0476 Mon Sep 17 00:00:00 2001 From: Johan Herland Date: Mon, 11 Apr 2011 00:48:51 +0200 Subject: --dirstat-by-file: Make it faster and more correct Currently, when using --dirstat-by-file, it first does the full --dirstat analysis (using diffcore_count_changes()), and then resets 'damage' to 1, if any damage was found by diffcore_count_changes(). But --dirstat-by-file is not interested in the file damage per se. It only cares if the file changed at all. In that sense it only cares if the blob object for a file has changed. We therefore only need to compare the object names of each file pair in the diff queue and we can skip the entire --dirstat analysis and simply set 'damage' to 1 for each entry where the object name has changed. This makes --dirstat-by-file faster, and also bypasses --dirstat's practice of ignoring rearranged lines within a file. The patch also contains an added testcase verifying that --dirstat-by-file now detects changes that only rearrange lines within a file. Signed-off-by: Johan Herland Acked-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diff.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index 3fd9e0c703..4f5270b8db 100644 --- a/diff.c +++ b/diff.c @@ -1539,9 +1539,27 @@ static void show_dirstat(struct diff_options *options) struct diff_filepair *p = q->queue[i]; const char *name; unsigned long copied, added, damage; + int content_changed; name = p->one->path ? p->one->path : p->two->path; + if (p->one->sha1_valid && p->two->sha1_valid) + content_changed = hashcmp(p->one->sha1, p->two->sha1); + else + content_changed = 1; + + if (DIFF_OPT_TST(options, DIRSTAT_BY_FILE)) { + /* + * In --dirstat-by-file mode, we don't really need to + * look at the actual file contents at all. + * The fact that the SHA1 changed is enough for us to + * add this file to the list of results + * (with each file contributing equal damage). + */ + damage = content_changed ? 1 : 0; + goto found_damage; + } + if (DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) { diff_populate_filespec(p->one, 0); diff_populate_filespec(p->two, 0); @@ -1564,14 +1582,11 @@ static void show_dirstat(struct diff_options *options) /* * Original minus copied is the removed material, * added is the new material. They are both damages - * made to the preimage. In --dirstat-by-file mode, count - * damaged files, not damaged lines. This is done by - * counting only a single damaged line per file. + * made to the preimage. */ damage = (p->one->size - copied) + added; - if (DIFF_OPT_TST(options, DIRSTAT_BY_FILE) && damage > 0) - damage = 1; +found_damage: ALLOC_GROW(dir.files, dir.nr + 1, dir.alloc); dir.files[dir.nr].name = name; dir.files[dir.nr].changed = damage; -- cgit v1.3 From 2ff3a80334115797b8446909655e536f43900bc5 Mon Sep 17 00:00:00 2001 From: Johan Herland Date: Mon, 11 Apr 2011 00:48:52 +0200 Subject: Teach --dirstat not to completely ignore rearranged lines within a file Currently, the --dirstat analysis ignores when lines within a file are rearranged, because the "damage" calculated by show_dirstat() is 0. However, if the object name has changed, we already know that there is some damage, and it is unintuitive to claim there is _no_ damage. Teach show_dirstat() to assign a minimum amount of damage (== 1) to entries for which the analysis otherwise yields zero damage, to still represent that these files are changed, instead of saying that there is no change. Also, skip --dirstat analysis when the object names are the same (e.g. for a pure file rename). Signed-off-by: Johan Herland Acked-by: Linus Torvalds Signed-off-by: Junio C Hamano --- Documentation/diff-options.txt | 4 ++-- diff.c | 19 ++++++++++++++++++- t/t4013-diff-various.sh | 2 -- t/t4013/diff.diff_--dirstat_initial_rearrange | 1 + 4 files changed, 21 insertions(+), 5 deletions(-) (limited to 'diff.c') diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt index 23772d615d..7e4bd425e1 100644 --- a/Documentation/diff-options.txt +++ b/Documentation/diff-options.txt @@ -74,8 +74,8 @@ endif::git-format-patch[] counted for the parent directory, unless `--cumulative` is used. + Note that the `--dirstat` option computes the changes while ignoring -pure code movements within a file. In other words, rearranging lines -in a file is not counted as a change. +the amount of pure code movements within a file. In other words, +rearranging lines in a file is not counted as much as other changes. --dirstat-by-file[=]:: Same as `--dirstat`, but counts changed files instead of lines. diff --git a/diff.c b/diff.c index 4f5270b8db..1f44cb4237 100644 --- a/diff.c +++ b/diff.c @@ -1548,6 +1548,16 @@ static void show_dirstat(struct diff_options *options) else content_changed = 1; + if (!content_changed) { + /* + * The SHA1 has not changed, so pre-/post-content is + * identical. We can therefore skip looking at the + * file contents altogether. + */ + damage = 0; + goto found_damage; + } + if (DIFF_OPT_TST(options, DIRSTAT_BY_FILE)) { /* * In --dirstat-by-file mode, we don't really need to @@ -1556,7 +1566,7 @@ static void show_dirstat(struct diff_options *options) * add this file to the list of results * (with each file contributing equal damage). */ - damage = content_changed ? 1 : 0; + damage = 1; goto found_damage; } @@ -1583,8 +1593,15 @@ static void show_dirstat(struct diff_options *options) * Original minus copied is the removed material, * added is the new material. They are both damages * made to the preimage. + * If the resulting damage is zero, we know that + * diffcore_count_changes() considers the two entries to + * be identical, but since content_changed is true, we + * know that there must have been _some_ kind of change, + * so we force all entries to have damage > 0. */ damage = (p->one->size - copied) + added; + if (!damage) + damage = 1; found_damage: ALLOC_GROW(dir.files, dir.nr + 1, dir.alloc); diff --git a/t/t4013-diff-various.sh b/t/t4013-diff-various.sh index 6428a905ab..93a6f20871 100755 --- a/t/t4013-diff-various.sh +++ b/t/t4013-diff-various.sh @@ -300,9 +300,7 @@ diff --no-index --name-status -- dir2 dir diff --no-index dir dir3 diff master master^ side diff --dirstat master~1 master~2 -# --dirstat doesn't notice changes that simply rearrange existing lines diff --dirstat initial rearrange -# ...but --dirstat-by-file does notice changes that only rearrange lines diff --dirstat-by-file initial rearrange EOF diff --git a/t/t4013/diff.diff_--dirstat_initial_rearrange b/t/t4013/diff.diff_--dirstat_initial_rearrange index fb2e17dd2e..5fb02c13bc 100644 --- a/t/t4013/diff.diff_--dirstat_initial_rearrange +++ b/t/t4013/diff.diff_--dirstat_initial_rearrange @@ -1,2 +1,3 @@ $ git diff --dirstat initial rearrange + 100.0% dir/ $ -- cgit v1.3 From 2ca86714703f81f9dd5dfb31f8d97a8a0089634d Mon Sep 17 00:00:00 2001 From: Johan Herland Date: Tue, 12 Apr 2011 11:24:34 +0200 Subject: --dirstat: In case of renames, use target filename instead of source filename This changes --dirstat analysis to count "damage" toward the target filename, rather than the source filename. For renames within a directory, this won't matter to the final output, but when moving files between diretories, the output now lists the target directory rather than the source directory. Signed-off-by: Johan Herland Acked-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index 1f44cb4237..abd9cd5f33 100644 --- a/diff.c +++ b/diff.c @@ -1541,7 +1541,7 @@ static void show_dirstat(struct diff_options *options) unsigned long copied, added, damage; int content_changed; - name = p->one->path ? p->one->path : p->two->path; + name = p->two->path ? p->two->path : p->one->path; if (p->one->sha1_valid && p->two->sha1_valid) content_changed = hashcmp(p->one->sha1, p->two->sha1); -- cgit v1.3