diff options
| author | Shulhan <ms@kilabit.info> | 2026-04-13 19:14:38 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-04-13 19:14:38 +0700 |
| commit | 2d3a245cbba808e58699efcce263cc380491f921 (patch) | |
| tree | 9e52a801972bab3f6523eb0bfe13d2d800519123 | |
| parent | ece7d12c4ed13e7e2b9a5355dc4579afa55f8711 (diff) | |
| download | kilabit.info-2d3a245cbba808e58699efcce263cc380491f921.tar.xz | |
journal/2026: new journal "Implementing Unified Diff"
I am writing this journal to understand the format in details
while working on the implementations.
| -rw-r--r-- | REUSE.toml | 3 | ||||
| -rw-r--r-- | _content/journal/2026/index.adoc | 4 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/Makefile | 18 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/add.uni | 6 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/del.new | 2 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/del.old | 3 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/del.uni | 6 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/index.adoc | 536 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/mixed.new | 14 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/mixed.old | 16 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/mixed.uni | 22 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/multi_add.uni | 11 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/multi_del.new | 8 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/multi_del.old | 10 | ||||
| -rw-r--r-- | _content/journal/2026/unified_diff/multi_del.uni | 11 |
15 files changed, 670 insertions, 0 deletions
@@ -16,6 +16,9 @@ path = [ "**/*.png", "**/*.svg", "**/*.webp", + "**/*.new", + "**/*.old", + "**/*.uni", ] SPDX-License-Identifier = "CC-BY-SA-4.0" SPDX-FileCopyrightText = "2019-2026 M. Shulhan <ms@kilabit.info>" diff --git a/_content/journal/2026/index.adoc b/_content/journal/2026/index.adoc index 531ac52..c5f583b 100644 --- a/_content/journal/2026/index.adoc +++ b/_content/journal/2026/index.adoc @@ -3,6 +3,10 @@ === 2026 +link:/journal/2026/unified_diff[Implementing Unified Diff^]. +The output is simple, but generating it is hard. + + link:/journal/2026/reducing_idle_services/[Reducing Idle Services with SystemD^]. In this journal, we will take a look on how to minimize idle services diff --git a/_content/journal/2026/unified_diff/Makefile b/_content/journal/2026/unified_diff/Makefile new file mode 100644 index 0000000..47ca454 --- /dev/null +++ b/_content/journal/2026/unified_diff/Makefile @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: CC-BY-SA-4.0 +// SPDX-FileCopyrightText: 2026 M. Shulhan <ms@kilabit.info> + +.PHONY: all +all: del.uni \ + multi_del.uni \ + add.uni \ + multi_add.uni \ + mixed.uni + +%.uni: %.old %.new + -diff -u --label="old" --label="new" $^ > $@ + +add.uni: del.new del.old + -diff -u --label="old" --label="new" $^ > $@ + +multi_add.uni: multi_del.new multi_del.old + -diff -u --label="old" --label="new" $^ > $@ diff --git a/_content/journal/2026/unified_diff/add.uni b/_content/journal/2026/unified_diff/add.uni new file mode 100644 index 0000000..70bcf66 --- /dev/null +++ b/_content/journal/2026/unified_diff/add.uni @@ -0,0 +1,6 @@ +--- old ++++ new +@@ -1,2 +1,3 @@ + a ++b + c diff --git a/_content/journal/2026/unified_diff/del.new b/_content/journal/2026/unified_diff/del.new new file mode 100644 index 0000000..0f7bc76 --- /dev/null +++ b/_content/journal/2026/unified_diff/del.new @@ -0,0 +1,2 @@ +a +c diff --git a/_content/journal/2026/unified_diff/del.old b/_content/journal/2026/unified_diff/del.old new file mode 100644 index 0000000..de98044 --- /dev/null +++ b/_content/journal/2026/unified_diff/del.old @@ -0,0 +1,3 @@ +a +b +c diff --git a/_content/journal/2026/unified_diff/del.uni b/_content/journal/2026/unified_diff/del.uni new file mode 100644 index 0000000..28a260b --- /dev/null +++ b/_content/journal/2026/unified_diff/del.uni @@ -0,0 +1,6 @@ +--- old ++++ new +@@ -1,3 +1,2 @@ + a +-b + c diff --git a/_content/journal/2026/unified_diff/index.adoc b/_content/journal/2026/unified_diff/index.adoc new file mode 100644 index 0000000..9e01f94 --- /dev/null +++ b/_content/journal/2026/unified_diff/index.adoc @@ -0,0 +1,536 @@ +// SPDX-License-Identifier: CC-BY-SA-4.0 +// SPDX-FileCopyrightText: 2026 M. Shulhan <ms@kilabit.info> + += Implementing Unified Diff +Shulhan <ms@kilabit.info> +13 April 2026 +:sectanchors: +:toc: + +[blockquote] +The unified output format is a variation on the context format that is more +compact because it omits redundant context lines +-- https://www.gnu.org/software/diffutils/manual/html_node/Unified-Format.html[GNU +diffutils manual]. + +We commonly see the output of unified diff when running `diff -u` command or +`git diff` command (actually it is from `git diff \-\-patch`, only that the +`\-\-patch` option is the default one). +In this journal, we will explore the format of unified diff and implements +it in my +https://pkg.go.dev/git.sr.ht/~shulhan/pakakeh.go/lib/text/diff[Go diff package^]. + +I am writing this journal to understand the format in details while working +on the implementations. + + +== The format + +The closest official document that describe the format is from +https://www.gnu.org/software/diffutils/manual/html_node/Detailed-Unified.html[GNU +diffutils manual]. + +Sample from the manual, + +---- +--- lao 2002-02-21 23:30:39.942229878 -0800 ++++ tzu 2002-02-21 23:30:50.442260588 -0800 +@@ -1,7 +1,6 @@ +-The Way that can be told of is not the eternal Way; +-The name that can be named is not the eternal name. + The Nameless is the origin of Heaven and Earth; +-The Named is the mother of all things. ++The named is the mother of all things. ++ + Therefore let there always be non-being, + so we may see their subtlety, + And let there always be being, +@@ -9,3 +8,6 @@ + The two are the same, + But after they are produced, + they have different names. ++They both may be called deep and profound. ++Deeper and more profound, ++The door of all subtleties! +---- + +The first two lines are self explanatory. +The `--- lao ...` means the old (or original) file name followed by +modification time, and the `\+\+\+ tzu ...` means the new file name, also +followed by modification time. +In the `git diff` format, the timestamp output is omitted. + +The lines after "@@ ... @@" is called it +https://www.gnu.org/software/diffutils/manual/html_node/Hunks.html[_hunk_^]. +Inside the hunk, the line that started with minus "-" is the one that is +removed from the old file, and the line started with plus "\+" is the +one that added in new file. +Lines that does not start with "-" or "+" are unmodified lines (its actually +prefixed with single space " "). +The minimum number of unmodified lines to be added to unified format usually +three (the manual said that this is to make the `patch` program works). +We also call the unmodified lines to be context lines. + +The line with "@@ ... @@" is the one that quite hard to decipher. +The manual only mention it as + + @@ from-file-line-numbers to-file-line-numbers @@ + +The `from-file-line-numbers` or `from-file-line-numbers` can written as + + start + +or + + start,count + +Where `from-file` prefixed with "-" and `to-file` prefixed with "+". + +https://www.gnu.org/software/diffutils/manual/html_node/Detailed-Unified.html[Taking +from manual^], +lets decipher the meanings of conditions for those line-numbers. + +> _If a hunk contains just one line, only its start line number appears._ + +This condition only applicable when we run diff with `0` context, `diff +--unified=0`, which is possible but negates the previous requirements that +it needs at least three context lines; or when old and new files only +contains one line. + +The "hunk contains just one line" can be read as "hunk contains only one +modified line", because the minimum output of hunk is actually two lines. + +The output is most likely like this: +---- +@@ -1 +1 @@ +-old line ++new new +---- + +> _An empty hunk is considered to start at the line that follows the hunk._ + +Why would an empty hunk needs to be printed? +I think this part of information is unnecessary. + +> _If a hunk and its context contain two or more lines, its line numbers look +like ‘start,count’._ +_Otherwise only its end line number appears._ + +What is the value of `start`? +What is the value of `count`? + +What if the context lines overlap between hunks? +To answer this questions we need to run and inspect the diff output on +multiple conditions. + +== Sample of diff outputs + +=== Case: Deletion only + +Given the following old and new file, + +[cols="a,a"] +|=== +| Old file | New file +| +---- +include::del.old[] +---- +| +---- +include::del.new[] +---- +|=== + +The diff output, +---- +include::del.uni[] +---- + +The `-1,3` means, from the old file, the hunk start from line 1 with 3 +lines affected/included after. +The `+1,2` means, from the new file, the hunk start from line 1 with 2 +lines affected/included after. + +The next case is when multiple deletion occurs after each others, +the length of new file become shorten. + +[cols="a,a"] +|=== +| Old file | New file +| +---- +include::multi_del.old[] +---- +Contains 10 lines. +| +---- +include::multi_del.new[] +---- +Contains 8 lines, 2 lines deleted from old file. +|=== + +The diff output, +---- +include::multi_del.uni[] +---- + +The `-3,8`, from the old file, the hunk start from line 3 until lines 10 +(counted as 8 including line 3 itself). +The `+3,6`, from the perspective of new file, the hunk start from line 3 +until line 8 (counted as 6 including 3 itself). + + +=== Case: Addition only + +For the addition we can reverse the input, new become old, old become new. + +The diff output for single addition, +---- +include::add.uni[] +---- + +The diff output for multi additions, +---- +include::multi_add.uni[] +---- + +Well, I think I am overthinking it. +While implementing the unified output, I see the line numbers, and my +thought is that the start and count based on diff on line numbers. +Turns out the count is the total number of lines. + +For example, in `-3,6`, the 6 is the total number of unmodified lines and +removed lines in hunk. +In `+3,8`, the 8 is total number of unmodified lines and added lines in +hunk. + +Lets see if my theory is right using the mixed case below. + +=== Case: Mixed + +In this case we inspect the diff for deletions and additions. + +[cols="a,a"] +|=== +| Old file | New file +| +---- +include::mixed.old[] +---- +Contains 16 lines. +| +---- +include::mixed.new[] +---- +Contains 14 lines, 5 lines deleted from old file `[d e g o p]`, and 3 new +lines added on new file `[E O P]`. +|=== + +The diff output, +---- +include::mixed.uni[] +---- + +Seems like my assumption is correct for the value of `count`. +In `-1,10`, 10 is `[a b c -d -e f -g h i j]` and in `+1,8`, 8 is +`[a b c +E f h i j]`. +Next, in `-12,5`, 5 is `[l m n -o -p]` and in `+10,5`, 5 is `[l m n +O +P]`. + +The `start` number is little bit tricky. +In the second hunk, `l` start from line 12 in old file and from line 10 in +new file. + +When processing the diff between tow files, one needs to record the other +line number when matching lines found, so the diff algorithm can derive the +`start` number based on the context line. + + +== Implementations + +Assume that we already have two array of lines: `$DELS` is an array that +contains deleted lines from the old file and `$ADDS` an array that contains +added lines from the new file. + +Using the sample from Mixed case, the `$DELS` and `$ADDS` would looks like +these, + +[cols="a,a"] +|=== +| DELS | ADDS +| +---- +Index Line Num. Value +0 4 d +1 5 e +2 7 g +3 15 o +4 16 p +---- +| +---- +Line Num. Value +4 E +13 O +14 P +---- +(We did not to show the index for this part). +|=== + +The algorithm that I use to implement the unified diff, + +1) Merge `$ADDS` into `$DELS` become one array: `$UNIFIED`. + +2) From the `$UNIFIED` array, generate `$HUNKS`. + +3) For each `$HUNK` in `$HUNKS`, + +3.1) add context lines before, after, and/or between changes. + +3.2) complete the hunk by generating label `@@ ... @@` + +That is the simplified algorithm, the next section explain it in details. + + +=== Merging algorithm + +The detailed steps on merging `$ADDS` and `$DELS` into `$UNIFIED`, + +1) Copy all items of `$DELS` into `$UNIFIED` + +2) For each `$line` in `$ADDS`, + +3.1) Find the index in `$UNIFIED` where the `$line` number is, + +3.1.a) greater or equal than line number in `$UNIFIED` AND + +3.1.b) not in consecutive lines. + +Here, in consecutive line means previous line number + 1 is equal to +current line number. +If line is in consecutive orders, continue to the next line and stop when it +found not consecutive lines. + +4) Insert `$line` into index + +There are two conditions here: 3.1.a) greater or equal than; +and 3.1.b) not consecutive line numbers. + +**Examples** + +For condition 3.1.a): Given added line `{4 E}`, we search the insert +index from `0` in `$UNIFIED` + +- index 0: +-- is line `{4 d}` >= `{4 E}`? Yes. One condition match. +-- Since this is the first line, we must continue to the next line to + determine if the next line is consecutive or not. +- index 1: +-- is line `{5 e}` >= `{4 E}`? Yes +-- is line 5 continued from 4? Yes (failed on second condition). + Continue to the next line. + +- index 2: +-- is line `{7 g}` >= `{4 E}`? Yes. +-- is 7 continued from 5? No. Stop here, we found the insert index. + +The insert index is 2. +Insert the `{4 E}` at index 2 in unified array and shift the previous values +to the right. +Now the `$UNIFIED` array looks like these, + +---- +0: -{4 d} +1: -{5 e} +2: +{4 E} +3: -{7 g} +4: -{15 o} +5: -{16 p} +---- + +For condition 3.1.b): Continue with the next added line `{13 O}`, we +search the insert index by iterating unified array start from previous +insert index plus one (2 + 1), 3. + +- index 3: +-- is line number `{7 g}` >= `{13 O}`? No. Next. +- index 4: +-- is line number `{15 o}` >= `{13 O}`? Yes. +-- is 15 after 8? Yes. Continue to the next line to check if its consecutive + or not. +- index 5: +-- is line number `{16 p}` >= `{13 O`}? Yes. +-- is 16 after 15? Yes. Continue to the next line. + +Since there is no more lines, we found the insert index is at 6 or at the +end of unified array. + +Continue processing the line in `$ADDS` until no more lines left. +The final `$UNIFIED` array looks like these after merging, + +---- +0: -{4 d} +1: -{5 e} +2: +{4 E} +3: -{7 g} +4: -{15 o} +5: -{16 p} +6: +{13 O} +7: +{14 P} +---- + + +=== Generating hunk + +The hunk algorithm split the `$UNIFIED` array based on gap. +Gap is the number of context `ncontext` lines multiplied by 2. + +For example, if the `ncontext` is 3 then the gap would be 3*2 = 6 (three +lines before and three lines after). + +Start from index x=1 from `$UNIFIED`, + +- (line[x].Number - line[x-1].Number) < 6 +- 5-4 > 6? No. +- 4-5 > 6? No. +- 7-4 > 6? No. +- 15-7 > 6? Yes + +We found the first hunk, which is index 0 until 3. +---- +0: -{4 d} +1: -{5 e} +2: +{4 E} +3: -{7 g} +---- +And the second hunk is from index 4 until 7. +---- +4: -{15 o} +5: -{16 p} +6: +{13 O} +7: +{14 P} +---- + + +=== Adding context lines + +Once we have the list of hunk, we can add `n` number of context lines +before, after, and/or between the changes. + +The context lines is added from old or new file based on the kind of line +changes, deletion or addition. + +Given in the first hunk we got earlier, +---- +0: -{4 d} +1: -{5 e} +2: +{4 E} +3: -{7 g} +---- +The context lines before line -4 is added from old file, because it is a +deletion. +The context lines between line +4 and -7 is added from new file, because `{4 +E}` is an addition. +The context lines after `-{7 g}` is added from old file, because it is a +deletion. + +Given the second hunk, +---- +4: -{15 o} +5: -{16 p} +6: +{13 O} +7: +{14 P} +---- +The context lines before `-{15 o}` is added from old file. +The context lines after `+{14 P}` is added from new file. +There is no context lines for in-between changes here. + +The result of adding context lines is like what we see earlier in the output +of Mixed case. Result for first hunk, +---- + 0: {1 a} + 1: {2 b} + 2: {3 c} + 3: -{4 d} + 4: -{5 e} + 5: +{4 E} + 6: {6 f} + 7: -{7 g} + 8: {8 h} + 9: {9 i} +10: {10 j} +---- + +Result for the second hunk, +---- +0: {12 l} +1: {13 m} +2: {14 n} +4: -{15 o} +5: -{16 p} +6: +{13 O} +7: +{14 P} +---- + + +=== Labeling hunk + +The hunk label is generated based on the line number of the first change in +hunk and the count of changes in hunk, in the following format, + + @@ -{oldNumber},{oldCount} +{newNumber},{newCount} @@ + +for two or more deletions+additions in the hunk, or + + @@ -{oldNumber} +{newNumber} @@ + +for single line deletion/addition in the hunk. + +The `oldNumber` is the line number of the first line in hunk from the old +file. +The `oldCount` is the total of unmodified lines plus deletions in the hunk. + +The `newNumber` is the line number of the first line in hunk from the new +file. +The `newCount` is the total of unmodified lines plus additions in the hunk. + +Lets use the second hunk for example. + +---- +Old line New line +0: {12 l} {10 l} +1: {13 m} {11 m} +2: {14 n} {12 n} +4: -{15 o} {13 O} +5: -{16 p} {14 P} +6: +{13 O} +7: +{14 P} +---- + +The `oldNumber`, the first line `{l}` based on perspective of old file, is +12; and the `oldCount` is `[l m n -o -p]` = 5. + +The `newNumber`, the first line `{l}` based on perspective of new file, is +10; and the `newCount` is `[l m n +O +P]` = 5. + +So, the final hunk label is `@@ -12,5 +10,5 @@`. + + +== Conclusion + +Implementing unified diff was fun. +There are many corner cases, trials and errors, when we implement it on our +Go diff package, until we found the exact output that match with `diff` +program. + +If you like what you see here and needs a diff package in Go, you can use +unified diff in your Go code with this simple snippet, + +---- +import "git.sr.ht/~shulhan/pakakeh.go/lib/text/diff" + +func diff(w io.writer, old, new []byte, ncontext int) + result := diff.Unified(old, new) + result.WriteUnified(w, ncontext) +} +---- diff --git a/_content/journal/2026/unified_diff/mixed.new b/_content/journal/2026/unified_diff/mixed.new new file mode 100644 index 0000000..2a666e8 --- /dev/null +++ b/_content/journal/2026/unified_diff/mixed.new @@ -0,0 +1,14 @@ +a +b +c +E +f +h +i +j +k +l +m +n +O +P diff --git a/_content/journal/2026/unified_diff/mixed.old b/_content/journal/2026/unified_diff/mixed.old new file mode 100644 index 0000000..b03757e --- /dev/null +++ b/_content/journal/2026/unified_diff/mixed.old @@ -0,0 +1,16 @@ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p diff --git a/_content/journal/2026/unified_diff/mixed.uni b/_content/journal/2026/unified_diff/mixed.uni new file mode 100644 index 0000000..abd883d --- /dev/null +++ b/_content/journal/2026/unified_diff/mixed.uni @@ -0,0 +1,22 @@ +--- old ++++ new +@@ -1,10 +1,8 @@ + a + b + c +-d +-e ++E + f +-g + h + i + j +@@ -12,5 +10,5 @@ + l + m + n +-o +-p ++O ++P diff --git a/_content/journal/2026/unified_diff/multi_add.uni b/_content/journal/2026/unified_diff/multi_add.uni new file mode 100644 index 0000000..1ba0c42 --- /dev/null +++ b/_content/journal/2026/unified_diff/multi_add.uni @@ -0,0 +1,11 @@ +--- old ++++ new +@@ -3,6 +3,8 @@ + c + d + e ++f + g + h + i ++j diff --git a/_content/journal/2026/unified_diff/multi_del.new b/_content/journal/2026/unified_diff/multi_del.new new file mode 100644 index 0000000..95f7905 --- /dev/null +++ b/_content/journal/2026/unified_diff/multi_del.new @@ -0,0 +1,8 @@ +a +b +c +d +e +g +h +i diff --git a/_content/journal/2026/unified_diff/multi_del.old b/_content/journal/2026/unified_diff/multi_del.old new file mode 100644 index 0000000..92dfa21 --- /dev/null +++ b/_content/journal/2026/unified_diff/multi_del.old @@ -0,0 +1,10 @@ +a +b +c +d +e +f +g +h +i +j diff --git a/_content/journal/2026/unified_diff/multi_del.uni b/_content/journal/2026/unified_diff/multi_del.uni new file mode 100644 index 0000000..8d53fb0 --- /dev/null +++ b/_content/journal/2026/unified_diff/multi_del.uni @@ -0,0 +1,11 @@ +--- old ++++ new +@@ -3,8 +3,6 @@ + c + d + e +-f + g + h + i +-j |
