From 8604b86d6e18bfdc9d2f8123a19bc8ed4dbfddb2 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Sun, 12 Apr 2026 17:13:10 +0700 Subject: text/diff: add an examples for Bytes and BytesRatio While at it, rename the source file to reflect the content, instead of diffinterface.go name it func.go. --- lib/text/diff/diff.go | 19 ++ lib/text/diff/diffinterface.go | 369 ------------------------------------- lib/text/diff/func.go | 319 ++++++++++++++++++++++++++++++++ lib/text/diff/func_example_test.go | 88 +++++++++ 4 files changed, 426 insertions(+), 369 deletions(-) delete mode 100644 lib/text/diff/diffinterface.go create mode 100644 lib/text/diff/func.go create mode 100644 lib/text/diff/func_example_test.go diff --git a/lib/text/diff/diff.go b/lib/text/diff/diff.go index 037ac79b..7a2844c9 100644 --- a/lib/text/diff/diff.go +++ b/lib/text/diff/diff.go @@ -281,3 +281,22 @@ func (diff Data) String() (s string) { } return sb.String() } + +// findLine returns true if line is found in the beginning of text start at +// line `startat`. +// It also return line number of matching line. +// If no match found, it will return false and `startat` value. +func findLine(line Line, text []Line, startat int) ( + found bool, + n int, +) { + textlen := len(text) + + for n = startat; n < textlen; n++ { + if bytes.Equal(line.Val, text[n].Val) { + return true, n + } + } + + return false, startat +} diff --git a/lib/text/diff/diffinterface.go b/lib/text/diff/diffinterface.go deleted file mode 100644 index be9d739e..00000000 --- a/lib/text/diff/diffinterface.go +++ /dev/null @@ -1,369 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -// SPDX-FileCopyrightText: 2018 Shulhan - -package diff - -import ( - "bytes" - - inbytes "git.sr.ht/~shulhan/pakakeh.go/internal/bytes" - "git.sr.ht/~shulhan/pakakeh.go/lib/text" -) - -const ( - // LevelLines define that we want only lines change set. - LevelLines = iota - // LevelWords define that we want the change not only capture the - // different per line, but also changes inside the line. - LevelWords -) - -const ( - // DefMatchLen minimum number of bytes used for searching the next - // matched chunk in line. - DefMatchLen = 5 - // DefMatchRatio define default minimum match ratio to be considered as - // change. - DefMatchRatio = 0.7 -) - -// BytesRatio compare two slice of bytes and return ratio of matching bytes. -// The ratio in in range of 0.0 to 1.0, where 1.0 if both are similar, and 0.0 -// if no matchs even found. -// `minTokenLen` define the minimum length of token for searching in both of -// slice. -func BytesRatio(old, newline []byte, minTokenLen int) (ratio float32, m int, maxlen int) { - x, y := 0, 0 - - oldlen := len(old) - newlen := len(newline) - minlen := oldlen - maxlen = newlen - if newlen < oldlen { - minlen = newlen - maxlen = oldlen - } - - if minTokenLen < 0 { - minTokenLen = DefMatchLen - } - - for { - // Count matching bytes from beginning of slice. - for x < minlen { - if old[x] != newline[y] { - break - } - m++ - x++ - y++ - } - - if x == minlen { - // All bytes is matched but probably some trailing in - // one of them. - break - } - - // Count matching bytes from end of slice - xend := oldlen - 1 - yend := newlen - 1 - - for xend >= x && yend >= y { - if old[xend] != newline[yend] { - break - } - m++ - xend-- - yend-- - } - - // One of the line have changes in the middle. - if xend == x || yend == y { - break - } - - // Cut the matching bytes - old = old[x : xend+1] - newline = newline[y : yend+1] - oldlen = len(old) - - // Get minimal token to search in the newline left over. - minlen = min(oldlen, minTokenLen) - - // Search old token in newline, chunk by chunk. - x = 0 - y = -1 - max := oldlen - minlen - for ; x < max; x++ { - token := old[x : x+minlen] - - y = inbytes.TokenFind(newline, token, 0) - if y > 0 { - break - } - } - - if y < 0 { - // We did not found anything. - break - } - - // Cut the changes - old = old[x:] - newline = newline[y:] - oldlen = len(old) - newlen = len(newline) - - minlen = min(newlen, oldlen) - - x, y = 0, 0 - // start again from beginning... - } - - ratio = float32(m) / float32(maxlen) - - return ratio, m, maxlen -} - -// findLine return true if line is found in text beginning at line `startat`. -// It also return line number of matching line. -// If no match found, it will return false and `startat` value. -func findLine(line Line, text []Line, startat int) ( - found bool, - n int, -) { - textlen := len(text) - - for n = startat; n < textlen; n++ { - if bytes.Equal(line.Val, text[n].Val) { - return true, n - } - } - - return false, startat -} - -// Bytes given two similar lines, find and return the differences (additions and -// deletion) between them. -// -// Case 1: addition on new or deletion on old. -// -// old: 00000 -// new: 00000111 -// -// or -// -// old: 00000111 -// new: 00000 -// -// Case 2: addition on new line -// -// old: 000000 -// new: 0001000 -// -// Case 3: deletion on old line (reverse of case 2) -// -// old: 0001000 -// new: 000000 -// -// Case 4: change happened in the beginning -// -// old: 11000 -// new: 22000 -// -// Case 5: both changed -// -// old: 0001000 -// new: 0002000 -func Bytes(old, new []byte, atx, aty int) (adds, dels text.Chunks) { - oldlen := len(old) - newlen := len(new) - - minlen := 0 - minlen = min(oldlen, newlen) - - // Find the position of unmatched byte from the beginning. - x, y := 0, 0 - for ; x < minlen; x++ { - if old[x] != new[x] { - break - } - } - y = x - - // Case 1: Check if addition or deletion is at the end. - if x == minlen { - if oldlen < newlen { - v := new[y:] - adds = append(adds, text.Chunk{StartAt: atx + y, V: v}) - } else { - v := old[x:] - dels = append(dels, text.Chunk{StartAt: atx + x, V: v}) - } - return adds, dels - } - - // Find the position of unmatched byte from the end - xend := oldlen - 1 - yend := newlen - 1 - - for xend >= x && yend >= y { - if old[xend] != new[yend] { - break - } - xend-- - yend-- - } - - // Case 2: addition in new line. - if x == xend+1 { - v := new[y : yend+1] - adds = append(adds, text.Chunk{StartAt: aty + y, V: v}) - return adds, dels - } - - // Case 3: deletion in old line. - if y == yend+1 { - v := old[x : xend+1] - dels = append(dels, text.Chunk{StartAt: atx + x, V: v}) - return adds, dels - } - - // Calculate possible match len. - // After we found similar bytes in the beginning and end of line, now - // we have `n` number of bytes left in old and new. - oldleft := old[x : xend+1] - newleft := new[y : yend+1] - oldleftlen := len(oldleft) - newleftlen := len(newleft) - - // Get minimal token to search in the new left over. - minlen = min(oldleftlen, DefMatchLen) - xtoken := oldleft[:minlen] - - xaty := inbytes.TokenFind(newleft, xtoken, 0) - - // Get miniminal token to search in the old left over. - minlen = min(newleftlen, DefMatchLen) - ytoken := newleft[:minlen] - - yatx := inbytes.TokenFind(oldleft, ytoken, 0) - - // Case 4: - // We did not find matching token of x in y, its mean the some chunk - // in x and y has been replaced. - if xaty < 0 && yatx < 0 { - addsleft, delsleft := searchForward(atx, aty, &x, &y, &oldleft, - &newleft) - - if len(addsleft) > 0 { - adds = append(adds, addsleft...) - } - if len(delsleft) > 0 { - dels = append(dels, delsleft...) - } - - // Check for possible empty left - if len(oldleft) == 0 { - if len(newleft) > 0 { - adds = append(adds, text.Chunk{ - StartAt: atx + x, - V: newleft, - }) - } - return adds, dels - } - if len(newleft) == 0 { - if len(oldleft) > 0 { - dels = append(dels, text.Chunk{ - StartAt: aty + y, - V: oldleft, - }) - } - return adds, dels - } - } - - // Case 5: is combination of case 2 and 3. - // Case 2: We found x token at y: xaty. Previous byte before that must - // be an addition. - if xaty >= 0 { - v := new[y : y+xaty] - adds = append(adds, text.Chunk{StartAt: aty + y, V: v}) - newleft = new[y+xaty : yend+1] - } else if yatx >= 0 { - // Case 3: We found y token at x: yatx. Previous byte before that must - // be a deletion. - v := old[x : x+yatx] - dels = append(dels, text.Chunk{StartAt: atx + x, V: v}) - oldleft = old[x+yatx : xend+1] - } - - addsleft, delsleft := Bytes(oldleft, newleft, atx+x, aty+y) - - if len(addsleft) > 0 { - adds = append(adds, addsleft...) - } - if len(delsleft) > 0 { - dels = append(dels, delsleft...) - } - - return adds, dels -} - -func searchForward(atx, aty int, x, y *int, oldleft, newleft *[]byte) ( - adds, dels text.Chunks, -) { - oldleftlen := len(*oldleft) - newleftlen := len(*newleft) - - minlen := min(oldleftlen, DefMatchLen) - - // Loop through old line to find matching token - xaty := -1 - xx := 1 - for ; xx < oldleftlen-minlen; xx++ { - token := (*oldleft)[xx : xx+minlen] - - xaty = inbytes.TokenFind(*newleft, token, 0) - if xaty > 0 { - break - } - } - - minlen = min(newleftlen, DefMatchLen) - - yatx := -1 - yy := 1 - for ; yy < newleftlen-minlen; yy++ { - token := (*newleft)[yy : yy+minlen] - - yatx = inbytes.TokenFind(*oldleft, token, 0) - if yatx > 0 { - break - } - } - - if xaty < 0 && yatx < 0 { - // still no token found, means whole chunk has been replaced. - dels = append(dels, text.Chunk{StartAt: atx + *x, V: *oldleft}) - adds = append(adds, text.Chunk{StartAt: aty + *y, V: *newleft}) - *oldleft = []byte{} - *newleft = []byte{} - return adds, dels - } - - // Some chunk has been replaced. - v := (*oldleft)[:xx] - dels = append(dels, text.Chunk{StartAt: atx + *x, V: v}) - *oldleft = (*oldleft)[xx:] - *x += xx - - v = (*newleft)[:yy] - adds = append(adds, text.Chunk{StartAt: aty + *y, V: v}) - *newleft = (*newleft)[yy:] - *y += yy - - return adds, dels -} diff --git a/lib/text/diff/func.go b/lib/text/diff/func.go new file mode 100644 index 00000000..6acc5178 --- /dev/null +++ b/lib/text/diff/func.go @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: BSD-3-Clause +// SPDX-FileCopyrightText: 2018 Shulhan + +package diff + +import ( + inbytes "git.sr.ht/~shulhan/pakakeh.go/internal/bytes" + "git.sr.ht/~shulhan/pakakeh.go/lib/text" +) + +const ( + // LevelLines define that we want only lines change set. + LevelLines = iota + // LevelWords define that we want the change not only capture the + // different per line, but also changes inside the line. + LevelWords +) + +const ( + // DefMatchLen minimum number of bytes used for searching the next + // matched chunk in line. + DefMatchLen = 5 + // DefMatchRatio define default minimum match ratio to be considered as + // change. + DefMatchRatio = 0.7 +) + +// Bytes returns the character differences (additions and deletion) between +// old and new bytes, start at specific position. +func Bytes(old, new []byte, oldat, newat int) (adds, dels text.Chunks) { + oldlen := len(old) + newlen := len(new) + + minlen := 0 + minlen = min(oldlen, newlen) + + // Find the position of unmatched byte from the beginning. + x, y := 0, 0 + for ; x < minlen; x++ { + if old[x] != new[x] { + break + } + } + y = x + + // Case 1: Check if addition or deletion is at the end. + if x == minlen { + if oldlen < newlen { + v := new[y:] + adds = append(adds, text.Chunk{StartAt: oldat + y, V: v}) + } else { + v := old[x:] + dels = append(dels, text.Chunk{StartAt: oldat + x, V: v}) + } + return adds, dels + } + + // Find the position of unmatched byte from the end + xend := oldlen - 1 + yend := newlen - 1 + + for xend >= x && yend >= y { + if old[xend] != new[yend] { + break + } + xend-- + yend-- + } + + // Case 2: addition in new line. + if x == xend+1 { + v := new[y : yend+1] + adds = append(adds, text.Chunk{StartAt: newat + y, V: v}) + return adds, dels + } + + // Case 3: deletion in old line. + if y == yend+1 { + v := old[x : xend+1] + dels = append(dels, text.Chunk{StartAt: oldat + x, V: v}) + return adds, dels + } + + // Calculate possible match len. + // After we found similar bytes in the beginning and end of line, now + // we have `n` number of bytes left in old and new. + oldleft := old[x : xend+1] + newleft := new[y : yend+1] + oldleftlen := len(oldleft) + newleftlen := len(newleft) + + // Get minimal token to search in the new left over. + minlen = min(oldleftlen, DefMatchLen) + xtoken := oldleft[:minlen] + + xaty := inbytes.TokenFind(newleft, xtoken, 0) + + // Get miniminal token to search in the old left over. + minlen = min(newleftlen, DefMatchLen) + ytoken := newleft[:minlen] + + yatx := inbytes.TokenFind(oldleft, ytoken, 0) + + // Case 4: + // We did not find matching token of x in y, its mean the some chunk + // in x and y has been replaced. + if xaty < 0 && yatx < 0 { + addsleft, delsleft := searchForward(oldat, newat, &x, &y, &oldleft, + &newleft) + + if len(addsleft) > 0 { + adds = append(adds, addsleft...) + } + if len(delsleft) > 0 { + dels = append(dels, delsleft...) + } + + // Check for possible empty left + if len(oldleft) == 0 { + if len(newleft) > 0 { + adds = append(adds, text.Chunk{ + StartAt: oldat + x, + V: newleft, + }) + } + return adds, dels + } + if len(newleft) == 0 { + if len(oldleft) > 0 { + dels = append(dels, text.Chunk{ + StartAt: newat + y, + V: oldleft, + }) + } + return adds, dels + } + } + + // Case 5: is combination of case 2 and 3. + // Case 2: We found x token at y: xaty. Previous byte before that must + // be an addition. + if xaty >= 0 { + v := new[y : y+xaty] + adds = append(adds, text.Chunk{StartAt: newat + y, V: v}) + newleft = new[y+xaty : yend+1] + } else if yatx >= 0 { + // Case 3: We found y token at x: yatx. Previous byte before that must + // be a deletion. + v := old[x : x+yatx] + dels = append(dels, text.Chunk{StartAt: oldat + x, V: v}) + oldleft = old[x+yatx : xend+1] + } + + addsleft, delsleft := Bytes(oldleft, newleft, oldat+x, newat+y) + + if len(addsleft) > 0 { + adds = append(adds, addsleft...) + } + if len(delsleft) > 0 { + dels = append(dels, delsleft...) + } + + return adds, dels +} + +// BytesRatio returns the ratio of matching bytes between old and new. +// The ratio in in range of 0.0 to 1.0, where 1.0 when both are similar, and +// 0.0 when no single character matchs found. +// The minTokenLen define the minimum length of token for searching in both of +// slice. +func BytesRatio(old, newline []byte, minTokenLen int) (ratio float32, m int, maxlen int) { + x, y := 0, 0 + + oldlen := len(old) + newlen := len(newline) + minlen := oldlen + maxlen = newlen + if newlen < oldlen { + minlen = newlen + maxlen = oldlen + } + + if minTokenLen < 0 { + minTokenLen = DefMatchLen + } + + for { + // Count matching bytes from beginning of slice. + for x < minlen { + if old[x] != newline[y] { + break + } + m++ + x++ + y++ + } + + if x == minlen { + // All bytes is matched but probably some trailing in + // one of them. + break + } + + // Count matching bytes from end of slice + xend := oldlen - 1 + yend := newlen - 1 + + for xend >= x && yend >= y { + if old[xend] != newline[yend] { + break + } + m++ + xend-- + yend-- + } + + // One of the line have changes in the middle. + if xend == x || yend == y { + break + } + + // Cut the matching bytes + old = old[x : xend+1] + newline = newline[y : yend+1] + oldlen = len(old) + + // Get minimal token to search in the newline left over. + minlen = min(oldlen, minTokenLen) + + // Search old token in newline, chunk by chunk. + x = 0 + y = -1 + max := oldlen - minlen + for ; x < max; x++ { + token := old[x : x+minlen] + + y = inbytes.TokenFind(newline, token, 0) + if y > 0 { + break + } + } + + if y < 0 { + // We did not found anything. + break + } + + // Cut the changes + old = old[x:] + newline = newline[y:] + oldlen = len(old) + newlen = len(newline) + + minlen = min(newlen, oldlen) + + x, y = 0, 0 + // start again from beginning... + } + + ratio = float32(m) / float32(maxlen) + + return ratio, m, maxlen +} + +func searchForward(oldat, newat int, x, y *int, oldleft, newleft *[]byte) ( + adds, dels text.Chunks, +) { + oldleftlen := len(*oldleft) + newleftlen := len(*newleft) + + minlen := min(oldleftlen, DefMatchLen) + + // Loop through old line to find matching token + xaty := -1 + xx := 1 + for ; xx < oldleftlen-minlen; xx++ { + token := (*oldleft)[xx : xx+minlen] + + xaty = inbytes.TokenFind(*newleft, token, 0) + if xaty > 0 { + break + } + } + + minlen = min(newleftlen, DefMatchLen) + + yatx := -1 + yy := 1 + for ; yy < newleftlen-minlen; yy++ { + token := (*newleft)[yy : yy+minlen] + + yatx = inbytes.TokenFind(*oldleft, token, 0) + if yatx > 0 { + break + } + } + + if xaty < 0 && yatx < 0 { + // still no token found, means whole chunk has been replaced. + dels = append(dels, text.Chunk{StartAt: oldat + *x, V: *oldleft}) + adds = append(adds, text.Chunk{StartAt: newat + *y, V: *newleft}) + *oldleft = []byte{} + *newleft = []byte{} + return adds, dels + } + + // Some chunk has been replaced. + v := (*oldleft)[:xx] + dels = append(dels, text.Chunk{StartAt: oldat + *x, V: v}) + *oldleft = (*oldleft)[xx:] + *x += xx + + v = (*newleft)[:yy] + adds = append(adds, text.Chunk{StartAt: newat + *y, V: v}) + *newleft = (*newleft)[yy:] + *y += yy + + return adds, dels +} diff --git a/lib/text/diff/func_example_test.go b/lib/text/diff/func_example_test.go new file mode 100644 index 00000000..713e70fd --- /dev/null +++ b/lib/text/diff/func_example_test.go @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: BSD-3-Clause +// SPDX-FileCopyrightText: 2026 Shulhan + +package diff + +import ( + "fmt" +) + +func ExampleBytes() { + fmt.Println(`Additions 1:`) + adds, dels := Bytes([]byte(`00000`), []byte(`00000111`), 0, 0) + fmt.Printf(" adds: %+v\n", adds) + fmt.Printf(" dels: %+v\n", dels) + + fmt.Println(`Additions 2:`) + adds, dels = Bytes([]byte(`000000`), []byte(`0001000`), 0, 0) + fmt.Printf(" adds: %+v\n", adds) + fmt.Printf(" dels: %+v\n", dels) + + fmt.Println(`Deletions 1:`) + adds, dels = Bytes([]byte(`00000111`), []byte(`00000`), 0, 0) + fmt.Printf(" adds: %+v\n", adds) + fmt.Printf(" dels: %+v\n", dels) + + fmt.Println(`Deletions 2:`) + adds, dels = Bytes([]byte(`0001000`), []byte(`000000`), 0, 0) + fmt.Printf(" adds: %+v\n", adds) + fmt.Printf(" dels: %+v\n", dels) + + fmt.Println(`Both changes 1:`) + adds, dels = Bytes([]byte(`11000`), []byte(`22000`), 0, 0) + fmt.Printf(" adds: %+v\n", adds) + fmt.Printf(" dels: %+v\n", dels) + + fmt.Println(`Both changes 2:`) + adds, dels = Bytes([]byte(`0001000`), []byte(`0002000`), 0, 0) + fmt.Printf(" adds: %+v\n", adds) + fmt.Printf(" dels: %+v\n", dels) + + // Output: + // Additions 1: + // adds: [{StartAt:5,V:111}] + // dels: [] + // Additions 2: + // adds: [{StartAt:3,V:1}] + // dels: [] + // Deletions 1: + // adds: [] + // dels: [{StartAt:5,V:111}] + // Deletions 2: + // adds: [] + // dels: [{StartAt:3,V:1}] + // Both changes 1: + // adds: [{StartAt:0,V:22}] + // dels: [{StartAt:0,V:11}] + // Both changes 2: + // adds: [{StartAt:3,V:2}] + // dels: [{StartAt:3,V:1}] +} + +func ExampleBytesRatio() { + ratio, m, maxlen := BytesRatio([]byte(`00000`), []byte(`00000111`), 1) + fmt.Printf("Additions 1: %f %d %d\n", ratio, m, maxlen) + + ratio, m, maxlen = BytesRatio([]byte(`000000`), []byte(`0001000`), 1) + fmt.Printf("Additions 2: %f %d %d\n", ratio, m, maxlen) + + ratio, m, maxlen = BytesRatio([]byte(`00000111`), []byte(`00000`), 1) + fmt.Printf("Deletions 1: %f %d %d\n", ratio, m, maxlen) + + ratio, m, maxlen = BytesRatio([]byte(`0001000`), []byte(`000000`), 1) + fmt.Printf("Deletions 2: %f %d %d\n", ratio, m, maxlen) + + ratio, m, maxlen = BytesRatio([]byte(`11000`), []byte(`22000`), 1) + fmt.Printf("Both changes 1: %f %d %d\n", ratio, m, maxlen) + + ratio, m, maxlen = BytesRatio([]byte(`0001000`), []byte(`0002000`), 1) + fmt.Printf("Both changes 2: %f %d %d\n", ratio, m, maxlen) + + // Output: + // Additions 1: 0.625000 5 8 + // Additions 2: 0.857143 6 7 + // Deletions 1: 0.625000 5 8 + // Deletions 2: 0.857143 6 7 + // Both changes 1: 0.600000 3 5 + // Both changes 2: 0.857143 6 7 +} -- cgit v1.3