diff options
| author | Shulhan <ms@kilabit.info> | 2026-04-12 17:13:10 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-04-12 17:39:22 +0700 |
| commit | 8604b86d6e18bfdc9d2f8123a19bc8ed4dbfddb2 (patch) | |
| tree | 797984d5312b3c4fd280ee381bbc3fad2941af06 /lib/text/diff/func.go | |
| parent | 50178d3a571213161675752ac1f32dbc75cd448d (diff) | |
| download | pakakeh.go-8604b86d6e18bfdc9d2f8123a19bc8ed4dbfddb2.tar.xz | |
text/diff: add an examples for Bytes and BytesRatio
While at it, rename the source file to reflect the content, instead of
diffinterface.go name it func.go.
Diffstat (limited to 'lib/text/diff/func.go')
| -rw-r--r-- | lib/text/diff/func.go | 319 |
1 files changed, 319 insertions, 0 deletions
diff --git a/lib/text/diff/func.go b/lib/text/diff/func.go new file mode 100644 index 00000000..6acc5178 --- /dev/null +++ b/lib/text/diff/func.go @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: BSD-3-Clause +// SPDX-FileCopyrightText: 2018 Shulhan <ms@kilabit.info> + +package diff + +import ( + inbytes "git.sr.ht/~shulhan/pakakeh.go/internal/bytes" + "git.sr.ht/~shulhan/pakakeh.go/lib/text" +) + +const ( + // LevelLines define that we want only lines change set. + LevelLines = iota + // LevelWords define that we want the change not only capture the + // different per line, but also changes inside the line. + LevelWords +) + +const ( + // DefMatchLen minimum number of bytes used for searching the next + // matched chunk in line. + DefMatchLen = 5 + // DefMatchRatio define default minimum match ratio to be considered as + // change. + DefMatchRatio = 0.7 +) + +// Bytes returns the character differences (additions and deletion) between +// old and new bytes, start at specific position. +func Bytes(old, new []byte, oldat, newat int) (adds, dels text.Chunks) { + oldlen := len(old) + newlen := len(new) + + minlen := 0 + minlen = min(oldlen, newlen) + + // Find the position of unmatched byte from the beginning. + x, y := 0, 0 + for ; x < minlen; x++ { + if old[x] != new[x] { + break + } + } + y = x + + // Case 1: Check if addition or deletion is at the end. + if x == minlen { + if oldlen < newlen { + v := new[y:] + adds = append(adds, text.Chunk{StartAt: oldat + y, V: v}) + } else { + v := old[x:] + dels = append(dels, text.Chunk{StartAt: oldat + x, V: v}) + } + return adds, dels + } + + // Find the position of unmatched byte from the end + xend := oldlen - 1 + yend := newlen - 1 + + for xend >= x && yend >= y { + if old[xend] != new[yend] { + break + } + xend-- + yend-- + } + + // Case 2: addition in new line. + if x == xend+1 { + v := new[y : yend+1] + adds = append(adds, text.Chunk{StartAt: newat + y, V: v}) + return adds, dels + } + + // Case 3: deletion in old line. + if y == yend+1 { + v := old[x : xend+1] + dels = append(dels, text.Chunk{StartAt: oldat + x, V: v}) + return adds, dels + } + + // Calculate possible match len. + // After we found similar bytes in the beginning and end of line, now + // we have `n` number of bytes left in old and new. + oldleft := old[x : xend+1] + newleft := new[y : yend+1] + oldleftlen := len(oldleft) + newleftlen := len(newleft) + + // Get minimal token to search in the new left over. + minlen = min(oldleftlen, DefMatchLen) + xtoken := oldleft[:minlen] + + xaty := inbytes.TokenFind(newleft, xtoken, 0) + + // Get miniminal token to search in the old left over. + minlen = min(newleftlen, DefMatchLen) + ytoken := newleft[:minlen] + + yatx := inbytes.TokenFind(oldleft, ytoken, 0) + + // Case 4: + // We did not find matching token of x in y, its mean the some chunk + // in x and y has been replaced. + if xaty < 0 && yatx < 0 { + addsleft, delsleft := searchForward(oldat, newat, &x, &y, &oldleft, + &newleft) + + if len(addsleft) > 0 { + adds = append(adds, addsleft...) + } + if len(delsleft) > 0 { + dels = append(dels, delsleft...) + } + + // Check for possible empty left + if len(oldleft) == 0 { + if len(newleft) > 0 { + adds = append(adds, text.Chunk{ + StartAt: oldat + x, + V: newleft, + }) + } + return adds, dels + } + if len(newleft) == 0 { + if len(oldleft) > 0 { + dels = append(dels, text.Chunk{ + StartAt: newat + y, + V: oldleft, + }) + } + return adds, dels + } + } + + // Case 5: is combination of case 2 and 3. + // Case 2: We found x token at y: xaty. Previous byte before that must + // be an addition. + if xaty >= 0 { + v := new[y : y+xaty] + adds = append(adds, text.Chunk{StartAt: newat + y, V: v}) + newleft = new[y+xaty : yend+1] + } else if yatx >= 0 { + // Case 3: We found y token at x: yatx. Previous byte before that must + // be a deletion. + v := old[x : x+yatx] + dels = append(dels, text.Chunk{StartAt: oldat + x, V: v}) + oldleft = old[x+yatx : xend+1] + } + + addsleft, delsleft := Bytes(oldleft, newleft, oldat+x, newat+y) + + if len(addsleft) > 0 { + adds = append(adds, addsleft...) + } + if len(delsleft) > 0 { + dels = append(dels, delsleft...) + } + + return adds, dels +} + +// BytesRatio returns the ratio of matching bytes between old and new. +// The ratio in in range of 0.0 to 1.0, where 1.0 when both are similar, and +// 0.0 when no single character matchs found. +// The minTokenLen define the minimum length of token for searching in both of +// slice. +func BytesRatio(old, newline []byte, minTokenLen int) (ratio float32, m int, maxlen int) { + x, y := 0, 0 + + oldlen := len(old) + newlen := len(newline) + minlen := oldlen + maxlen = newlen + if newlen < oldlen { + minlen = newlen + maxlen = oldlen + } + + if minTokenLen < 0 { + minTokenLen = DefMatchLen + } + + for { + // Count matching bytes from beginning of slice. + for x < minlen { + if old[x] != newline[y] { + break + } + m++ + x++ + y++ + } + + if x == minlen { + // All bytes is matched but probably some trailing in + // one of them. + break + } + + // Count matching bytes from end of slice + xend := oldlen - 1 + yend := newlen - 1 + + for xend >= x && yend >= y { + if old[xend] != newline[yend] { + break + } + m++ + xend-- + yend-- + } + + // One of the line have changes in the middle. + if xend == x || yend == y { + break + } + + // Cut the matching bytes + old = old[x : xend+1] + newline = newline[y : yend+1] + oldlen = len(old) + + // Get minimal token to search in the newline left over. + minlen = min(oldlen, minTokenLen) + + // Search old token in newline, chunk by chunk. + x = 0 + y = -1 + max := oldlen - minlen + for ; x < max; x++ { + token := old[x : x+minlen] + + y = inbytes.TokenFind(newline, token, 0) + if y > 0 { + break + } + } + + if y < 0 { + // We did not found anything. + break + } + + // Cut the changes + old = old[x:] + newline = newline[y:] + oldlen = len(old) + newlen = len(newline) + + minlen = min(newlen, oldlen) + + x, y = 0, 0 + // start again from beginning... + } + + ratio = float32(m) / float32(maxlen) + + return ratio, m, maxlen +} + +func searchForward(oldat, newat int, x, y *int, oldleft, newleft *[]byte) ( + adds, dels text.Chunks, +) { + oldleftlen := len(*oldleft) + newleftlen := len(*newleft) + + minlen := min(oldleftlen, DefMatchLen) + + // Loop through old line to find matching token + xaty := -1 + xx := 1 + for ; xx < oldleftlen-minlen; xx++ { + token := (*oldleft)[xx : xx+minlen] + + xaty = inbytes.TokenFind(*newleft, token, 0) + if xaty > 0 { + break + } + } + + minlen = min(newleftlen, DefMatchLen) + + yatx := -1 + yy := 1 + for ; yy < newleftlen-minlen; yy++ { + token := (*newleft)[yy : yy+minlen] + + yatx = inbytes.TokenFind(*oldleft, token, 0) + if yatx > 0 { + break + } + } + + if xaty < 0 && yatx < 0 { + // still no token found, means whole chunk has been replaced. + dels = append(dels, text.Chunk{StartAt: oldat + *x, V: *oldleft}) + adds = append(adds, text.Chunk{StartAt: newat + *y, V: *newleft}) + *oldleft = []byte{} + *newleft = []byte{} + return adds, dels + } + + // Some chunk has been replaced. + v := (*oldleft)[:xx] + dels = append(dels, text.Chunk{StartAt: oldat + *x, V: v}) + *oldleft = (*oldleft)[xx:] + *x += xx + + v = (*newleft)[:yy] + adds = append(adds, text.Chunk{StartAt: newat + *y, V: v}) + *newleft = (*newleft)[yy:] + *y += yy + + return adds, dels +} |
