[dev.link] cmd/link: fix hash collision check

For content-addressable symbols, we build its content hash based on the symbol data and relocations. When the compiler builds the symbol data, it may not always include the trailing zeros, e.g. the data of [10]int64{1,2,3} is only the first 24 bytes. Therefore, we may end up with symbols with the same contents (thus same hash) but different sizes. This is not actually a hash collision. In this case, we can deduplicate them and keep the one with the larger size. Change-Id: If6834542d7914cc00f917d7db151955e5aee6f30 Reviewed-on: https://go-review.googlesource.com/c/go/+/243718 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Jeremy Faller <jeremy@golang.org>
author: Cherry Zhang <cherryyz@google.com> 2020-07-20 18:07:00 -0400
committer: Cherry Zhang <cherryyz@google.com> 2020-07-21 21:14:19 +0000
commit: 0e3114871e221485d89bf94dc019fcfa3df9c21a (patch)
tree: d45ef036d194686789b778cfa64ad961ffcded20 /src/cmd/link/internal/loader
parent: 526d99a49ae67bfde15134b96159680988615d2d (diff)
download: go-0e3114871e221485d89bf94dc019fcfa3df9c21a.tar.xz
1 files changed, 31 insertions, 32 deletions
diff --git a/src/cmd/link/internal/loader/loader.go b/src/cmd/link/internal/loader/loader.go
index 6d541af950..86fdbeffd8 100644
--- a/src/cmd/link/internal/loader/loader.go
+++ b/src/cmd/link/internal/loader/loader.go
@@ -434,24 +434,46 @@ func (l *Loader) addSym(name string, ver int, r *oReader, li uint32, kind int, o
 		l.symsByName[ver][name] = i
 		addToGlobal()
 		return i, true
-	case hashed64Def:
+	case hashed64Def, hashedDef:
 		// Hashed (content-addressable) symbol. Check the hash
 		// but don't add to name lookup table, as they are not
 		// referenced by name. Also no need to do overwriting
 		// check, as same hash indicates same content.
-		hash := r.Hash64(li - uint32(r.ndef))
+		var checkHash func() (symSizeAlign, bool)
+		var addToHashMap func(symSizeAlign)
+		var h64 uint64         // only used for hashed64Def
+		var h *goobj2.HashType // only used for hashedDef
+		if kind == hashed64Def {
+			checkHash = func() (symSizeAlign, bool) {
+				h64 = r.Hash64(li - uint32(r.ndef))
+				s, existed := l.hashed64Syms[h64]
+				return s, existed
+			}
+			addToHashMap = func(ss symSizeAlign) { l.hashed64Syms[h64] = ss }
+		} else {
+			checkHash = func() (symSizeAlign, bool) {
+				h = r.Hash(li - uint32(r.ndef+r.nhashed64def))
+				s, existed := l.hashedSyms[*h]
+				return s, existed
+			}
+			addToHashMap = func(ss symSizeAlign) { l.hashedSyms[*h] = ss }
+		}
 		siz := osym.Siz()
 		align := osym.Align()
-		if s, existed := l.hashed64Syms[hash]; existed {
-			// For short symbols, the content hash is the identity function of the
-			// 8 bytes, and trailing zeros doesn't change the hash value, e.g.
+		if s, existed := checkHash(); existed {
+			// The content hash is built from symbol data and relocations. In the
+			// object file, the symbol data may not always contain trailing zeros,
+			// e.g. for [5]int{1,2,3} and [100]int{1,2,3}, the data is same
+			// (although the size is different).
+			// Also, for short symbols, the content hash is the identity function of
+			// the 8 bytes, and trailing zeros doesn't change the hash value, e.g.
 			// hash("A") == hash("A\0\0\0").
 			// So when two symbols have the same hash, we need to use the one with
-			// larget size.
+			// larger size.
 			if siz <= s.size {
 				if align > s.align { // we need to use the biggest alignment
 					l.SetSymAlign(s.sym, int32(align))
-					l.hashed64Syms[hash] = symSizeAlign{s.sym, s.size, align}
+					addToHashMap(symSizeAlign{s.sym, s.size, align})
 				}
 			} else {
 				// New symbol has larger size, use the new one. Rewrite the index mapping.
@@ -460,34 +482,11 @@ func (l *Loader) addSym(name string, ver int, r *oReader, li uint32, kind int, o
 					align = s.align // keep the biggest alignment
 					l.SetSymAlign(s.sym, int32(align))
 				}
-				l.hashed64Syms[hash] = symSizeAlign{s.sym, siz, align}
-			}
-			return s.sym, false
-		}
-		l.hashed64Syms[hash] = symSizeAlign{i, siz, align}
-		addToGlobal()
-		return i, true
-	case hashedDef:
-		// Hashed (content-addressable) symbol. Check the hash
-		// but don't add to name lookup table, as they are not
-		// referenced by name. Also no need to do overwriting
-		// check, as same hash indicates same content.
-		hash := r.Hash(li - uint32(r.ndef+r.nhashed64def))
-		if s, existed := l.hashedSyms[*hash]; existed {
-			if s.size != osym.Siz() {
-				fmt.Printf("hash collision: %v (size %d) and %v (size %d), hash %x\n", l.SymName(s.sym), s.size, osym.Name(r.Reader), osym.Siz(), *hash)
-				panic("hash collision")
-			}
-			if l.flags&FlagStrictDups != 0 {
-				l.checkdup(name, r, li, s.sym)
-			}
-			if a := osym.Align(); a > s.align { // we need to use the biggest alignment
-				l.SetSymAlign(s.sym, int32(a))
-				l.hashedSyms[*hash] = symSizeAlign{s.sym, s.size, a}
+				addToHashMap(symSizeAlign{s.sym, siz, align})
 			}
 			return s.sym, false
 		}
-		l.hashedSyms[*hash] = symSizeAlign{i, osym.Siz(), osym.Align()}
+		addToHashMap(symSizeAlign{i, siz, align})
 		addToGlobal()
 		return i, true
 	}
author	Cherry Zhang <cherryyz@google.com>	2020-07-20 18:07:00 -0400
committer	Cherry Zhang <cherryyz@google.com>	2020-07-21 21:14:19 +0000
commit	0e3114871e221485d89bf94dc019fcfa3df9c21a (patch)
tree	d45ef036d194686789b778cfa64ad961ffcded20 /src/cmd/link/internal/loader
parent	526d99a49ae67bfde15134b96159680988615d2d (diff)
download	go-0e3114871e221485d89bf94dc019fcfa3df9c21a.tar.xz