diff options
| author | Robert Griesemer <gri@golang.org> | 2015-03-24 16:36:16 -0700 |
|---|---|---|
| committer | Robert Griesemer <gri@golang.org> | 2015-03-31 19:36:55 +0000 |
| commit | 8d267b9b592001da4c7a4d73996f92094fd23152 (patch) | |
| tree | ea0eab8d6f4ae666df9274572c2fa17a1ef714db /src | |
| parent | f8fd5502ecdba0f40d794d22f7eb14c9b471a773 (diff) | |
| download | go-8d267b9b592001da4c7a4d73996f92094fd23152.tar.xz | |
math/big: fixed Float.Float64, implemented Float.Float32
- fix bounds checks for exponent range of denormalized numbers
- use correct rounding precision for denormalized numbers
- added extra tests
Change-Id: I6be56399afd0d9a603300a2e44b5539e08d6f592
Reviewed-on: https://go-review.googlesource.com/8096
Reviewed-by: Alan Donovan <adonovan@google.com>
Diffstat (limited to 'src')
| -rw-r--r-- | src/math/big/float.go | 198 | ||||
| -rw-r--r-- | src/math/big/float_test.go | 107 |
2 files changed, 259 insertions, 46 deletions
diff --git a/src/math/big/float.go b/src/math/big/float.go index fa3751d0c7..629510a18e 100644 --- a/src/math/big/float.go +++ b/src/math/big/float.go @@ -750,6 +750,11 @@ func (z *Float) Copy(x *Float) *Float { return z } +func high32(x nat) uint32 { + // TODO(gri) This can be done more efficiently on 32bit platforms. + return uint32(high64(x) >> 32) +} + func high64(x nat) uint64 { i := len(x) if i == 0 { @@ -872,15 +877,16 @@ func (x *Float) Int64() (int64, Accuracy) { panic("unreachable") } -// Float64 returns the float64 value nearest to x by rounding ToNearestEven -// with 53 bits of precision. -// If x is too small to be represented by a float64 -// (|x| < math.SmallestNonzeroFloat64), the result is (0, Below) or -// (-0, Above), respectively, depending on the sign of x. -// If x is too large to be represented by a float64 (|x| > math.MaxFloat64), +// TODO(gri) Float32 and Float64 are very similar internally but for the +// floatxx parameters and some conversions. Should factor out shared code. + +// Float32 returns the float32 value nearest to x. If x is too small to be +// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result +// is (0, Below) or (-0, Above), respectively, depending on the sign of x. +// If x is too large to be represented by a float32 (|x| > math.MaxFloat32), // the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x. // The result is (NaN, Undef) for NaNs. -func (x *Float) Float64() (float64, Accuracy) { +func (x *Float) Float32() (float32, Accuracy) { if debugFloat { x.validate() } @@ -888,61 +894,183 @@ func (x *Float) Float64() (float64, Accuracy) { switch x.form { case finite: // 0 < |x| < +Inf + + const ( + fbits = 32 // float size + mbits = 23 // mantissa size (excluding implicit msb) + ebits = fbits - mbits - 1 // 8 exponent size + bias = 1<<(ebits-1) - 1 // 127 exponent bias + dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal) + emin = 1 - bias // -126 smallest unbiased exponent (normal) + emax = bias // 127 largest unbiased exponent (normal) + ) + + // Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0. + // floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0. + // For a given mantissa m, we need to add 1 to a floatxx exponent to get the + // corresponding Float exponent. + // (see also implementation of math.Ldexp for similar code) + + if x.exp < dmin+1 { + // underflow + if x.neg { + var z float32 + return -z, Above + } + return 0.0, Below + } + // x.exp >= dmin+1 + var r Float - r.prec = 53 + r.prec = mbits + 1 // +1 for implicit msb + if x.exp < emin+1 { + // denormal number - round to fewer bits + r.prec = uint32(x.exp - dmin) + } r.Set(x) - // Rounding via Set may have caused r to overflow - // to ±Inf (rounding never causes underflows to 0). + // Rounding may have caused r to overflow to ±Inf + // (rounding never causes underflows to 0). if r.form == inf { - r.exp = 10000 // cause overflow below + r.exp = emax + 2 // cause overflow below } - // see also implementation of math.Ldexp + if r.exp > emax+1 { + // overflow + if x.neg { + return float32(math.Inf(-1)), Below + } + return float32(math.Inf(+1)), Above + } + // dmin+1 <= r.exp <= emax+1 + + var s uint32 + if r.neg { + s = 1 << (fbits - 1) + } + + m := high32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) + + // Rounding may have caused a denormal number to + // become normal. Check again. + c := float32(1.0) + if r.exp < emin+1 { + // denormal number + r.exp += mbits + c = 1.0 / (1 << mbits) // 2**-mbits + } + // emin+1 <= r.exp <= emax+1 + e := uint32(r.exp-emin) << mbits + + return c * math.Float32frombits(s|e|m), r.acc - e := int64(r.exp) + 1022 - if e <= -52 { + case zero: + if x.neg { + var z float32 + return -z, Exact + } + return 0.0, Exact + + case inf: + if x.neg { + return float32(math.Inf(-1)), Exact + } + return float32(math.Inf(+1)), Exact + + case nan: + return float32(math.NaN()), Undef + } + + panic("unreachable") +} + +// Float64 returns the float64 value nearest to x. If x is too small to be +// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result +// is (0, Below) or (-0, Above), respectively, depending on the sign of x. +// If x is too large to be represented by a float64 (|x| > math.MaxFloat64), +// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x. +// The result is (NaN, Undef) for NaNs. +func (x *Float) Float64() (float64, Accuracy) { + if debugFloat { + x.validate() + } + + switch x.form { + case finite: + // 0 < |x| < +Inf + + const ( + fbits = 64 // float size + mbits = 52 // mantissa size (excluding implicit msb) + ebits = fbits - mbits - 1 // 11 exponent size + bias = 1<<(ebits-1) - 1 // 1023 exponent bias + dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal) + emin = 1 - bias // -1022 smallest unbiased exponent (normal) + emax = bias // 1023 largest unbiased exponent (normal) + ) + + // Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0. + // floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0. + // For a given mantissa m, we need to add 1 to a floatxx exponent to get the + // corresponding Float exponent. + // (see also implementation of math.Ldexp for similar code) + + if x.exp < dmin+1 { // underflow if x.neg { - z := 0.0 + var z float64 return -z, Above } return 0.0, Below } - // e > -52 + // x.exp >= dmin+1 + + var r Float + r.prec = mbits + 1 // +1 for implicit msb + if x.exp < emin+1 { + // denormal number - round to fewer bits + r.prec = uint32(x.exp - dmin) + } + r.Set(x) - if e >= 2047 { + // Rounding may have caused r to overflow to ±Inf + // (rounding never causes underflows to 0). + if r.form == inf { + r.exp = emax + 2 // cause overflow below + } + + if r.exp > emax+1 { // overflow if x.neg { return math.Inf(-1), Below } return math.Inf(+1), Above } - // -52 < e < 2047 - - denormal := false - if e < 0 { - denormal = true - e += 52 - } - // 0 < e < 2047 + // dmin+1 <= r.exp <= emax+1 - s := uint64(0) + var s uint64 if r.neg { - s = 1 << 63 + s = 1 << (fbits - 1) } - m := high64(r.mant) >> 11 & (1<<52 - 1) // cut off msb (implicit 1 bit) - z := math.Float64frombits(s | uint64(e)<<52 | m) - if denormal { - // adjust for denormal - // TODO(gri) does this change accuracy? - z /= 1 << 52 + + m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) + + // Rounding may have caused a denormal number to + // become normal. Check again. + c := 1.0 + if r.exp < emin+1 { + // denormal number + r.exp += mbits + c = 1.0 / (1 << mbits) // 2**-mbits } - return z, r.acc + // emin+1 <= r.exp <= emax+1 + e := uint64(r.exp-emin) << mbits + + return c * math.Float64frombits(s|e|m), r.acc case zero: if x.neg { - z := 0.0 + var z float64 return -z, Exact } return 0.0, Exact diff --git a/src/math/big/float_test.go b/src/math/big/float_test.go index 7bfac5d66b..9d101531de 100644 --- a/src/math/big/float_test.go +++ b/src/math/big/float_test.go @@ -537,14 +537,14 @@ func TestFloatRound(t *testing.T) { // TestFloatRound24 tests that rounding a float64 to 24 bits // matches IEEE-754 rounding to nearest when converting a -// float64 to a float32. +// float64 to a float32 (excluding denormal numbers). func TestFloatRound24(t *testing.T) { const x0 = 1<<26 - 0x10 // 11...110000 (26 bits) for d := 0; d <= 0x10; d++ { x := float64(x0 + d) f := new(Float).SetPrec(24).SetFloat64(x) - got, _ := f.Float64() - want := float64(float32(x)) + got, _ := f.Float32() + want := float32(x) if got != want { t.Errorf("Round(%g, 24) = %g; want %g", x, got, want) } @@ -837,7 +837,70 @@ func TestFloatInt64(t *testing.T) { } } +func TestFloatFloat32(t *testing.T) { + for _, test := range []struct { + x string + out float32 + acc Accuracy + }{ + {"-Inf", float32(math.Inf(-1)), Exact}, + {"-0x1.ffffff0p2147483646", float32(-math.Inf(+1)), Below}, // overflow in rounding + {"-1e10000", float32(math.Inf(-1)), Below}, // overflow + {"-0x1p128", float32(math.Inf(-1)), Below}, // overflow + {"-0x1.ffffff0p127", float32(-math.Inf(+1)), Below}, // overflow + {"-0x1.fffffe8p127", -math.MaxFloat32, Above}, + {"-0x1.fffffe0p127", -math.MaxFloat32, Exact}, + {"-12345.000000000000000000001", -12345, Above}, + {"-12345.0", -12345, Exact}, + {"-1.000000000000000000001", -1, Above}, + {"-1", -1, Exact}, + {"-0x0.000002p-126", -math.SmallestNonzeroFloat32, Exact}, + {"-0x0.000002p-127", -0, Above}, // underflow + {"-1e-1000", -0, Above}, // underflow + {"0", 0, Exact}, + {"1e-1000", 0, Below}, // underflow + {"0x0.000002p-127", 0, Below}, // underflow + {"0x0.000002p-126", math.SmallestNonzeroFloat32, Exact}, + {"1", 1, Exact}, + {"1.000000000000000000001", 1, Below}, + {"12345.0", 12345, Exact}, + {"12345.000000000000000000001", 12345, Below}, + {"0x1.fffffe0p127", math.MaxFloat32, Exact}, + {"0x1.fffffe8p127", math.MaxFloat32, Below}, + {"0x1.ffffff0p127", float32(math.Inf(+1)), Above}, // overflow + {"0x1p128", float32(math.Inf(+1)), Above}, // overflow + {"1e10000", float32(math.Inf(+1)), Above}, // overflow + {"0x1.ffffff0p2147483646", float32(math.Inf(+1)), Above}, // overflow in rounding + {"+Inf", float32(math.Inf(+1)), Exact}, + } { + // conversion should match strconv where syntax is agreeable + if f, err := strconv.ParseFloat(test.x, 32); err == nil && float32(f) != test.out { + t.Errorf("%s: got %g; want %g (incorrect test data)", test.x, f, test.out) + } + + x := makeFloat(test.x) + out, acc := x.Float32() + if out != test.out || acc != test.acc { + t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", test.x, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), test.acc) + } + + // test that x.SetFloat64(float64(f)).Float32() == f + var x2 Float + out2, acc2 := x2.SetFloat64(float64(out)).Float32() + if out2 != out || acc2 != Exact { + t.Errorf("idempotency test: got %g (%s); want %g (Exact)", out2, acc2, out) + } + } + + // test NaN + x := makeFloat("NaN") + if out, acc := x.Float32(); out == out || acc != Undef { + t.Errorf("NaN: got %g (%s); want NaN (Undef)", out, acc) + } +} + func TestFloatFloat64(t *testing.T) { + const smallestNormalFloat64 = 2.2250738585072014e-308 // 1p-1022 for _, test := range []struct { x string out float64 @@ -849,7 +912,7 @@ func TestFloatFloat64(t *testing.T) { {"-0x1p1024", math.Inf(-1), Below}, // overflow {"-0x1.fffffffffffff8p1023", -math.Inf(+1), Below}, // overflow {"-0x1.fffffffffffff4p1023", -math.MaxFloat64, Above}, - {"-0x1.fffffffffffffp1023", -math.MaxFloat64, Exact}, + {"-0x1.fffffffffffff0p1023", -math.MaxFloat64, Exact}, {"-12345.000000000000000000001", -12345, Above}, {"-12345.0", -12345, Exact}, {"-1.000000000000000000001", -1, Above}, @@ -865,18 +928,39 @@ func TestFloatFloat64(t *testing.T) { {"1.000000000000000000001", 1, Below}, {"12345.0", 12345, Exact}, {"12345.000000000000000000001", 12345, Below}, - {"0x1.fffffffffffffp1023", math.MaxFloat64, Exact}, + {"0x1.fffffffffffff0p1023", math.MaxFloat64, Exact}, {"0x1.fffffffffffff4p1023", math.MaxFloat64, Below}, {"0x1.fffffffffffff8p1023", math.Inf(+1), Above}, // overflow {"0x1p1024", math.Inf(+1), Above}, // overflow {"1e10000", math.Inf(+1), Above}, // overflow {"0x1.fffffffffffff8p2147483646", math.Inf(+1), Above}, // overflow in rounding {"+Inf", math.Inf(+1), Exact}, + + // selected denormalized values that were handled incorrectly in the past + {"0x.fffffffffffffp-1022", smallestNormalFloat64 - math.SmallestNonzeroFloat64, Exact}, + {"4503599627370495p-1074", smallestNormalFloat64 - math.SmallestNonzeroFloat64, Exact}, + + // http://www.exploringbinary.com/php-hangs-on-numeric-value-2-2250738585072011e-308/ + {"2.2250738585072011e-308", 2.225073858507201e-308, Below}, + // http://www.exploringbinary.com/java-hangs-when-converting-2-2250738585072012e-308/ + {"2.2250738585072012e-308", 2.2250738585072014e-308, Above}, } { + // conversion should match strconv where syntax is agreeable + if f, err := strconv.ParseFloat(test.x, 64); err == nil && f != test.out { + t.Errorf("%s: got %g; want %g (incorrect test data)", test.x, f, test.out) + } + x := makeFloat(test.x) out, acc := x.Float64() if out != test.out || acc != test.acc { - t.Errorf("%s: got %g (%s); want %g (%s)", test.x, out, acc, test.out, test.acc) + t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", test.x, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), test.acc) + } + + // test that x.SetFloat64(f).Float64() == f + var x2 Float + out2, acc2 := x2.SetFloat64(out).Float64() + if out2 != out || acc2 != Exact { + t.Errorf("idempotency test: got %g (%s); want %g (Exact)", out2, acc2, out) } } @@ -1108,7 +1192,8 @@ func TestFloatAdd(t *testing.T) { } // TestFloatAdd32 tests that Float.Add/Sub of numbers with -// 24bit mantissa behaves like float32 addition/subtraction. +// 24bit mantissa behaves like float32 addition/subtraction +// (excluding denormal numbers). func TestFloatAdd32(t *testing.T) { // chose base such that we cross the mantissa precision limit const base = 1<<26 - 0x10 // 11...110000 (26 bits) @@ -1124,15 +1209,15 @@ func TestFloatAdd32(t *testing.T) { z := new(Float).SetPrec(24) z.Add(x, y) - got, acc := z.Float64() - want := float64(float32(y0) + float32(x0)) + got, acc := z.Float32() + want := float32(y0) + float32(x0) if got != want || acc != Exact { t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want) } z.Sub(z, y) - got, acc = z.Float64() - want = float64(float32(want) - float32(y0)) + got, acc = z.Float32() + want = float32(want) - float32(y0) if got != want || acc != Exact { t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want) } |
