aboutsummaryrefslogtreecommitdiff
path: root/src/image
diff options
context:
space:
mode:
authorCherry Mui <cherryyz@google.com>2025-09-25 13:33:58 -0400
committerCherry Mui <cherryyz@google.com>2025-09-25 13:33:59 -0400
commita693ae1e9aebac896f6634583dbdd1cd319f3983 (patch)
tree44ef04e84afe5ef8652222c5500ab6c779d09650 /src/image
parent5a78e1a4a1c79185e86b5c18efffba2a9b9d3739 (diff)
parentd70ad4e740e24b4b76961c4b56d698fa23668aa2 (diff)
downloadgo-a693ae1e9aebac896f6634583dbdd1cd319f3983.tar.xz
[dev.simd] all: merge master (d70ad4e) into dev.simd
Conflicts: - src/cmd/compile/internal/types2/stdlib_test.go - src/go/types/stdlib_test.go Merge List: + 2025-09-25 d70ad4e740 sync/atomic: correct Uintptr.Or return doc + 2025-09-25 d7abfe4f0d runtime: acquire/release C TSAN lock when calling cgo symbolizer/tracebacker + 2025-09-25 393d91aea0 cmd/fix: remove all functionality + 2025-09-25 6dceff8bad cmd/link: handle -w flag in external linking mode + 2025-09-25 76d088eb74 cmd/internal/obj/riscv: remove ACFLWSP/ACFSWSP and ACFLW/ACFSW + 2025-09-25 5225e9dc49 doc/next: document new image/jpeg DCT in release notes + 2025-09-25 81a83bba21 cmd: update x/tools@4df13e3 + 2025-09-25 6b32c613ca go/types: make typeset return an iterator + 2025-09-25 fbba930271 image/jpeg: replace fdct.go and idct.go with new implementation in dct.go + 2025-09-25 92e093467f image/jpeg: correct and test reference slowFDCT and slowIDCT + 2025-09-25 27c7bbc51c image/jpeg: prepare for new FDCT/IDCT implementations + 2025-09-24 f15cd63ec4 cmd/compile: don't rely on loop info when there are irreducible loops + 2025-09-24 371c1d2fcb cmd/internal/obj/riscv: add support for vector unit-stride fault-only-first load instructions + 2025-09-23 411c250d64 runtime: add specialized malloc functions for sizes up to 512 bytes + 2025-09-23 d7a38adf4c runtime: eliminate global span queue [green tea] + 2025-09-23 7bc1935db5 cmd/compile/internal: support new(expr) + 2025-09-23 eb78f13c9f doc/go_spec.html: document new(expr) + 2025-09-23 74cc463f9e go/token: add TestRemovedFileFileReturnsNil test + 2025-09-23 902dc27ae9 go/token: clear cache after grabbing the mutex in RemoveFile + 2025-09-23 a13d085a5b cmd/cgo: don't hardcode section name in TestNumberOfExportedFunctions + 2025-09-23 61bf26a9ee cmd/link: fix Macho-O X86_64_RELOC_SUBTRACTOR in internal linking + 2025-09-23 4b787c8c2b reflect: remove stale comment in unpackEface + 2025-09-23 3df27cd21a cmd/compile: fix typo in comment + 2025-09-23 684e8d3363 reflect: allocate memory in TypeAssert[I] only when the assertion succeeds + 2025-09-23 a5866ebe40 cmd/compile: prevent shapifying of pointer shape type + 2025-09-23 a27261c42f go/types,types2: allow new(expr) + 2025-09-23 e93f439ac4 runtime/cgo: retry when CreateThread fails with ERROR_ACCESS_DENIED + 2025-09-23 69e74b0aac runtime: deduplicate pMask resize code + 2025-09-23 fde10c4ce7 runtime: split gcMarkWorkAvailable into two separate conditions + 2025-09-23 5d040df092 runtime: use scan kernels in scanSpan [green tea] + 2025-09-23 7e0251bf58 runtime: don't report non-blocked goroutines as "(durable)" in stacks + 2025-09-23 22ac328856 cmd/link: make -w behavior consistent on Windows Change-Id: Id76b5a30a3b6f6669437f97e3320c9bca65a1e96
Diffstat (limited to 'src/image')
-rw-r--r--src/image/decode_example_test.go32
-rw-r--r--src/image/jpeg/dct.go521
-rw-r--r--src/image/jpeg/dct_test.go285
-rw-r--r--src/image/jpeg/fdct.go192
-rw-r--r--src/image/jpeg/idct.go194
-rw-r--r--src/image/jpeg/writer_test.go4
6 files changed, 762 insertions, 466 deletions
diff --git a/src/image/decode_example_test.go b/src/image/decode_example_test.go
index 526c03f3c1..252ac868c0 100644
--- a/src/image/decode_example_test.go
+++ b/src/image/decode_example_test.go
@@ -70,22 +70,22 @@ func Example() {
}
// Output:
// bin red green blue alpha
- // 0x0000-0x0fff: 364 790 7242 0
- // 0x1000-0x1fff: 645 2967 1039 0
- // 0x2000-0x2fff: 1072 2299 979 0
- // 0x3000-0x3fff: 820 2266 980 0
- // 0x4000-0x4fff: 537 1305 541 0
- // 0x5000-0x5fff: 319 962 261 0
- // 0x6000-0x6fff: 322 375 177 0
- // 0x7000-0x7fff: 601 279 214 0
- // 0x8000-0x8fff: 3478 227 273 0
- // 0x9000-0x9fff: 2260 234 329 0
- // 0xa000-0xafff: 921 282 373 0
- // 0xb000-0xbfff: 321 335 397 0
- // 0xc000-0xcfff: 229 388 298 0
- // 0xd000-0xdfff: 260 414 277 0
- // 0xe000-0xefff: 516 428 298 0
- // 0xf000-0xffff: 2785 1899 1772 15450
+ // 0x0000-0x0fff: 362 793 7245 0
+ // 0x1000-0x1fff: 648 2963 1036 0
+ // 0x2000-0x2fff: 1072 2301 977 0
+ // 0x3000-0x3fff: 819 2266 982 0
+ // 0x4000-0x4fff: 537 1303 541 0
+ // 0x5000-0x5fff: 321 964 261 0
+ // 0x6000-0x6fff: 321 375 177 0
+ // 0x7000-0x7fff: 599 278 213 0
+ // 0x8000-0x8fff: 3478 228 275 0
+ // 0x9000-0x9fff: 2260 233 328 0
+ // 0xa000-0xafff: 921 282 374 0
+ // 0xb000-0xbfff: 322 335 395 0
+ // 0xc000-0xcfff: 228 388 299 0
+ // 0xd000-0xdfff: 261 415 277 0
+ // 0xe000-0xefff: 516 423 297 0
+ // 0xf000-0xffff: 2785 1903 1773 15450
}
const data = `
diff --git a/src/image/jpeg/dct.go b/src/image/jpeg/dct.go
new file mode 100644
index 0000000000..b2bf27d826
--- /dev/null
+++ b/src/image/jpeg/dct.go
@@ -0,0 +1,521 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+// Discrete Cosine Transformation (DCT) implementations using the algorithm from
+// Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz,
+// “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989.
+// https://ieeexplore.ieee.org/document/266596
+//
+// Since the paper is paywalled, the rest of this comment gives a summary.
+//
+// A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7
+// and transforms them in place into the result values.
+//
+// The mathematical definition of the N-point 1D FDCT is:
+//
+// X[k] = α_k Σ_n x[n] * cos (2n+1)*k*π/2N
+//
+// where α₀ = √2 and α_k = 1 for k > 0.
+//
+// For our purposes, N=8, so the angles end up being multiples of π/16.
+// The most direct implementation of this definition would require 64 multiplications.
+//
+// Loeffler's paper presents a more efficient computation that requires only
+// 11 multiplications and works in terms of three basic operations:
+//
+// - A “butterfly” x0, x1 = x0+x1, x0-x1.
+// The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2.
+//
+// - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k.
+//
+// - A rotation of x0, x1 by θ, defined as:
+// x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ.
+// The inverse is rotation by -θ.
+//
+// The algorithm proceeds in four stages:
+//
+// Stage 1:
+// - butterfly x0, x7; x1, x6; x2, x5; x3, x4.
+//
+// Stage 2:
+// - butterfly x0, x3; x1, x2
+// - rotate x4, x7 by 3π/16
+// - rotate x5, x6 by π/16.
+//
+// Stage 3:
+// - butterfly x0, x1; x4, x6; x7, x5
+// - rotate x2, x3 by 6π/16 and scale by √2.
+//
+// Stage 4:
+// - butterfly x7, x4
+// - scale x5, x6 by √2.
+//
+// Finally, the values are permuted. The permutation can be read as either:
+// - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form)
+// - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS)
+// The code below uses the second form to make it easier to merge adjacent stores.
+// (Note that unlike in recursive FFT implementations, the permutation here is
+// not always mapping indexes to their bit reversals.)
+//
+// As written above, the rotation requires four multiplications, but it can be
+// reduced to three by refactoring (see [dctBox] below), and the scaling in
+// stage 3 can be merged into the rotation constants, so the overall cost
+// of a 1D FDCT is 11 multiplies.
+//
+// The 1D inverse DCT (IDCT) is the 1D FDCT run backward
+// with all the basic operations inverted.
+
+// dctBox implements a 3-multiply, 3-add rotation+scaling.
+// Given x0, x1, k*cos θ, and k*sin θ, dctBox returns the
+// rotated and scaled coordinates.
+// (It is called dctBox because the rotate+scale operation
+// is drawn as a box in Figures 1 and 2 in the paper.)
+func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) {
+ // y0 = x0*kcos + x1*ksin
+ // y1 = -x0*ksin + x1*kcos
+ ksum := kcos * (x0 + x1)
+ y0 = ksum + (ksin-kcos)*x1
+ y1 = ksum - (kcos+ksin)*x0
+ return y0, y1
+}
+
+// A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT).
+// The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16,
+// but it is convenient to use int32s for intermediate storage,
+// so we define only a single block type of [8*8]int32.
+//
+// A 2D DCT is implemented as 1D DCTs over the rows and columns.
+//
+// dct_test.go defines a String method for nice printing in tests.
+type block [blockSize]int32
+
+const blockSize = 8 * 8
+
+// Note on Numerical Precision
+//
+// The inputs to both the FDCT and IDCT are uint8 values stored in a block,
+// and the outputs are int16s in the same block, but the overall operation
+// uses int32 values as fixed-point intermediate values.
+// In the code comments below, the notation “QN.M” refers to a
+// signed value of 1+N+M significant bits, one of which is the sign bit,
+// and M of which hold fractional (sub-integer) precision.
+// For example, 255 as a Q8.0 value is stored as int32(255),
+// while 255 as a Q8.1 value is stored as int32(510),
+// and 255.5 as a Q8.1 value is int32(511).
+// The notation UQN.M refers to an unsigned value of N+M significant bits.
+// See https://en.wikipedia.org/wiki/Q_(number_format) for more.
+//
+// In general we only need to keep about 16 significant bits, but it is more
+// efficient and somewhat more precise to let unnecessary fractional bits
+// accumulate and shift them away in bulk rather than after every operation.
+// As such, it is important to keep track of the number of fractional bits
+// in each variable at different points in the code, to avoid mistakes like
+// adding numbers with different fractional precisions, as well as to keep
+// track of the total number of bits, to avoid overflow. A comment like:
+//
+// // x[123] now Q8.2.
+//
+// means that x1, x2, and x3 are all Q8.2 (11-bit) values.
+// Keeping extra precision bits also reduces the size of the errors introduced
+// by using right shift to approximate rounded division.
+
+// Constants needed for the implementation.
+// These are all 60-bit precision fixed-point constants.
+// The function c(val, b) rounds the constant to b bits.
+// c is simple enough that calls to it with constant args
+// are inlined and constant-propagated down to an inline constant.
+// Each constant is commented with its Ivy definition (see robpike.io/ivy),
+// using this scaling helper function:
+//
+// op fix x = floor 0.5 + x * 2**60
+const (
+ cos1 = 1130768441178740757 // fix cos 1*pi/16
+ sin1 = 224923827593068887 // fix sin 1*pi/16
+ cos3 = 958619196450722178 // fix cos 3*pi/16
+ sin3 = 640528868967736374 // fix sin 3*pi/16
+ sqrt2 = 1630477228166597777 // fix sqrt 2
+ sqrt2_cos6 = 623956622067911264 // fix (sqrt 2)*cos 6*pi/16
+ sqrt2_sin6 = 1506364539328854985 // fix (sqrt 2)*sin 6*pi/16
+ sqrt2inv = 815238614083298888 // fix 1/sqrt 2
+ sqrt2inv_cos6 = 311978311033955632 // fix (1/sqrt 2)*cos 6*pi/16
+ sqrt2inv_sin6 = 753182269664427492 // fix (1/sqrt 2)*sin 6*pi/16
+)
+
+func c(x uint64, bits int) int32 {
+ return int32((x + (1 << (59 - bits))) >> (60 - bits))
+}
+
+// fdct implements the forward DCT.
+// Inputs are UQ8.0; outputs are Q13.0.
+func fdct(b *block) {
+ fdctCols(b)
+ fdctRows(b)
+}
+
+// fdctCols applies the 1D DCT to the columns of b.
+// Inputs are UQ8.0 in [0,255] but interpreted as [-128,127].
+// Outputs are Q10.18.
+func fdctCols(b *block) {
+ for i := range 8 {
+ x0 := b[0*8+i]
+ x1 := b[1*8+i]
+ x2 := b[2*8+i]
+ x3 := b[3*8+i]
+ x4 := b[4*8+i]
+ x5 := b[5*8+i]
+ x6 := b[6*8+i]
+ x7 := b[7*8+i]
+
+ // x[01234567] are UQ8.0 in [0,255].
+
+ // Stage 1: four butterflies.
+ // In general a butterfly of QN.M inputs produces Q(N+1).M outputs.
+ // A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference.
+
+ x0, x7 = x0+x7, x0-x7
+ x1, x6 = x1+x6, x1-x6
+ x2, x5 = x2+x5, x2-x5
+ x3, x4 = x3+x4, x3-x4
+ // x[0123] now UQ9.0 in [0, 510].
+ // x[4567] now Q8.0 in [-255,255].
+
+ // Stage 2: two boxes and two butterflies.
+ // A box on QN.M inputs with B-bit constants
+ // produces Q(N+1).(M+B) outputs.
+ // (The +1 is from the addition.)
+
+ x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18))
+ x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18))
+ // x[47] now Q9.18 in [-354, 354].
+ // x[56] now Q9.18 in [-300, 300].
+
+ x0, x3 = x0+x3, x0-x3
+ x1, x2 = x1+x2, x1-x2
+ // x[01] now UQ10.0 in [0, 1020].
+ // x[23] now Q9.0 in [-510, 510].
+
+ // Stage 3: one box and three butterflies.
+
+ x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18))
+ // x[23] now Q10.18 in [-943, 943].
+
+ x0, x1 = x0+x1, x0-x1
+ // x0 now UQ11.0 in [0, 2040].
+ // x1 now Q10.0 in [-1020, 1020].
+
+ // Store x0, x1, x2, x3 to their permuted targets.
+ // The original +128 in every input value
+ // has cancelled out except in the “DC signal” x0.
+ // Subtracting 128*8 here is equivalent to subtracting 128
+ // from every input before we started, but cheaper.
+ // It also converts x0 from UQ11.18 to Q10.18.
+ b[0*8+i] = (x0 - 128*8) << 18
+ b[4*8+i] = x1 << 18
+ b[2*8+i] = x2
+ b[6*8+i] = x3
+
+ x4, x6 = x4+x6, x4-x6
+ x7, x5 = x7+x5, x7-x5
+ // x[4567] now Q10.18 in [-654, 654].
+
+ // Stage 4: two √2 scalings and one butterfly.
+
+ x5 = (x5 >> 12) * c(sqrt2, 12)
+ x6 = (x6 >> 12) * c(sqrt2, 12)
+ // x[56] still Q10.18 in [-925, 925] (= 654√2).
+ x7, x4 = x7+x4, x7-x4
+ // x[47] still Q10.18 in [-925, 925] (not Q11.18!).
+ // This is not obvious at all! See “Note on 925” below.
+
+ // Store x4 x5 x6 x7 to their permuted targets.
+ b[1*8+i] = x7
+ b[3*8+i] = x5
+ b[5*8+i] = x6
+ b[7*8+i] = x4
+ }
+}
+
+// fdctRows applies the 1D DCT to the rows of b.
+// Inputs are Q10.18; outputs are Q13.0.
+func fdctRows(b *block) {
+ for i := range 8 {
+ x := b[8*i : 8*i+8 : 8*i+8]
+ x0 := x[0]
+ x1 := x[1]
+ x2 := x[2]
+ x3 := x[3]
+ x4 := x[4]
+ x5 := x[5]
+ x6 := x[6]
+ x7 := x[7]
+
+ // x[01234567] are Q10.18 [-1020, 1020].
+
+ // Stage 1: four butterflies.
+
+ x0, x7 = x0+x7, x0-x7
+ x1, x6 = x1+x6, x1-x6
+ x2, x5 = x2+x5, x2-x5
+ x3, x4 = x3+x4, x3-x4
+ // x[01234567] now Q11.18 in [-2040, 2040].
+
+ // Stage 2: two boxes and two butterflies.
+
+ x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14))
+ x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14))
+ // x[47] now Q12.18 in [-2830, 2830].
+ // x[56] now Q12.18 in [-2400, 2400].
+ x0, x3 = x0+x3, x0-x3
+ x1, x2 = x1+x2, x1-x2
+ // x[01234567] now Q12.18 in [-4080, 4080].
+
+ // Stage 3: one box and three butterflies.
+
+ x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14))
+ // x[23] now Q13.18 in [-7539, 7539].
+ x0, x1 = x0+x1, x0-x1
+ // x[01] now Q13.18 in [-8160, 8160].
+ x4, x6 = x4+x6, x4-x6
+ x7, x5 = x7+x5, x7-x5
+ // x[4567] now Q13.18 in [-5230, 5230].
+
+ // Stage 4: two √2 scalings and one butterfly.
+
+ x5 = (x5 >> 14) * c(sqrt2, 14)
+ x6 = (x6 >> 14) * c(sqrt2, 14)
+ // x[56] still Q13.18 in [-7397, 7397] (= 5230√2).
+ x7, x4 = x7+x4, x7-x4
+ // x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246).
+ // See “Note on 925” below.
+
+ // Cut from Q13.18 to Q13.0.
+ x0 = (x0 + 1<<17) >> 18
+ x1 = (x1 + 1<<17) >> 18
+ x2 = (x2 + 1<<17) >> 18
+ x3 = (x3 + 1<<17) >> 18
+ x4 = (x4 + 1<<17) >> 18
+ x5 = (x5 + 1<<17) >> 18
+ x6 = (x6 + 1<<17) >> 18
+ x7 = (x7 + 1<<17) >> 18
+
+ // Note: Unlike in fdctCols, saved all stores for the end
+ // because they are adjacent memory locations and some systems
+ // can use multiword stores.
+ x[0] = x0
+ x[1] = x7
+ x[2] = x2
+ x[3] = x5
+ x[4] = x1
+ x[5] = x6
+ x[6] = x3
+ x[7] = x4
+ }
+}
+
+// “Note on 925”, deferred from above to avoid interrupting code.
+//
+// In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255].
+// Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve:
+//
+// Stage 2:
+// x4 = b4*cos3 + b7*sin3
+// x7 = -b4*sin3 + b7*cos3
+// x5 = b5*cos1 + b6*sin1
+// x6 = -b5*sin1 + b6*cos1
+//
+// Stage 3:
+//
+// x4 = x4+x6 = b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
+// x6 = x4-x6 = b4*cos3 + b7*sin3 + b5*sin1 - b6*cos1
+// x7 = x7+x5 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1
+// x5 = x7-x5 = -b4*sin3 + b7*cos3 - b5*cos1 - b6*sin1
+//
+// Stage 4:
+//
+// x7 = x7+x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 + b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1
+// = b4*(cos3-sin3) + b5*(cos1-sin1) + b6*(cos1+sin1) + b7*(cos3+sin3)
+// < 255*(0.2759 + 0.7857 + 1.1759 + 1.3871) = 255*3.6246 < 925.
+//
+// x4 = x7-x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 - b4*cos3 - b7*sin3 + b5*sin1 - b6*cos1
+// = -b4*(cos3+sin3) + b5*(cos1+sin1) + b6*(sin1-cos1) + b7*(cos3-sin3)
+// < same 925.
+//
+// The fact that x5, x6 are also at most 925 is not a coincidence: we are computing
+// the same kinds of numbers for all four, just with different paths to them.
+//
+// In fdctRows, the same analysis applies, but the initial values are
+// in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395.
+
+// idct implements the inverse DCT.
+// Inputs are UQ8.0; outputs are Q10.3.
+func idct(b *block) {
+ // A 2D IDCT is a 1D IDCT on rows followed by columns.
+ idctRows(b)
+ idctCols(b)
+}
+
+// idctRows applies the 1D IDCT to the rows of b.
+// Inputs are UQ8.0; outputs are Q9.20.
+func idctRows(b *block) {
+ for i := range 8 {
+ x := b[8*i : 8*i+8 : 8*i+8]
+ x0 := x[0]
+ x7 := x[1]
+ x2 := x[2]
+ x5 := x[3]
+ x1 := x[4]
+ x6 := x[5]
+ x3 := x[6]
+ x4 := x[7]
+
+ // Run FDCT backward.
+ // Independent operations have been reordered somewhat
+ // to make precision tracking easier.
+ //
+ // Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly
+ // and carries with it an implicit divide by two: the extra bit
+ // is added to the precision, not the value size.
+
+ // x[01234567] are UQ8.0 in [0, 255].
+
+ // Stages 4, 3, 2: x0, x1, x2, x3.
+
+ x0 <<= 17
+ x1 <<= 17
+ // x0, x1 now UQ8.17.
+ x0, x1 = x0+x1, x0-x1
+ // x0 now UQ8.18 in [0, 255].
+ // x1 now Q7.18 in [-127½, 127½].
+
+ // Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit.
+ x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18))
+ // x[23] now Q8.18 in [-236, 236].
+ x1, x2 = x1+x2, x1-x2
+ x0, x3 = x0+x3, x0-x3
+ // x[0123] now Q8.19 in [-246, 246].
+
+ // Stages 4, 3, 2: x4, x5, x6, x7.
+
+ x4 <<= 7
+ x7 <<= 7
+ // x[47] now UQ8.7
+ x7, x4 = x7+x4, x7-x4
+ // x7 now UQ8.8 in [0, 255].
+ // x4 now Q7.8 in [-127½, 127½].
+
+ x6 = x6 * c(sqrt2inv, 8)
+ x5 = x5 * c(sqrt2inv, 8)
+ // x[56] now UQ8.8 in [0, 181].
+ // Note that 1/√2 has five 0s in its binary representation after
+ // the 8th bit, so this multipliy is actually producing 12 bits of precision.
+
+ x7, x5 = x7+x5, x7-x5
+ x4, x6 = x4+x6, x4-x6
+ // x[4567] now Q8.9 in [-218, 218].
+
+ x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12))
+ x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12))
+ // x[4567] now Q9.19 in [-303, 303].
+
+ // Stage 1.
+
+ x0, x7 = x0+x7, x0-x7
+ x1, x6 = x1+x6, x1-x6
+ x2, x5 = x2+x5, x2-x5
+ x3, x4 = x3+x4, x3-x4
+ // x[01234567] now Q9.20 in [-275, 275].
+
+ // Note: we don't need all 20 bits of “precision”,
+ // but it is faster to let idctCols shift it away as part
+ // of other operations rather than downshift here.
+
+ x[0] = x0
+ x[1] = x1
+ x[2] = x2
+ x[3] = x3
+ x[4] = x4
+ x[5] = x5
+ x[6] = x6
+ x[7] = x7
+ }
+}
+
+// idctCols applies the 1D IDCT to the columns of b.
+// Inputs are Q9.20.
+// Outputs are Q10.3. That is, the result is the IDCT*8.
+func idctCols(b *block) {
+ for i := range 8 {
+ x0 := b[0*8+i]
+ x7 := b[1*8+i]
+ x2 := b[2*8+i]
+ x5 := b[3*8+i]
+ x1 := b[4*8+i]
+ x6 := b[5*8+i]
+ x3 := b[6*8+i]
+ x4 := b[7*8+i]
+
+ // x[012345678] are Q9.20.
+
+ // Start by adding 0.5 to x0 (the incoming DC signal).
+ // The butterflies will add it to all the other values,
+ // and then the final shifts will round properly.
+ x0 += 1 << 19
+
+ // Stages 4, 3, 2: x0, x1, x2, x3.
+
+ x0, x1 = (x0+x1)>>2, (x0-x1)>>2
+ // x[01] now Q9.19.
+ // Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 1, so no new high bit.
+ x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12))
+ // x[0123] now Q9.19.
+
+ x1, x2 = x1+x2, x1-x2
+ x0, x3 = x0+x3, x0-x3
+ // x[0123] now Q9.20.
+
+ // Stages 4, 3, 2: x4, x5, x6, x7.
+
+ x7, x4 = x7+x4, x7-x4
+ // x[47] now Q9.21.
+
+ x5 = (x5 >> 13) * c(sqrt2inv, 14)
+ x6 = (x6 >> 13) * c(sqrt2inv, 14)
+ // x[56] now Q9.21.
+
+ x7, x5 = x7+x5, x7-x5
+ x4, x6 = x4+x6, x4-x6
+ // x[4567] now Q9.22.
+
+ x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12))
+ x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12))
+ // x[4567] now Q10.20.
+
+ x0, x7 = x0+x7, x0-x7
+ x1, x6 = x1+x6, x1-x6
+ x2, x5 = x2+x5, x2-x5
+ x3, x4 = x3+x4, x3-x4
+ // x[01234567] now Q10.21.
+
+ x0 >>= 18
+ x1 >>= 18
+ x2 >>= 18
+ x3 >>= 18
+ x4 >>= 18
+ x5 >>= 18
+ x6 >>= 18
+ x7 >>= 18
+ // x[01234567] now Q10.3.
+
+ b[0*8+i] = x0
+ b[1*8+i] = x1
+ b[2*8+i] = x2
+ b[3*8+i] = x3
+ b[4*8+i] = x4
+ b[5*8+i] = x5
+ b[6*8+i] = x6
+ b[7*8+i] = x7
+ }
+}
diff --git a/src/image/jpeg/dct_test.go b/src/image/jpeg/dct_test.go
index 11819db86e..0375a7113d 100644
--- a/src/image/jpeg/dct_test.go
+++ b/src/image/jpeg/dct_test.go
@@ -7,20 +7,18 @@ package jpeg
import (
"fmt"
"math"
+ "math/big"
"math/rand"
"strings"
"testing"
)
func benchmarkDCT(b *testing.B, f func(*block)) {
- b.StopTimer()
- blocks := make([]block, 0, b.N*len(testBlocks))
- for i := 0; i < b.N; i++ {
- blocks = append(blocks, testBlocks[:]...)
- }
- b.StartTimer()
- for i := range blocks {
- f(&blocks[i])
+ var blk block // avoid potential allocation in loop
+ for b.Loop() {
+ for _, blk = range testBlocks {
+ f(&blk)
+ }
}
}
@@ -32,11 +30,37 @@ func BenchmarkIDCT(b *testing.B) {
benchmarkDCT(b, idct)
}
+const testSlowVsBig = true
+
func TestDCT(t *testing.T) {
blocks := make([]block, len(testBlocks))
copy(blocks, testBlocks[:])
- // Append some randomly generated blocks of varying sparseness.
+ // All zeros
+ blocks = append(blocks, block{})
+
+ // Every possible unit impulse.
+ for i := range blockSize {
+ var b block
+ b[i] = 255
+ blocks = append(blocks, b)
+ }
+
+ // All ones.
+ var ones block
+ for i := range ones {
+ ones[i] = 255
+ }
+ blocks = append(blocks, ones)
+
+ // Every possible inverted unit impulse.
+ for i := range blockSize {
+ ones[i] = 0
+ blocks = append(blocks, ones)
+ ones[i] = 255
+ }
+
+ // Some randomly generated blocks of varying sparseness.
r := rand.New(rand.NewSource(123))
for i := 0; i < 100; i++ {
b := block{}
@@ -47,61 +71,84 @@ func TestDCT(t *testing.T) {
blocks = append(blocks, b)
}
- // Check that the FDCT and IDCT functions are inverses, after a scale and
- // level shift. Scaling reduces the rounding errors in the conversion from
- // floats to ints.
- for i, b := range blocks {
- got, want := b, b
- for j := range got {
- got[j] = (got[j] - 128) * 8
- }
- slowFDCT(&got)
- slowIDCT(&got)
- for j := range got {
- got[j] = got[j]/8 + 128
- }
- if differ(&got, &want) {
- t.Errorf("i=%d: IDCT(FDCT)\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+ // Check that the slow FDCT and IDCT functions are inverses,
+ // after a scale and level shift.
+ // Scaling reduces the rounding errors in the conversion.
+ // The “fast” ones are not inverses because the fast IDCT
+ // is optimized for 8-bit inputs, not full 16-bit ones.
+ slowRoundTrip := func(b *block) {
+ slowFDCT(b)
+ slowIDCT(b)
+ for j := range b {
+ b[j] = b[j]/8 + 128
}
}
+ nop := func(*block) {}
+ testDCT(t, "IDCT(FDCT)", blocks, slowRoundTrip, nop, 1, 8)
- // Check that the optimized and slow FDCT implementations agree.
- // The fdct function already does a scale and level shift.
- for i, b := range blocks {
- got, want := b, b
- fdct(&got)
- for j := range want {
- want[j] = (want[j] - 128) * 8
- }
- slowFDCT(&want)
- if differ(&got, &want) {
- t.Errorf("i=%d: FDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
- }
+ if testSlowVsBig {
+ testDCT(t, "slowFDCT", blocks, slowFDCT, slowerFDCT, 0, 64)
+ testDCT(t, "slowIDCT", blocks, slowIDCT, slowerIDCT, 0, 64)
}
- // Check that the optimized and slow IDCT implementations agree.
- for i, b := range blocks {
- got, want := b, b
- idct(&got)
- slowIDCT(&want)
- if differ(&got, &want) {
- t.Errorf("i=%d: IDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want)
+ // Check that the optimized and slow FDCT implementations agree.
+ testDCT(t, "FDCT", blocks, fdct, slowFDCT, 1, 8)
+ testDCT(t, "IDCT", blocks, idct, slowIDCT, 1, 8)
+}
+
+func testDCT(t *testing.T, name string, blocks []block, fhave, fwant func(*block), tolerance int32, maxCloseCalls int) {
+ t.Run(name, func(t *testing.T) {
+ totalClose := 0
+ for i, b := range blocks {
+ have, want := b, b
+ fhave(&have)
+ fwant(&want)
+ d, n := differ(&have, &want, tolerance)
+ if d >= 0 || n > maxCloseCalls {
+ fail := ""
+ if d >= 0 {
+ fail = fmt.Sprintf("diff at %d,%d", d/8, d%8)
+ }
+ if n > maxCloseCalls {
+ if fail != "" {
+ fail += "; "
+ }
+ fail += fmt.Sprintf("%d close calls", n)
+ }
+ t.Errorf("i=%d: %s (%s)\nsrc\n%s\nhave\n%s\nwant\n%s\n",
+ i, name, fail, &b, &have, &want)
+ }
+ totalClose += n
}
- }
+ if tolerance > 0 {
+ t.Logf("%d/%d total close calls", totalClose, len(blocks)*blockSize)
+ }
+ })
}
-// differ reports whether any pair-wise elements in b0 and b1 differ by 2 or
-// more. That tolerance is because there isn't a single definitive decoding of
-// a given JPEG image, even before the YCbCr to RGB conversion; implementations
+// differ returns the index of the first pair-wise elements in b0 and b1
+// that differ by more than 'ok', along with the total number of elements
+// that differ by at least ok ("close calls").
+//
+// There isn't a single definitive decoding of a given JPEG image,
+// even before the YCbCr to RGB conversion; implementations
// can have different IDCT rounding errors.
-func differ(b0, b1 *block) bool {
+//
+// If there are no differences, differ returns -1, 0.
+func differ(b0, b1 *block, ok int32) (index, closeCalls int) {
+ index = -1
for i := range b0 {
delta := b0[i] - b1[i]
- if delta < -2 || +2 < delta {
- return true
+ if delta < -ok || ok < delta {
+ if index < 0 {
+ index = i
+ }
+ }
+ if delta <= -ok || ok <= delta {
+ closeCalls++
}
}
- return false
+ return
}
// alpha returns 1 if i is 0 and returns √2 otherwise.
@@ -112,6 +159,14 @@ func alpha(i int) float64 {
return math.Sqrt2
}
+// bigAlpha returns 1 if i is 0 and returns √2 otherwise.
+func bigAlpha(i int) *big.Float {
+ if i == 0 {
+ return bigFloat1
+ }
+ return bigFloatSqrt2
+}
+
var cosines = [32]float64{
+1.0000000000000000000000000000000000000000000000000000000000000000, // cos(π/16 * 0)
+0.9807852804032304491261822361342390369739337308933360950029160885, // cos(π/16 * 1)
@@ -150,6 +205,57 @@ var cosines = [32]float64{
+0.9807852804032304491261822361342390369739337308933360950029160885, // cos(π/16 * 31)
}
+func bigFloat(s string) *big.Float {
+ f, ok := new(big.Float).SetString(s)
+ if !ok {
+ panic("bad float")
+ }
+ return f
+}
+
+var (
+ bigFloat1 = big.NewFloat(1)
+ bigFloatSqrt2 = bigFloat("1.41421356237309504880168872420969807856967187537694807317667974")
+)
+
+var bigCosines = [32]*big.Float{
+ bigFloat("+1.0000000000000000000000000000000000000000000000000000000000000000"), // cos(π/16 * 0)
+ bigFloat("+0.9807852804032304491261822361342390369739337308933360950029160885"), // cos(π/16 * 1)
+ bigFloat("+0.9238795325112867561281831893967882868224166258636424861150977312"), // cos(π/16 * 2)
+ bigFloat("+0.8314696123025452370787883776179057567385608119872499634461245902"), // cos(π/16 * 3)
+ bigFloat("+0.7071067811865475244008443621048490392848359376884740365883398689"), // cos(π/16 * 4)
+ bigFloat("+0.5555702330196022247428308139485328743749371907548040459241535282"), // cos(π/16 * 5)
+ bigFloat("+0.3826834323650897717284599840303988667613445624856270414338006356"), // cos(π/16 * 6)
+ bigFloat("+0.1950903220161282678482848684770222409276916177519548077545020894"), // cos(π/16 * 7)
+
+ bigFloat("-0.0000000000000000000000000000000000000000000000000000000000000000"), // cos(π/16 * 8)
+ bigFloat("-0.1950903220161282678482848684770222409276916177519548077545020894"), // cos(π/16 * 9)
+ bigFloat("-0.3826834323650897717284599840303988667613445624856270414338006356"), // cos(π/16 * 10)
+ bigFloat("-0.5555702330196022247428308139485328743749371907548040459241535282"), // cos(π/16 * 11)
+ bigFloat("-0.7071067811865475244008443621048490392848359376884740365883398689"), // cos(π/16 * 12)
+ bigFloat("-0.8314696123025452370787883776179057567385608119872499634461245902"), // cos(π/16 * 13)
+ bigFloat("-0.9238795325112867561281831893967882868224166258636424861150977312"), // cos(π/16 * 14)
+ bigFloat("-0.9807852804032304491261822361342390369739337308933360950029160885"), // cos(π/16 * 15)
+
+ bigFloat("-1.0000000000000000000000000000000000000000000000000000000000000000"), // cos(π/16 * 16)
+ bigFloat("-0.9807852804032304491261822361342390369739337308933360950029160885"), // cos(π/16 * 17)
+ bigFloat("-0.9238795325112867561281831893967882868224166258636424861150977312"), // cos(π/16 * 18)
+ bigFloat("-0.8314696123025452370787883776179057567385608119872499634461245902"), // cos(π/16 * 19)
+ bigFloat("-0.7071067811865475244008443621048490392848359376884740365883398689"), // cos(π/16 * 20)
+ bigFloat("-0.5555702330196022247428308139485328743749371907548040459241535282"), // cos(π/16 * 21)
+ bigFloat("-0.3826834323650897717284599840303988667613445624856270414338006356"), // cos(π/16 * 22)
+ bigFloat("-0.1950903220161282678482848684770222409276916177519548077545020894"), // cos(π/16 * 23)
+
+ bigFloat("+0.0000000000000000000000000000000000000000000000000000000000000000"), // cos(π/16 * 24)
+ bigFloat("+0.1950903220161282678482848684770222409276916177519548077545020894"), // cos(π/16 * 25)
+ bigFloat("+0.3826834323650897717284599840303988667613445624856270414338006356"), // cos(π/16 * 26)
+ bigFloat("+0.5555702330196022247428308139485328743749371907548040459241535282"), // cos(π/16 * 27)
+ bigFloat("+0.7071067811865475244008443621048490392848359376884740365883398689"), // cos(π/16 * 28)
+ bigFloat("+0.8314696123025452370787883776179057567385608119872499634461245902"), // cos(π/16 * 29)
+ bigFloat("+0.9238795325112867561281831893967882868224166258636424861150977312"), // cos(π/16 * 30)
+ bigFloat("+0.9807852804032304491261822361342390369739337308933360950029160885"), // cos(π/16 * 31)
+}
+
// slowFDCT performs the 8*8 2-dimensional forward discrete cosine transform:
//
// dst[u,v] = (1/8) * Σ_x Σ_y alpha(u) * alpha(v) * src[x,y] *
@@ -160,24 +266,51 @@ var cosines = [32]float64{
//
// b acts as both dst and src.
func slowFDCT(b *block) {
- var dst [blockSize]float64
+ var dst block
for v := 0; v < 8; v++ {
for u := 0; u < 8; u++ {
sum := 0.0
for y := 0; y < 8; y++ {
for x := 0; x < 8; x++ {
- sum += alpha(u) * alpha(v) * float64(b[8*y+x]) *
+ sum += alpha(u) * alpha(v) * float64(b[8*y+x]-128) *
cosines[((2*x+1)*u)%32] *
cosines[((2*y+1)*v)%32]
}
}
- dst[8*v+u] = sum / 8
+ dst[8*v+u] = int32(math.Round(sum))
}
}
- // Convert from float64 to int32.
- for i := range dst {
- b[i] = int32(dst[i] + 0.5)
+ *b = dst
+}
+
+// slowerFDCT is slowFDCT but using big.Floats to validate slowFDCT.
+func slowerFDCT(b *block) {
+ var dst block
+ for v := 0; v < 8; v++ {
+ for u := 0; u < 8; u++ {
+ sum := big.NewFloat(0)
+ for y := 0; y < 8; y++ {
+ for x := 0; x < 8; x++ {
+ f := big.NewFloat(float64(b[8*y+x] - 128))
+ f = new(big.Float).Mul(f, bigAlpha(u))
+ f = new(big.Float).Mul(f, bigAlpha(v))
+ f = new(big.Float).Mul(f, bigCosines[((2*x+1)*u)%32])
+ f = new(big.Float).Mul(f, bigCosines[((2*y+1)*v)%32])
+ sum = new(big.Float).Add(sum, f)
+ }
+ }
+ // Int64 truncates toward zero, so add ±0.5
+ // as needed to round
+ if sum.Sign() > 0 {
+ sum = new(big.Float).Add(sum, big.NewFloat(+0.5))
+ } else {
+ sum = new(big.Float).Add(sum, big.NewFloat(-0.5))
+ }
+ i, _ := sum.Int64()
+ dst[8*v+u] = int32(i)
+ }
}
+ *b = dst
}
// slowIDCT performs the 8*8 2-dimensional inverse discrete cosine transform:
@@ -190,7 +323,7 @@ func slowFDCT(b *block) {
//
// b acts as both dst and src.
func slowIDCT(b *block) {
- var dst [blockSize]float64
+ var dst block
for y := 0; y < 8; y++ {
for x := 0; x < 8; x++ {
sum := 0.0
@@ -201,13 +334,41 @@ func slowIDCT(b *block) {
cosines[((2*y+1)*v)%32]
}
}
- dst[8*y+x] = sum / 8
+ dst[8*y+x] = int32(math.Round(sum / 8))
}
}
- // Convert from float64 to int32.
- for i := range dst {
- b[i] = int32(dst[i] + 0.5)
+ *b = dst
+}
+
+// slowerIDCT is slowIDCT but using big.Floats to validate slowIDCT.
+func slowerIDCT(b *block) {
+ var dst block
+ for y := 0; y < 8; y++ {
+ for x := 0; x < 8; x++ {
+ sum := big.NewFloat(0)
+ for v := 0; v < 8; v++ {
+ for u := 0; u < 8; u++ {
+ f := big.NewFloat(float64(b[8*v+u]))
+ f = new(big.Float).Mul(f, bigAlpha(u))
+ f = new(big.Float).Mul(f, bigAlpha(v))
+ f = new(big.Float).Mul(f, bigCosines[((2*x+1)*u)%32])
+ f = new(big.Float).Mul(f, bigCosines[((2*y+1)*v)%32])
+ f = new(big.Float).Quo(f, big.NewFloat(8))
+ sum = new(big.Float).Add(sum, f)
+ }
+ }
+ // Int64 truncates toward zero, so add ±0.5
+ // as needed to round
+ if sum.Sign() > 0 {
+ sum = new(big.Float).Add(sum, big.NewFloat(+0.5))
+ } else {
+ sum = new(big.Float).Add(sum, big.NewFloat(-0.5))
+ }
+ i, _ := sum.Int64()
+ dst[8*y+x] = int32(i)
+ }
}
+ *b = dst
}
func (b *block) String() string {
diff --git a/src/image/jpeg/fdct.go b/src/image/jpeg/fdct.go
deleted file mode 100644
index c7a973ec3c..0000000000
--- a/src/image/jpeg/fdct.go
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package jpeg
-
-// This file implements a Forward Discrete Cosine Transformation.
-
-/*
-It is based on the code in jfdctint.c from the Independent JPEG Group,
-found at http://www.ijg.org/files/jpegsrc.v8c.tar.gz.
-
-The "LEGAL ISSUES" section of the README in that archive says:
-
-In plain English:
-
-1. We don't promise that this software works. (But if you find any bugs,
- please let us know!)
-2. You can use this software for whatever you want. You don't have to pay us.
-3. You may not pretend that you wrote this software. If you use it in a
- program, you must acknowledge somewhere in your documentation that
- you've used the IJG code.
-
-In legalese:
-
-The authors make NO WARRANTY or representation, either express or implied,
-with respect to this software, its quality, accuracy, merchantability, or
-fitness for a particular purpose. This software is provided "AS IS", and you,
-its user, assume the entire risk as to its quality and accuracy.
-
-This software is copyright (C) 1991-2011, Thomas G. Lane, Guido Vollbeding.
-All Rights Reserved except as specified below.
-
-Permission is hereby granted to use, copy, modify, and distribute this
-software (or portions thereof) for any purpose, without fee, subject to these
-conditions:
-(1) If any part of the source code for this software is distributed, then this
-README file must be included, with this copyright and no-warranty notice
-unaltered; and any additions, deletions, or changes to the original files
-must be clearly indicated in accompanying documentation.
-(2) If only executable code is distributed, then the accompanying
-documentation must state that "this software is based in part on the work of
-the Independent JPEG Group".
-(3) Permission for use of this software is granted only if the user accepts
-full responsibility for any undesirable consequences; the authors accept
-NO LIABILITY for damages of any kind.
-
-These conditions apply to any software derived from or based on the IJG code,
-not just to the unmodified library. If you use our work, you ought to
-acknowledge us.
-
-Permission is NOT granted for the use of any IJG author's name or company name
-in advertising or publicity relating to this software or products derived from
-it. This software may be referred to only as "the Independent JPEG Group's
-software".
-
-We specifically permit and encourage the use of this software as the basis of
-commercial products, provided that all warranty or liability claims are
-assumed by the product vendor.
-*/
-
-// Trigonometric constants in 13-bit fixed point format.
-const (
- fix_0_298631336 = 2446
- fix_0_390180644 = 3196
- fix_0_541196100 = 4433
- fix_0_765366865 = 6270
- fix_0_899976223 = 7373
- fix_1_175875602 = 9633
- fix_1_501321110 = 12299
- fix_1_847759065 = 15137
- fix_1_961570560 = 16069
- fix_2_053119869 = 16819
- fix_2_562915447 = 20995
- fix_3_072711026 = 25172
-)
-
-const (
- constBits = 13
- pass1Bits = 2
- centerJSample = 128
-)
-
-// fdct performs a forward DCT on an 8x8 block of coefficients, including a
-// level shift.
-func fdct(b *block) {
- // Pass 1: process rows.
- for y := 0; y < 8; y++ {
- y8 := y * 8
- s := b[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
- x0 := s[0]
- x1 := s[1]
- x2 := s[2]
- x3 := s[3]
- x4 := s[4]
- x5 := s[5]
- x6 := s[6]
- x7 := s[7]
-
- tmp0 := x0 + x7
- tmp1 := x1 + x6
- tmp2 := x2 + x5
- tmp3 := x3 + x4
-
- tmp10 := tmp0 + tmp3
- tmp12 := tmp0 - tmp3
- tmp11 := tmp1 + tmp2
- tmp13 := tmp1 - tmp2
-
- tmp0 = x0 - x7
- tmp1 = x1 - x6
- tmp2 = x2 - x5
- tmp3 = x3 - x4
-
- s[0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits
- s[4] = (tmp10 - tmp11) << pass1Bits
- z1 := (tmp12 + tmp13) * fix_0_541196100
- z1 += 1 << (constBits - pass1Bits - 1)
- s[2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits)
- s[6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits)
-
- tmp10 = tmp0 + tmp3
- tmp11 = tmp1 + tmp2
- tmp12 = tmp0 + tmp2
- tmp13 = tmp1 + tmp3
- z1 = (tmp12 + tmp13) * fix_1_175875602
- z1 += 1 << (constBits - pass1Bits - 1)
- tmp0 *= fix_1_501321110
- tmp1 *= fix_3_072711026
- tmp2 *= fix_2_053119869
- tmp3 *= fix_0_298631336
- tmp10 *= -fix_0_899976223
- tmp11 *= -fix_2_562915447
- tmp12 *= -fix_0_390180644
- tmp13 *= -fix_1_961570560
-
- tmp12 += z1
- tmp13 += z1
- s[1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits)
- s[3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits)
- s[5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits)
- s[7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits)
- }
- // Pass 2: process columns.
- // We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
- for x := 0; x < 8; x++ {
- tmp0 := b[0*8+x] + b[7*8+x]
- tmp1 := b[1*8+x] + b[6*8+x]
- tmp2 := b[2*8+x] + b[5*8+x]
- tmp3 := b[3*8+x] + b[4*8+x]
-
- tmp10 := tmp0 + tmp3 + 1<<(pass1Bits-1)
- tmp12 := tmp0 - tmp3
- tmp11 := tmp1 + tmp2
- tmp13 := tmp1 - tmp2
-
- tmp0 = b[0*8+x] - b[7*8+x]
- tmp1 = b[1*8+x] - b[6*8+x]
- tmp2 = b[2*8+x] - b[5*8+x]
- tmp3 = b[3*8+x] - b[4*8+x]
-
- b[0*8+x] = (tmp10 + tmp11) >> pass1Bits
- b[4*8+x] = (tmp10 - tmp11) >> pass1Bits
-
- z1 := (tmp12 + tmp13) * fix_0_541196100
- z1 += 1 << (constBits + pass1Bits - 1)
- b[2*8+x] = (z1 + tmp12*fix_0_765366865) >> (constBits + pass1Bits)
- b[6*8+x] = (z1 - tmp13*fix_1_847759065) >> (constBits + pass1Bits)
-
- tmp10 = tmp0 + tmp3
- tmp11 = tmp1 + tmp2
- tmp12 = tmp0 + tmp2
- tmp13 = tmp1 + tmp3
- z1 = (tmp12 + tmp13) * fix_1_175875602
- z1 += 1 << (constBits + pass1Bits - 1)
- tmp0 *= fix_1_501321110
- tmp1 *= fix_3_072711026
- tmp2 *= fix_2_053119869
- tmp3 *= fix_0_298631336
- tmp10 *= -fix_0_899976223
- tmp11 *= -fix_2_562915447
- tmp12 *= -fix_0_390180644
- tmp13 *= -fix_1_961570560
-
- tmp12 += z1
- tmp13 += z1
- b[1*8+x] = (tmp0 + tmp10 + tmp12) >> (constBits + pass1Bits)
- b[3*8+x] = (tmp1 + tmp11 + tmp13) >> (constBits + pass1Bits)
- b[5*8+x] = (tmp2 + tmp11 + tmp12) >> (constBits + pass1Bits)
- b[7*8+x] = (tmp3 + tmp10 + tmp13) >> (constBits + pass1Bits)
- }
-}
diff --git a/src/image/jpeg/idct.go b/src/image/jpeg/idct.go
deleted file mode 100644
index a3957c8ada..0000000000
--- a/src/image/jpeg/idct.go
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package jpeg
-
-// This is a Go translation of idct.c from
-//
-// http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
-//
-// which carries the following notice:
-
-/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
-
-/*
- * Disclaimer of Warranty
- *
- * These software programs are available to the user without any license fee or
- * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
- * any and all warranties, whether express, implied, or statuary, including any
- * implied warranties or merchantability or of fitness for a particular
- * purpose. In no event shall the copyright-holder be liable for any
- * incidental, punitive, or consequential damages of any kind whatsoever
- * arising from the use of these programs.
- *
- * This disclaimer of warranty extends to the user of these programs and user's
- * customers, employees, agents, transferees, successors, and assigns.
- *
- * The MPEG Software Simulation Group does not represent or warrant that the
- * programs furnished hereunder are free of infringement of any third-party
- * patents.
- *
- * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
- * are subject to royalty fees to patent holders. Many of these patents are
- * general enough such that they are unavoidable regardless of implementation
- * design.
- *
- */
-
-const blockSize = 64 // A DCT block is 8x8.
-
-type block [blockSize]int32
-
-const (
- w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
- w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
- w3 = 2408 // 2048*sqrt(2)*cos(3*pi/16)
- w5 = 1609 // 2048*sqrt(2)*cos(5*pi/16)
- w6 = 1108 // 2048*sqrt(2)*cos(6*pi/16)
- w7 = 565 // 2048*sqrt(2)*cos(7*pi/16)
-
- w1pw7 = w1 + w7
- w1mw7 = w1 - w7
- w2pw6 = w2 + w6
- w2mw6 = w2 - w6
- w3pw5 = w3 + w5
- w3mw5 = w3 - w5
-
- r2 = 181 // 256/sqrt(2)
-)
-
-// idct performs a 2-D Inverse Discrete Cosine Transformation.
-//
-// The input coefficients should already have been multiplied by the
-// appropriate quantization table. We use fixed-point computation, with the
-// number of bits for the fractional component varying over the intermediate
-// stages.
-//
-// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
-// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
-// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
-func idct(src *block) {
- // Horizontal 1-D IDCT.
- for y := 0; y < 8; y++ {
- y8 := y * 8
- s := src[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
- // If all the AC components are zero, then the IDCT is trivial.
- if s[1] == 0 && s[2] == 0 && s[3] == 0 &&
- s[4] == 0 && s[5] == 0 && s[6] == 0 && s[7] == 0 {
- dc := s[0] << 3
- s[0] = dc
- s[1] = dc
- s[2] = dc
- s[3] = dc
- s[4] = dc
- s[5] = dc
- s[6] = dc
- s[7] = dc
- continue
- }
-
- // Prescale.
- x0 := (s[0] << 11) + 128
- x1 := s[4] << 11
- x2 := s[6]
- x3 := s[2]
- x4 := s[1]
- x5 := s[7]
- x6 := s[5]
- x7 := s[3]
-
- // Stage 1.
- x8 := w7 * (x4 + x5)
- x4 = x8 + w1mw7*x4
- x5 = x8 - w1pw7*x5
- x8 = w3 * (x6 + x7)
- x6 = x8 - w3mw5*x6
- x7 = x8 - w3pw5*x7
-
- // Stage 2.
- x8 = x0 + x1
- x0 -= x1
- x1 = w6 * (x3 + x2)
- x2 = x1 - w2pw6*x2
- x3 = x1 + w2mw6*x3
- x1 = x4 + x6
- x4 -= x6
- x6 = x5 + x7
- x5 -= x7
-
- // Stage 3.
- x7 = x8 + x3
- x8 -= x3
- x3 = x0 + x2
- x0 -= x2
- x2 = (r2*(x4+x5) + 128) >> 8
- x4 = (r2*(x4-x5) + 128) >> 8
-
- // Stage 4.
- s[0] = (x7 + x1) >> 8
- s[1] = (x3 + x2) >> 8
- s[2] = (x0 + x4) >> 8
- s[3] = (x8 + x6) >> 8
- s[4] = (x8 - x6) >> 8
- s[5] = (x0 - x4) >> 8
- s[6] = (x3 - x2) >> 8
- s[7] = (x7 - x1) >> 8
- }
-
- // Vertical 1-D IDCT.
- for x := 0; x < 8; x++ {
- // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
- // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
- // we do not bother to check for the all-zero case.
- s := src[x : x+57 : x+57] // Small cap improves performance, see https://golang.org/issue/27857
-
- // Prescale.
- y0 := (s[8*0] << 8) + 8192
- y1 := s[8*4] << 8
- y2 := s[8*6]
- y3 := s[8*2]
- y4 := s[8*1]
- y5 := s[8*7]
- y6 := s[8*5]
- y7 := s[8*3]
-
- // Stage 1.
- y8 := w7*(y4+y5) + 4
- y4 = (y8 + w1mw7*y4) >> 3
- y5 = (y8 - w1pw7*y5) >> 3
- y8 = w3*(y6+y7) + 4
- y6 = (y8 - w3mw5*y6) >> 3
- y7 = (y8 - w3pw5*y7) >> 3
-
- // Stage 2.
- y8 = y0 + y1
- y0 -= y1
- y1 = w6*(y3+y2) + 4
- y2 = (y1 - w2pw6*y2) >> 3
- y3 = (y1 + w2mw6*y3) >> 3
- y1 = y4 + y6
- y4 -= y6
- y6 = y5 + y7
- y5 -= y7
-
- // Stage 3.
- y7 = y8 + y3
- y8 -= y3
- y3 = y0 + y2
- y0 -= y2
- y2 = (r2*(y4+y5) + 128) >> 8
- y4 = (r2*(y4-y5) + 128) >> 8
-
- // Stage 4.
- s[8*0] = (y7 + y1) >> 14
- s[8*1] = (y3 + y2) >> 14
- s[8*2] = (y0 + y4) >> 14
- s[8*3] = (y8 + y6) >> 14
- s[8*4] = (y8 - y6) >> 14
- s[8*5] = (y0 - y4) >> 14
- s[8*6] = (y3 - y2) >> 14
- s[8*7] = (y7 - y1) >> 14
- }
-}
diff --git a/src/image/jpeg/writer_test.go b/src/image/jpeg/writer_test.go
index 341477044a..197b49ec13 100644
--- a/src/image/jpeg/writer_test.go
+++ b/src/image/jpeg/writer_test.go
@@ -154,8 +154,8 @@ func TestWriter(t *testing.T) {
continue
}
// Compare the average delta to the tolerance level.
- if averageDelta(m0, m1) > tc.tolerance {
- t.Errorf("%s, quality=%d: average delta is too high", tc.filename, tc.quality)
+ if d := averageDelta(m0, m1); d > tc.tolerance {
+ t.Errorf("%s, quality=%d: average delta is too high (%d > %d)", tc.filename, tc.quality, d, tc.tolerance)
continue
}
}