diff options
| author | Cherry Mui <cherryyz@google.com> | 2025-09-25 13:33:58 -0400 |
|---|---|---|
| committer | Cherry Mui <cherryyz@google.com> | 2025-09-25 13:33:59 -0400 |
| commit | a693ae1e9aebac896f6634583dbdd1cd319f3983 (patch) | |
| tree | 44ef04e84afe5ef8652222c5500ab6c779d09650 /src/image | |
| parent | 5a78e1a4a1c79185e86b5c18efffba2a9b9d3739 (diff) | |
| parent | d70ad4e740e24b4b76961c4b56d698fa23668aa2 (diff) | |
| download | go-a693ae1e9aebac896f6634583dbdd1cd319f3983.tar.xz | |
[dev.simd] all: merge master (d70ad4e) into dev.simd
Conflicts:
- src/cmd/compile/internal/types2/stdlib_test.go
- src/go/types/stdlib_test.go
Merge List:
+ 2025-09-25 d70ad4e740 sync/atomic: correct Uintptr.Or return doc
+ 2025-09-25 d7abfe4f0d runtime: acquire/release C TSAN lock when calling cgo symbolizer/tracebacker
+ 2025-09-25 393d91aea0 cmd/fix: remove all functionality
+ 2025-09-25 6dceff8bad cmd/link: handle -w flag in external linking mode
+ 2025-09-25 76d088eb74 cmd/internal/obj/riscv: remove ACFLWSP/ACFSWSP and ACFLW/ACFSW
+ 2025-09-25 5225e9dc49 doc/next: document new image/jpeg DCT in release notes
+ 2025-09-25 81a83bba21 cmd: update x/tools@4df13e3
+ 2025-09-25 6b32c613ca go/types: make typeset return an iterator
+ 2025-09-25 fbba930271 image/jpeg: replace fdct.go and idct.go with new implementation in dct.go
+ 2025-09-25 92e093467f image/jpeg: correct and test reference slowFDCT and slowIDCT
+ 2025-09-25 27c7bbc51c image/jpeg: prepare for new FDCT/IDCT implementations
+ 2025-09-24 f15cd63ec4 cmd/compile: don't rely on loop info when there are irreducible loops
+ 2025-09-24 371c1d2fcb cmd/internal/obj/riscv: add support for vector unit-stride fault-only-first load instructions
+ 2025-09-23 411c250d64 runtime: add specialized malloc functions for sizes up to 512 bytes
+ 2025-09-23 d7a38adf4c runtime: eliminate global span queue [green tea]
+ 2025-09-23 7bc1935db5 cmd/compile/internal: support new(expr)
+ 2025-09-23 eb78f13c9f doc/go_spec.html: document new(expr)
+ 2025-09-23 74cc463f9e go/token: add TestRemovedFileFileReturnsNil test
+ 2025-09-23 902dc27ae9 go/token: clear cache after grabbing the mutex in RemoveFile
+ 2025-09-23 a13d085a5b cmd/cgo: don't hardcode section name in TestNumberOfExportedFunctions
+ 2025-09-23 61bf26a9ee cmd/link: fix Macho-O X86_64_RELOC_SUBTRACTOR in internal linking
+ 2025-09-23 4b787c8c2b reflect: remove stale comment in unpackEface
+ 2025-09-23 3df27cd21a cmd/compile: fix typo in comment
+ 2025-09-23 684e8d3363 reflect: allocate memory in TypeAssert[I] only when the assertion succeeds
+ 2025-09-23 a5866ebe40 cmd/compile: prevent shapifying of pointer shape type
+ 2025-09-23 a27261c42f go/types,types2: allow new(expr)
+ 2025-09-23 e93f439ac4 runtime/cgo: retry when CreateThread fails with ERROR_ACCESS_DENIED
+ 2025-09-23 69e74b0aac runtime: deduplicate pMask resize code
+ 2025-09-23 fde10c4ce7 runtime: split gcMarkWorkAvailable into two separate conditions
+ 2025-09-23 5d040df092 runtime: use scan kernels in scanSpan [green tea]
+ 2025-09-23 7e0251bf58 runtime: don't report non-blocked goroutines as "(durable)" in stacks
+ 2025-09-23 22ac328856 cmd/link: make -w behavior consistent on Windows
Change-Id: Id76b5a30a3b6f6669437f97e3320c9bca65a1e96
Diffstat (limited to 'src/image')
| -rw-r--r-- | src/image/decode_example_test.go | 32 | ||||
| -rw-r--r-- | src/image/jpeg/dct.go | 521 | ||||
| -rw-r--r-- | src/image/jpeg/dct_test.go | 285 | ||||
| -rw-r--r-- | src/image/jpeg/fdct.go | 192 | ||||
| -rw-r--r-- | src/image/jpeg/idct.go | 194 | ||||
| -rw-r--r-- | src/image/jpeg/writer_test.go | 4 |
6 files changed, 762 insertions, 466 deletions
diff --git a/src/image/decode_example_test.go b/src/image/decode_example_test.go index 526c03f3c1..252ac868c0 100644 --- a/src/image/decode_example_test.go +++ b/src/image/decode_example_test.go @@ -70,22 +70,22 @@ func Example() { } // Output: // bin red green blue alpha - // 0x0000-0x0fff: 364 790 7242 0 - // 0x1000-0x1fff: 645 2967 1039 0 - // 0x2000-0x2fff: 1072 2299 979 0 - // 0x3000-0x3fff: 820 2266 980 0 - // 0x4000-0x4fff: 537 1305 541 0 - // 0x5000-0x5fff: 319 962 261 0 - // 0x6000-0x6fff: 322 375 177 0 - // 0x7000-0x7fff: 601 279 214 0 - // 0x8000-0x8fff: 3478 227 273 0 - // 0x9000-0x9fff: 2260 234 329 0 - // 0xa000-0xafff: 921 282 373 0 - // 0xb000-0xbfff: 321 335 397 0 - // 0xc000-0xcfff: 229 388 298 0 - // 0xd000-0xdfff: 260 414 277 0 - // 0xe000-0xefff: 516 428 298 0 - // 0xf000-0xffff: 2785 1899 1772 15450 + // 0x0000-0x0fff: 362 793 7245 0 + // 0x1000-0x1fff: 648 2963 1036 0 + // 0x2000-0x2fff: 1072 2301 977 0 + // 0x3000-0x3fff: 819 2266 982 0 + // 0x4000-0x4fff: 537 1303 541 0 + // 0x5000-0x5fff: 321 964 261 0 + // 0x6000-0x6fff: 321 375 177 0 + // 0x7000-0x7fff: 599 278 213 0 + // 0x8000-0x8fff: 3478 228 275 0 + // 0x9000-0x9fff: 2260 233 328 0 + // 0xa000-0xafff: 921 282 374 0 + // 0xb000-0xbfff: 322 335 395 0 + // 0xc000-0xcfff: 228 388 299 0 + // 0xd000-0xdfff: 261 415 277 0 + // 0xe000-0xefff: 516 423 297 0 + // 0xf000-0xffff: 2785 1903 1773 15450 } const data = ` diff --git a/src/image/jpeg/dct.go b/src/image/jpeg/dct.go new file mode 100644 index 0000000000..b2bf27d826 --- /dev/null +++ b/src/image/jpeg/dct.go @@ -0,0 +1,521 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package jpeg + +// Discrete Cosine Transformation (DCT) implementations using the algorithm from +// Christoph Loeffler, Adriaan Lightenberg, and George S. Mostchytz, +// “Practical Fast 1-D DCT Algorithms with 11 Multiplications,” ICASSP 1989. +// https://ieeexplore.ieee.org/document/266596 +// +// Since the paper is paywalled, the rest of this comment gives a summary. +// +// A 1-dimensional forward DCT (1D FDCT) takes as input 8 values x0..x7 +// and transforms them in place into the result values. +// +// The mathematical definition of the N-point 1D FDCT is: +// +// X[k] = α_k Σ_n x[n] * cos (2n+1)*k*π/2N +// +// where α₀ = √2 and α_k = 1 for k > 0. +// +// For our purposes, N=8, so the angles end up being multiples of π/16. +// The most direct implementation of this definition would require 64 multiplications. +// +// Loeffler's paper presents a more efficient computation that requires only +// 11 multiplications and works in terms of three basic operations: +// +// - A “butterfly” x0, x1 = x0+x1, x0-x1. +// The inverse is x0, x1 = (x0+x1)/2, (x0-x1)/2. +// +// - A scaling of x0 by k: x0 *= k. The inverse is scaling by 1/k. +// +// - A rotation of x0, x1 by θ, defined as: +// x0, x1 = x0 cos θ + x1 sin θ, -x0 sin θ + x1 cos θ. +// The inverse is rotation by -θ. +// +// The algorithm proceeds in four stages: +// +// Stage 1: +// - butterfly x0, x7; x1, x6; x2, x5; x3, x4. +// +// Stage 2: +// - butterfly x0, x3; x1, x2 +// - rotate x4, x7 by 3π/16 +// - rotate x5, x6 by π/16. +// +// Stage 3: +// - butterfly x0, x1; x4, x6; x7, x5 +// - rotate x2, x3 by 6π/16 and scale by √2. +// +// Stage 4: +// - butterfly x7, x4 +// - scale x5, x6 by √2. +// +// Finally, the values are permuted. The permutation can be read as either: +// - x0, x4, x2, x6, x7, x3, x5, x1 = x0, x1, x2, x3, x4, x5, x6, x7 (paper's form) +// - x0, x1, x2, x3, x4, x5, x6, x7 = x0, x7, x2, x5, x1, x6, x3, x4 (sorted by LHS) +// The code below uses the second form to make it easier to merge adjacent stores. +// (Note that unlike in recursive FFT implementations, the permutation here is +// not always mapping indexes to their bit reversals.) +// +// As written above, the rotation requires four multiplications, but it can be +// reduced to three by refactoring (see [dctBox] below), and the scaling in +// stage 3 can be merged into the rotation constants, so the overall cost +// of a 1D FDCT is 11 multiplies. +// +// The 1D inverse DCT (IDCT) is the 1D FDCT run backward +// with all the basic operations inverted. + +// dctBox implements a 3-multiply, 3-add rotation+scaling. +// Given x0, x1, k*cos θ, and k*sin θ, dctBox returns the +// rotated and scaled coordinates. +// (It is called dctBox because the rotate+scale operation +// is drawn as a box in Figures 1 and 2 in the paper.) +func dctBox(x0, x1, kcos, ksin int32) (y0, y1 int32) { + // y0 = x0*kcos + x1*ksin + // y1 = -x0*ksin + x1*kcos + ksum := kcos * (x0 + x1) + y0 = ksum + (ksin-kcos)*x1 + y1 = ksum - (kcos+ksin)*x0 + return y0, y1 +} + +// A block is an 8x8 input to a 2D DCT (either the FDCT or IDCT). +// The input is actually only 8x8 uint8 values, and the outputs are 8x8 int16, +// but it is convenient to use int32s for intermediate storage, +// so we define only a single block type of [8*8]int32. +// +// A 2D DCT is implemented as 1D DCTs over the rows and columns. +// +// dct_test.go defines a String method for nice printing in tests. +type block [blockSize]int32 + +const blockSize = 8 * 8 + +// Note on Numerical Precision +// +// The inputs to both the FDCT and IDCT are uint8 values stored in a block, +// and the outputs are int16s in the same block, but the overall operation +// uses int32 values as fixed-point intermediate values. +// In the code comments below, the notation “QN.M” refers to a +// signed value of 1+N+M significant bits, one of which is the sign bit, +// and M of which hold fractional (sub-integer) precision. +// For example, 255 as a Q8.0 value is stored as int32(255), +// while 255 as a Q8.1 value is stored as int32(510), +// and 255.5 as a Q8.1 value is int32(511). +// The notation UQN.M refers to an unsigned value of N+M significant bits. +// See https://en.wikipedia.org/wiki/Q_(number_format) for more. +// +// In general we only need to keep about 16 significant bits, but it is more +// efficient and somewhat more precise to let unnecessary fractional bits +// accumulate and shift them away in bulk rather than after every operation. +// As such, it is important to keep track of the number of fractional bits +// in each variable at different points in the code, to avoid mistakes like +// adding numbers with different fractional precisions, as well as to keep +// track of the total number of bits, to avoid overflow. A comment like: +// +// // x[123] now Q8.2. +// +// means that x1, x2, and x3 are all Q8.2 (11-bit) values. +// Keeping extra precision bits also reduces the size of the errors introduced +// by using right shift to approximate rounded division. + +// Constants needed for the implementation. +// These are all 60-bit precision fixed-point constants. +// The function c(val, b) rounds the constant to b bits. +// c is simple enough that calls to it with constant args +// are inlined and constant-propagated down to an inline constant. +// Each constant is commented with its Ivy definition (see robpike.io/ivy), +// using this scaling helper function: +// +// op fix x = floor 0.5 + x * 2**60 +const ( + cos1 = 1130768441178740757 // fix cos 1*pi/16 + sin1 = 224923827593068887 // fix sin 1*pi/16 + cos3 = 958619196450722178 // fix cos 3*pi/16 + sin3 = 640528868967736374 // fix sin 3*pi/16 + sqrt2 = 1630477228166597777 // fix sqrt 2 + sqrt2_cos6 = 623956622067911264 // fix (sqrt 2)*cos 6*pi/16 + sqrt2_sin6 = 1506364539328854985 // fix (sqrt 2)*sin 6*pi/16 + sqrt2inv = 815238614083298888 // fix 1/sqrt 2 + sqrt2inv_cos6 = 311978311033955632 // fix (1/sqrt 2)*cos 6*pi/16 + sqrt2inv_sin6 = 753182269664427492 // fix (1/sqrt 2)*sin 6*pi/16 +) + +func c(x uint64, bits int) int32 { + return int32((x + (1 << (59 - bits))) >> (60 - bits)) +} + +// fdct implements the forward DCT. +// Inputs are UQ8.0; outputs are Q13.0. +func fdct(b *block) { + fdctCols(b) + fdctRows(b) +} + +// fdctCols applies the 1D DCT to the columns of b. +// Inputs are UQ8.0 in [0,255] but interpreted as [-128,127]. +// Outputs are Q10.18. +func fdctCols(b *block) { + for i := range 8 { + x0 := b[0*8+i] + x1 := b[1*8+i] + x2 := b[2*8+i] + x3 := b[3*8+i] + x4 := b[4*8+i] + x5 := b[5*8+i] + x6 := b[6*8+i] + x7 := b[7*8+i] + + // x[01234567] are UQ8.0 in [0,255]. + + // Stage 1: four butterflies. + // In general a butterfly of QN.M inputs produces Q(N+1).M outputs. + // A butterfly of UQN.M inputs produces a UQ(N+1).M sum and a QN.M difference. + + x0, x7 = x0+x7, x0-x7 + x1, x6 = x1+x6, x1-x6 + x2, x5 = x2+x5, x2-x5 + x3, x4 = x3+x4, x3-x4 + // x[0123] now UQ9.0 in [0, 510]. + // x[4567] now Q8.0 in [-255,255]. + + // Stage 2: two boxes and two butterflies. + // A box on QN.M inputs with B-bit constants + // produces Q(N+1).(M+B) outputs. + // (The +1 is from the addition.) + + x4, x7 = dctBox(x4, x7, c(cos3, 18), c(sin3, 18)) + x5, x6 = dctBox(x5, x6, c(cos1, 18), c(sin1, 18)) + // x[47] now Q9.18 in [-354, 354]. + // x[56] now Q9.18 in [-300, 300]. + + x0, x3 = x0+x3, x0-x3 + x1, x2 = x1+x2, x1-x2 + // x[01] now UQ10.0 in [0, 1020]. + // x[23] now Q9.0 in [-510, 510]. + + // Stage 3: one box and three butterflies. + + x2, x3 = dctBox(x2, x3, c(sqrt2_cos6, 18), c(sqrt2_sin6, 18)) + // x[23] now Q10.18 in [-943, 943]. + + x0, x1 = x0+x1, x0-x1 + // x0 now UQ11.0 in [0, 2040]. + // x1 now Q10.0 in [-1020, 1020]. + + // Store x0, x1, x2, x3 to their permuted targets. + // The original +128 in every input value + // has cancelled out except in the “DC signal” x0. + // Subtracting 128*8 here is equivalent to subtracting 128 + // from every input before we started, but cheaper. + // It also converts x0 from UQ11.18 to Q10.18. + b[0*8+i] = (x0 - 128*8) << 18 + b[4*8+i] = x1 << 18 + b[2*8+i] = x2 + b[6*8+i] = x3 + + x4, x6 = x4+x6, x4-x6 + x7, x5 = x7+x5, x7-x5 + // x[4567] now Q10.18 in [-654, 654]. + + // Stage 4: two √2 scalings and one butterfly. + + x5 = (x5 >> 12) * c(sqrt2, 12) + x6 = (x6 >> 12) * c(sqrt2, 12) + // x[56] still Q10.18 in [-925, 925] (= 654√2). + x7, x4 = x7+x4, x7-x4 + // x[47] still Q10.18 in [-925, 925] (not Q11.18!). + // This is not obvious at all! See “Note on 925” below. + + // Store x4 x5 x6 x7 to their permuted targets. + b[1*8+i] = x7 + b[3*8+i] = x5 + b[5*8+i] = x6 + b[7*8+i] = x4 + } +} + +// fdctRows applies the 1D DCT to the rows of b. +// Inputs are Q10.18; outputs are Q13.0. +func fdctRows(b *block) { + for i := range 8 { + x := b[8*i : 8*i+8 : 8*i+8] + x0 := x[0] + x1 := x[1] + x2 := x[2] + x3 := x[3] + x4 := x[4] + x5 := x[5] + x6 := x[6] + x7 := x[7] + + // x[01234567] are Q10.18 [-1020, 1020]. + + // Stage 1: four butterflies. + + x0, x7 = x0+x7, x0-x7 + x1, x6 = x1+x6, x1-x6 + x2, x5 = x2+x5, x2-x5 + x3, x4 = x3+x4, x3-x4 + // x[01234567] now Q11.18 in [-2040, 2040]. + + // Stage 2: two boxes and two butterflies. + + x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 14), c(sin3, 14)) + x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 14), c(sin1, 14)) + // x[47] now Q12.18 in [-2830, 2830]. + // x[56] now Q12.18 in [-2400, 2400]. + x0, x3 = x0+x3, x0-x3 + x1, x2 = x1+x2, x1-x2 + // x[01234567] now Q12.18 in [-4080, 4080]. + + // Stage 3: one box and three butterflies. + + x2, x3 = dctBox(x2>>14, x3>>14, c(sqrt2_cos6, 14), c(sqrt2_sin6, 14)) + // x[23] now Q13.18 in [-7539, 7539]. + x0, x1 = x0+x1, x0-x1 + // x[01] now Q13.18 in [-8160, 8160]. + x4, x6 = x4+x6, x4-x6 + x7, x5 = x7+x5, x7-x5 + // x[4567] now Q13.18 in [-5230, 5230]. + + // Stage 4: two √2 scalings and one butterfly. + + x5 = (x5 >> 14) * c(sqrt2, 14) + x6 = (x6 >> 14) * c(sqrt2, 14) + // x[56] still Q13.18 in [-7397, 7397] (= 5230√2). + x7, x4 = x7+x4, x7-x4 + // x[47] still Q13.18 in [-7395, 7395] (= 2040*3.6246). + // See “Note on 925” below. + + // Cut from Q13.18 to Q13.0. + x0 = (x0 + 1<<17) >> 18 + x1 = (x1 + 1<<17) >> 18 + x2 = (x2 + 1<<17) >> 18 + x3 = (x3 + 1<<17) >> 18 + x4 = (x4 + 1<<17) >> 18 + x5 = (x5 + 1<<17) >> 18 + x6 = (x6 + 1<<17) >> 18 + x7 = (x7 + 1<<17) >> 18 + + // Note: Unlike in fdctCols, saved all stores for the end + // because they are adjacent memory locations and some systems + // can use multiword stores. + x[0] = x0 + x[1] = x7 + x[2] = x2 + x[3] = x5 + x[4] = x1 + x[5] = x6 + x[6] = x3 + x[7] = x4 + } +} + +// “Note on 925”, deferred from above to avoid interrupting code. +// +// In fdctCols, heading into stage 2, the values x4, x5, x6, x7 are in [-255, 255]. +// Let's call those specific values b4, b5, b6, b7, and trace how x[4567] evolve: +// +// Stage 2: +// x4 = b4*cos3 + b7*sin3 +// x7 = -b4*sin3 + b7*cos3 +// x5 = b5*cos1 + b6*sin1 +// x6 = -b5*sin1 + b6*cos1 +// +// Stage 3: +// +// x4 = x4+x6 = b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1 +// x6 = x4-x6 = b4*cos3 + b7*sin3 + b5*sin1 - b6*cos1 +// x7 = x7+x5 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 +// x5 = x7-x5 = -b4*sin3 + b7*cos3 - b5*cos1 - b6*sin1 +// +// Stage 4: +// +// x7 = x7+x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 + b4*cos3 + b7*sin3 - b5*sin1 + b6*cos1 +// = b4*(cos3-sin3) + b5*(cos1-sin1) + b6*(cos1+sin1) + b7*(cos3+sin3) +// < 255*(0.2759 + 0.7857 + 1.1759 + 1.3871) = 255*3.6246 < 925. +// +// x4 = x7-x4 = -b4*sin3 + b7*cos3 + b5*cos1 + b6*sin1 - b4*cos3 - b7*sin3 + b5*sin1 - b6*cos1 +// = -b4*(cos3+sin3) + b5*(cos1+sin1) + b6*(sin1-cos1) + b7*(cos3-sin3) +// < same 925. +// +// The fact that x5, x6 are also at most 925 is not a coincidence: we are computing +// the same kinds of numbers for all four, just with different paths to them. +// +// In fdctRows, the same analysis applies, but the initial values are +// in [-2040, 2040] instead of [-255, 255], so the bound is 2040*3.6246 < 7395. + +// idct implements the inverse DCT. +// Inputs are UQ8.0; outputs are Q10.3. +func idct(b *block) { + // A 2D IDCT is a 1D IDCT on rows followed by columns. + idctRows(b) + idctCols(b) +} + +// idctRows applies the 1D IDCT to the rows of b. +// Inputs are UQ8.0; outputs are Q9.20. +func idctRows(b *block) { + for i := range 8 { + x := b[8*i : 8*i+8 : 8*i+8] + x0 := x[0] + x7 := x[1] + x2 := x[2] + x5 := x[3] + x1 := x[4] + x6 := x[5] + x3 := x[6] + x4 := x[7] + + // Run FDCT backward. + // Independent operations have been reordered somewhat + // to make precision tracking easier. + // + // Note that “x0, x1 = x0+x1, x0-x1” is now a reverse butterfly + // and carries with it an implicit divide by two: the extra bit + // is added to the precision, not the value size. + + // x[01234567] are UQ8.0 in [0, 255]. + + // Stages 4, 3, 2: x0, x1, x2, x3. + + x0 <<= 17 + x1 <<= 17 + // x0, x1 now UQ8.17. + x0, x1 = x0+x1, x0-x1 + // x0 now UQ8.18 in [0, 255]. + // x1 now Q7.18 in [-127½, 127½]. + + // Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 0.924, so no new high bit. + x2, x3 = dctBox(x2, x3, c(sqrt2inv_cos6, 18), -c(sqrt2inv_sin6, 18)) + // x[23] now Q8.18 in [-236, 236]. + x1, x2 = x1+x2, x1-x2 + x0, x3 = x0+x3, x0-x3 + // x[0123] now Q8.19 in [-246, 246]. + + // Stages 4, 3, 2: x4, x5, x6, x7. + + x4 <<= 7 + x7 <<= 7 + // x[47] now UQ8.7 + x7, x4 = x7+x4, x7-x4 + // x7 now UQ8.8 in [0, 255]. + // x4 now Q7.8 in [-127½, 127½]. + + x6 = x6 * c(sqrt2inv, 8) + x5 = x5 * c(sqrt2inv, 8) + // x[56] now UQ8.8 in [0, 181]. + // Note that 1/√2 has five 0s in its binary representation after + // the 8th bit, so this multipliy is actually producing 12 bits of precision. + + x7, x5 = x7+x5, x7-x5 + x4, x6 = x4+x6, x4-x6 + // x[4567] now Q8.9 in [-218, 218]. + + x4, x7 = dctBox(x4>>2, x7>>2, c(cos3, 12), -c(sin3, 12)) + x5, x6 = dctBox(x5>>2, x6>>2, c(cos1, 12), -c(sin1, 12)) + // x[4567] now Q9.19 in [-303, 303]. + + // Stage 1. + + x0, x7 = x0+x7, x0-x7 + x1, x6 = x1+x6, x1-x6 + x2, x5 = x2+x5, x2-x5 + x3, x4 = x3+x4, x3-x4 + // x[01234567] now Q9.20 in [-275, 275]. + + // Note: we don't need all 20 bits of “precision”, + // but it is faster to let idctCols shift it away as part + // of other operations rather than downshift here. + + x[0] = x0 + x[1] = x1 + x[2] = x2 + x[3] = x3 + x[4] = x4 + x[5] = x5 + x[6] = x6 + x[7] = x7 + } +} + +// idctCols applies the 1D IDCT to the columns of b. +// Inputs are Q9.20. +// Outputs are Q10.3. That is, the result is the IDCT*8. +func idctCols(b *block) { + for i := range 8 { + x0 := b[0*8+i] + x7 := b[1*8+i] + x2 := b[2*8+i] + x5 := b[3*8+i] + x1 := b[4*8+i] + x6 := b[5*8+i] + x3 := b[6*8+i] + x4 := b[7*8+i] + + // x[012345678] are Q9.20. + + // Start by adding 0.5 to x0 (the incoming DC signal). + // The butterflies will add it to all the other values, + // and then the final shifts will round properly. + x0 += 1 << 19 + + // Stages 4, 3, 2: x0, x1, x2, x3. + + x0, x1 = (x0+x1)>>2, (x0-x1)>>2 + // x[01] now Q9.19. + // Note: (1/sqrt 2)*((cos 6*pi/16)+(sin 6*pi/16)) < 1, so no new high bit. + x2, x3 = dctBox(x2>>13, x3>>13, c(sqrt2inv_cos6, 12), -c(sqrt2inv_sin6, 12)) + // x[0123] now Q9.19. + + x1, x2 = x1+x2, x1-x2 + x0, x3 = x0+x3, x0-x3 + // x[0123] now Q9.20. + + // Stages 4, 3, 2: x4, x5, x6, x7. + + x7, x4 = x7+x4, x7-x4 + // x[47] now Q9.21. + + x5 = (x5 >> 13) * c(sqrt2inv, 14) + x6 = (x6 >> 13) * c(sqrt2inv, 14) + // x[56] now Q9.21. + + x7, x5 = x7+x5, x7-x5 + x4, x6 = x4+x6, x4-x6 + // x[4567] now Q9.22. + + x4, x7 = dctBox(x4>>14, x7>>14, c(cos3, 12), -c(sin3, 12)) + x5, x6 = dctBox(x5>>14, x6>>14, c(cos1, 12), -c(sin1, 12)) + // x[4567] now Q10.20. + + x0, x7 = x0+x7, x0-x7 + x1, x6 = x1+x6, x1-x6 + x2, x5 = x2+x5, x2-x5 + x3, x4 = x3+x4, x3-x4 + // x[01234567] now Q10.21. + + x0 >>= 18 + x1 >>= 18 + x2 >>= 18 + x3 >>= 18 + x4 >>= 18 + x5 >>= 18 + x6 >>= 18 + x7 >>= 18 + // x[01234567] now Q10.3. + + b[0*8+i] = x0 + b[1*8+i] = x1 + b[2*8+i] = x2 + b[3*8+i] = x3 + b[4*8+i] = x4 + b[5*8+i] = x5 + b[6*8+i] = x6 + b[7*8+i] = x7 + } +} diff --git a/src/image/jpeg/dct_test.go b/src/image/jpeg/dct_test.go index 11819db86e..0375a7113d 100644 --- a/src/image/jpeg/dct_test.go +++ b/src/image/jpeg/dct_test.go @@ -7,20 +7,18 @@ package jpeg import ( "fmt" "math" + "math/big" "math/rand" "strings" "testing" ) func benchmarkDCT(b *testing.B, f func(*block)) { - b.StopTimer() - blocks := make([]block, 0, b.N*len(testBlocks)) - for i := 0; i < b.N; i++ { - blocks = append(blocks, testBlocks[:]...) - } - b.StartTimer() - for i := range blocks { - f(&blocks[i]) + var blk block // avoid potential allocation in loop + for b.Loop() { + for _, blk = range testBlocks { + f(&blk) + } } } @@ -32,11 +30,37 @@ func BenchmarkIDCT(b *testing.B) { benchmarkDCT(b, idct) } +const testSlowVsBig = true + func TestDCT(t *testing.T) { blocks := make([]block, len(testBlocks)) copy(blocks, testBlocks[:]) - // Append some randomly generated blocks of varying sparseness. + // All zeros + blocks = append(blocks, block{}) + + // Every possible unit impulse. + for i := range blockSize { + var b block + b[i] = 255 + blocks = append(blocks, b) + } + + // All ones. + var ones block + for i := range ones { + ones[i] = 255 + } + blocks = append(blocks, ones) + + // Every possible inverted unit impulse. + for i := range blockSize { + ones[i] = 0 + blocks = append(blocks, ones) + ones[i] = 255 + } + + // Some randomly generated blocks of varying sparseness. r := rand.New(rand.NewSource(123)) for i := 0; i < 100; i++ { b := block{} @@ -47,61 +71,84 @@ func TestDCT(t *testing.T) { blocks = append(blocks, b) } - // Check that the FDCT and IDCT functions are inverses, after a scale and - // level shift. Scaling reduces the rounding errors in the conversion from - // floats to ints. - for i, b := range blocks { - got, want := b, b - for j := range got { - got[j] = (got[j] - 128) * 8 - } - slowFDCT(&got) - slowIDCT(&got) - for j := range got { - got[j] = got[j]/8 + 128 - } - if differ(&got, &want) { - t.Errorf("i=%d: IDCT(FDCT)\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want) + // Check that the slow FDCT and IDCT functions are inverses, + // after a scale and level shift. + // Scaling reduces the rounding errors in the conversion. + // The “fast” ones are not inverses because the fast IDCT + // is optimized for 8-bit inputs, not full 16-bit ones. + slowRoundTrip := func(b *block) { + slowFDCT(b) + slowIDCT(b) + for j := range b { + b[j] = b[j]/8 + 128 } } + nop := func(*block) {} + testDCT(t, "IDCT(FDCT)", blocks, slowRoundTrip, nop, 1, 8) - // Check that the optimized and slow FDCT implementations agree. - // The fdct function already does a scale and level shift. - for i, b := range blocks { - got, want := b, b - fdct(&got) - for j := range want { - want[j] = (want[j] - 128) * 8 - } - slowFDCT(&want) - if differ(&got, &want) { - t.Errorf("i=%d: FDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want) - } + if testSlowVsBig { + testDCT(t, "slowFDCT", blocks, slowFDCT, slowerFDCT, 0, 64) + testDCT(t, "slowIDCT", blocks, slowIDCT, slowerIDCT, 0, 64) } - // Check that the optimized and slow IDCT implementations agree. - for i, b := range blocks { - got, want := b, b - idct(&got) - slowIDCT(&want) - if differ(&got, &want) { - t.Errorf("i=%d: IDCT\nsrc\n%s\ngot\n%s\nwant\n%s\n", i, &b, &got, &want) + // Check that the optimized and slow FDCT implementations agree. + testDCT(t, "FDCT", blocks, fdct, slowFDCT, 1, 8) + testDCT(t, "IDCT", blocks, idct, slowIDCT, 1, 8) +} + +func testDCT(t *testing.T, name string, blocks []block, fhave, fwant func(*block), tolerance int32, maxCloseCalls int) { + t.Run(name, func(t *testing.T) { + totalClose := 0 + for i, b := range blocks { + have, want := b, b + fhave(&have) + fwant(&want) + d, n := differ(&have, &want, tolerance) + if d >= 0 || n > maxCloseCalls { + fail := "" + if d >= 0 { + fail = fmt.Sprintf("diff at %d,%d", d/8, d%8) + } + if n > maxCloseCalls { + if fail != "" { + fail += "; " + } + fail += fmt.Sprintf("%d close calls", n) + } + t.Errorf("i=%d: %s (%s)\nsrc\n%s\nhave\n%s\nwant\n%s\n", + i, name, fail, &b, &have, &want) + } + totalClose += n } - } + if tolerance > 0 { + t.Logf("%d/%d total close calls", totalClose, len(blocks)*blockSize) + } + }) } -// differ reports whether any pair-wise elements in b0 and b1 differ by 2 or -// more. That tolerance is because there isn't a single definitive decoding of -// a given JPEG image, even before the YCbCr to RGB conversion; implementations +// differ returns the index of the first pair-wise elements in b0 and b1 +// that differ by more than 'ok', along with the total number of elements +// that differ by at least ok ("close calls"). +// +// There isn't a single definitive decoding of a given JPEG image, +// even before the YCbCr to RGB conversion; implementations // can have different IDCT rounding errors. -func differ(b0, b1 *block) bool { +// +// If there are no differences, differ returns -1, 0. +func differ(b0, b1 *block, ok int32) (index, closeCalls int) { + index = -1 for i := range b0 { delta := b0[i] - b1[i] - if delta < -2 || +2 < delta { - return true + if delta < -ok || ok < delta { + if index < 0 { + index = i + } + } + if delta <= -ok || ok <= delta { + closeCalls++ } } - return false + return } // alpha returns 1 if i is 0 and returns √2 otherwise. @@ -112,6 +159,14 @@ func alpha(i int) float64 { return math.Sqrt2 } +// bigAlpha returns 1 if i is 0 and returns √2 otherwise. +func bigAlpha(i int) *big.Float { + if i == 0 { + return bigFloat1 + } + return bigFloatSqrt2 +} + var cosines = [32]float64{ +1.0000000000000000000000000000000000000000000000000000000000000000, // cos(π/16 * 0) +0.9807852804032304491261822361342390369739337308933360950029160885, // cos(π/16 * 1) @@ -150,6 +205,57 @@ var cosines = [32]float64{ +0.9807852804032304491261822361342390369739337308933360950029160885, // cos(π/16 * 31) } +func bigFloat(s string) *big.Float { + f, ok := new(big.Float).SetString(s) + if !ok { + panic("bad float") + } + return f +} + +var ( + bigFloat1 = big.NewFloat(1) + bigFloatSqrt2 = bigFloat("1.41421356237309504880168872420969807856967187537694807317667974") +) + +var bigCosines = [32]*big.Float{ + bigFloat("+1.0000000000000000000000000000000000000000000000000000000000000000"), // cos(π/16 * 0) + bigFloat("+0.9807852804032304491261822361342390369739337308933360950029160885"), // cos(π/16 * 1) + bigFloat("+0.9238795325112867561281831893967882868224166258636424861150977312"), // cos(π/16 * 2) + bigFloat("+0.8314696123025452370787883776179057567385608119872499634461245902"), // cos(π/16 * 3) + bigFloat("+0.7071067811865475244008443621048490392848359376884740365883398689"), // cos(π/16 * 4) + bigFloat("+0.5555702330196022247428308139485328743749371907548040459241535282"), // cos(π/16 * 5) + bigFloat("+0.3826834323650897717284599840303988667613445624856270414338006356"), // cos(π/16 * 6) + bigFloat("+0.1950903220161282678482848684770222409276916177519548077545020894"), // cos(π/16 * 7) + + bigFloat("-0.0000000000000000000000000000000000000000000000000000000000000000"), // cos(π/16 * 8) + bigFloat("-0.1950903220161282678482848684770222409276916177519548077545020894"), // cos(π/16 * 9) + bigFloat("-0.3826834323650897717284599840303988667613445624856270414338006356"), // cos(π/16 * 10) + bigFloat("-0.5555702330196022247428308139485328743749371907548040459241535282"), // cos(π/16 * 11) + bigFloat("-0.7071067811865475244008443621048490392848359376884740365883398689"), // cos(π/16 * 12) + bigFloat("-0.8314696123025452370787883776179057567385608119872499634461245902"), // cos(π/16 * 13) + bigFloat("-0.9238795325112867561281831893967882868224166258636424861150977312"), // cos(π/16 * 14) + bigFloat("-0.9807852804032304491261822361342390369739337308933360950029160885"), // cos(π/16 * 15) + + bigFloat("-1.0000000000000000000000000000000000000000000000000000000000000000"), // cos(π/16 * 16) + bigFloat("-0.9807852804032304491261822361342390369739337308933360950029160885"), // cos(π/16 * 17) + bigFloat("-0.9238795325112867561281831893967882868224166258636424861150977312"), // cos(π/16 * 18) + bigFloat("-0.8314696123025452370787883776179057567385608119872499634461245902"), // cos(π/16 * 19) + bigFloat("-0.7071067811865475244008443621048490392848359376884740365883398689"), // cos(π/16 * 20) + bigFloat("-0.5555702330196022247428308139485328743749371907548040459241535282"), // cos(π/16 * 21) + bigFloat("-0.3826834323650897717284599840303988667613445624856270414338006356"), // cos(π/16 * 22) + bigFloat("-0.1950903220161282678482848684770222409276916177519548077545020894"), // cos(π/16 * 23) + + bigFloat("+0.0000000000000000000000000000000000000000000000000000000000000000"), // cos(π/16 * 24) + bigFloat("+0.1950903220161282678482848684770222409276916177519548077545020894"), // cos(π/16 * 25) + bigFloat("+0.3826834323650897717284599840303988667613445624856270414338006356"), // cos(π/16 * 26) + bigFloat("+0.5555702330196022247428308139485328743749371907548040459241535282"), // cos(π/16 * 27) + bigFloat("+0.7071067811865475244008443621048490392848359376884740365883398689"), // cos(π/16 * 28) + bigFloat("+0.8314696123025452370787883776179057567385608119872499634461245902"), // cos(π/16 * 29) + bigFloat("+0.9238795325112867561281831893967882868224166258636424861150977312"), // cos(π/16 * 30) + bigFloat("+0.9807852804032304491261822361342390369739337308933360950029160885"), // cos(π/16 * 31) +} + // slowFDCT performs the 8*8 2-dimensional forward discrete cosine transform: // // dst[u,v] = (1/8) * Σ_x Σ_y alpha(u) * alpha(v) * src[x,y] * @@ -160,24 +266,51 @@ var cosines = [32]float64{ // // b acts as both dst and src. func slowFDCT(b *block) { - var dst [blockSize]float64 + var dst block for v := 0; v < 8; v++ { for u := 0; u < 8; u++ { sum := 0.0 for y := 0; y < 8; y++ { for x := 0; x < 8; x++ { - sum += alpha(u) * alpha(v) * float64(b[8*y+x]) * + sum += alpha(u) * alpha(v) * float64(b[8*y+x]-128) * cosines[((2*x+1)*u)%32] * cosines[((2*y+1)*v)%32] } } - dst[8*v+u] = sum / 8 + dst[8*v+u] = int32(math.Round(sum)) } } - // Convert from float64 to int32. - for i := range dst { - b[i] = int32(dst[i] + 0.5) + *b = dst +} + +// slowerFDCT is slowFDCT but using big.Floats to validate slowFDCT. +func slowerFDCT(b *block) { + var dst block + for v := 0; v < 8; v++ { + for u := 0; u < 8; u++ { + sum := big.NewFloat(0) + for y := 0; y < 8; y++ { + for x := 0; x < 8; x++ { + f := big.NewFloat(float64(b[8*y+x] - 128)) + f = new(big.Float).Mul(f, bigAlpha(u)) + f = new(big.Float).Mul(f, bigAlpha(v)) + f = new(big.Float).Mul(f, bigCosines[((2*x+1)*u)%32]) + f = new(big.Float).Mul(f, bigCosines[((2*y+1)*v)%32]) + sum = new(big.Float).Add(sum, f) + } + } + // Int64 truncates toward zero, so add ±0.5 + // as needed to round + if sum.Sign() > 0 { + sum = new(big.Float).Add(sum, big.NewFloat(+0.5)) + } else { + sum = new(big.Float).Add(sum, big.NewFloat(-0.5)) + } + i, _ := sum.Int64() + dst[8*v+u] = int32(i) + } } + *b = dst } // slowIDCT performs the 8*8 2-dimensional inverse discrete cosine transform: @@ -190,7 +323,7 @@ func slowFDCT(b *block) { // // b acts as both dst and src. func slowIDCT(b *block) { - var dst [blockSize]float64 + var dst block for y := 0; y < 8; y++ { for x := 0; x < 8; x++ { sum := 0.0 @@ -201,13 +334,41 @@ func slowIDCT(b *block) { cosines[((2*y+1)*v)%32] } } - dst[8*y+x] = sum / 8 + dst[8*y+x] = int32(math.Round(sum / 8)) } } - // Convert from float64 to int32. - for i := range dst { - b[i] = int32(dst[i] + 0.5) + *b = dst +} + +// slowerIDCT is slowIDCT but using big.Floats to validate slowIDCT. +func slowerIDCT(b *block) { + var dst block + for y := 0; y < 8; y++ { + for x := 0; x < 8; x++ { + sum := big.NewFloat(0) + for v := 0; v < 8; v++ { + for u := 0; u < 8; u++ { + f := big.NewFloat(float64(b[8*v+u])) + f = new(big.Float).Mul(f, bigAlpha(u)) + f = new(big.Float).Mul(f, bigAlpha(v)) + f = new(big.Float).Mul(f, bigCosines[((2*x+1)*u)%32]) + f = new(big.Float).Mul(f, bigCosines[((2*y+1)*v)%32]) + f = new(big.Float).Quo(f, big.NewFloat(8)) + sum = new(big.Float).Add(sum, f) + } + } + // Int64 truncates toward zero, so add ±0.5 + // as needed to round + if sum.Sign() > 0 { + sum = new(big.Float).Add(sum, big.NewFloat(+0.5)) + } else { + sum = new(big.Float).Add(sum, big.NewFloat(-0.5)) + } + i, _ := sum.Int64() + dst[8*y+x] = int32(i) + } } + *b = dst } func (b *block) String() string { diff --git a/src/image/jpeg/fdct.go b/src/image/jpeg/fdct.go deleted file mode 100644 index c7a973ec3c..0000000000 --- a/src/image/jpeg/fdct.go +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package jpeg - -// This file implements a Forward Discrete Cosine Transformation. - -/* -It is based on the code in jfdctint.c from the Independent JPEG Group, -found at http://www.ijg.org/files/jpegsrc.v8c.tar.gz. - -The "LEGAL ISSUES" section of the README in that archive says: - -In plain English: - -1. We don't promise that this software works. (But if you find any bugs, - please let us know!) -2. You can use this software for whatever you want. You don't have to pay us. -3. You may not pretend that you wrote this software. If you use it in a - program, you must acknowledge somewhere in your documentation that - you've used the IJG code. - -In legalese: - -The authors make NO WARRANTY or representation, either express or implied, -with respect to this software, its quality, accuracy, merchantability, or -fitness for a particular purpose. This software is provided "AS IS", and you, -its user, assume the entire risk as to its quality and accuracy. - -This software is copyright (C) 1991-2011, Thomas G. Lane, Guido Vollbeding. -All Rights Reserved except as specified below. - -Permission is hereby granted to use, copy, modify, and distribute this -software (or portions thereof) for any purpose, without fee, subject to these -conditions: -(1) If any part of the source code for this software is distributed, then this -README file must be included, with this copyright and no-warranty notice -unaltered; and any additions, deletions, or changes to the original files -must be clearly indicated in accompanying documentation. -(2) If only executable code is distributed, then the accompanying -documentation must state that "this software is based in part on the work of -the Independent JPEG Group". -(3) Permission for use of this software is granted only if the user accepts -full responsibility for any undesirable consequences; the authors accept -NO LIABILITY for damages of any kind. - -These conditions apply to any software derived from or based on the IJG code, -not just to the unmodified library. If you use our work, you ought to -acknowledge us. - -Permission is NOT granted for the use of any IJG author's name or company name -in advertising or publicity relating to this software or products derived from -it. This software may be referred to only as "the Independent JPEG Group's -software". - -We specifically permit and encourage the use of this software as the basis of -commercial products, provided that all warranty or liability claims are -assumed by the product vendor. -*/ - -// Trigonometric constants in 13-bit fixed point format. -const ( - fix_0_298631336 = 2446 - fix_0_390180644 = 3196 - fix_0_541196100 = 4433 - fix_0_765366865 = 6270 - fix_0_899976223 = 7373 - fix_1_175875602 = 9633 - fix_1_501321110 = 12299 - fix_1_847759065 = 15137 - fix_1_961570560 = 16069 - fix_2_053119869 = 16819 - fix_2_562915447 = 20995 - fix_3_072711026 = 25172 -) - -const ( - constBits = 13 - pass1Bits = 2 - centerJSample = 128 -) - -// fdct performs a forward DCT on an 8x8 block of coefficients, including a -// level shift. -func fdct(b *block) { - // Pass 1: process rows. - for y := 0; y < 8; y++ { - y8 := y * 8 - s := b[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857 - x0 := s[0] - x1 := s[1] - x2 := s[2] - x3 := s[3] - x4 := s[4] - x5 := s[5] - x6 := s[6] - x7 := s[7] - - tmp0 := x0 + x7 - tmp1 := x1 + x6 - tmp2 := x2 + x5 - tmp3 := x3 + x4 - - tmp10 := tmp0 + tmp3 - tmp12 := tmp0 - tmp3 - tmp11 := tmp1 + tmp2 - tmp13 := tmp1 - tmp2 - - tmp0 = x0 - x7 - tmp1 = x1 - x6 - tmp2 = x2 - x5 - tmp3 = x3 - x4 - - s[0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits - s[4] = (tmp10 - tmp11) << pass1Bits - z1 := (tmp12 + tmp13) * fix_0_541196100 - z1 += 1 << (constBits - pass1Bits - 1) - s[2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits) - s[6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits) - - tmp10 = tmp0 + tmp3 - tmp11 = tmp1 + tmp2 - tmp12 = tmp0 + tmp2 - tmp13 = tmp1 + tmp3 - z1 = (tmp12 + tmp13) * fix_1_175875602 - z1 += 1 << (constBits - pass1Bits - 1) - tmp0 *= fix_1_501321110 - tmp1 *= fix_3_072711026 - tmp2 *= fix_2_053119869 - tmp3 *= fix_0_298631336 - tmp10 *= -fix_0_899976223 - tmp11 *= -fix_2_562915447 - tmp12 *= -fix_0_390180644 - tmp13 *= -fix_1_961570560 - - tmp12 += z1 - tmp13 += z1 - s[1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits) - s[3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits) - s[5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits) - s[7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits) - } - // Pass 2: process columns. - // We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8. - for x := 0; x < 8; x++ { - tmp0 := b[0*8+x] + b[7*8+x] - tmp1 := b[1*8+x] + b[6*8+x] - tmp2 := b[2*8+x] + b[5*8+x] - tmp3 := b[3*8+x] + b[4*8+x] - - tmp10 := tmp0 + tmp3 + 1<<(pass1Bits-1) - tmp12 := tmp0 - tmp3 - tmp11 := tmp1 + tmp2 - tmp13 := tmp1 - tmp2 - - tmp0 = b[0*8+x] - b[7*8+x] - tmp1 = b[1*8+x] - b[6*8+x] - tmp2 = b[2*8+x] - b[5*8+x] - tmp3 = b[3*8+x] - b[4*8+x] - - b[0*8+x] = (tmp10 + tmp11) >> pass1Bits - b[4*8+x] = (tmp10 - tmp11) >> pass1Bits - - z1 := (tmp12 + tmp13) * fix_0_541196100 - z1 += 1 << (constBits + pass1Bits - 1) - b[2*8+x] = (z1 + tmp12*fix_0_765366865) >> (constBits + pass1Bits) - b[6*8+x] = (z1 - tmp13*fix_1_847759065) >> (constBits + pass1Bits) - - tmp10 = tmp0 + tmp3 - tmp11 = tmp1 + tmp2 - tmp12 = tmp0 + tmp2 - tmp13 = tmp1 + tmp3 - z1 = (tmp12 + tmp13) * fix_1_175875602 - z1 += 1 << (constBits + pass1Bits - 1) - tmp0 *= fix_1_501321110 - tmp1 *= fix_3_072711026 - tmp2 *= fix_2_053119869 - tmp3 *= fix_0_298631336 - tmp10 *= -fix_0_899976223 - tmp11 *= -fix_2_562915447 - tmp12 *= -fix_0_390180644 - tmp13 *= -fix_1_961570560 - - tmp12 += z1 - tmp13 += z1 - b[1*8+x] = (tmp0 + tmp10 + tmp12) >> (constBits + pass1Bits) - b[3*8+x] = (tmp1 + tmp11 + tmp13) >> (constBits + pass1Bits) - b[5*8+x] = (tmp2 + tmp11 + tmp12) >> (constBits + pass1Bits) - b[7*8+x] = (tmp3 + tmp10 + tmp13) >> (constBits + pass1Bits) - } -} diff --git a/src/image/jpeg/idct.go b/src/image/jpeg/idct.go deleted file mode 100644 index a3957c8ada..0000000000 --- a/src/image/jpeg/idct.go +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package jpeg - -// This is a Go translation of idct.c from -// -// http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz -// -// which carries the following notice: - -/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */ - -/* - * Disclaimer of Warranty - * - * These software programs are available to the user without any license fee or - * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims - * any and all warranties, whether express, implied, or statuary, including any - * implied warranties or merchantability or of fitness for a particular - * purpose. In no event shall the copyright-holder be liable for any - * incidental, punitive, or consequential damages of any kind whatsoever - * arising from the use of these programs. - * - * This disclaimer of warranty extends to the user of these programs and user's - * customers, employees, agents, transferees, successors, and assigns. - * - * The MPEG Software Simulation Group does not represent or warrant that the - * programs furnished hereunder are free of infringement of any third-party - * patents. - * - * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware, - * are subject to royalty fees to patent holders. Many of these patents are - * general enough such that they are unavoidable regardless of implementation - * design. - * - */ - -const blockSize = 64 // A DCT block is 8x8. - -type block [blockSize]int32 - -const ( - w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16) - w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16) - w3 = 2408 // 2048*sqrt(2)*cos(3*pi/16) - w5 = 1609 // 2048*sqrt(2)*cos(5*pi/16) - w6 = 1108 // 2048*sqrt(2)*cos(6*pi/16) - w7 = 565 // 2048*sqrt(2)*cos(7*pi/16) - - w1pw7 = w1 + w7 - w1mw7 = w1 - w7 - w2pw6 = w2 + w6 - w2mw6 = w2 - w6 - w3pw5 = w3 + w5 - w3mw5 = w3 - w5 - - r2 = 181 // 256/sqrt(2) -) - -// idct performs a 2-D Inverse Discrete Cosine Transformation. -// -// The input coefficients should already have been multiplied by the -// appropriate quantization table. We use fixed-point computation, with the -// number of bits for the fractional component varying over the intermediate -// stages. -// -// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the -// discrete W transform and for the discrete Fourier transform", IEEE Trans. on -// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984. -func idct(src *block) { - // Horizontal 1-D IDCT. - for y := 0; y < 8; y++ { - y8 := y * 8 - s := src[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857 - // If all the AC components are zero, then the IDCT is trivial. - if s[1] == 0 && s[2] == 0 && s[3] == 0 && - s[4] == 0 && s[5] == 0 && s[6] == 0 && s[7] == 0 { - dc := s[0] << 3 - s[0] = dc - s[1] = dc - s[2] = dc - s[3] = dc - s[4] = dc - s[5] = dc - s[6] = dc - s[7] = dc - continue - } - - // Prescale. - x0 := (s[0] << 11) + 128 - x1 := s[4] << 11 - x2 := s[6] - x3 := s[2] - x4 := s[1] - x5 := s[7] - x6 := s[5] - x7 := s[3] - - // Stage 1. - x8 := w7 * (x4 + x5) - x4 = x8 + w1mw7*x4 - x5 = x8 - w1pw7*x5 - x8 = w3 * (x6 + x7) - x6 = x8 - w3mw5*x6 - x7 = x8 - w3pw5*x7 - - // Stage 2. - x8 = x0 + x1 - x0 -= x1 - x1 = w6 * (x3 + x2) - x2 = x1 - w2pw6*x2 - x3 = x1 + w2mw6*x3 - x1 = x4 + x6 - x4 -= x6 - x6 = x5 + x7 - x5 -= x7 - - // Stage 3. - x7 = x8 + x3 - x8 -= x3 - x3 = x0 + x2 - x0 -= x2 - x2 = (r2*(x4+x5) + 128) >> 8 - x4 = (r2*(x4-x5) + 128) >> 8 - - // Stage 4. - s[0] = (x7 + x1) >> 8 - s[1] = (x3 + x2) >> 8 - s[2] = (x0 + x4) >> 8 - s[3] = (x8 + x6) >> 8 - s[4] = (x8 - x6) >> 8 - s[5] = (x0 - x4) >> 8 - s[6] = (x3 - x2) >> 8 - s[7] = (x7 - x1) >> 8 - } - - // Vertical 1-D IDCT. - for x := 0; x < 8; x++ { - // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial. - // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so - // we do not bother to check for the all-zero case. - s := src[x : x+57 : x+57] // Small cap improves performance, see https://golang.org/issue/27857 - - // Prescale. - y0 := (s[8*0] << 8) + 8192 - y1 := s[8*4] << 8 - y2 := s[8*6] - y3 := s[8*2] - y4 := s[8*1] - y5 := s[8*7] - y6 := s[8*5] - y7 := s[8*3] - - // Stage 1. - y8 := w7*(y4+y5) + 4 - y4 = (y8 + w1mw7*y4) >> 3 - y5 = (y8 - w1pw7*y5) >> 3 - y8 = w3*(y6+y7) + 4 - y6 = (y8 - w3mw5*y6) >> 3 - y7 = (y8 - w3pw5*y7) >> 3 - - // Stage 2. - y8 = y0 + y1 - y0 -= y1 - y1 = w6*(y3+y2) + 4 - y2 = (y1 - w2pw6*y2) >> 3 - y3 = (y1 + w2mw6*y3) >> 3 - y1 = y4 + y6 - y4 -= y6 - y6 = y5 + y7 - y5 -= y7 - - // Stage 3. - y7 = y8 + y3 - y8 -= y3 - y3 = y0 + y2 - y0 -= y2 - y2 = (r2*(y4+y5) + 128) >> 8 - y4 = (r2*(y4-y5) + 128) >> 8 - - // Stage 4. - s[8*0] = (y7 + y1) >> 14 - s[8*1] = (y3 + y2) >> 14 - s[8*2] = (y0 + y4) >> 14 - s[8*3] = (y8 + y6) >> 14 - s[8*4] = (y8 - y6) >> 14 - s[8*5] = (y0 - y4) >> 14 - s[8*6] = (y3 - y2) >> 14 - s[8*7] = (y7 - y1) >> 14 - } -} diff --git a/src/image/jpeg/writer_test.go b/src/image/jpeg/writer_test.go index 341477044a..197b49ec13 100644 --- a/src/image/jpeg/writer_test.go +++ b/src/image/jpeg/writer_test.go @@ -154,8 +154,8 @@ func TestWriter(t *testing.T) { continue } // Compare the average delta to the tolerance level. - if averageDelta(m0, m1) > tc.tolerance { - t.Errorf("%s, quality=%d: average delta is too high", tc.filename, tc.quality) + if d := averageDelta(m0, m1); d > tc.tolerance { + t.Errorf("%s, quality=%d: average delta is too high (%d > %d)", tc.filename, tc.quality, d, tc.tolerance) continue } } |
