vrshoot
diff libs/libjpeg/jidctint.c @ 0:b2f14e535253
initial commit
author | John Tsiombikas <nuclear@member.fsf.org> |
---|---|
date | Sat, 01 Feb 2014 19:58:19 +0200 |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/libs/libjpeg/jidctint.c Sat Feb 01 19:58:19 2014 +0200 1.3 @@ -0,0 +1,389 @@ 1.4 +/* 1.5 + * jidctint.c 1.6 + * 1.7 + * Copyright (C) 1991-1998, Thomas G. Lane. 1.8 + * This file is part of the Independent JPEG Group's software. 1.9 + * For conditions of distribution and use, see the accompanying README file. 1.10 + * 1.11 + * This file contains a slow-but-accurate integer implementation of the 1.12 + * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine 1.13 + * must also perform dequantization of the input coefficients. 1.14 + * 1.15 + * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT 1.16 + * on each row (or vice versa, but it's more convenient to emit a row at 1.17 + * a time). Direct algorithms are also available, but they are much more 1.18 + * complex and seem not to be any faster when reduced to code. 1.19 + * 1.20 + * This implementation is based on an algorithm described in 1.21 + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT 1.22 + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, 1.23 + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. 1.24 + * The primary algorithm described there uses 11 multiplies and 29 adds. 1.25 + * We use their alternate method with 12 multiplies and 32 adds. 1.26 + * The advantage of this method is that no data path contains more than one 1.27 + * multiplication; this allows a very simple and accurate implementation in 1.28 + * scaled fixed-point arithmetic, with a minimal number of shifts. 1.29 + */ 1.30 + 1.31 +#define JPEG_INTERNALS 1.32 +#include "jinclude.h" 1.33 +#include "jpeglib.h" 1.34 +#include "jdct.h" /* Private declarations for DCT subsystem */ 1.35 + 1.36 +#ifdef DCT_ISLOW_SUPPORTED 1.37 + 1.38 + 1.39 +/* 1.40 + * This module is specialized to the case DCTSIZE = 8. 1.41 + */ 1.42 + 1.43 +#if DCTSIZE != 8 1.44 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ 1.45 +#endif 1.46 + 1.47 + 1.48 +/* 1.49 + * The poop on this scaling stuff is as follows: 1.50 + * 1.51 + * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) 1.52 + * larger than the true IDCT outputs. The final outputs are therefore 1.53 + * a factor of N larger than desired; since N=8 this can be cured by 1.54 + * a simple right shift at the end of the algorithm. The advantage of 1.55 + * this arrangement is that we save two multiplications per 1-D IDCT, 1.56 + * because the y0 and y4 inputs need not be divided by sqrt(N). 1.57 + * 1.58 + * We have to do addition and subtraction of the integer inputs, which 1.59 + * is no problem, and multiplication by fractional constants, which is 1.60 + * a problem to do in integer arithmetic. We multiply all the constants 1.61 + * by CONST_SCALE and convert them to integer constants (thus retaining 1.62 + * CONST_BITS bits of precision in the constants). After doing a 1.63 + * multiplication we have to divide the product by CONST_SCALE, with proper 1.64 + * rounding, to produce the correct output. This division can be done 1.65 + * cheaply as a right shift of CONST_BITS bits. We postpone shifting 1.66 + * as long as possible so that partial sums can be added together with 1.67 + * full fractional precision. 1.68 + * 1.69 + * The outputs of the first pass are scaled up by PASS1_BITS bits so that 1.70 + * they are represented to better-than-integral precision. These outputs 1.71 + * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word 1.72 + * with the recommended scaling. (To scale up 12-bit sample data further, an 1.73 + * intermediate INT32 array would be needed.) 1.74 + * 1.75 + * To avoid overflow of the 32-bit intermediate results in pass 2, we must 1.76 + * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis 1.77 + * shows that the values given below are the most effective. 1.78 + */ 1.79 + 1.80 +#if BITS_IN_JSAMPLE == 8 1.81 +#define CONST_BITS 13 1.82 +#define PASS1_BITS 2 1.83 +#else 1.84 +#define CONST_BITS 13 1.85 +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ 1.86 +#endif 1.87 + 1.88 +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus 1.89 + * causing a lot of useless floating-point operations at run time. 1.90 + * To get around this we use the following pre-calculated constants. 1.91 + * If you change CONST_BITS you may want to add appropriate values. 1.92 + * (With a reasonable C compiler, you can just rely on the FIX() macro...) 1.93 + */ 1.94 + 1.95 +#if CONST_BITS == 13 1.96 +#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */ 1.97 +#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */ 1.98 +#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */ 1.99 +#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */ 1.100 +#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */ 1.101 +#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */ 1.102 +#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */ 1.103 +#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */ 1.104 +#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */ 1.105 +#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */ 1.106 +#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */ 1.107 +#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */ 1.108 +#else 1.109 +#define FIX_0_298631336 FIX(0.298631336) 1.110 +#define FIX_0_390180644 FIX(0.390180644) 1.111 +#define FIX_0_541196100 FIX(0.541196100) 1.112 +#define FIX_0_765366865 FIX(0.765366865) 1.113 +#define FIX_0_899976223 FIX(0.899976223) 1.114 +#define FIX_1_175875602 FIX(1.175875602) 1.115 +#define FIX_1_501321110 FIX(1.501321110) 1.116 +#define FIX_1_847759065 FIX(1.847759065) 1.117 +#define FIX_1_961570560 FIX(1.961570560) 1.118 +#define FIX_2_053119869 FIX(2.053119869) 1.119 +#define FIX_2_562915447 FIX(2.562915447) 1.120 +#define FIX_3_072711026 FIX(3.072711026) 1.121 +#endif 1.122 + 1.123 + 1.124 +/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result. 1.125 + * For 8-bit samples with the recommended scaling, all the variable 1.126 + * and constant values involved are no more than 16 bits wide, so a 1.127 + * 16x16->32 bit multiply can be used instead of a full 32x32 multiply. 1.128 + * For 12-bit samples, a full 32-bit multiplication will be needed. 1.129 + */ 1.130 + 1.131 +#if BITS_IN_JSAMPLE == 8 1.132 +#define MULTIPLY(var,const) MULTIPLY16C16(var,const) 1.133 +#else 1.134 +#define MULTIPLY(var,const) ((var) * (const)) 1.135 +#endif 1.136 + 1.137 + 1.138 +/* Dequantize a coefficient by multiplying it by the multiplier-table 1.139 + * entry; produce an int result. In this module, both inputs and result 1.140 + * are 16 bits or less, so either int or short multiply will work. 1.141 + */ 1.142 + 1.143 +#define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval)) 1.144 + 1.145 + 1.146 +/* 1.147 + * Perform dequantization and inverse DCT on one block of coefficients. 1.148 + */ 1.149 + 1.150 +GLOBAL(void) 1.151 +jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, 1.152 + JCOEFPTR coef_block, 1.153 + JSAMPARRAY output_buf, JDIMENSION output_col) 1.154 +{ 1.155 + INT32 tmp0, tmp1, tmp2, tmp3; 1.156 + INT32 tmp10, tmp11, tmp12, tmp13; 1.157 + INT32 z1, z2, z3, z4, z5; 1.158 + JCOEFPTR inptr; 1.159 + ISLOW_MULT_TYPE * quantptr; 1.160 + int * wsptr; 1.161 + JSAMPROW outptr; 1.162 + JSAMPLE *range_limit = IDCT_range_limit(cinfo); 1.163 + int ctr; 1.164 + int workspace[DCTSIZE2]; /* buffers data between passes */ 1.165 + SHIFT_TEMPS 1.166 + 1.167 + /* Pass 1: process columns from input, store into work array. */ 1.168 + /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ 1.169 + /* furthermore, we scale the results by 2**PASS1_BITS. */ 1.170 + 1.171 + inptr = coef_block; 1.172 + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; 1.173 + wsptr = workspace; 1.174 + for (ctr = DCTSIZE; ctr > 0; ctr--) { 1.175 + /* Due to quantization, we will usually find that many of the input 1.176 + * coefficients are zero, especially the AC terms. We can exploit this 1.177 + * by short-circuiting the IDCT calculation for any column in which all 1.178 + * the AC terms are zero. In that case each output is equal to the 1.179 + * DC coefficient (with scale factor as needed). 1.180 + * With typical images and quantization tables, half or more of the 1.181 + * column DCT calculations can be simplified this way. 1.182 + */ 1.183 + 1.184 + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && 1.185 + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && 1.186 + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && 1.187 + inptr[DCTSIZE*7] == 0) { 1.188 + /* AC terms all zero */ 1.189 + int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; 1.190 + 1.191 + wsptr[DCTSIZE*0] = dcval; 1.192 + wsptr[DCTSIZE*1] = dcval; 1.193 + wsptr[DCTSIZE*2] = dcval; 1.194 + wsptr[DCTSIZE*3] = dcval; 1.195 + wsptr[DCTSIZE*4] = dcval; 1.196 + wsptr[DCTSIZE*5] = dcval; 1.197 + wsptr[DCTSIZE*6] = dcval; 1.198 + wsptr[DCTSIZE*7] = dcval; 1.199 + 1.200 + inptr++; /* advance pointers to next column */ 1.201 + quantptr++; 1.202 + wsptr++; 1.203 + continue; 1.204 + } 1.205 + 1.206 + /* Even part: reverse the even part of the forward DCT. */ 1.207 + /* The rotator is sqrt(2)*c(-6). */ 1.208 + 1.209 + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); 1.210 + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); 1.211 + 1.212 + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); 1.213 + tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); 1.214 + tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); 1.215 + 1.216 + z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); 1.217 + z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); 1.218 + 1.219 + tmp0 = (z2 + z3) << CONST_BITS; 1.220 + tmp1 = (z2 - z3) << CONST_BITS; 1.221 + 1.222 + tmp10 = tmp0 + tmp3; 1.223 + tmp13 = tmp0 - tmp3; 1.224 + tmp11 = tmp1 + tmp2; 1.225 + tmp12 = tmp1 - tmp2; 1.226 + 1.227 + /* Odd part per figure 8; the matrix is unitary and hence its 1.228 + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 1.229 + */ 1.230 + 1.231 + tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); 1.232 + tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); 1.233 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); 1.234 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); 1.235 + 1.236 + z1 = tmp0 + tmp3; 1.237 + z2 = tmp1 + tmp2; 1.238 + z3 = tmp0 + tmp2; 1.239 + z4 = tmp1 + tmp3; 1.240 + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ 1.241 + 1.242 + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ 1.243 + tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ 1.244 + tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ 1.245 + tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ 1.246 + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ 1.247 + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ 1.248 + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ 1.249 + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ 1.250 + 1.251 + z3 += z5; 1.252 + z4 += z5; 1.253 + 1.254 + tmp0 += z1 + z3; 1.255 + tmp1 += z2 + z4; 1.256 + tmp2 += z2 + z3; 1.257 + tmp3 += z1 + z4; 1.258 + 1.259 + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 1.260 + 1.261 + wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); 1.262 + wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); 1.263 + wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); 1.264 + wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); 1.265 + wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); 1.266 + wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); 1.267 + wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); 1.268 + wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); 1.269 + 1.270 + inptr++; /* advance pointers to next column */ 1.271 + quantptr++; 1.272 + wsptr++; 1.273 + } 1.274 + 1.275 + /* Pass 2: process rows from work array, store into output array. */ 1.276 + /* Note that we must descale the results by a factor of 8 == 2**3, */ 1.277 + /* and also undo the PASS1_BITS scaling. */ 1.278 + 1.279 + wsptr = workspace; 1.280 + for (ctr = 0; ctr < DCTSIZE; ctr++) { 1.281 + outptr = output_buf[ctr] + output_col; 1.282 + /* Rows of zeroes can be exploited in the same way as we did with columns. 1.283 + * However, the column calculation has created many nonzero AC terms, so 1.284 + * the simplification applies less often (typically 5% to 10% of the time). 1.285 + * On machines with very fast multiplication, it's possible that the 1.286 + * test takes more time than it's worth. In that case this section 1.287 + * may be commented out. 1.288 + */ 1.289 + 1.290 +#ifndef NO_ZERO_ROW_TEST 1.291 + if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && 1.292 + wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { 1.293 + /* AC terms all zero */ 1.294 + JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3) 1.295 + & RANGE_MASK]; 1.296 + 1.297 + outptr[0] = dcval; 1.298 + outptr[1] = dcval; 1.299 + outptr[2] = dcval; 1.300 + outptr[3] = dcval; 1.301 + outptr[4] = dcval; 1.302 + outptr[5] = dcval; 1.303 + outptr[6] = dcval; 1.304 + outptr[7] = dcval; 1.305 + 1.306 + wsptr += DCTSIZE; /* advance pointer to next row */ 1.307 + continue; 1.308 + } 1.309 +#endif 1.310 + 1.311 + /* Even part: reverse the even part of the forward DCT. */ 1.312 + /* The rotator is sqrt(2)*c(-6). */ 1.313 + 1.314 + z2 = (INT32) wsptr[2]; 1.315 + z3 = (INT32) wsptr[6]; 1.316 + 1.317 + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); 1.318 + tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); 1.319 + tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); 1.320 + 1.321 + tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS; 1.322 + tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS; 1.323 + 1.324 + tmp10 = tmp0 + tmp3; 1.325 + tmp13 = tmp0 - tmp3; 1.326 + tmp11 = tmp1 + tmp2; 1.327 + tmp12 = tmp1 - tmp2; 1.328 + 1.329 + /* Odd part per figure 8; the matrix is unitary and hence its 1.330 + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 1.331 + */ 1.332 + 1.333 + tmp0 = (INT32) wsptr[7]; 1.334 + tmp1 = (INT32) wsptr[5]; 1.335 + tmp2 = (INT32) wsptr[3]; 1.336 + tmp3 = (INT32) wsptr[1]; 1.337 + 1.338 + z1 = tmp0 + tmp3; 1.339 + z2 = tmp1 + tmp2; 1.340 + z3 = tmp0 + tmp2; 1.341 + z4 = tmp1 + tmp3; 1.342 + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ 1.343 + 1.344 + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ 1.345 + tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ 1.346 + tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ 1.347 + tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ 1.348 + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ 1.349 + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ 1.350 + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ 1.351 + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ 1.352 + 1.353 + z3 += z5; 1.354 + z4 += z5; 1.355 + 1.356 + tmp0 += z1 + z3; 1.357 + tmp1 += z2 + z4; 1.358 + tmp2 += z2 + z3; 1.359 + tmp3 += z1 + z4; 1.360 + 1.361 + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 1.362 + 1.363 + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3, 1.364 + CONST_BITS+PASS1_BITS+3) 1.365 + & RANGE_MASK]; 1.366 + outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3, 1.367 + CONST_BITS+PASS1_BITS+3) 1.368 + & RANGE_MASK]; 1.369 + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2, 1.370 + CONST_BITS+PASS1_BITS+3) 1.371 + & RANGE_MASK]; 1.372 + outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2, 1.373 + CONST_BITS+PASS1_BITS+3) 1.374 + & RANGE_MASK]; 1.375 + outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1, 1.376 + CONST_BITS+PASS1_BITS+3) 1.377 + & RANGE_MASK]; 1.378 + outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1, 1.379 + CONST_BITS+PASS1_BITS+3) 1.380 + & RANGE_MASK]; 1.381 + outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0, 1.382 + CONST_BITS+PASS1_BITS+3) 1.383 + & RANGE_MASK]; 1.384 + outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0, 1.385 + CONST_BITS+PASS1_BITS+3) 1.386 + & RANGE_MASK]; 1.387 + 1.388 + wsptr += DCTSIZE; /* advance pointer to next row */ 1.389 + } 1.390 +} 1.391 + 1.392 +#endif /* DCT_ISLOW_SUPPORTED */