dbf-halloween2015

diff libs/libjpeg/jfdctfst.c @ 1:c3f5c32cb210

barfed all the libraries in the source tree to make porting easier
author John Tsiombikas <nuclear@member.fsf.org>
date Sun, 01 Nov 2015 00:36:56 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libs/libjpeg/jfdctfst.c	Sun Nov 01 00:36:56 2015 +0200
     1.3 @@ -0,0 +1,224 @@
     1.4 +/*
     1.5 + * jfdctfst.c
     1.6 + *
     1.7 + * Copyright (C) 1994-1996, Thomas G. Lane.
     1.8 + * This file is part of the Independent JPEG Group's software.
     1.9 + * For conditions of distribution and use, see the accompanying README file.
    1.10 + *
    1.11 + * This file contains a fast, not so accurate integer implementation of the
    1.12 + * forward DCT (Discrete Cosine Transform).
    1.13 + *
    1.14 + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
    1.15 + * on each column.  Direct algorithms are also available, but they are
    1.16 + * much more complex and seem not to be any faster when reduced to code.
    1.17 + *
    1.18 + * This implementation is based on Arai, Agui, and Nakajima's algorithm for
    1.19 + * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
    1.20 + * Japanese, but the algorithm is described in the Pennebaker & Mitchell
    1.21 + * JPEG textbook (see REFERENCES section in file README).  The following code
    1.22 + * is based directly on figure 4-8 in P&M.
    1.23 + * While an 8-point DCT cannot be done in less than 11 multiplies, it is
    1.24 + * possible to arrange the computation so that many of the multiplies are
    1.25 + * simple scalings of the final outputs.  These multiplies can then be
    1.26 + * folded into the multiplications or divisions by the JPEG quantization
    1.27 + * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
    1.28 + * to be done in the DCT itself.
    1.29 + * The primary disadvantage of this method is that with fixed-point math,
    1.30 + * accuracy is lost due to imprecise representation of the scaled
    1.31 + * quantization values.  The smaller the quantization table entry, the less
    1.32 + * precise the scaled value, so this implementation does worse with high-
    1.33 + * quality-setting files than with low-quality ones.
    1.34 + */
    1.35 +
    1.36 +#define JPEG_INTERNALS
    1.37 +#include "jinclude.h"
    1.38 +#include "jpeglib.h"
    1.39 +#include "jdct.h"		/* Private declarations for DCT subsystem */
    1.40 +
    1.41 +#ifdef DCT_IFAST_SUPPORTED
    1.42 +
    1.43 +
    1.44 +/*
    1.45 + * This module is specialized to the case DCTSIZE = 8.
    1.46 + */
    1.47 +
    1.48 +#if DCTSIZE != 8
    1.49 +  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
    1.50 +#endif
    1.51 +
    1.52 +
    1.53 +/* Scaling decisions are generally the same as in the LL&M algorithm;
    1.54 + * see jfdctint.c for more details.  However, we choose to descale
    1.55 + * (right shift) multiplication products as soon as they are formed,
    1.56 + * rather than carrying additional fractional bits into subsequent additions.
    1.57 + * This compromises accuracy slightly, but it lets us save a few shifts.
    1.58 + * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
    1.59 + * everywhere except in the multiplications proper; this saves a good deal
    1.60 + * of work on 16-bit-int machines.
    1.61 + *
    1.62 + * Again to save a few shifts, the intermediate results between pass 1 and
    1.63 + * pass 2 are not upscaled, but are represented only to integral precision.
    1.64 + *
    1.65 + * A final compromise is to represent the multiplicative constants to only
    1.66 + * 8 fractional bits, rather than 13.  This saves some shifting work on some
    1.67 + * machines, and may also reduce the cost of multiplication (since there
    1.68 + * are fewer one-bits in the constants).
    1.69 + */
    1.70 +
    1.71 +#define CONST_BITS  8
    1.72 +
    1.73 +
    1.74 +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
    1.75 + * causing a lot of useless floating-point operations at run time.
    1.76 + * To get around this we use the following pre-calculated constants.
    1.77 + * If you change CONST_BITS you may want to add appropriate values.
    1.78 + * (With a reasonable C compiler, you can just rely on the FIX() macro...)
    1.79 + */
    1.80 +
    1.81 +#if CONST_BITS == 8
    1.82 +#define FIX_0_382683433  ((INT32)   98)		/* FIX(0.382683433) */
    1.83 +#define FIX_0_541196100  ((INT32)  139)		/* FIX(0.541196100) */
    1.84 +#define FIX_0_707106781  ((INT32)  181)		/* FIX(0.707106781) */
    1.85 +#define FIX_1_306562965  ((INT32)  334)		/* FIX(1.306562965) */
    1.86 +#else
    1.87 +#define FIX_0_382683433  FIX(0.382683433)
    1.88 +#define FIX_0_541196100  FIX(0.541196100)
    1.89 +#define FIX_0_707106781  FIX(0.707106781)
    1.90 +#define FIX_1_306562965  FIX(1.306562965)
    1.91 +#endif
    1.92 +
    1.93 +
    1.94 +/* We can gain a little more speed, with a further compromise in accuracy,
    1.95 + * by omitting the addition in a descaling shift.  This yields an incorrectly
    1.96 + * rounded result half the time...
    1.97 + */
    1.98 +
    1.99 +#ifndef USE_ACCURATE_ROUNDING
   1.100 +#undef DESCALE
   1.101 +#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
   1.102 +#endif
   1.103 +
   1.104 +
   1.105 +/* Multiply a DCTELEM variable by an INT32 constant, and immediately
   1.106 + * descale to yield a DCTELEM result.
   1.107 + */
   1.108 +
   1.109 +#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
   1.110 +
   1.111 +
   1.112 +/*
   1.113 + * Perform the forward DCT on one block of samples.
   1.114 + */
   1.115 +
   1.116 +GLOBAL(void)
   1.117 +jpeg_fdct_ifast (DCTELEM * data)
   1.118 +{
   1.119 +  DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1.120 +  DCTELEM tmp10, tmp11, tmp12, tmp13;
   1.121 +  DCTELEM z1, z2, z3, z4, z5, z11, z13;
   1.122 +  DCTELEM *dataptr;
   1.123 +  int ctr;
   1.124 +  SHIFT_TEMPS
   1.125 +
   1.126 +  /* Pass 1: process rows. */
   1.127 +
   1.128 +  dataptr = data;
   1.129 +  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
   1.130 +    tmp0 = dataptr[0] + dataptr[7];
   1.131 +    tmp7 = dataptr[0] - dataptr[7];
   1.132 +    tmp1 = dataptr[1] + dataptr[6];
   1.133 +    tmp6 = dataptr[1] - dataptr[6];
   1.134 +    tmp2 = dataptr[2] + dataptr[5];
   1.135 +    tmp5 = dataptr[2] - dataptr[5];
   1.136 +    tmp3 = dataptr[3] + dataptr[4];
   1.137 +    tmp4 = dataptr[3] - dataptr[4];
   1.138 +    
   1.139 +    /* Even part */
   1.140 +    
   1.141 +    tmp10 = tmp0 + tmp3;	/* phase 2 */
   1.142 +    tmp13 = tmp0 - tmp3;
   1.143 +    tmp11 = tmp1 + tmp2;
   1.144 +    tmp12 = tmp1 - tmp2;
   1.145 +    
   1.146 +    dataptr[0] = tmp10 + tmp11; /* phase 3 */
   1.147 +    dataptr[4] = tmp10 - tmp11;
   1.148 +    
   1.149 +    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
   1.150 +    dataptr[2] = tmp13 + z1;	/* phase 5 */
   1.151 +    dataptr[6] = tmp13 - z1;
   1.152 +    
   1.153 +    /* Odd part */
   1.154 +
   1.155 +    tmp10 = tmp4 + tmp5;	/* phase 2 */
   1.156 +    tmp11 = tmp5 + tmp6;
   1.157 +    tmp12 = tmp6 + tmp7;
   1.158 +
   1.159 +    /* The rotator is modified from fig 4-8 to avoid extra negations. */
   1.160 +    z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
   1.161 +    z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
   1.162 +    z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
   1.163 +    z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
   1.164 +
   1.165 +    z11 = tmp7 + z3;		/* phase 5 */
   1.166 +    z13 = tmp7 - z3;
   1.167 +
   1.168 +    dataptr[5] = z13 + z2;	/* phase 6 */
   1.169 +    dataptr[3] = z13 - z2;
   1.170 +    dataptr[1] = z11 + z4;
   1.171 +    dataptr[7] = z11 - z4;
   1.172 +
   1.173 +    dataptr += DCTSIZE;		/* advance pointer to next row */
   1.174 +  }
   1.175 +
   1.176 +  /* Pass 2: process columns. */
   1.177 +
   1.178 +  dataptr = data;
   1.179 +  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
   1.180 +    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
   1.181 +    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
   1.182 +    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
   1.183 +    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
   1.184 +    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
   1.185 +    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
   1.186 +    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
   1.187 +    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
   1.188 +    
   1.189 +    /* Even part */
   1.190 +    
   1.191 +    tmp10 = tmp0 + tmp3;	/* phase 2 */
   1.192 +    tmp13 = tmp0 - tmp3;
   1.193 +    tmp11 = tmp1 + tmp2;
   1.194 +    tmp12 = tmp1 - tmp2;
   1.195 +    
   1.196 +    dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
   1.197 +    dataptr[DCTSIZE*4] = tmp10 - tmp11;
   1.198 +    
   1.199 +    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
   1.200 +    dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
   1.201 +    dataptr[DCTSIZE*6] = tmp13 - z1;
   1.202 +    
   1.203 +    /* Odd part */
   1.204 +
   1.205 +    tmp10 = tmp4 + tmp5;	/* phase 2 */
   1.206 +    tmp11 = tmp5 + tmp6;
   1.207 +    tmp12 = tmp6 + tmp7;
   1.208 +
   1.209 +    /* The rotator is modified from fig 4-8 to avoid extra negations. */
   1.210 +    z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
   1.211 +    z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
   1.212 +    z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
   1.213 +    z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
   1.214 +
   1.215 +    z11 = tmp7 + z3;		/* phase 5 */
   1.216 +    z13 = tmp7 - z3;
   1.217 +
   1.218 +    dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
   1.219 +    dataptr[DCTSIZE*3] = z13 - z2;
   1.220 +    dataptr[DCTSIZE*1] = z11 + z4;
   1.221 +    dataptr[DCTSIZE*7] = z11 - z4;
   1.222 +
   1.223 +    dataptr++;			/* advance pointer to next column */
   1.224 +  }
   1.225 +}
   1.226 +
   1.227 +#endif /* DCT_IFAST_SUPPORTED */