/* ***** BEGIN LICENSE BLOCK ***** * Source last modified: $Id: dct4.c,v 1.1 2005/02/26 01:47:34 jrecker Exp $ * * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. * * The contents of this file, and the files included with this file, * are subject to the current version of the RealNetworks Public * Source License (the "RPSL") available at * http://www.helixcommunity.org/content/rpsl unless you have licensed * the file under the current version of the RealNetworks Community * Source License (the "RCSL") available at * http://www.helixcommunity.org/content/rcsl, in which case the RCSL * will apply. You may also obtain the license terms directly from * RealNetworks. You may not use this file except in compliance with * the RPSL or, if you have a valid RCSL with RealNetworks applicable * to this file, the RCSL. Please see the applicable RPSL or RCSL for * the rights, obligations and limitations governing use of the * contents of the file. * * This file is part of the Helix DNA Technology. RealNetworks is the * developer of the Original Code and owns the copyrights in the * portions it created. * * This file, and the files included with this file, is distributed * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET * ENJOYMENT OR NON-INFRINGEMENT. * * Technology Compatibility Kit Test Suite(s) Location: * http://www.helixcommunity.org/content/tck * * Contributor(s): * * ***** END LICENSE BLOCK ***** */ /************************************************************************************** * Fixed-point HE-AAC decoder * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) * February 2005 * * dct4.c - optimized DCT-IV **************************************************************************************/ #include "coder.h" #include "assembly.h" static const int nmdctTab[NUM_IMDCT_SIZES] PROGMEM = {128, 1024}; static const int postSkip[NUM_IMDCT_SIZES] PROGMEM = {15, 1}; /************************************************************************************** * Function: PreMultiply * * Description: pre-twiddle stage of DCT4 * * Inputs: table index (for transform size) * buffer of nmdct samples * * Outputs: processed samples in same buffer * * Return: none * * Notes: minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits * i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long) * normalization by -1/N is rolled into tables here (see trigtabs.c) * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add **************************************************************************************/ static void PreMultiply(int tabidx, int *zbuf1) { int i, nmdct, ar1, ai1, ar2, ai2, z1, z2; int t, cms2, cps2a, sin2a, cps2b, sin2b; int *zbuf2; const int *csptr; nmdct = nmdctTab[tabidx]; zbuf2 = zbuf1 + nmdct - 1; csptr = cos4sin4tab + cos4sin4tabOffset[tabidx]; /* whole thing should fit in registers - verify that compiler does this */ for (i = nmdct >> 2; i != 0; i--) { /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ cps2a = *csptr++; sin2a = *csptr++; cps2b = *csptr++; sin2b = *csptr++; ar1 = *(zbuf1 + 0); ai2 = *(zbuf1 + 1); ai1 = *(zbuf2 + 0); ar2 = *(zbuf2 - 1); /* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414 * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from * extra sign bits, and eat one in adding */ t = MULSHIFT32(sin2a, ar1 + ai1); z2 = MULSHIFT32(cps2a, ai1) - t; cms2 = cps2a - 2*sin2a; z1 = MULSHIFT32(cms2, ar1) + t; *zbuf1++ = z1; /* cos*ar1 + sin*ai1 */ *zbuf1++ = z2; /* cos*ai1 - sin*ar1 */ t = MULSHIFT32(sin2b, ar2 + ai2); z2 = MULSHIFT32(cps2b, ai2) - t; cms2 = cps2b - 2*sin2b; z1 = MULSHIFT32(cms2, ar2) + t; *zbuf2-- = z2; /* cos*ai2 - sin*ar2 */ *zbuf2-- = z1; /* cos*ar2 + sin*ai2 */ } } /************************************************************************************** * Function: PostMultiply * * Description: post-twiddle stage of DCT4 * * Inputs: table index (for transform size) * buffer of nmdct samples * * Outputs: processed samples in same buffer * * Return: none * * Notes: minimum 1 GB in, 2 GB out - gains 2 int bits * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add **************************************************************************************/ static void PostMultiply(int tabidx, int *fft1) { int i, nmdct, ar1, ai1, ar2, ai2, skipFactor; int t, cms2, cps2, sin2; int *fft2; const int *csptr; nmdct = nmdctTab[tabidx]; csptr = cos1sin1tab; skipFactor = postSkip[tabidx]; fft2 = fft1 + nmdct - 1; /* load coeffs for first pass * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ cps2 = *csptr++; sin2 = *csptr; csptr += skipFactor; cms2 = cps2 - 2*sin2; for (i = nmdct >> 2; i != 0; i--) { ar1 = *(fft1 + 0); ai1 = *(fft1 + 1); ar2 = *(fft2 - 1); ai2 = *(fft2 + 0); /* gain 2 ints bit from MULSHIFT32 by Q30 * max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414 * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from * extra sign bits, and eat one in adding */ t = MULSHIFT32(sin2, ar1 + ai1); *fft2-- = t - MULSHIFT32(cps2, ai1); /* sin*ar1 - cos*ai1 */ *fft1++ = t + MULSHIFT32(cms2, ar1); /* cos*ar1 + sin*ai1 */ cps2 = *csptr++; sin2 = *csptr; csptr += skipFactor; ai2 = -ai2; t = MULSHIFT32(sin2, ar2 + ai2); *fft2-- = t - MULSHIFT32(cps2, ai2); /* sin*ar1 - cos*ai1 */ cms2 = cps2 - 2*sin2; *fft1++ = t + MULSHIFT32(cms2, ar2); /* cos*ar1 + sin*ai1 */ } } /************************************************************************************** * Function: PreMultiplyRescale * * Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits * * Inputs: table index (for transform size) * buffer of nmdct samples * number of guard bits to add to input before processing * * Outputs: processed samples in same buffer * * Return: none * * Notes: see notes on PreMultiply(), above **************************************************************************************/ /* __attribute__ ((section (".data"))) */ static void PreMultiplyRescale(int tabidx, int *zbuf1, int es) { int i, nmdct, ar1, ai1, ar2, ai2, z1, z2; int t, cms2, cps2a, sin2a, cps2b, sin2b; int *zbuf2; const int *csptr; nmdct = nmdctTab[tabidx]; zbuf2 = zbuf1 + nmdct - 1; csptr = cos4sin4tab + cos4sin4tabOffset[tabidx]; /* whole thing should fit in registers - verify that compiler does this */ for (i = nmdct >> 2; i != 0; i--) { /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ cps2a = *csptr++; sin2a = *csptr++; cps2b = *csptr++; sin2b = *csptr++; ar1 = *(zbuf1 + 0) >> es; ai1 = *(zbuf2 + 0) >> es; ai2 = *(zbuf1 + 1) >> es; t = MULSHIFT32(sin2a, ar1 + ai1); z2 = MULSHIFT32(cps2a, ai1) - t; cms2 = cps2a - 2*sin2a; z1 = MULSHIFT32(cms2, ar1) + t; *zbuf1++ = z1; *zbuf1++ = z2; ar2 = *(zbuf2 - 1) >> es; /* do here to free up register used for es */ t = MULSHIFT32(sin2b, ar2 + ai2); z2 = MULSHIFT32(cps2b, ai2) - t; cms2 = cps2b - 2*sin2b; z1 = MULSHIFT32(cms2, ar2) + t; *zbuf2-- = z2; *zbuf2-- = z1; } } /************************************************************************************** * Function: PostMultiplyRescale * * Description: post-twiddle stage of DCT4, with rescaling for extra guard bits * * Inputs: table index (for transform size) * buffer of nmdct samples * number of guard bits to remove from output * * Outputs: processed samples in same buffer * * Return: none * * Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit * see notes on PostMultiply(), above **************************************************************************************/ /* __attribute__ ((section (".data"))) */ static void PostMultiplyRescale(int tabidx, int *fft1, int es) { int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z; int t, cs2, sin2; int *fft2; const int *csptr; nmdct = nmdctTab[tabidx]; csptr = cos1sin1tab; skipFactor = postSkip[tabidx]; fft2 = fft1 + nmdct - 1; /* load coeffs for first pass * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ cs2 = *csptr++; sin2 = *csptr; csptr += skipFactor; for (i = nmdct >> 2; i != 0; i--) { ar1 = *(fft1 + 0); ai1 = *(fft1 + 1); ai2 = *(fft2 + 0); t = MULSHIFT32(sin2, ar1 + ai1); z = t - MULSHIFT32(cs2, ai1); CLIP_2N_SHIFT(z, es); *fft2-- = z; cs2 -= 2*sin2; z = t + MULSHIFT32(cs2, ar1); CLIP_2N_SHIFT(z, es); *fft1++ = z; cs2 = *csptr++; sin2 = *csptr; csptr += skipFactor; ar2 = *fft2; ai2 = -ai2; t = MULSHIFT32(sin2, ar2 + ai2); z = t - MULSHIFT32(cs2, ai2); CLIP_2N_SHIFT(z, es); *fft2-- = z; cs2 -= 2*sin2; z = t + MULSHIFT32(cs2, ar2); CLIP_2N_SHIFT(z, es); *fft1++ = z; cs2 += 2*sin2; } } /************************************************************************************** * Function: DCT4 * * Description: type-IV DCT * * Inputs: table index (for transform size) * buffer of nmdct samples * number of guard bits in the input buffer * * Outputs: processed samples in same buffer * * Return: none * * Notes: operates in-place * if number of guard bits in input is < GBITS_IN_DCT4, the input is * scaled (>>) before the DCT4 and rescaled (<<, with clipping) after * the DCT4 (rare) * the output has FBITS_LOST_DCT4 fewer fraction bits than the input * the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4) * int bits gained per stage (PreMul + FFT + PostMul) * short blocks = (-5 + 4 + 2) = 1 total * long blocks = (-8 + 7 + 2) = 1 total **************************************************************************************/ void DCT4(int tabidx, int *coef, int gb) { int es; /* fast in-place DCT-IV - adds guard bits if necessary */ if (gb < GBITS_IN_DCT4) { es = GBITS_IN_DCT4 - gb; PreMultiplyRescale(tabidx, coef, es); R4FFT(tabidx, coef); PostMultiplyRescale(tabidx, coef, es); } else { PreMultiply(tabidx, coef); R4FFT(tabidx, coef); PostMultiply(tabidx, coef); } }