dct4.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. /* ***** BEGIN LICENSE BLOCK *****
  2. * Source last modified: $Id: dct4.c,v 1.1 2005/02/26 01:47:34 jrecker Exp $
  3. *
  4. * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
  5. *
  6. * The contents of this file, and the files included with this file,
  7. * are subject to the current version of the RealNetworks Public
  8. * Source License (the "RPSL") available at
  9. * http://www.helixcommunity.org/content/rpsl unless you have licensed
  10. * the file under the current version of the RealNetworks Community
  11. * Source License (the "RCSL") available at
  12. * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
  13. * will apply. You may also obtain the license terms directly from
  14. * RealNetworks. You may not use this file except in compliance with
  15. * the RPSL or, if you have a valid RCSL with RealNetworks applicable
  16. * to this file, the RCSL. Please see the applicable RPSL or RCSL for
  17. * the rights, obligations and limitations governing use of the
  18. * contents of the file.
  19. *
  20. * This file is part of the Helix DNA Technology. RealNetworks is the
  21. * developer of the Original Code and owns the copyrights in the
  22. * portions it created.
  23. *
  24. * This file, and the files included with this file, is distributed
  25. * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
  26. * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
  27. * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
  28. * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
  29. * ENJOYMENT OR NON-INFRINGEMENT.
  30. *
  31. * Technology Compatibility Kit Test Suite(s) Location:
  32. * http://www.helixcommunity.org/content/tck
  33. *
  34. * Contributor(s):
  35. *
  36. * ***** END LICENSE BLOCK ***** */
  37. /**************************************************************************************
  38. * Fixed-point HE-AAC decoder
  39. * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
  40. * February 2005
  41. *
  42. * dct4.c - optimized DCT-IV
  43. **************************************************************************************/
  44. #include "coder.h"
  45. #include "assembly.h"
  46. static const int nmdctTab[NUM_IMDCT_SIZES] PROGMEM = {128, 1024};
  47. static const int postSkip[NUM_IMDCT_SIZES] PROGMEM = {15, 1};
  48. /**************************************************************************************
  49. * Function: PreMultiply
  50. *
  51. * Description: pre-twiddle stage of DCT4
  52. *
  53. * Inputs: table index (for transform size)
  54. * buffer of nmdct samples
  55. *
  56. * Outputs: processed samples in same buffer
  57. *
  58. * Return: none
  59. *
  60. * Notes: minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits
  61. * i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long)
  62. * normalization by -1/N is rolled into tables here (see trigtabs.c)
  63. * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
  64. **************************************************************************************/
  65. static void PreMultiply(int tabidx, int *zbuf1)
  66. {
  67. int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
  68. int t, cms2, cps2a, sin2a, cps2b, sin2b;
  69. int *zbuf2;
  70. const int *csptr;
  71. nmdct = nmdctTab[tabidx];
  72. zbuf2 = zbuf1 + nmdct - 1;
  73. csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
  74. /* whole thing should fit in registers - verify that compiler does this */
  75. for (i = nmdct >> 2; i != 0; i--) {
  76. /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
  77. cps2a = *csptr++;
  78. sin2a = *csptr++;
  79. cps2b = *csptr++;
  80. sin2b = *csptr++;
  81. ar1 = *(zbuf1 + 0);
  82. ai2 = *(zbuf1 + 1);
  83. ai1 = *(zbuf2 + 0);
  84. ar2 = *(zbuf2 - 1);
  85. /* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M
  86. * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414
  87. * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
  88. * extra sign bits, and eat one in adding
  89. */
  90. t = MULSHIFT32(sin2a, ar1 + ai1);
  91. z2 = MULSHIFT32(cps2a, ai1) - t;
  92. cms2 = cps2a - 2*sin2a;
  93. z1 = MULSHIFT32(cms2, ar1) + t;
  94. *zbuf1++ = z1; /* cos*ar1 + sin*ai1 */
  95. *zbuf1++ = z2; /* cos*ai1 - sin*ar1 */
  96. t = MULSHIFT32(sin2b, ar2 + ai2);
  97. z2 = MULSHIFT32(cps2b, ai2) - t;
  98. cms2 = cps2b - 2*sin2b;
  99. z1 = MULSHIFT32(cms2, ar2) + t;
  100. *zbuf2-- = z2; /* cos*ai2 - sin*ar2 */
  101. *zbuf2-- = z1; /* cos*ar2 + sin*ai2 */
  102. }
  103. }
  104. /**************************************************************************************
  105. * Function: PostMultiply
  106. *
  107. * Description: post-twiddle stage of DCT4
  108. *
  109. * Inputs: table index (for transform size)
  110. * buffer of nmdct samples
  111. *
  112. * Outputs: processed samples in same buffer
  113. *
  114. * Return: none
  115. *
  116. * Notes: minimum 1 GB in, 2 GB out - gains 2 int bits
  117. * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
  118. **************************************************************************************/
  119. static void PostMultiply(int tabidx, int *fft1)
  120. {
  121. int i, nmdct, ar1, ai1, ar2, ai2, skipFactor;
  122. int t, cms2, cps2, sin2;
  123. int *fft2;
  124. const int *csptr;
  125. nmdct = nmdctTab[tabidx];
  126. csptr = cos1sin1tab;
  127. skipFactor = postSkip[tabidx];
  128. fft2 = fft1 + nmdct - 1;
  129. /* load coeffs for first pass
  130. * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
  131. */
  132. cps2 = *csptr++;
  133. sin2 = *csptr;
  134. csptr += skipFactor;
  135. cms2 = cps2 - 2*sin2;
  136. for (i = nmdct >> 2; i != 0; i--) {
  137. ar1 = *(fft1 + 0);
  138. ai1 = *(fft1 + 1);
  139. ar2 = *(fft2 - 1);
  140. ai2 = *(fft2 + 0);
  141. /* gain 2 ints bit from MULSHIFT32 by Q30
  142. * max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414
  143. * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
  144. * extra sign bits, and eat one in adding
  145. */
  146. t = MULSHIFT32(sin2, ar1 + ai1);
  147. *fft2-- = t - MULSHIFT32(cps2, ai1); /* sin*ar1 - cos*ai1 */
  148. *fft1++ = t + MULSHIFT32(cms2, ar1); /* cos*ar1 + sin*ai1 */
  149. cps2 = *csptr++;
  150. sin2 = *csptr;
  151. csptr += skipFactor;
  152. ai2 = -ai2;
  153. t = MULSHIFT32(sin2, ar2 + ai2);
  154. *fft2-- = t - MULSHIFT32(cps2, ai2); /* sin*ar1 - cos*ai1 */
  155. cms2 = cps2 - 2*sin2;
  156. *fft1++ = t + MULSHIFT32(cms2, ar2); /* cos*ar1 + sin*ai1 */
  157. }
  158. }
  159. /**************************************************************************************
  160. * Function: PreMultiplyRescale
  161. *
  162. * Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits
  163. *
  164. * Inputs: table index (for transform size)
  165. * buffer of nmdct samples
  166. * number of guard bits to add to input before processing
  167. *
  168. * Outputs: processed samples in same buffer
  169. *
  170. * Return: none
  171. *
  172. * Notes: see notes on PreMultiply(), above
  173. **************************************************************************************/
  174. /* __attribute__ ((section (".data"))) */ static void PreMultiplyRescale(int tabidx, int *zbuf1, int es)
  175. {
  176. int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
  177. int t, cms2, cps2a, sin2a, cps2b, sin2b;
  178. int *zbuf2;
  179. const int *csptr;
  180. nmdct = nmdctTab[tabidx];
  181. zbuf2 = zbuf1 + nmdct - 1;
  182. csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
  183. /* whole thing should fit in registers - verify that compiler does this */
  184. for (i = nmdct >> 2; i != 0; i--) {
  185. /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
  186. cps2a = *csptr++;
  187. sin2a = *csptr++;
  188. cps2b = *csptr++;
  189. sin2b = *csptr++;
  190. ar1 = *(zbuf1 + 0) >> es;
  191. ai1 = *(zbuf2 + 0) >> es;
  192. ai2 = *(zbuf1 + 1) >> es;
  193. t = MULSHIFT32(sin2a, ar1 + ai1);
  194. z2 = MULSHIFT32(cps2a, ai1) - t;
  195. cms2 = cps2a - 2*sin2a;
  196. z1 = MULSHIFT32(cms2, ar1) + t;
  197. *zbuf1++ = z1;
  198. *zbuf1++ = z2;
  199. ar2 = *(zbuf2 - 1) >> es; /* do here to free up register used for es */
  200. t = MULSHIFT32(sin2b, ar2 + ai2);
  201. z2 = MULSHIFT32(cps2b, ai2) - t;
  202. cms2 = cps2b - 2*sin2b;
  203. z1 = MULSHIFT32(cms2, ar2) + t;
  204. *zbuf2-- = z2;
  205. *zbuf2-- = z1;
  206. }
  207. }
  208. /**************************************************************************************
  209. * Function: PostMultiplyRescale
  210. *
  211. * Description: post-twiddle stage of DCT4, with rescaling for extra guard bits
  212. *
  213. * Inputs: table index (for transform size)
  214. * buffer of nmdct samples
  215. * number of guard bits to remove from output
  216. *
  217. * Outputs: processed samples in same buffer
  218. *
  219. * Return: none
  220. *
  221. * Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit
  222. * see notes on PostMultiply(), above
  223. **************************************************************************************/
  224. /* __attribute__ ((section (".data"))) */ static void PostMultiplyRescale(int tabidx, int *fft1, int es)
  225. {
  226. int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z;
  227. int t, cs2, sin2;
  228. int *fft2;
  229. const int *csptr;
  230. nmdct = nmdctTab[tabidx];
  231. csptr = cos1sin1tab;
  232. skipFactor = postSkip[tabidx];
  233. fft2 = fft1 + nmdct - 1;
  234. /* load coeffs for first pass
  235. * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
  236. */
  237. cs2 = *csptr++;
  238. sin2 = *csptr;
  239. csptr += skipFactor;
  240. for (i = nmdct >> 2; i != 0; i--) {
  241. ar1 = *(fft1 + 0);
  242. ai1 = *(fft1 + 1);
  243. ai2 = *(fft2 + 0);
  244. t = MULSHIFT32(sin2, ar1 + ai1);
  245. z = t - MULSHIFT32(cs2, ai1);
  246. CLIP_2N_SHIFT(z, es);
  247. *fft2-- = z;
  248. cs2 -= 2*sin2;
  249. z = t + MULSHIFT32(cs2, ar1);
  250. CLIP_2N_SHIFT(z, es);
  251. *fft1++ = z;
  252. cs2 = *csptr++;
  253. sin2 = *csptr;
  254. csptr += skipFactor;
  255. ar2 = *fft2;
  256. ai2 = -ai2;
  257. t = MULSHIFT32(sin2, ar2 + ai2);
  258. z = t - MULSHIFT32(cs2, ai2);
  259. CLIP_2N_SHIFT(z, es);
  260. *fft2-- = z;
  261. cs2 -= 2*sin2;
  262. z = t + MULSHIFT32(cs2, ar2);
  263. CLIP_2N_SHIFT(z, es);
  264. *fft1++ = z;
  265. cs2 += 2*sin2;
  266. }
  267. }
  268. /**************************************************************************************
  269. * Function: DCT4
  270. *
  271. * Description: type-IV DCT
  272. *
  273. * Inputs: table index (for transform size)
  274. * buffer of nmdct samples
  275. * number of guard bits in the input buffer
  276. *
  277. * Outputs: processed samples in same buffer
  278. *
  279. * Return: none
  280. *
  281. * Notes: operates in-place
  282. * if number of guard bits in input is < GBITS_IN_DCT4, the input is
  283. * scaled (>>) before the DCT4 and rescaled (<<, with clipping) after
  284. * the DCT4 (rare)
  285. * the output has FBITS_LOST_DCT4 fewer fraction bits than the input
  286. * the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4)
  287. * int bits gained per stage (PreMul + FFT + PostMul)
  288. * short blocks = (-5 + 4 + 2) = 1 total
  289. * long blocks = (-8 + 7 + 2) = 1 total
  290. **************************************************************************************/
  291. void DCT4(int tabidx, int *coef, int gb)
  292. {
  293. int es;
  294. /* fast in-place DCT-IV - adds guard bits if necessary */
  295. if (gb < GBITS_IN_DCT4) {
  296. es = GBITS_IN_DCT4 - gb;
  297. PreMultiplyRescale(tabidx, coef, es);
  298. R4FFT(tabidx, coef);
  299. PostMultiplyRescale(tabidx, coef, es);
  300. } else {
  301. PreMultiply(tabidx, coef);
  302. R4FFT(tabidx, coef);
  303. PostMultiply(tabidx, coef);
  304. }
  305. }