sbrqmf.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. /* ***** BEGIN LICENSE BLOCK *****
  2. * Source last modified: $Id: sbrqmf.c,v 1.2 2005/05/19 20:45:20 jrecker Exp $
  3. *
  4. * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
  5. *
  6. * The contents of this file, and the files included with this file,
  7. * are subject to the current version of the RealNetworks Public
  8. * Source License (the "RPSL") available at
  9. * http://www.helixcommunity.org/content/rpsl unless you have licensed
  10. * the file under the current version of the RealNetworks Community
  11. * Source License (the "RCSL") available at
  12. * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
  13. * will apply. You may also obtain the license terms directly from
  14. * RealNetworks. You may not use this file except in compliance with
  15. * the RPSL or, if you have a valid RCSL with RealNetworks applicable
  16. * to this file, the RCSL. Please see the applicable RPSL or RCSL for
  17. * the rights, obligations and limitations governing use of the
  18. * contents of the file.
  19. *
  20. * This file is part of the Helix DNA Technology. RealNetworks is the
  21. * developer of the Original Code and owns the copyrights in the
  22. * portions it created.
  23. *
  24. * This file, and the files included with this file, is distributed
  25. * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
  26. * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
  27. * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
  28. * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
  29. * ENJOYMENT OR NON-INFRINGEMENT.
  30. *
  31. * Technology Compatibility Kit Test Suite(s) Location:
  32. * http://www.helixcommunity.org/content/tck
  33. *
  34. * Contributor(s):
  35. *
  36. * ***** END LICENSE BLOCK ***** */
  37. /**************************************************************************************
  38. * Fixed-point HE-AAC decoder
  39. * Jon Recker (jrecker@real.com)
  40. * February 2005
  41. *
  42. * sbrqmf.c - analysis and synthesis QMF filters for SBR
  43. **************************************************************************************/
  44. #include "sbr.h"
  45. #include "assembly.h"
  46. /* PreMultiply64() table
  47. * format = Q30
  48. * reordered for sequential access
  49. *
  50. * for (i = 0; i < 64/4; i++) {
  51. * angle = (i + 0.25) * M_PI / nmdct;
  52. * x = (cos(angle) + sin(angle));
  53. * x = sin(angle);
  54. *
  55. * angle = (nmdct/2 - 1 - i + 0.25) * M_PI / nmdct;
  56. * x = (cos(angle) + sin(angle));
  57. * x = sin(angle);
  58. * }
  59. */
  60. static const int cos4sin4tab64[64] PROGMEM = {
  61. 0x40c7d2bd, 0x00c90e90, 0x424ff28f, 0x3ff4e5e0, 0x43cdd89a, 0x03ecadcf, 0x454149fc, 0x3fc395f9,
  62. 0x46aa0d6d, 0x070de172, 0x4807eb4b, 0x3f6af2e3, 0x495aada2, 0x0a2abb59, 0x4aa22036, 0x3eeb3347,
  63. 0x4bde1089, 0x0d415013, 0x4d0e4de2, 0x3e44a5ef, 0x4e32a956, 0x104fb80e, 0x4f4af5d1, 0x3d77b192,
  64. 0x50570819, 0x135410c3, 0x5156b6d9, 0x3c84d496, 0x5249daa2, 0x164c7ddd, 0x53304df6, 0x3b6ca4c4,
  65. 0x5409ed4b, 0x19372a64, 0x54d69714, 0x3a2fcee8, 0x55962bc0, 0x1c1249d8, 0x56488dc5, 0x38cf1669,
  66. 0x56eda1a0, 0x1edc1953, 0x57854ddd, 0x374b54ce, 0x580f7b19, 0x2192e09b, 0x588c1404, 0x35a5793c,
  67. 0x58fb0568, 0x2434f332, 0x595c3e2a, 0x33de87de, 0x59afaf4c, 0x26c0b162, 0x59f54bee, 0x31f79948,
  68. 0x5a2d0957, 0x29348937, 0x5a56deec, 0x2ff1d9c7, 0x5a72c63b, 0x2b8ef77d, 0x5a80baf6, 0x2dce88aa,
  69. };
  70. /* PostMultiply64() table
  71. * format = Q30
  72. * reordered for sequential access
  73. *
  74. * for (i = 0; i <= (32/2); i++) {
  75. * angle = i * M_PI / 64;
  76. * x = (cos(angle) + sin(angle));
  77. * x = sin(angle);
  78. * }
  79. */
  80. static const int cos1sin1tab64[34] PROGMEM = {
  81. 0x40000000, 0x00000000, 0x43103085, 0x0323ecbe, 0x45f704f7, 0x0645e9af, 0x48b2b335, 0x09640837,
  82. 0x4b418bbe, 0x0c7c5c1e, 0x4da1fab5, 0x0f8cfcbe, 0x4fd288dc, 0x1294062f, 0x51d1dc80, 0x158f9a76,
  83. 0x539eba45, 0x187de2a7, 0x553805f2, 0x1b5d100a, 0x569cc31b, 0x1e2b5d38, 0x57cc15bc, 0x20e70f32,
  84. 0x58c542c5, 0x238e7673, 0x5987b08a, 0x261feffa, 0x5a12e720, 0x2899e64a, 0x5a6690ae, 0x2afad269,
  85. 0x5a82799a, 0x2d413ccd,
  86. };
  87. /**************************************************************************************
  88. * Function: PreMultiply64
  89. *
  90. * Description: pre-twiddle stage of 64-point DCT-IV
  91. *
  92. * Inputs: buffer of 64 samples
  93. *
  94. * Outputs: processed samples in same buffer
  95. *
  96. * Return: none
  97. *
  98. * Notes: minimum 1 GB in, 2 GB out, gains 2 int bits
  99. * gbOut = gbIn + 1
  100. * output is limited to sqrt(2)/2 plus GB in full GB
  101. * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
  102. **************************************************************************************/
  103. static void PreMultiply64(int *zbuf1)
  104. {
  105. int i, ar1, ai1, ar2, ai2, z1, z2;
  106. int t, cms2, cps2a, sin2a, cps2b, sin2b;
  107. int *zbuf2;
  108. const int *csptr;
  109. zbuf2 = zbuf1 + 64 - 1;
  110. csptr = cos4sin4tab64;
  111. /* whole thing should fit in registers - verify that compiler does this */
  112. for (i = 64 >> 2; i != 0; i--) {
  113. /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
  114. cps2a = *csptr++;
  115. sin2a = *csptr++;
  116. cps2b = *csptr++;
  117. sin2b = *csptr++;
  118. ar1 = *(zbuf1 + 0);
  119. ai2 = *(zbuf1 + 1);
  120. ai1 = *(zbuf2 + 0);
  121. ar2 = *(zbuf2 - 1);
  122. /* gain 2 ints bit from MULSHIFT32 by Q30
  123. * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414
  124. * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
  125. * extra sign bits, and eat one in adding
  126. */
  127. t = MULSHIFT32(sin2a, ar1 + ai1);
  128. z2 = MULSHIFT32(cps2a, ai1) - t;
  129. cms2 = cps2a - 2*sin2a;
  130. z1 = MULSHIFT32(cms2, ar1) + t;
  131. *zbuf1++ = z1; /* cos*ar1 + sin*ai1 */
  132. *zbuf1++ = z2; /* cos*ai1 - sin*ar1 */
  133. t = MULSHIFT32(sin2b, ar2 + ai2);
  134. z2 = MULSHIFT32(cps2b, ai2) - t;
  135. cms2 = cps2b - 2*sin2b;
  136. z1 = MULSHIFT32(cms2, ar2) + t;
  137. *zbuf2-- = z2; /* cos*ai2 - sin*ar2 */
  138. *zbuf2-- = z1; /* cos*ar2 + sin*ai2 */
  139. }
  140. }
  141. /**************************************************************************************
  142. * Function: PostMultiply64
  143. *
  144. * Description: post-twiddle stage of 64-point type-IV DCT
  145. *
  146. * Inputs: buffer of 64 samples
  147. * number of output samples to calculate
  148. *
  149. * Outputs: processed samples in same buffer
  150. *
  151. * Return: none
  152. *
  153. * Notes: minimum 1 GB in, 2 GB out, gains 2 int bits
  154. * gbOut = gbIn + 1
  155. * output is limited to sqrt(2)/2 plus GB in full GB
  156. * nSampsOut is rounded up to next multiple of 4, since we calculate
  157. * 4 samples per loop
  158. **************************************************************************************/
  159. static void PostMultiply64(int *fft1, int nSampsOut)
  160. {
  161. int i, ar1, ai1, ar2, ai2;
  162. int t, cms2, cps2, sin2;
  163. int *fft2;
  164. const int *csptr;
  165. csptr = cos1sin1tab64;
  166. fft2 = fft1 + 64 - 1;
  167. /* load coeffs for first pass
  168. * cps2 = (cos+sin)/2, sin2 = sin/2, cms2 = (cos-sin)/2
  169. */
  170. cps2 = *csptr++;
  171. sin2 = *csptr++;
  172. cms2 = cps2 - 2*sin2;
  173. for (i = (nSampsOut + 3) >> 2; i != 0; i--) {
  174. ar1 = *(fft1 + 0);
  175. ai1 = *(fft1 + 1);
  176. ar2 = *(fft2 - 1);
  177. ai2 = *(fft2 + 0);
  178. /* gain 2 int bits (multiplying by Q30), max gain = sqrt(2) */
  179. t = MULSHIFT32(sin2, ar1 + ai1);
  180. *fft2-- = t - MULSHIFT32(cps2, ai1);
  181. *fft1++ = t + MULSHIFT32(cms2, ar1);
  182. cps2 = *csptr++;
  183. sin2 = *csptr++;
  184. ai2 = -ai2;
  185. t = MULSHIFT32(sin2, ar2 + ai2);
  186. *fft2-- = t - MULSHIFT32(cps2, ai2);
  187. cms2 = cps2 - 2*sin2;
  188. *fft1++ = t + MULSHIFT32(cms2, ar2);
  189. }
  190. }
  191. /**************************************************************************************
  192. * Function: QMFAnalysisConv
  193. *
  194. * Description: convolution kernel for analysis QMF
  195. *
  196. * Inputs: pointer to coefficient table, reordered for sequential access
  197. * delay buffer of size 32*10 = 320 real-valued PCM samples
  198. * index for delay ring buffer (range = [0, 9])
  199. *
  200. * Outputs: 64 consecutive 32-bit samples
  201. *
  202. * Return: none
  203. *
  204. * Notes: this is carefully written to be efficient on ARM
  205. * use the assembly code version in sbrqmfak.s when building for ARM!
  206. **************************************************************************************/
  207. #if (defined (__arm) && defined (__ARMCC_VERSION)) || (defined (_WIN32) && defined (_WIN32_WCE) && defined (ARM)) || (defined(__GNUC__) && defined(__arm__))
  208. #ifdef __cplusplus
  209. extern "C"
  210. #endif
  211. void QMFAnalysisConv(int *cTab, int *delay, int dIdx, int *uBuf);
  212. #else
  213. void QMFAnalysisConv(int *cTab, int *delay, int dIdx, int *uBuf)
  214. {
  215. int k, dOff;
  216. int *cPtr0, *cPtr1;
  217. U64 u64lo, u64hi;
  218. dOff = dIdx*32 + 31;
  219. cPtr0 = cTab;
  220. cPtr1 = cTab + 33*5 - 1;
  221. /* special first pass since we need to flip sign to create cTab[384], cTab[512] */
  222. u64lo.w64 = 0;
  223. u64hi.w64 = 0;
  224. u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  225. u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  226. u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  227. u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  228. u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  229. u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  230. u64lo.w64 = MADD64(u64lo.w64, -(*cPtr1--), delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  231. u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  232. u64lo.w64 = MADD64(u64lo.w64, -(*cPtr1--), delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  233. u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  234. uBuf[0] = u64lo.r.hi32;
  235. uBuf[32] = u64hi.r.hi32;
  236. uBuf++;
  237. dOff--;
  238. /* max gain for any sample in uBuf, after scaling by cTab, ~= 0.99
  239. * so we can just sum the uBuf values with no overflow problems
  240. */
  241. for (k = 1; k <= 31; k++) {
  242. u64lo.w64 = 0;
  243. u64hi.w64 = 0;
  244. u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  245. u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  246. u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  247. u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  248. u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  249. u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  250. u64lo.w64 = MADD64(u64lo.w64, *cPtr1--, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  251. u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  252. u64lo.w64 = MADD64(u64lo.w64, *cPtr1--, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  253. u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); dOff -= 32; if (dOff < 0) {dOff += 320;}
  254. uBuf[0] = u64lo.r.hi32;
  255. uBuf[32] = u64hi.r.hi32;
  256. uBuf++;
  257. dOff--;
  258. }
  259. }
  260. #endif
  261. /**************************************************************************************
  262. * Function: QMFAnalysis
  263. *
  264. * Description: 32-subband analysis QMF (4.6.18.4.1)
  265. *
  266. * Inputs: 32 consecutive samples of decoded 32-bit PCM, format = Q(fBitsIn)
  267. * delay buffer of size 32*10 = 320 PCM samples
  268. * number of fraction bits in input PCM
  269. * index for delay ring buffer (range = [0, 9])
  270. * number of subbands to calculate (range = [0, 32])
  271. *
  272. * Outputs: qmfaBands complex subband samples, format = Q(FBITS_OUT_QMFA)
  273. * updated delay buffer
  274. * updated delay index
  275. *
  276. * Return: guard bit mask
  277. *
  278. * Notes: output stored as RE{X0}, IM{X0}, RE{X1}, IM{X1}, ... RE{X31}, IM{X31}
  279. * output stored in int buffer of size 64*2 = 128
  280. * (zero-filled from XBuf[2*qmfaBands] to XBuf[127])
  281. **************************************************************************************/
  282. int QMFAnalysis(int *inbuf, int *delay, int *XBuf, int fBitsIn, int *delayIdx, int qmfaBands)
  283. {
  284. int n, y, shift, gbMask;
  285. int *delayPtr, *uBuf, *tBuf;
  286. /* use XBuf[128] as temp buffer for reordering */
  287. uBuf = XBuf; /* first 64 samples */
  288. tBuf = XBuf + 64; /* second 64 samples */
  289. /* overwrite oldest PCM with new PCM
  290. * delay[n] has 1 GB after shifting (either << or >>)
  291. */
  292. delayPtr = delay + (*delayIdx * 32);
  293. if (fBitsIn > FBITS_IN_QMFA) {
  294. shift = MIN(fBitsIn - FBITS_IN_QMFA, 31);
  295. for (n = 32; n != 0; n--) {
  296. y = (*inbuf) >> shift;
  297. inbuf++;
  298. *delayPtr++ = y;
  299. }
  300. } else {
  301. shift = MIN(FBITS_IN_QMFA - fBitsIn, 30);
  302. for (n = 32; n != 0; n--) {
  303. y = *inbuf++;
  304. CLIP_2N_SHIFT30(y, shift);
  305. *delayPtr++ = y;
  306. }
  307. }
  308. QMFAnalysisConv((int *)cTabA, delay, *delayIdx, uBuf);
  309. /* uBuf has at least 2 GB right now (1 from clipping to Q(FBITS_IN_QMFA), one from
  310. * the scaling by cTab (MULSHIFT32(*delayPtr--, *cPtr++), with net gain of < 1.0)
  311. * TODO - fuse with QMFAnalysisConv to avoid separate reordering
  312. */
  313. tBuf[2*0 + 0] = uBuf[0];
  314. tBuf[2*0 + 1] = uBuf[1];
  315. for (n = 1; n < 31; n++) {
  316. tBuf[2*n + 0] = -uBuf[64-n];
  317. tBuf[2*n + 1] = uBuf[n+1];
  318. }
  319. tBuf[2*31 + 1] = uBuf[32];
  320. tBuf[2*31 + 0] = -uBuf[33];
  321. /* fast in-place DCT-IV - only need 2*qmfaBands output samples */
  322. PreMultiply64(tBuf); /* 2 GB in, 3 GB out */
  323. FFT32C(tBuf); /* 3 GB in, 1 GB out */
  324. PostMultiply64(tBuf, qmfaBands*2); /* 1 GB in, 2 GB out */
  325. /* TODO - roll into PostMultiply (if enough registers) */
  326. gbMask = 0;
  327. for (n = 0; n < qmfaBands; n++) {
  328. XBuf[2*n+0] = tBuf[ n + 0]; /* implicit scaling of 2 in our output Q format */
  329. gbMask |= FASTABS(XBuf[2*n+0]);
  330. XBuf[2*n+1] = -tBuf[63 - n];
  331. gbMask |= FASTABS(XBuf[2*n+1]);
  332. }
  333. /* fill top section with zeros for HF generation */
  334. for ( ; n < 64; n++) {
  335. XBuf[2*n+0] = 0;
  336. XBuf[2*n+1] = 0;
  337. }
  338. *delayIdx = (*delayIdx == NUM_QMF_DELAY_BUFS - 1 ? 0 : *delayIdx + 1);
  339. /* minimum of 2 GB in output */
  340. return gbMask;
  341. }
  342. /* lose FBITS_LOST_DCT4_64 in DCT4, gain 6 for implicit scaling by 1/64, lose 1 for cTab multiply (Q31) */
  343. #define FBITS_OUT_QMFS (FBITS_IN_QMFS - FBITS_LOST_DCT4_64 + 6 - 1)
  344. #define RND_VAL (1 << (FBITS_OUT_QMFS-1))
  345. /**************************************************************************************
  346. * Function: QMFSynthesisConv
  347. *
  348. * Description: final convolution kernel for synthesis QMF
  349. *
  350. * Inputs: pointer to coefficient table, reordered for sequential access
  351. * delay buffer of size 64*10 = 640 complex samples (1280 ints)
  352. * index for delay ring buffer (range = [0, 9])
  353. * number of QMF subbands to process (range = [0, 64])
  354. * number of channels
  355. *
  356. * Outputs: 64 consecutive 16-bit PCM samples, interleaved by factor of nChans
  357. *
  358. * Return: none
  359. *
  360. * Notes: this is carefully written to be efficient on ARM
  361. * use the assembly code version in sbrqmfsk.s when building for ARM!
  362. **************************************************************************************/
  363. #if (defined (__arm) && defined (__ARMCC_VERSION)) || (defined (_WIN32) && defined (_WIN32_WCE) && defined (ARM)) || (defined(__GNUC__) && defined(__arm__))
  364. #ifdef __cplusplus
  365. extern "C"
  366. #endif
  367. void QMFSynthesisConv(int *cPtr, int *delay, int dIdx, short *outbuf, int nChans);
  368. #else
  369. void QMFSynthesisConv(int *cPtr, int *delay, int dIdx, short *outbuf, int nChans)
  370. {
  371. int k, dOff0, dOff1;
  372. U64 sum64;
  373. dOff0 = (dIdx)*128;
  374. dOff1 = dOff0 - 1;
  375. if (dOff1 < 0)
  376. dOff1 += 1280;
  377. /* scaling note: total gain of coefs (cPtr[0]-cPtr[9] for any k) is < 2.0, so 1 GB in delay values is adequate */
  378. for (k = 0; k <= 63; k++) {
  379. sum64.w64 = 0;
  380. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); dOff0 -= 256; if (dOff0 < 0) {dOff0 += 1280;}
  381. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); dOff1 -= 256; if (dOff1 < 0) {dOff1 += 1280;}
  382. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); dOff0 -= 256; if (dOff0 < 0) {dOff0 += 1280;}
  383. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); dOff1 -= 256; if (dOff1 < 0) {dOff1 += 1280;}
  384. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); dOff0 -= 256; if (dOff0 < 0) {dOff0 += 1280;}
  385. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); dOff1 -= 256; if (dOff1 < 0) {dOff1 += 1280;}
  386. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); dOff0 -= 256; if (dOff0 < 0) {dOff0 += 1280;}
  387. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); dOff1 -= 256; if (dOff1 < 0) {dOff1 += 1280;}
  388. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); dOff0 -= 256; if (dOff0 < 0) {dOff0 += 1280;}
  389. sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); dOff1 -= 256; if (dOff1 < 0) {dOff1 += 1280;}
  390. dOff0++;
  391. dOff1--;
  392. *outbuf = CLIPTOSHORT((sum64.r.hi32 + RND_VAL) >> FBITS_OUT_QMFS);
  393. outbuf += nChans;
  394. }
  395. }
  396. #endif
  397. /**************************************************************************************
  398. * Function: QMFSynthesis
  399. *
  400. * Description: 64-subband synthesis QMF (4.6.18.4.2)
  401. *
  402. * Inputs: 64 consecutive complex subband QMF samples, format = Q(FBITS_IN_QMFS)
  403. * delay buffer of size 64*10 = 640 complex samples (1280 ints)
  404. * index for delay ring buffer (range = [0, 9])
  405. * number of QMF subbands to process (range = [0, 64])
  406. * number of channels
  407. *
  408. * Outputs: 64 consecutive 16-bit PCM samples, interleaved by factor of nChans
  409. * updated delay buffer
  410. * updated delay index
  411. *
  412. * Return: none
  413. *
  414. * Notes: assumes MIN_GBITS_IN_QMFS guard bits in input, either from
  415. * QMFAnalysis (if upsampling only) or from MapHF (if SBR on)
  416. **************************************************************************************/
  417. void QMFSynthesis(int *inbuf, int *delay, int *delayIdx, int qmfsBands, short *outbuf, int nChans)
  418. {
  419. int n, a0, a1, b0, b1, dOff0, dOff1, dIdx;
  420. int *tBufLo, *tBufHi;
  421. dIdx = *delayIdx;
  422. tBufLo = delay + dIdx*128 + 0;
  423. tBufHi = delay + dIdx*128 + 127;
  424. /* reorder inputs to DCT-IV, only use first qmfsBands (complex) samples
  425. * TODO - fuse with PreMultiply64 to avoid separate reordering steps
  426. */
  427. for (n = 0; n < qmfsBands >> 1; n++) {
  428. a0 = *inbuf++;
  429. b0 = *inbuf++;
  430. a1 = *inbuf++;
  431. b1 = *inbuf++;
  432. *tBufLo++ = a0;
  433. *tBufLo++ = a1;
  434. *tBufHi-- = b0;
  435. *tBufHi-- = b1;
  436. }
  437. if (qmfsBands & 0x01) {
  438. a0 = *inbuf++;
  439. b0 = *inbuf++;
  440. *tBufLo++ = a0;
  441. *tBufHi-- = b0;
  442. *tBufLo++ = 0;
  443. *tBufHi-- = 0;
  444. n++;
  445. }
  446. for ( ; n < 32; n++) {
  447. *tBufLo++ = 0;
  448. *tBufHi-- = 0;
  449. *tBufLo++ = 0;
  450. *tBufHi-- = 0;
  451. }
  452. tBufLo = delay + dIdx*128 + 0;
  453. tBufHi = delay + dIdx*128 + 64;
  454. /* 2 GB in, 3 GB out */
  455. PreMultiply64(tBufLo);
  456. PreMultiply64(tBufHi);
  457. /* 3 GB in, 1 GB out */
  458. FFT32C(tBufLo);
  459. FFT32C(tBufHi);
  460. /* 1 GB in, 2 GB out */
  461. PostMultiply64(tBufLo, 64);
  462. PostMultiply64(tBufHi, 64);
  463. /* could fuse with PostMultiply64 to avoid separate pass */
  464. dOff0 = dIdx*128;
  465. dOff1 = dIdx*128 + 64;
  466. for (n = 32; n != 0; n--) {
  467. a0 = (*tBufLo++);
  468. a1 = (*tBufLo++);
  469. b0 = (*tBufHi++);
  470. b1 = -(*tBufHi++);
  471. delay[dOff0++] = (b0 - a0);
  472. delay[dOff0++] = (b1 - a1);
  473. delay[dOff1++] = (b0 + a0);
  474. delay[dOff1++] = (b1 + a1);
  475. }
  476. QMFSynthesisConv((int *)cTabS, delay, dIdx, outbuf, nChans);
  477. *delayIdx = (*delayIdx == NUM_QMF_DELAY_BUFS - 1 ? 0 : *delayIdx + 1);
  478. }