assembly.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728
  1. /* ***** BEGIN LICENSE BLOCK *****
  2. * Source last modified: $Id: assembly.h,v 1.7 2005/11/10 00:04:40 margotm Exp $
  3. *
  4. * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
  5. *
  6. * The contents of this file, and the files included with this file,
  7. * are subject to the current version of the RealNetworks Public
  8. * Source License (the "RPSL") available at
  9. * http://www.helixcommunity.org/content/rpsl unless you have licensed
  10. * the file under the current version of the RealNetworks Community
  11. * Source License (the "RCSL") available at
  12. * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
  13. * will apply. You may also obtain the license terms directly from
  14. * RealNetworks. You may not use this file except in compliance with
  15. * the RPSL or, if you have a valid RCSL with RealNetworks applicable
  16. * to this file, the RCSL. Please see the applicable RPSL or RCSL for
  17. * the rights, obligations and limitations governing use of the
  18. * contents of the file.
  19. *
  20. * This file is part of the Helix DNA Technology. RealNetworks is the
  21. * developer of the Original Code and owns the copyrights in the
  22. * portions it created.
  23. *
  24. * This file, and the files included with this file, is distributed
  25. * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
  26. * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
  27. * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
  28. * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
  29. * ENJOYMENT OR NON-INFRINGEMENT.
  30. *
  31. * Technology Compatibility Kit Test Suite(s) Location:
  32. * http://www.helixcommunity.org/content/tck
  33. *
  34. * Contributor(s):
  35. *
  36. * ***** END LICENSE BLOCK ***** */
  37. /**************************************************************************************
  38. * Fixed-point HE-AAC decoder
  39. * Jon Recker (jrecker@real.com)
  40. * February 2005
  41. *
  42. * assembly.h - inline assembly language functions and prototypes
  43. *
  44. * MULSHIFT32(x, y) signed multiply of two 32-bit integers (x and y),
  45. * returns top 32-bits of 64-bit result
  46. * CLIPTOSHORT(x) convert 32-bit integer to 16-bit short,
  47. * clipping to [-32768, 32767]
  48. * FASTABS(x) branchless absolute value of signed integer x
  49. * CLZ(x) count leading zeros on signed integer x
  50. * MADD64(sum64, x, y) 64-bit multiply accumulate: sum64 += (x*y)
  51. **************************************************************************************/
  52. #ifndef _ASSEMBLY_H
  53. #define _ASSEMBLY_H
  54. /* toolchain: MSFT Visual C++
  55. * target architecture: x86
  56. */
  57. #if (defined (_WIN32) && !defined (_WIN32_WCE)) || (defined (__WINS__) && defined (_SYMBIAN)) || (defined (WINCE_EMULATOR)) || (defined (_OPENWAVE_SIMULATOR))
  58. #pragma warning( disable : 4035 ) /* complains about inline asm not returning a value */
  59. static __inline int MULSHIFT32(int x, int y)
  60. {
  61. __asm {
  62. mov eax, x
  63. imul y
  64. mov eax, edx
  65. }
  66. }
  67. static __inline short CLIPTOSHORT(int x)
  68. {
  69. int sign;
  70. /* clip to [-32768, 32767] */
  71. sign = x >> 31;
  72. if (sign != (x >> 15))
  73. x = sign ^ ((1 << 15) - 1);
  74. return (short)x;
  75. }
  76. static __inline int FASTABS(int x)
  77. {
  78. int sign;
  79. sign = x >> (sizeof(int) * 8 - 1);
  80. x ^= sign;
  81. x -= sign;
  82. return x;
  83. }
  84. static __inline int CLZ(int x)
  85. {
  86. int numZeros;
  87. if (!x)
  88. return 32;
  89. /* count leading zeros with binary search */
  90. numZeros = 1;
  91. if (!((unsigned int)x >> 16)) { numZeros += 16; x <<= 16; }
  92. if (!((unsigned int)x >> 24)) { numZeros += 8; x <<= 8; }
  93. if (!((unsigned int)x >> 28)) { numZeros += 4; x <<= 4; }
  94. if (!((unsigned int)x >> 30)) { numZeros += 2; x <<= 2; }
  95. numZeros -= ((unsigned int)x >> 31);
  96. return numZeros;
  97. }
  98. #ifdef __CW32__
  99. typedef long long Word64;
  100. #else
  101. typedef __int64 Word64;
  102. #endif
  103. typedef union _U64 {
  104. Word64 w64;
  105. struct {
  106. /* x86 = little endian */
  107. unsigned int lo32;
  108. signed int hi32;
  109. } r;
  110. } U64;
  111. /* returns 64-bit value in [edx:eax] */
  112. static __inline Word64 MADD64(Word64 sum64, int x, int y)
  113. {
  114. #if (defined (_SYMBIAN_61_) || defined (_SYMBIAN_70_)) && defined (__WINS__) && !defined (__CW32__)
  115. /* Workaround for the Symbian emulator because of non existing longlong.lib and
  116. * hence __allmul not defined. */
  117. __asm {
  118. mov eax, x
  119. imul y
  120. add dword ptr sum64, eax
  121. adc dword ptr sum64 + 4, edx
  122. }
  123. #else
  124. sum64 += (Word64)x * (Word64)y;
  125. #endif
  126. return sum64;
  127. }
  128. /* toolchain: MSFT Embedded Visual C++
  129. * target architecture: ARM v.4 and above (require 'M' type processor for 32x32->64 multiplier)
  130. */
  131. #elif defined (_WIN32) && defined (_WIN32_WCE) && defined (ARM)
  132. static __inline short CLIPTOSHORT(int x)
  133. {
  134. int sign;
  135. /* clip to [-32768, 32767] */
  136. sign = x >> 31;
  137. if (sign != (x >> 15))
  138. x = sign ^ ((1 << 15) - 1);
  139. return (short)x;
  140. }
  141. static __inline int FASTABS(int x)
  142. {
  143. int sign;
  144. sign = x >> (sizeof(int) * 8 - 1);
  145. x ^= sign;
  146. x -= sign;
  147. return x;
  148. }
  149. static __inline int CLZ(int x)
  150. {
  151. int numZeros;
  152. if (!x)
  153. return 32;
  154. /* count leading zeros with binary search (function should be 17 ARM instructions total) */
  155. numZeros = 1;
  156. if (!((unsigned int)x >> 16)) { numZeros += 16; x <<= 16; }
  157. if (!((unsigned int)x >> 24)) { numZeros += 8; x <<= 8; }
  158. if (!((unsigned int)x >> 28)) { numZeros += 4; x <<= 4; }
  159. if (!((unsigned int)x >> 30)) { numZeros += 2; x <<= 2; }
  160. numZeros -= ((unsigned int)x >> 31);
  161. return numZeros;
  162. }
  163. /* implemented in asmfunc.s */
  164. #ifdef __cplusplus
  165. extern "C" {
  166. #endif
  167. typedef __int64 Word64;
  168. typedef union _U64 {
  169. Word64 w64;
  170. struct {
  171. /* ARM WinCE = little endian */
  172. unsigned int lo32;
  173. signed int hi32;
  174. } r;
  175. } U64;
  176. /* manual name mangling for just this platform (must match labels in .s file) */
  177. #define MULSHIFT32 raac_MULSHIFT32
  178. #define MADD64 raac_MADD64
  179. int MULSHIFT32(int x, int y);
  180. Word64 MADD64(Word64 sum64, int x, int y);
  181. #ifdef __cplusplus
  182. }
  183. #endif
  184. /* toolchain: ARM ADS or RealView
  185. * target architecture: ARM v.4 and above (requires 'M' type processor for 32x32->64 multiplier)
  186. */
  187. #elif defined (XXX__arm) && defined (__ARMCC_VERSION)
  188. static __inline int MULSHIFT32(int x, int y)
  189. {
  190. /* rules for smull RdLo, RdHi, Rm, Rs:
  191. * RdHi != Rm
  192. * RdLo != Rm
  193. * RdHi != RdLo
  194. */
  195. int zlow;
  196. __asm {
  197. smull zlow,y,x,y
  198. }
  199. return y;
  200. }
  201. static __inline short CLIPTOSHORT(int x)
  202. {
  203. int sign;
  204. /* clip to [-32768, 32767] */
  205. sign = x >> 31;
  206. if (sign != (x >> 15))
  207. x = sign ^ ((1 << 15) - 1);
  208. return (short)x;
  209. }
  210. static __inline int FASTABS(int x)
  211. {
  212. int sign;
  213. sign = x >> (sizeof(int) * 8 - 1);
  214. x ^= sign;
  215. x -= sign;
  216. return x;
  217. }
  218. static __inline int CLZ(int x)
  219. {
  220. int numZeros;
  221. if (!x)
  222. return 32;
  223. /* count leading zeros with binary search (function should be 17 ARM instructions total) */
  224. numZeros = 1;
  225. if (!((unsigned int)x >> 16)) { numZeros += 16; x <<= 16; }
  226. if (!((unsigned int)x >> 24)) { numZeros += 8; x <<= 8; }
  227. if (!((unsigned int)x >> 28)) { numZeros += 4; x <<= 4; }
  228. if (!((unsigned int)x >> 30)) { numZeros += 2; x <<= 2; }
  229. numZeros -= ((unsigned int)x >> 31);
  230. return numZeros;
  231. /* ARM code would look like this, but do NOT use inline asm in ADS for this,
  232. because you can't safely use the status register flags intermixed with C code
  233. __asm {
  234. mov numZeros, #1
  235. tst x, 0xffff0000
  236. addeq numZeros, numZeros, #16
  237. moveq x, x, lsl #16
  238. tst x, 0xff000000
  239. addeq numZeros, numZeros, #8
  240. moveq x, x, lsl #8
  241. tst x, 0xf0000000
  242. addeq numZeros, numZeros, #4
  243. moveq x, x, lsl #4
  244. tst x, 0xc0000000
  245. addeq numZeros, numZeros, #2
  246. moveq x, x, lsl #2
  247. sub numZeros, numZeros, x, lsr #31
  248. }
  249. */
  250. /* reference:
  251. numZeros = 0;
  252. while (!(x & 0x80000000)) {
  253. numZeros++;
  254. x <<= 1;
  255. }
  256. */
  257. }
  258. typedef __int64 Word64;
  259. typedef union _U64 {
  260. Word64 w64;
  261. struct {
  262. /* ARM ADS = little endian */
  263. unsigned int lo32;
  264. signed int hi32;
  265. } r;
  266. } U64;
  267. static __inline Word64 MADD64(Word64 sum64, int x, int y)
  268. {
  269. U64 u;
  270. u.w64 = sum64;
  271. __asm {
  272. smlal u.r.lo32, u.r.hi32, x, y
  273. }
  274. return u.w64;
  275. }
  276. /* toolchain: ARM gcc
  277. * target architecture: ARM v.4 and above (requires 'M' type processor for 32x32->64 multiplier)
  278. */
  279. #elif defined(__GNUC__) && defined(XXXX__arm__)
  280. static inline int MULSHIFT32(int x, int y)
  281. {
  282. int zlow;
  283. asm ("smull %0,%1,%2,%3" : "=&r" (zlow), "=r" (y) : "r" (x), "1" (y) : "cc");
  284. return y;
  285. }
  286. /*
  287. static inline short CLIPTOSHORT(int x)
  288. {
  289. int sign;
  290. // clip to [-32768, 32767] //
  291. sign = x >> 31;
  292. if (sign != (x >> 15))
  293. x = sign ^ ((1 << 15) - 1);
  294. return (short)x;
  295. }
  296. */
  297. static inline short CLIPTOSHORT(int x)
  298. {
  299. asm ("ssat %0, #16, %1" : "=r" (x) : "r" (x));
  300. return x;
  301. }
  302. /* From coder.h, ORIGINAL:
  303. clip to [-2^n, 2^n-1], valid range of n = [1, 30]
  304. //TODO (FB) Is there a better way ?
  305. */
  306. #define CLIP_2N(y, n) { \
  307. int sign = (y) >> 31; \
  308. if (sign != (y) >> (n)) { \
  309. (y) = sign ^ ((1 << (n)) - 1); \
  310. } \
  311. }
  312. /* From coder.h, ORIGINAL:
  313. do y <<= n, clipping to range [-2^30, 2^30 - 1] (i.e. output has one guard bit)
  314. */
  315. //TODO (FB) Is there a better way ?
  316. #define CLIP_2N_SHIFT(y, n) { \
  317. int sign = (y) >> 31; \
  318. if (sign != (y) >> (30 - (n))) { \
  319. (y) = sign ^ (0x3fffffff); \
  320. } else { \
  321. (y) = (y) << (n); \
  322. } \
  323. }
  324. #define FASTABS(x) abs(x) //FB
  325. #define CLZ(x) __builtin_clz(x) //FB
  326. //Reverse byte order (16 bit) //FB
  327. static inline unsigned int REV16( unsigned int value)
  328. {
  329. asm ("rev16 %0, %1" : "=r" (value) : "r" (value) );
  330. return(value);
  331. }
  332. //Reverse byte order (32 bit) //FB
  333. static inline unsigned int REV32( unsigned int value)
  334. {
  335. asm ("rev %0, %1" : "=r" (value) : "r" (value) );
  336. return(value);
  337. }
  338. typedef long long Word64;
  339. typedef union _U64 {
  340. Word64 w64;
  341. struct {
  342. /* little endian */
  343. unsigned int lo32;
  344. signed int hi32;
  345. } r;
  346. } U64;
  347. static inline Word64 MADD64(Word64 sum64, int x, int y)
  348. {
  349. U64 u;
  350. u.w64 = sum64;
  351. asm ("smlal %0,%1,%2,%3" : "+&r" (u.r.lo32), "+&r" (u.r.hi32) : "r" (x), "r" (y) : "cc");
  352. return u.w64;
  353. }
  354. /* toolchain: x86 gcc
  355. * target architecture: x86
  356. */
  357. #elif defined(__APPLE__) || defined(__GNUC__) && (defined(__i386__) || defined(__amd64__)) || (defined (_SOLARIS) && !defined (__GNUC__) && defined(_SOLARISX86))
  358. typedef long long Word64;
  359. static __inline__ int MULSHIFT32(int x, int y)
  360. {
  361. int z;
  362. z = (Word64)x * (Word64)y >> 32;
  363. return z;
  364. }
  365. static __inline short CLIPTOSHORT(int x)
  366. {
  367. int sign;
  368. /* clip to [-32768, 32767] */
  369. sign = x >> 31;
  370. if (sign != (x >> 15))
  371. x = sign ^ ((1 << 15) - 1);
  372. return (short)x;
  373. }
  374. static __inline int FASTABS(int x)
  375. {
  376. int sign;
  377. sign = x >> (sizeof(int) * 8 - 1);
  378. x ^= sign;
  379. x -= sign;
  380. return x;
  381. }
  382. static __inline int CLZ(int x)
  383. {
  384. int numZeros;
  385. if (!x)
  386. return 32;
  387. /* count leading zeros with binary search (function should be 17 ARM instructions total) */
  388. numZeros = 1;
  389. if (!((unsigned int)x >> 16)) { numZeros += 16; x <<= 16; }
  390. if (!((unsigned int)x >> 24)) { numZeros += 8; x <<= 8; }
  391. if (!((unsigned int)x >> 28)) { numZeros += 4; x <<= 4; }
  392. if (!((unsigned int)x >> 30)) { numZeros += 2; x <<= 2; }
  393. numZeros -= ((unsigned int)x >> 31);
  394. return numZeros;
  395. }
  396. typedef union _U64 {
  397. Word64 w64;
  398. struct {
  399. /* x86 = little endian */
  400. unsigned int lo32;
  401. signed int hi32;
  402. } r;
  403. } U64;
  404. static __inline Word64 MADD64(Word64 sum64, int x, int y)
  405. {
  406. sum64 += (Word64)x * (Word64)y;
  407. return sum64;
  408. }
  409. #elif defined(__arm__)
  410. typedef long long Word64;
  411. typedef union _U64 {
  412. Word64 w64;
  413. struct {
  414. /* x86 = little endian */
  415. unsigned int lo32;
  416. signed int hi32;
  417. } r;
  418. } U64;
  419. static __inline Word64 MADD64(Word64 sum64, int x, int y)
  420. {
  421. sum64 += (Word64)x * (Word64)y;
  422. return sum64;
  423. }
  424. static __inline short CLIPTOSHORT(int x)
  425. {
  426. int sign;
  427. /* clip to [-32768, 32767] */
  428. sign = x >> 31;
  429. if (sign != (x >> 15))
  430. x = sign ^ ((1 << 15) - 1);
  431. return (short)x;
  432. }
  433. #if defined(ARM7DI)
  434. static __inline int MULSHIFT32(int x, int y) {
  435. return x * y;
  436. }
  437. #else
  438. static __inline int MULSHIFT32(int x, int y)
  439. {
  440. /* important rules for smull RdLo, RdHi, Rm, Rs:
  441. * RdHi and Rm can't be the same register
  442. * RdLo and Rm can't be the same register
  443. * RdHi and RdLo can't be the same register
  444. * Note: Rs determines early termination (leading sign bits) so if you want to specify
  445. * which operand is Rs, put it in the SECOND argument (y)
  446. * For inline assembly, x and y are not assumed to be R0, R1 so it shouldn't matter
  447. * which one is returned. (If this were a function call, returning y (R1) would
  448. * require an extra "mov r0, r1")
  449. */
  450. int zlow;
  451. __asm__ volatile ("smull %0,%1,%2,%3" : "=&r" (zlow), "=r" (y) : "r" (x), "1" (y)) ;
  452. return y;
  453. }
  454. #endif
  455. static __inline int FASTABS(int x)
  456. {
  457. int t=0; /*Really is not necessary to initialiaze only to avoid warning*/
  458. __asm__ volatile (
  459. "eor %0,%2,%2, asr #31;"
  460. "sub %0,%1,%2, asr #31;"
  461. : "=&r" (t)
  462. : "0" (t), "r" (x)
  463. );
  464. return t;
  465. }
  466. static __inline int CLZ(int x)
  467. {
  468. int numZeros;
  469. if (!x)
  470. return (sizeof(int) * 8);
  471. numZeros = 0;
  472. while (!(x & 0x80000000)) {
  473. numZeros++;
  474. x <<= 1;
  475. }
  476. return numZeros;
  477. }
  478. #elif defined(ESP_PLATFORM) || defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__)) || (defined (_SOLARIS) && !defined (__GNUC__) && !defined (_SOLARISX86))
  479. typedef long long Word64;
  480. static __inline__ int MULSHIFT32(int x, int y)
  481. {
  482. int z;
  483. z = (Word64)x * (Word64)y >> 32;
  484. return z;
  485. }
  486. static __inline short CLIPTOSHORT(int x)
  487. {
  488. int sign;
  489. /* clip to [-32768, 32767] */
  490. sign = x >> 31;
  491. if (sign != (x >> 15))
  492. x = sign ^ ((1 << 15) - 1);
  493. return (short)x;
  494. }
  495. static __inline int FASTABS(int x)
  496. {
  497. int sign;
  498. sign = x >> (sizeof(int) * 8 - 1);
  499. x ^= sign;
  500. x -= sign;
  501. return x;
  502. }
  503. static __inline int CLZ(int x)
  504. {
  505. int numZeros;
  506. if (!x)
  507. return 32;
  508. /* count leading zeros with binary search (function should be 17 ARM instructions total) */
  509. numZeros = 1;
  510. if (!((unsigned int)x >> 16)) { numZeros += 16; x <<= 16; }
  511. if (!((unsigned int)x >> 24)) { numZeros += 8; x <<= 8; }
  512. if (!((unsigned int)x >> 28)) { numZeros += 4; x <<= 4; }
  513. if (!((unsigned int)x >> 30)) { numZeros += 2; x <<= 2; }
  514. numZeros -= ((unsigned int)x >> 31);
  515. return numZeros;
  516. }
  517. typedef union _U64 {
  518. Word64 w64;
  519. struct {
  520. #ifdef __XTENSA__
  521. unsigned int lo32;
  522. signed int hi32;
  523. #else
  524. /* PowerPC = big endian */
  525. signed int hi32;
  526. unsigned int lo32;
  527. #endif
  528. } r;
  529. } U64;
  530. static __inline Word64 MADD64(Word64 sum64, int x, int y)
  531. {
  532. sum64 += (Word64)x * (Word64)y;
  533. return sum64;
  534. }
  535. /* From coder.h, ORIGINAL:
  536. clip to [-2^n, 2^n-1], valid range of n = [1, 30]
  537. //TODO (FB) Is there a better way ?
  538. */
  539. #define CLIP_2N(y, n) { \
  540. int sign = (y) >> 31; \
  541. if (sign != (y) >> (n)) { \
  542. (y) = sign ^ ((1 << (n)) - 1); \
  543. } \
  544. }
  545. /* From coder.h, ORIGINAL:
  546. do y <<= n, clipping to range [-2^30, 2^30 - 1] (i.e. output has one guard bit)
  547. */
  548. //TODO (FB) Is there a better way ?
  549. #define CLIP_2N_SHIFT(y, n) { \
  550. int sign = (y) >> 31; \
  551. if (sign != (y) >> (30 - (n))) { \
  552. (y) = sign ^ (0x3fffffff); \
  553. } else { \
  554. (y) = (y) << (n); \
  555. } \
  556. }
  557. //#define FASTABS(x) abs(x) //FB
  558. //#define CLZ(x) __builtin_clz(x) //FB
  559. #else
  560. #error Unsupported platform in assembly.h
  561. #endif /* platforms */
  562. #ifndef CLIP_2N
  563. #define CLIP_2N(y, n) { \
  564. int sign = (y) >> 31; \
  565. if (sign != (y) >> (n)) { \
  566. (y) = sign ^ ((1 << (n)) - 1); \
  567. } \
  568. }
  569. #endif
  570. #ifndef CLIP_2N_SHIFT
  571. /* From coder.h, ORIGINAL:
  572. do y <<= n, clipping to range [-2^30, 2^30 - 1] (i.e. output has one guard bit)
  573. */
  574. //TODO (FB) Is there a better way ?
  575. #define CLIP_2N_SHIFT(y, n) { \
  576. int sign = (y) >> 31; \
  577. if (sign != (y) >> (30 - (n))) { \
  578. (y) = sign ^ (0x3fffffff); \
  579. } else { \
  580. (y) = (y) << (n); \
  581. } \
  582. }
  583. #endif
  584. #endif /* _ASSEMBLY_H */