소스 검색

first commit

Per Mårtensson 3 달 전
커밋
2f512584f1
90개의 변경된 파일21723개의 추가작업 그리고 0개의 파일을 삭제
  1. 35 0
      CMakeLists.txt
  2. 116 0
      Dockerfile
  3. 674 0
      LICENSE
  4. 333 0
      NEWS.md
  5. 98 0
      README.md
  6. 49 0
      config/basic_multichannel.conf
  7. 40 0
      config/basic_scanning.conf
  8. 454 0
      config/big_mixer.conf
  9. 142 0
      config/mixers.conf
  10. 144 0
      config/noaa.conf
  11. 121 0
      config/two_dongles_multiple_outputs.conf
  12. 118 0
      init.d/rtl_airband-debian.sh
  13. 16 0
      init.d/rtl_airband-freebsd.sh
  14. 40 0
      init.d/rtl_airband-gentoo.sh
  15. 16 0
      init.d/rtl_airband.service
  16. 21 0
      scripts/find_version
  17. 3 0
      scripts/reformat_code
  18. 1 0
      src/.gitignore
  19. 393 0
      src/CMakeLists.txt
  20. 19 0
      src/CMakeModules/FindBCM_VC.cmake
  21. 17 0
      src/CMakeModules/FindLame.cmake
  22. 35 0
      src/CMakeModules/FindMiriSDR.cmake
  23. 57 0
      src/CMakeModules/FindRTLSDR.cmake
  24. 11 0
      src/CMakeModules/version.cmake
  25. 886 0
      src/config.cpp
  26. 36 0
      src/config.h.in
  27. 172 0
      src/ctcss.cpp
  28. 98 0
      src/ctcss.h
  29. 163 0
      src/filters.cpp
  30. 63 0
      src/filters.h
  31. 86 0
      src/generate_signal.cpp
  32. 75 0
      src/generate_signal.h
  33. 21 0
      src/hello_fft/CMakeLists.txt
  34. 135 0
      src/hello_fft/gpu_fft.c
  35. 92 0
      src/hello_fft/gpu_fft.h
  36. 157 0
      src/hello_fft/gpu_fft.txt
  37. 137 0
      src/hello_fft/gpu_fft_base.c
  38. 84 0
      src/hello_fft/gpu_fft_shaders.c
  39. 40 0
      src/hello_fft/gpu_fft_trans.h
  40. 278 0
      src/hello_fft/gpu_fft_twiddles.c
  41. 707 0
      src/hello_fft/hex/shader_1024k.hex
  42. 605 0
      src/hello_fft/hex/shader_128k.hex
  43. 562 0
      src/hello_fft/hex/shader_16k.hex
  44. 447 0
      src/hello_fft/hex/shader_1k.hex
  45. 1103 0
      src/hello_fft/hex/shader_2048k.hex
  46. 321 0
      src/hello_fft/hex/shader_256.hex
  47. 698 0
      src/hello_fft/hex/shader_256k.hex
  48. 679 0
      src/hello_fft/hex/shader_2k.hex
  49. 538 0
      src/hello_fft/hex/shader_32k.hex
  50. 434 0
      src/hello_fft/hex/shader_4k.hex
  51. 450 0
      src/hello_fft/hex/shader_512.hex
  52. 781 0
      src/hello_fft/hex/shader_512k.hex
  53. 772 0
      src/hello_fft/hex/shader_64k.hex
  54. 516 0
      src/hello_fft/hex/shader_8k.hex
  55. 126 0
      src/hello_fft/hex/shader_trans.hex
  56. 248 0
      src/hello_fft/mailbox.c
  57. 47 0
      src/hello_fft/mailbox.h
  58. 86 0
      src/helper_functions.cpp
  59. 32 0
      src/helper_functions.h
  60. 131 0
      src/input-common.cpp
  61. 66 0
      src/input-common.h
  62. 181 0
      src/input-file.cpp
  63. 29 0
      src/input-file.h
  64. 63 0
      src/input-helpers.cpp
  65. 24 0
      src/input-helpers.h
  66. 239 0
      src/input-mirisdr.cpp
  67. 32 0
      src/input-mirisdr.h
  68. 254 0
      src/input-rtlsdr.cpp
  69. 32 0
      src/input-rtlsdr.h
  70. 366 0
      src/input-soapysdr.cpp
  71. 36 0
      src/input-soapysdr.h
  72. 71 0
      src/logging.cpp
  73. 57 0
      src/logging.h
  74. 261 0
      src/mixer.cpp
  75. 1005 0
      src/output.cpp
  76. 249 0
      src/pulse.cpp
  77. 1332 0
      src/rtl_airband.cpp
  78. 401 0
      src/rtl_airband.h
  79. 83 0
      src/rtl_airband_neon.s
  80. 635 0
      src/squelch.cpp
  81. 181 0
      src/squelch.h
  82. 138 0
      src/test_base_class.cpp
  83. 35 0
      src/test_base_class.h
  84. 155 0
      src/test_ctcss.cpp
  85. 41 0
      src/test_filters.cpp
  86. 280 0
      src/test_generate_signal.cpp
  87. 167 0
      src/test_helper_functions.cpp
  88. 281 0
      src/test_squelch.cpp
  89. 90 0
      src/udp_stream.cpp
  90. 180 0
      src/util.cpp

+ 35 - 0
CMakeLists.txt

@@ -0,0 +1,35 @@
+cmake_minimum_required (VERSION 3.1)
+project (RTLSDR-Airband CXX)
+
+execute_process(COMMAND ${PROJECT_SOURCE_DIR}/scripts/find_version
+   OUTPUT_VARIABLE RTL_AIRBAND_VERSION
+   OUTPUT_STRIP_TRAILING_WHITESPACE
+   ERROR_VARIABLE RTL_AIRBAND_VERSION_ERROR
+   ERROR_STRIP_TRAILING_WHITESPACE)
+
+string(COMPARE EQUAL "${RTL_AIRBAND_VERSION}" "" RTL_AIRBAND_VERSION_UNSET)
+
+if(RTL_AIRBAND_VERSION_UNSET)
+   message(FATAL_ERROR "Failed to detect RTL_AIRBAND_VERSION - \"${RTL_AIRBAND_VERSION_ERROR}\"")
+endif()
+
+set (CMAKE_CXX_STANDARD 11)
+set (CXX_STANDARD_REQUIRED ON)
+set (CMAKE_CXX_EXTENSIONS OFF)
+set (CMAKE_COMPILE_WARNING_AS_ERROR ON)
+
+if(NOT CMAKE_BUILD_TYPE)
+   set(CMAKE_BUILD_TYPE Release)
+   message(STATUS "Build type not specified: defaulting to Release")
+endif(NOT CMAKE_BUILD_TYPE)
+
+# TODO: flags to add: -Wfloat-equal -Wconversion -Wstrict-overflow=5 -Waggregate-return -Wpedantic -Wcast-align
+# TODO: these could be added except for gtest: -Wswitch-enum -Wundef -Wswitch-default
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wshadow -Wdate-time -Wpointer-arith -Wwrite-strings -Wcast-qual  -Wunreachable-code -Werror")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og -DDEBUG")
+
+if(DEBUG_SQUELCH)
+   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG_SQUELCH")
+endif()
+
+add_subdirectory (src)

+ 116 - 0
Dockerfile

@@ -0,0 +1,116 @@
+# build container
+FROM debian:bookworm-slim AS build
+
+# install build dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+      build-essential \
+      cmake \
+      libmp3lame-dev \
+      libshout3-dev \
+      libconfig++-dev \
+      libfftw3-dev \
+      libsoapysdr-dev \
+      libpulse-dev \
+      \
+      git \
+      ca-certificates \
+      libusb-1.0-0-dev \
+      debhelper \
+      pkg-config \
+      && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# set working dir for compiling dependencies
+WORKDIR /build_dependencies
+
+# compile / install rtl-sdr-blog version of rtl-sdr for v4 support
+RUN git clone https://github.com/rtlsdrblog/rtl-sdr-blog && \
+    cd rtl-sdr-blog/ && \
+    dpkg-buildpackage -b --no-sign && \
+    cd .. && \
+    dpkg -i librtlsdr0_*.deb && \
+    dpkg -i librtlsdr-dev_*.deb && \
+    dpkg -i rtl-sdr_*.deb
+
+# compile / install libmirisdr-4
+RUN git clone https://github.com/f4exb/libmirisdr-4 && \
+  cd libmirisdr-4 && \
+  mkdir build && \
+  cd build && \
+  cmake ../ && \
+  VERBOSE=1 make install && \
+  ldconfig
+
+# TODO: build anything from source?
+
+# set working dir for project build
+WORKDIR /rtl_airband_build
+
+# copy in the rtl_airband source
+# WARNING: not copying in the whole repo, this may need to be updated if build files are added outside of src/
+COPY ./.git/ .git/
+COPY ./src/ src/
+COPY ./scripts/ scripts/
+COPY ./CMakeLists.txt .
+
+# configure and build
+# TODO: detect platforms
+RUN uname -m && \
+    echo | gcc -### -v -E - | tee compiler_native_info.txt && \
+    cmake -B build_dir -DPLATFORM=generic -DCMAKE_BUILD_TYPE=Release -DNFM=TRUE -DBUILD_UNITTESTS=TRUE && \
+    VERBOSE=1 cmake --build build_dir -j4
+
+# make sure unit tests pass
+RUN ./build_dir/src/unittests
+
+
+# application container
+FROM debian:bookworm-slim
+
+# install runtime dependencies
+RUN apt-get update && \
+  apt-get upgrade -y && \
+  apt-get install -y --no-install-recommends \
+    tini \
+    libc6 \
+    libmp3lame0 \
+    libshout3 \
+    libconfig++9v5 \
+    libfftw3-single3 \
+    libsoapysdr0.8 \
+    libpulse0 \
+    libusb-1.0-0-dev \
+    && \
+  apt-get clean && \
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# install (from build container) rtl-sdr-blog version of rtl-sdr for v4 support
+COPY --from=build /build_dependencies/librtlsdr0_*.deb /build_dependencies/librtlsdr-dev_*.deb /build_dependencies/rtl-sdr_*.deb /tmp/
+RUN dpkg -i /tmp/librtlsdr0_*.deb && \
+    dpkg -i /tmp/librtlsdr-dev_*.deb && \
+    dpkg -i /tmp/rtl-sdr_*.deb && \
+    rm -rf /tmp/*.deb && \
+    echo '' | tee --append /etc/modprobe.d/rtl_sdr.conf && \
+    echo 'blacklist dvb_usb_rtl28xxun' | tee --append /etc/modprobe.d/rtl_sdr.conf && \
+    echo 'blacklist rtl2832' | tee --append /etc/modprobe.d/rtl_sdr.conf && \
+    echo 'blacklist rtl2830' | tee --append /etc/modprobe.d/rtl_sdr.conf
+
+# copy (from build container) libmirisdr-4 library
+COPY --from=build /usr/local/lib/libmirisdr.so.4 /usr/local/lib/
+
+# Copy rtl_airband from the build container
+COPY LICENSE /app/
+COPY --from=build /rtl_airband_build/build_dir/src/unittests /app/
+COPY --from=build /rtl_airband_build/build_dir/src/rtl_airband /app/
+RUN chmod a+x /app/unittests /app/rtl_airband
+
+# make sure unit tests pass
+RUN /app/unittests
+
+# Use tini as init and run rtl_airband from /app/
+ENTRYPOINT ["/usr/bin/tini", "--"]
+WORKDIR /app/
+CMD ["/app/rtl_airband", "-F", "-e", "-c", "/app/rtl_airband.conf"]

+ 674 - 0
LICENSE

@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.

+ 333 - 0
NEWS.md

@@ -0,0 +1,333 @@
+# NEWS
+
+This file will no longer be updated with each release, for changes between releases, see PRs merged to the repo
+
+Version 5.0.0 (Jan 21, 2024):
+
+* NOTE: Going forward a release tag will be automatically created on each merge to `main`, and changes will not be reflected in this file.  For changes between versions see the repo's [release history](https://github.com/charlie-foxtrot/RTLSDR-Airband/releases).
+* NOTE: Going forward PRs will be opened directly against `main` and the `unstable` branch will no longer be used.
+* NOTE: This repo has significantly diverged from the original project [microtony/RTLSDR-Airband](https://github.com/microtony/RTLSDR-Airband) so it has been been detached (ie no longer a fork).
+
+* Changes in this release, see [#444](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/444):
+  * build and publish docker containers
+  * changes to supported `cmake` platforms:
+    * depreciate `rpiv1`, `armv7-generic`, and `armv8-generic` and build platforms
+    * change default build platform to `native`
+    * rename `default` to `generic`
+  * enable a series of compile warnings and cleanup code
+  * remove `SSE` specific code - let the compiler "do the right thing"
+  * remove some no longer supported windows `ifdef`'s
+  * fix CTCSS bug that could miss a tone when multiple tones have the same power (happens with less accurate floating point operations, ie i386)
+
+Version 4.2.0 (Oct 13, 2023):
+
+* Changes in this release:
+  * Add support for building with libshout v2.4.6, see [#382](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/382) and [#422](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/422)
+  * Add error checking for lowpass <= highpass, see [#399](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/399) and [#412](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/412)
+  * Remove limit on count of mixer inputs (thanks @cdknox), see [#408](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/408)
+  * Add `dated_subdirectories` config option for output files (thanks, @marcin-osowski), see [#413](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/413)
+
+Version 4.1.1 (May 1, 2023):
+
+* Changes in this release:
+  * Fix build issues when using VideoCore GPU, see [#378](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/378)
+
+Version 4.1.0 (April 23, 2023):
+
+* Changes in this release:
+  * Add `channel_dbfs_noise_level` and `channel_dbfs_signal_level` to the stats file, see [#355](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/355)
+  * Add squelch support for CTCSS, add `channel_ctcss_counter` and `channel_no_ctcss_counter` to the stats file, see [#368](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/368)
+  * Support `ampfactor` on a per-channel basis (in addition to mixer inputs), see [#369](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/369)
+  * Fix config error messages, see [#371](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/371)
+  * Multiple CI / workflow improvements, including:
+    * Addition of Dockerfiles and shell scripts for multiple build environments
+    * Addition of vscode devcontainer configuration
+    * Addition of gtest, code refactoring, addition of unit tests, running unit tests on each pull request
+    * Running more combinations of OSs, build types, and build options on each pull request
+
+Version 4.0.3 (Jan 10, 2023):
+
+* Changes in this release:
+  * Add `channel_squelch_level` to stats file, see [#332](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/332)
+  * Support "default" values in lists for `squelch_snr_threshold` and `notch_q`,
+   see [#334](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/334)
+  * Set cmake `ENABLE_EXPORTS` property, see [#339](https://github.com/charlie-foxtrot/RTLSDR-Airband/pull/339)
+
+* Other items to note:
+  * Repo maintainer has changed, see [#342](https://github.com/charlie-foxtrot/RTLSDR-Airband/discussions/342)
+  * Repo URL has moved to https://github.com/charlie-foxtrot/RTLSDR-Airband
+  * Default branch / Top of Tree has been renamed to `main`
+
+Version 4.0.2 (Dec 26, 2021):
+
+* Added a new `PLATFORM` value `default` (which, as the name says, is the new
+  default). It results in a portable binary without any architecture-specific
+  optimizations. This also allows the program to be built with compilers that
+  do not support `-march=native` option (notably Clang on Apple M1) (#303).
+
+Version 4.0.1 (Nov 14, 2021):
+
+* Fixed compilation error on RaspberryPi OS 11 (Bullseye)
+
+Version 4.0.0 (Oct 19, 2021):
+
+* RTLSDR-Airband is now built with CMake. Refer to the wiki for updated
+  compilation instructions.
+* When compiling the program, a new `PLATFORM` value `native` can now be
+  specified. It enables `-march=native -mtune=native` compilation options. This
+  causes the compiler to apply the most appropriate optimizations for the
+  hardware on which the app is being built (thx @charlie-foxtrot).
+* BACKWARDS-INCOMPATIBLE CHANGE: Signal level and noise level estimates
+  displayed in the textual waterfalls are now expressed in dBFS (decibels
+  related to the full scale of the analog-to-digital converter). The main
+  benefit of the new approach is that these values do not depend on the
+  `fft_size` value(thx @charlie-foxtrot).
+* BACKWARDS-INCOMPATIBLE CHANGE: Improved squelch algorithm with new
+  configuration parameters. `squelch` keyword has been replaced with
+  `squelch_threshold` which takes an absolute signal value in dBFS as an
+  argument. Alternatively, a minimum signal-to-noise ratio (in dB) that should
+  trigger the squelch might be configured using `squelch_snr_threshold` option
+  (thx @charlie-foxtrot).
+* BACKWARDS-INCOMPATIBLE CHANGE: `include_freq` config option for file outputs
+  now causes the frequency to be appended after the timestamp rather than
+  before it. This feature now works correctly in scan mode, when
+  `split_on_transmission` feature is enabled. (thx @charlie-foxtrot).
+* BACKWARDS-INCOMPATIBLE CHANGE: sample format in files produced by `rawfile`
+  outputs has been changed from CS16 to CF32. File name suffix is now `.cf32`.
+* Improved squelch indicator in the textual waterfalls. In addition to the `*`
+  character indicating that the squelch is open, there is also a `~` character
+  indicating that the channel has a signal that is being suppressed because it
+  is outside the band of the channel filter (thx @charlie-foxtrot).
+* New output type `udp_stream` for sending uncompressed audio to another host
+  via UDP/IP (thx @charlie-foxtrot).
+* Added `multiple_output_threads` global option. When set to `true`, a separate
+  output thread is spawned for each device (thx @charlie-foxtrot).
+* Modulation in scan mode is now configurable per channel (thx
+  @charlie-foxtrot).
+* SoapySDR errors like TIMEOUT or OVERFLOW are no longer treated as fatal. They
+  often appear intermittently, especially when the CPU usage is high. There is
+  no point in failing the input in this case.
+* Added `.tmp` suffix to the names of the output files currently being written
+  to. The suffix is removed when the file is closed. External applications that
+  consume recorded files can now figure out which files are not yet complete.
+* Added logging and statistics for output thread overruns and mixer
+  input/output overruns (thx @charlie-foxtrot).
+* The program can now be built on MacOS.
+* Miscellaneous bug fixes and code cleanups.
+
+Version 3.2.1 (Nov 13, 2020):
+
+* Fixed a compile error when using libshout older than 2.4.0
+
+Version 3.2.0 (Nov 08, 2020):
+
+* Added `split_on_transmission` output file option which allows creating
+  a new file for every transmission on the channel (thx @charlie-foxtrot).
+* Added `include_freq` output file option, which causes the channel frequency
+  to be appended to the file name (thx @charlie-foxtrot).
+* Added support for notch filters for eliminating narrowband interference,
+  like CTCSS tones (thx @charlie-foxtrot).
+* Added `bandwidth` channel option which causes the channelized I/Q signal
+  to be lowpass-filtered before demodulation. This might help in situations
+  where neighboring channels are closely spaced and interfere with the channel
+  of interest. It also reduces the bandwidth of the resulting audio signal,
+  and thus eliminates the high-frequency noise (thx @charlie-foxtrot).
+* Added support for multithreaded demodulation. Each device can now have its
+  own demodulation thread. This allows spreading the demodulation work across
+  multiple CPU cores. Enable with `multiple_demod_threads` global option
+  (thx @charlie-foxtrot).
+* Added support for highpass/lowpass MP3 filters for mixers (thx @charlie-foxtrot)
+* Added support for frequency usage statistics (thx @charlie-foxtrot).
+* Workaround for Fitipower tuner problem of not honoring the first gain
+  setting when the device is first used (thx @eshaz).
+* Finalize the MP3 file properly before opening a new one (thx @jratke).
+* Close the RTL device properly on program exit (thx @jratke).
+* Updated the SoapySDR input driver to reflect changes in SoapySDR library API.
+* Minor cleanups.
+
+Version 3.1.0 (Jan 19, 2020):
+
+* SoapySDR: added support for complex float 32-bit samples
+* SoapySDR: allow using AGC if the device supports it. Gain setting for
+  soapy devices is now optional - if it's not specified, the program will
+  try to enable AGC.
+* Use lowpass/highpass filters provided by LAME library to improve audio
+  quality of MP3 streams. Filter cutoff frequencies may be configured per
+  output, using `highpass` and `lowpass` config options. Credit: clydebarrow.
+* Added `log_scan_activity` global config option. When set to `true`, a
+  log message is written whenever a squelch opens on a scanned channel,
+  effectively producing a channel activity log. Credit: clam-i-am.
+* Improved squelch behaviour in some corner cases.
+* Fix for incorrect naming of pulseaudio context. Name set in the config
+  was not used as it should. Credit: Darryl Pogue.
+* Don't fail when the configured gain value is negative. Some SDRs support
+  this (eg. FC0012-based dongles).
+* Fix a bug which in some cases could prevent the icecast output from
+  reconnecting with the Icecast server after the connection has failed.
+
+Version 3.0.1 (Feb 16, 2018):
+
+* Fix for squelch staying constantly open when configured manually
+  with NFM=off (#84)
+
+Version 3.0.0 (Feb 10, 2018):
+
+* Major overhaul of the SDR input code - now it's modular and
+  hardware-agnostic (no longer tightly coupled with librtlsdr).
+* Support for SoapySDR vendor-neutral SDR library - any SDR which has
+  a plugin for SoapySDR shall now work in RTLSDR-Airband.
+* Support for Mirics DVB-T dongles via libmirisdr-4 library.
+* Support for RTLSDR is now optional and can be disabled at compilation
+  stage.
+* Removed the 8-channels-per-device limit in multichannel mode.
+* Configurable per-device sampling rate.
+* Configurable FFT size.
+* Support for multibyte input samples.
+* Support for rawfile outputs (ie. writing raw I/Q data from a
+  narrowband channel to a file for processing with other programs,
+  line GNUradio or csdr).
+* INCOMPATIBLE CHANGE: removed `rtlsdr_buffers` global configuration
+  option; buffer count can now be adjusted with a per-device
+  "buffers" option.
+* INCOMPATIBLE CHANGE: removed `syslog` global configuration option;
+  syslog logging is now enabled by default, both in foreground and
+  background mode. To force logging to standard error, use -e command
+  line option.
+* Added -F command line option for better cooperation with systemd.
+  Runs the program in foreground, but without textual waterfalls.
+  Together with -e it allows running rtl_airband as a service of type
+  "simple" under systemd. Example rtl_airband.service file has been
+  adjusted to reflect this change.
+* Added `type` device configuration option. It sets the device type
+  (ie. the input driver which shall be used to talk to the device).
+  "rtlsdr" is assumed as a default type for backward compatibility.
+  If RTLSDR support has been disabled at compilation stage, then
+  there is no default type - it must be set manually, or the program
+  will throw an error on startup.
+* Frequencies in the config can now be expressed in Hz, kHz, MHz or GHz
+  for improved readability.
+* Lots of bugfixes.
+* Rewritten documentation on [Github Wiki](https://github.com/szpajder/RTLSDR-Airband/wiki).
+
+Version 2.4.0 (Oct 15, 2017):
+
+* Support for PulseAudio output via new output type `pulse`. With this
+  feature you can eg. play the sound via the soundcard of the Raspberry
+  Pi you run RTLSDR-Airband on (you need to install and run pulseaudio
+  daemon on it, though). Or you can stream the audio from a Pi located
+  near the antenna (eg. in the attic) to speakers connected to the desktop
+  PC you are sitting at, without launching a local Icecast server,
+  as before. Because the audio stream is sent uncompressed, it is
+  not recommended to run it across the Internet - jitter or packet loss
+  will easily cause the audio to become choppy. However in a local network
+  PulseAudio is a good choice. And it gives much lower latency as compared
+  to Icecast (typically under 0.5 seconds). Thanks to Marcus Ströbel
+  for the idea and initial implementation.
+* Support for referring to RTL devices by their serial numbers in the
+  config file. Instead of `index = <dongle_index>` parameter, use `serial =
+  <dongle_serial_number>` to get consistent behavior across reboots
+  and hardware reconfigurations.
+* Set RTL gain to the nearest gain value supported by the device. This is
+  required for E4000 tuners, which do not round the given gain value to
+  the nearest supported setting, which causes the gain setting operation
+  to fail.
+* Improved squelch operation in scan mode. All squelch-related variables
+  (noise floor, AGC coefficients, etc) are now calculated and stored
+  separately for each scanned channel. Earlier their values were common
+  to all channels, which caused squelch problems in case when noise floor
+  varied considerably between channels. Thanks to @strix-technica.
+* Added build target for FreeBSD on x86. Use `PLATFORM=x86-freebsd` to
+  compile and `PLATFORM=x86-freebsd gmake install` to install. Thanks
+  to @nyammy.
+* Display squelch setting in waterfall in place of noise floor value when
+  squelch is set manually.
+* Bug fixes, performance improvements.
+* Decluttered and more understandable documentation.
+
+Version 2.3.0 (Jan 2, 2017):
+
+* Added support for mixers. It is now possible to produce audio streams
+  combined from several input channels. Both mono and stereo mixing is
+  supported. Usage example is provided in config/mixers.conf. All
+  mixer-related parameters are documented in config/reference.conf.
+* Added build options for 64-bit ARM architectures, like Odroid C2.
+  Please use PLATFORM=armv8-generic when compiling.
+* Fixed a long-standing bug in RTL sample processing, which caused some
+  samples to be processed twice. If you were annoyed by these regular
+  clicks in NFM audio every 125 ms, they are now gone.
+* Reduced CPU usage on x86
+* Some code restructuring and cleanups
+* Added several configuration file examples for typical real-life
+  scenarios. They are placed in config/ subdirectory. rtl_airband.conf.example
+  file has been moved to config/reference.conf. It is meant to be a reference
+  for all supported config knobs together with their description. This is
+  still an interim solution before some more readable and understandable
+  documentation gets written.
+
+Version 2.2.0 (Oct 8, 2016):
+
+* Support for Icecast stream metadata updates in scanning mode. When enabled,
+  every time the scanner stops on a channel, current frequency is written into
+  Icecast song title, which in turn is displayed in the player. Alternatively,
+  textual labels can be configured for each frequency. It is possible
+  to configure the amount of delay between the stream and metadata updates to
+  synchronize them with the audio. There are some caveats however - read
+  comments in rtl_airband.conf.example for details.
+* Added global option 'localtime'. When enabled, rtl_airband uses local time
+  instead of UTC time for output file names. (Credit: ScanOC).
+* Auto gain feature removed. RTL auto gain does not work well for narrowband
+  channels. Most often it sets the gain too high which causes problems for
+  auto squelch and audio bleeding between adjacent channels. Gain must be
+  configured manually from now on.
+* Dropped unmaintained Windows build.
+* Reverted to power level calculation algorithm from version 2.0.2. The new
+  algo didn't really do much to sensitivity, but introduced annoying clicks
+  on squelch open/close.
+* Improved DC offset estimator for AM mode. This one hardly ever clicks
+  on squelch opening.
+* Boosted AM audio volume.
+* Reduced squelch flapping in NFM mode.
+
+Version 2.1.0 (Aug 11, 2016):
+
+* Narrowband FM demodulation support
+* Automatic Frequency Control
+* Append mode for recording (enabled by default)
+* Dongles, channels and outputs can be individually enabled and disabled
+  by a simple config flag (no need to comment out or delete large
+  configuration sections)
+* Use VBR for MP3 encoding
+* Modified power level calculation algorithm (better sensitivity)
+* Support for manual squelch setting
+* Bug fixes
+
+Version 2.0.2 (Mar 26, 2016):
+
+* Fixed a problem with running three dongles or more, simultaneously
+
+Version 2.0.1 (Jan 24, 2016):
+
+* Fixed crash on output initialization
+
+Version 2.0.0 (Dec 27, 2015):
+
+* util/convert_cfg: can be used to convert old-style config.txt to the new format
+* Syslog logging (enabled by default)
+* Daemon mode
+* Reworked makefiles, added install rule
+* /dev/vcio is now used to access GPU on RPi; creating char_dev no longer necessary
+* Startup scripts for Debian and Gentoo
+* Support for auto gain setting
+* Support for multiple outputs per channel
+* Support for recording streams to local MP3 files
+* Support for ARMv7-based platforms other than RPi (eg. Cubieboard)
+* Updated documentation
+* Numerous bugfixes and stability improvements
+
+Version 1.0.0 (May 12, 2015):
+
+* Linux x86/x86_64 support (Windows build is currently unmaintained and might not work)
+* Raspberry Pi V2 support
+* Bundled hello_fft code (v2.0)
+* More robust interaction with Icecast servers
+* Important stability fixes

+ 98 - 0
README.md

@@ -0,0 +1,98 @@
+# RTLSDR-Airband
+
+![main](https://github.com/charlie-foxtrot/RTLSDR-Airband/actions/workflows/ci_build.yml/badge.svg?branch=main)
+![main](https://github.com/charlie-foxtrot/RTLSDR-Airband/actions/workflows/platform_build.yml/badge.svg?branch=main)
+![main](https://github.com/charlie-foxtrot/RTLSDR-Airband/actions/workflows/build_docker_containers.yml/badge.svg?branch=main)
+![main](https://github.com/charlie-foxtrot/RTLSDR-Airband/actions/workflows/code_formatting.yml/badge.svg?branch=main)
+
+NOTE: Changes as of v5.0.0:
+ - PRs will be opened directly against `main` and the `unstable` branch will no longer be used
+ - Version tags will be automatically created on each merge to `main`
+ - A release will be created on each `major` or `minor` version tag but not `minor` tags
+ - Checking out `main` is recommended over using a release artifact to stay on the latest version
+ - This repo has significantly diverged from the original project [microtony/RTLSDR-Airband](https://github.com/microtony/RTLSDR-Airband) so it has been been detached (ie no longer a fork).
+ - Specific build support for `rpiv1`, `armv7-generic`, and `armv8-generic` have been deprecated for the new default `native`, see [#447](https://github.com/charlie-foxtrot/RTLSDR-Airband/discussions/447)
+
+NOTE: Repo URL has moved to https://github.com/charlie-foxtrot/RTLSDR-Airband see [#342](https://github.com/charlie-foxtrot/RTLSDR-Airband/discussions/342) for info
+
+## Overview
+
+RTLSDR-Airband receives analog radio voice channels and produces
+audio streams which can be routed to various outputs, such as online
+streaming services like LiveATC.net. Originally the only SDR type
+supported by the program was Realtek DVB-T dongle (hence the project's
+name). However, thanks to SoapySDR vendor-neutral SDR library, other
+radios are now supported as well.
+
+## Documentation
+
+User's manual is now on the [wiki](https://github.com/charlie-foxtrot/RTLSDR-Airband/wiki).
+
+## Credits and thanks
+
+I hereby express my gratitude to everybody who helped with the development and testing
+of RTLSDR-Airband. Special thanks go to:
+
+* Dave Pascoe
+* SDR Guru
+* Marcus Ströbel
+* strix-technica
+* charlie-foxtrot
+
+## License
+
+Copyright (C) 2022-2024 charlie-foxtrot
+
+Copyright (C) 2015-2022 Tomasz Lemiech <szpajder@gmail.com>
+
+Based on original work by Wong Man Hang <microtony@gmail.com>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+## Open Source Licenses of bundled code
+
+### gpu_fft
+
+BCM2835 "GPU_FFT" release 2.0
+Copyright (c) 2014, Andrew Holme.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the
+  names of its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+### rtl-sdr
+
+* Copyright (C) 2012 by Steve Markgraf <steve@steve-m.de>
+* Copyright (C) 2015 by Kyle Keen <keenerd@gmail.com>
+* GNU General Public License Version 2

+ 49 - 0
config/basic_multichannel.conf

@@ -0,0 +1,49 @@
+# This is a minimalistic configuration file for RTLSDR-Airband.
+# Just a single RTL dongle with two AM channels in multichannel mode.
+# Each channel is sent to a single Icecast output.
+# Refer to https://github.com/charlie-foxtrot/RTLSDR-Airband/wiki
+# for description of keywords and config syntax.
+
+devices:
+({
+  type = "rtlsdr";
+  index = 0;
+  gain = 25;
+  centerfreq = 120.0;
+  correction = 80;
+  channels:
+  (
+    {
+      freq = 119.5;
+      outputs: (
+        {
+          type = "icecast";
+          server = "icecast.server.example.org";
+          port = 8080;
+          mountpoint = "TWR.mp3";
+          name = "Tower";
+          genre = "ATC";
+          username = "source";
+          password = "mypassword";
+        }
+      );
+    },
+    {
+      freq = 120.225;
+      outputs: (
+        {
+          type = "icecast";
+          server = "icecast.server.example.org";
+          port = 8080;
+          mountpoint = "GND.mp3";
+          name = "Ground";
+          genre = "ATC";
+          description = "My local airport - ground feed";
+          username = "source";
+          password = "mypassword";
+        }
+      );
+    }
+  );
+ }
+);

+ 40 - 0
config/basic_scanning.conf

@@ -0,0 +1,40 @@
+# Scanning mode example
+# Single dongle, three frequencies, output to Icecast server and to a file.
+# Refer to https://github.com/charlie-foxtrot/RTLSDR-Airband/wiki
+# for description of keywords and config syntax.
+
+devices:
+({
+  type = "rtlsdr";
+  index = 0;
+  gain = 25;
+  correction = 80;
+  mode = "scan";
+  channels:
+  (
+    {
+      freqs = ( 118.15, 124.7, 132.1 );
+      labels = ( "Tower", "Ground", "Approach" );
+      outputs: (
+        {
+          type = "icecast";
+          server = "icecast.server.example.org";
+          port = 8080;
+          mountpoint = "stream.mp3";
+          name = "Tower + Ground + Approach";
+          genre = "ATC";
+          description = "My local airport - aggregated feed";
+          username = "source";
+          password = "mypassword";
+          send_scan_freq_tags = false;
+        },
+        {
+          type = "file";
+          directory = "/home/pi/recordings";
+          filename_template = "TWR+GND+APP";
+        }
+      );
+    }
+  );
+ }
+);

+ 454 - 0
config/big_mixer.conf

@@ -0,0 +1,454 @@
+mixers: {
+  big_mixer: {
+    outputs: (
+      {
+        type = "file";
+        directory = "./";
+        filename_template = "big_mixer";
+      }
+    );
+  }
+};
+
+devices:
+({
+  type = "rtlsdr";
+  index = 0;
+  gain = 25;
+  centerfreq = 156.7375;
+  channels:
+  (
+    {
+      freq = 156.050;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.175;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.250;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.275;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.300;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.325;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.350;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.375;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.400;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.425;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.450;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.475;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.500;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.525;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.550;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.575;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.600;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.625;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.650;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.675;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.700;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.725;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.750;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.800;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.850;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.875;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.900;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.925;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.950;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 156.975;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.000;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.025;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.050;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.075;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.100;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.125;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.150;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.175;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.200;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.225;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.250;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.275;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.300;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.325;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.350;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.375;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.400;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    },
+    {
+      freq = 157.425;
+      outputs: (
+        {
+          type = "mixer";
+          name = "big_mixer";
+        }
+      );
+    }
+  )
+});

+ 142 - 0
config/mixers.conf

@@ -0,0 +1,142 @@
+# This config file demonstrates the usage of mixers.
+# First, two mixers are defined:
+#
+# - mixer1: sends the mixed stream to Icecast and saves it to a file
+# - mixer2: sends the mixed stream to Icecast
+#
+# Two dongles are used, both in AM, multichannel mode:
+#
+# - dongle 1: 3 channels:
+#   - channel 1 goes to mixer1 (center, volume decreased to 30%)
+#   - channel 2 goes to mixer1 (full left)
+#   - channel 3 goes to mixer2 (85% right)
+#
+# - dongle 2: 2 channels:
+#   - channel 1 goes to mixer1 (full right)
+#   - channel 2 goes to mixer2 (85% left, volume set to 200%)
+#
+# Refer to https://github.com/charlie-foxtrot/RTLSDR-Airband/wiki
+# for description of keywords and config syntax.
+mixers: {
+  mixer1: {
+    outputs: (
+        {
+          type = "icecast";
+          server = "icecast.server.example.org";
+          port = 8080;
+          mountpoint = "mixer1.mp3";
+          name = "VOLMET + Approach + Director"
+          genre = "ATC";
+          username = "source";
+          password = "mypassword";
+        },
+        {
+          type = "file";
+          directory = "/home/pi/recordings";
+          filename_template = "mixer1";
+        }
+    );
+  },
+  mixer2: {
+    outputs: (
+        {
+          type = "icecast";
+          server = "icecast.server.example.org";
+          port = 8080;
+          mountpoint = "mixer2.mp3";
+          name = "Ground + Delivery"
+          genre = "ATC";
+          username = "source";
+          password = "mypassword";
+        }
+    );
+  }
+};
+
+devices:
+({
+  type = "rtlsdr";
+  index = 0;
+  gain = 25;
+  centerfreq = 121.2;
+  correction = 81;
+  channels:
+  (
+# VOLMET
+    {
+      freq = 120.875;
+# VOLMET/ATIS/AWOS channels often transmit continuously.
+# Auto squelch does not perform well in such cases, so it's best to set the
+# squelch threshold manually. squelch_threshold defines an absolute signal
+# level (in dBFS).
+      squelch_threshold = -40;
+      lowpass = 5;
+      highpass = 5;
+      outputs: (
+        {
+          type = "mixer";
+          name = "mixer1";
+          ampfactor = 0.3;
+        }
+      );
+    },
+# Approach
+    {
+      freq = 121.8;
+      outputs: (
+        {
+          type = "mixer";
+          name = "mixer1";
+          balance = -1.0;
+        }
+      );
+    },
+# Director
+    {
+      freq = 121.925;
+      outputs: (
+        {
+          type = "mixer";
+          name = "mixer2";
+          balance = 0.85;
+        }
+      );
+    }
+  );
+ },
+ {
+  type = "rtlsdr";
+  index = 1;
+  gain = 33;
+  centerfreq = 131.2;
+  correction = 48;
+  channels:
+  (
+# Ground
+    {
+      freq = 130.925;
+# Another way of tweaking the squelch is to specify custom SNR threshold (in dB)
+      squelch_snr_threshold = 5.0;
+      outputs: (
+        {
+          type = "mixer";
+          name = "mixer1";
+          balance = 1.0;
+        }
+      );
+    },
+# Delivery
+    {
+      freq = 131.4;
+      outputs: (
+        {
+          type = "mixer";
+          name = "mixer2";
+          balance = -0.85;
+          ampfactor = 2.0;
+        }
+      );
+    }
+  );
+ }
+);

+ 144 - 0
config/noaa.conf

@@ -0,0 +1,144 @@
+fft_size = 1024;
+localtime = true;
+multiple_demod_threads = true;
+multiple_output_threads = true;
+devices:
+(
+  {
+    type = "rtlsdr";
+    index = 0;
+    gain = 19.7;
+    centerfreq = 162.48200;
+    correction = 0;
+    sample_rate = 2.40;
+    channels:
+    (
+      {
+        freq = 162.40000;
+        label = "NOAA 162.400";
+        modulation = "nfm";
+        lowpass = -1;
+        highpass = -1;
+        bandwidth = 5000;
+        ampfactor = 2.00;
+        squelch_snr_threshold = 0.00;
+        outputs:
+        (
+          {
+            type = "file";
+            directory = "/recordings";
+            filename_template = "NOAA_162.400";
+          }
+        );
+      },
+      {
+        freq = 162.42500;
+        label = "NOAA 162.425";
+        modulation = "nfm";
+        lowpass = -1;
+        highpass = -1;
+        bandwidth = 5000;
+        ampfactor = 2.00;
+        squelch_snr_threshold = 0.00;
+        outputs:
+        (
+          {
+            type = "file";
+            directory = "/recordings";
+            filename_template = "NOAA_162.425";
+          }
+        );
+      },
+      {
+        freq = 162.45000;
+        label = "NOAA 162.450";
+        modulation = "nfm";
+        lowpass = -1;
+        highpass = -1;
+        bandwidth = 5000;
+        ampfactor = 2.00;
+        squelch_snr_threshold = 0.00;
+        outputs:
+        (
+          {
+            type = "file";
+            directory = "/recordings";
+            filename_template = "NOAA_162.450";
+          }
+        );
+      },
+      {
+        freq = 162.47500;
+        label = "NOAA 162.475";
+        modulation = "nfm";
+        lowpass = -1;
+        highpass = -1;
+        bandwidth = 5000;
+        ampfactor = 2.00;
+        squelch_snr_threshold = 0.00;
+        outputs:
+        (
+          {
+            type = "file";
+            directory = "/recordings";
+            filename_template = "NOAA_162.475";
+          }
+        );
+      },
+      {
+        freq = 162.50000;
+        label = "NOAA 162.500";
+        modulation = "nfm";
+        lowpass = -1;
+        highpass = -1;
+        bandwidth = 5000;
+        ampfactor = 2.00;
+        squelch_snr_threshold = 0.00;
+        outputs:
+        (
+          {
+            type = "file";
+            directory = "/recordings";
+            filename_template = "NOAA_162.500";
+          }
+        );
+      },
+      {
+        freq = 162.52500;
+        label = "NOAA 162.525";
+        modulation = "nfm";
+        lowpass = -1;
+        highpass = -1;
+        bandwidth = 5000;
+        ampfactor = 2.00;
+        squelch_snr_threshold = 0.00;
+        outputs:
+        (
+          {
+            type = "file";
+            directory = "/recordings";
+            filename_template = "NOAA_162.525";
+          }
+        );
+      },
+      {
+        freq = 162.55000;
+        label = "NOAA 162.550";
+        modulation = "nfm";
+        lowpass = -1;
+        highpass = -1;
+        bandwidth = 5000;
+        ampfactor = 2.00;
+        squelch_snr_threshold = 0.00;
+        outputs:
+        (
+          {
+            type = "file";
+            directory = "/recordings";
+            filename_template = "NOAA_162.550";
+          }
+        );
+      }
+    );
+  }
+);

+ 121 - 0
config/two_dongles_multiple_outputs.conf

@@ -0,0 +1,121 @@
+# Example configuration file for 2 dongles.
+# First dongle - scanning mode, NFM modulation, three frequencies,
+# output to Icecast stream, to a file and to PulseAudio server
+# on a local network.
+# Second dongle - multichannel mode, three channels:
+#
+# - channel 1: AM, goes to Icecast stream
+# - channel 2: AM, goes to two Icecast streams
+# - channel 3: NFM, goes to two files
+#
+# Dongles are specified with their serial numbers instead of
+# indexes, because the latter can change when devices are
+# reconnected into different USB ports.
+#
+# Refer to https://github.com/charlie-foxtrot/RTLSDR-Airband/wiki
+# for description of keywords and config syntax.
+
+devices:
+({
+  type = "rtlsdr";
+  serial = "777755221";
+  gain = 25;
+  correction = 80;
+  mode = "scan";
+  channels:
+  (
+    {
+      modulation = "nfm";
+      freqs = ( 152.1, 168.25, 168.375 );
+      outputs: (
+        {
+          type = "icecast";
+          server = "icecast.server.example.org";
+          port = 8080;
+          mountpoint = "utility.mp3";
+          name = "Utility channels";
+          username = "source";
+          password = "mypassword";
+        },
+        {
+          type = "file";
+          directory = "/home/pi/recordings";
+          filename_template = "utility";
+        },
+        {
+          type = "pulse";
+          server = "192.168.11.10";
+          stream_name = "Utility channels";
+          continuous = false;
+        }
+      );
+    }
+  );
+ },
+ {
+  type = "rtlsdr";
+  serial = "33433123";
+  gain = 20;
+  centerfreq = 118.5;
+  correction = 43;
+  mode = "multichannel";
+  channels:
+  (
+    {
+      freq = 118.15;
+      outputs: (
+        {
+          type = "icecast";
+          server = "icecast.server.example.org";
+          port = 8080;
+          mountpoint = "TWR.mp3";
+          name = "Tower";
+          genre = "ATC";
+          username = "source";
+          password = "mypassword";
+        }
+      );
+    },
+    {
+      freq = 119.425;
+      outputs: (
+        {
+          type = "icecast";
+          server = "icecast.server.example.org";
+          port = 8080;
+          mountpoint = "ACC.mp3";
+          name = "Radar";
+          genre = "ATC";
+          username = "source";
+          password = "mypassword";
+        },
+        {
+          type = "icecast";
+          server = "other.server.example.org";
+          port = 9999;
+          mountpoint = "feed.mp3";
+          username = "user";
+          password = "secretpass";
+        }
+      );
+    },
+    {
+      freq = 119.6;
+      modulation = "nfm";
+      outputs: (
+        {
+          type = "file";
+          directory = "/home/pi/recordings";
+          filename_template = "somechannel";
+        },
+        {
+          type = "file";
+          directory = "/home/pi/recordings";
+          filename_template = "somechannel_full";
+          continuous = true;
+        }
+      );
+    }
+  );
+ }
+);

+ 118 - 0
init.d/rtl_airband-debian.sh

@@ -0,0 +1,118 @@
+#! /bin/sh
+### BEGIN INIT INFO
+# Provides:          rtl_airband
+# Required-Start:    $remote_fs $syslog
+# Required-Stop:     $remote_fs $syslog
+# Default-Start:     2 3 4 5
+# Default-Stop:      0 1 6
+# Short-Description: rtl_airband initscript
+### END INIT INFO
+
+# Author: Tomasz Lemiech <szpajder@gmail.com>
+
+PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
+DESC="RTLSDR airband receiver"
+NAME=rtl_airband
+DAEMON=/usr/local/bin/$NAME
+DAEMON_ARGS=""
+PIDFILE=/run/$NAME.pid
+SCRIPTNAME=/etc/init.d/$NAME
+
+# Exit if the package is not installed
+[ -x "$DAEMON" ] || exit 0
+
+# Read configuration variable file if it is present
+[ -r /etc/default/$NAME ] && . /etc/default/$NAME
+
+# Load the VERBOSE setting and other rcS variables
+. /lib/init/vars.sh
+
+# Define LSB log_* functions.
+# Depend on lsb-base (>= 3.2-14) to ensure that this file is present
+# and status_of_proc is working.
+. /lib/lsb/init-functions
+
+#
+# Function that starts the daemon/service
+#
+do_start()
+{
+	# Return
+	#   0 if daemon has been started
+	#   1 if daemon was already running
+	#   2 if daemon could not be started
+	start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON --test > /dev/null \
+		|| return 1
+	start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON -- \
+		$DAEMON_ARGS \
+		|| return 2
+	# on this one.  As a last resort, sleep for some time.
+}
+
+do_stop()
+{
+	# Return
+	#   0 if daemon has been stopped
+	#   1 if daemon was already stopped
+	#   2 if daemon could not be stopped
+	#   other if a failure occurred
+	start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 --pidfile $PIDFILE --name $NAME
+	RETVAL="$?"
+	[ "$RETVAL" = 2 ] && return 2
+	# Wait for children to finish too if this is a daemon that forks
+	# and if the daemon is only ever run from this initscript.
+	# If the above conditions are not satisfied then add some other code
+	# that waits for the process to drop all resources that could be
+	# needed by services started subsequently.  A last resort is to
+	# sleep for some time.
+	start-stop-daemon --stop --quiet --oknodo --retry=0/30/KILL/5 --exec $DAEMON
+	[ "$?" = 2 ] && return 2
+	rm -f $PIDFILE
+	return "$RETVAL"
+}
+
+case "$1" in
+  start)
+	[ "$VERBOSE" != no ] && log_daemon_msg "Starting $DESC" "$NAME"
+	do_start
+	case "$?" in
+		0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;;
+		2) [ "$VERBOSE" != no ] && log_end_msg 1 ;;
+	esac
+	;;
+  stop)
+	[ "$VERBOSE" != no ] && log_daemon_msg "Stopping $DESC" "$NAME"
+	do_stop
+	case "$?" in
+		0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;;
+		2) [ "$VERBOSE" != no ] && log_end_msg 1 ;;
+	esac
+	;;
+  status)
+	status_of_proc "$DAEMON" "$NAME" && exit 0 || exit $?
+	;;
+  restart|force-reload)
+	log_daemon_msg "Restarting $DESC" "$NAME"
+	do_stop
+	case "$?" in
+	  0|1)
+		do_start
+		case "$?" in
+			0) log_end_msg 0 ;;
+			1) log_end_msg 1 ;; # Old process is still running
+			*) log_end_msg 1 ;; # Failed to start
+		esac
+		;;
+	  *)
+		# Failed to stop
+		log_end_msg 1
+		;;
+	esac
+	;;
+  *)
+	echo "Usage: $SCRIPTNAME {start|stop|status|restart|force-reload}" >&2
+	exit 3
+	;;
+esac
+
+:

+ 16 - 0
init.d/rtl_airband-freebsd.sh

@@ -0,0 +1,16 @@
+#!/bin/sh
+
+# PROVIDE: rtl_airband
+# REQUIRE: DAEMON
+# BEFORE: LOGIN
+# KEYWORD: nojail shutdown
+
+. /etc/rc.subr
+
+name=rtl_airband
+rcvar=rtl_airband_enable
+
+command="/usr/local/bin/rtl_airband"
+
+load_rc_config ${name}
+run_rc_command "$1"

+ 40 - 0
init.d/rtl_airband-gentoo.sh

@@ -0,0 +1,40 @@
+#!/sbin/runscript
+# rtl_airband Gentoo startup script
+# (c) 2015 Tomasz Lemiech <szpajder@gmail.com>
+
+RTLAIRBAND_CONFDIR=${RTLAIRBAND_CONFDIR:-/usr/local/etc}
+RTLAIRBAND_CONFIG=${RTLAIRBAND_CONFIG:-${RTLAIRBAND_CONFDIR}/rtl_airband.conf}
+RTLAIRBAND_PIDFILE=${RTLAIRBAND_PIDFILE:-/run/${SVCNAME}.pid}
+RTLAIRBAND_BINARY=${RTLAIRBAND_BINARY:-/usr/local/bin/rtl_airband}
+
+depend() {
+	use logger dns
+}
+
+checkconfig() {
+	if [ ! -e "${RTLAIRBAND_CONFIG}" ] ; then
+		eerror "You need an ${RTLAIRBAND_CONFIG} file to run rtl_airband"
+		return 1
+	fi
+}
+
+start() {
+	checkconfig || return 1
+
+	ebegin "Starting ${SVCNAME}"
+	start-stop-daemon --start --exec "${RTLAIRBAND_BINARY}" \
+	    --pidfile "${RTLAIRBAND_PIDFILE}" \
+	    -- ${RTLAIRBAND_OPTS}
+	eend $?
+}
+
+stop() {
+	if [ "${RC_CMD}" = "restart" ] ; then
+		checkconfig || return 1
+	fi
+
+	ebegin "Stopping ${SVCNAME}"
+	start-stop-daemon --stop --exec "${RTLAIRBAND_BINARY}" \
+	    --pidfile "${RTLAIRBAND_PIDFILE}" --quiet
+	eend $?
+}

+ 16 - 0
init.d/rtl_airband.service

@@ -0,0 +1,16 @@
+[Unit]
+Description=SDR AM/NFM demodulator
+Documentation=https://github.com/charlie-foxtrot/RTLSDR-Airband/wiki
+Wants=network.target
+After=network.target
+
+[Service]
+Type=simple
+ExecStart=/usr/local/bin/rtl_airband -Fe
+# The program may exit only due to startup failure (eg. misconfiguration)
+# or due to failure of all SDR devices (eg. disconnection). In either case,
+# there is no point to restart it, because it would fail once again.
+Restart=no
+
+[Install]
+WantedBy=multi-user.target

+ 21 - 0
scripts/find_version

@@ -0,0 +1,21 @@
+#!/bin/bash
+
+PROJECT_ROOT_PATH="$(cd $(dirname "$0")/../ ; pwd)"
+PROJECT_GIT_DIR_PATH="${PROJECT_ROOT_PATH}/.git"
+PROJECT_DIR_NAME="$(basename ${PROJECT_ROOT_PATH})"
+
+# if there is a .git directory at the project root then rely on git for the version string
+if [ -d "${PROJECT_GIT_DIR_PATH}" ] ; then
+    git describe --tags --abbrev --dirty --always
+    exit 0
+fi
+
+# if the proejct root directory matches the naming convetion of an extracted archive then
+# get the version number out of that
+if [[ "${PROJECT_DIR_NAME}" =~ ^RTLSDR-Airband-[0-9]*\.[0-9]*\.[0-9]*$ ]]; then
+    echo ${PROJECT_DIR_NAME} | cut -d '-' -f 3
+    exit 0
+fi
+
+# print an error string to stderr (any output to stdout is considered success)
+>&2 echo "did not find a git root directory at ${PROJECT_GIT_DIR_PATH} and failed to extract a version from ${PROJECT_DIR_NAME}"

+ 3 - 0
scripts/reformat_code

@@ -0,0 +1,3 @@
+#!/bin/bash
+
+find src/*.h src/*.cpp src/hello_fft/*.h src/hello_fft/*.c | xargs clang-format-14 -i

+ 1 - 0
src/.gitignore

@@ -0,0 +1 @@
+config.h

+ 393 - 0
src/CMakeLists.txt

@@ -0,0 +1,393 @@
+include(CheckCXXCompilerFlag)
+include(CheckCXXSymbolExists)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
+set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
+
+if(UNIX OR MINGW)
+	add_definitions(-D_FILE_OFFSET_BITS=64)
+	# isnormal()
+	add_definitions(-D_POSIX_C_SOURCE=200112L)
+endif()
+
+CHECK_CXX_COMPILER_FLAG(-pthread CXX_HAS_PTHREAD)
+if(CXX_HAS_PTHREAD)
+    add_compile_options(-pthread)
+endif()
+add_compile_options(-lgpiodcxx)
+add_compile_options(-lgpiod)
+CHECK_CXX_COMPILER_FLAG(-ffast-math CXX_HAS_FFAST_MATH)
+if(CXX_HAS_FFAST_MATH)
+    add_compile_options(-ffast-math)
+endif()
+
+# asprintf on MacOS
+if(APPLE)
+	add_definitions(-D_DARWIN_C_SOURCE)
+endif()
+
+# sincosf on linux vs __sincosf on MacOS
+set(CMAKE_REQUIRED_DEFINITIONS_ORIG ${CMAKE_REQUIRED_DEFINITIONS})
+list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_GNU_SOURCE")
+set(CMAKE_REQUIRED_LIBRARIES_ORIG ${CMAKE_REQUIRED_LIBRARIES})
+list(APPEND CMAKE_REQUIRED_LIBRARIES m)
+CHECK_SYMBOL_EXISTS(sincosf math.h HAVE_SINCOSF)
+if(HAVE_SINCOSF)
+	set(SINCOSF "sincosf")
+else()
+	CHECK_SYMBOL_EXISTS(__sincosf math.h HAVE___SINCOSF)
+	if(HAVE___SINCOSF)
+		set(SINCOSF "__sincosf")
+	endif()
+endif()
+if(NOT HAVE_SINCOSF AND NOT HAVE___SINCOSF)
+	message(FATAL_ERROR "Required function sincosf() is unavailable")
+endif()
+set(CMAKE_REQUIRED_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS_ORIG})
+set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES_ORIG})
+
+find_library(LIBM m REQUIRED)
+find_library(LIBDL dl REQUIRED)
+find_library(LIBPTHREAD pthread REQUIRED)
+
+find_package(PkgConfig REQUIRED)
+
+pkg_check_modules(CONFIG REQUIRED libconfig++)
+list(APPEND rtl_airband_extra_libs ${CONFIG_LIBRARIES})
+list(APPEND rtl_airband_include_dirs ${CONFIG_INCLUDE_DIRS})
+list(APPEND link_dirs ${CONFIG_LIBRARY_DIRS})
+
+# Can't use pkg_check_modules here, as some distros do not install lame.pc file
+find_package(Lame REQUIRED)
+list(APPEND rtl_airband_extra_libs ${LAME_LIBRARIES})
+list(APPEND rtl_airband_include_dirs ${LAME_INCLUDE_DIR})
+
+pkg_check_modules(SHOUT REQUIRED shout)
+list(APPEND rtl_airband_extra_libs ${SHOUT_LIBRARIES})
+list(APPEND rtl_airband_include_dirs ${SHOUT_INCLUDE_DIRS})
+list(APPEND link_dirs ${SHOUT_LIBRARY_DIRS})
+
+set(CMAKE_REQUIRED_INCLUDES_SAVE ${CMAKE_REQUIRED_INCLUDES})
+set(CMAKE_REQUIRED_LIBRARIES_SAVE ${CMAKE_REQUIRED_LIBRARIES})
+set(CMAKE_REQUIRED_LINK_OPTIONS_SAVE ${CMAKE_REQUIRED_LINK_OPTIONS})
+set(CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${SHOUT_INCLUDE_DIRS}")
+set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES} ${SHOUT_LIBRARIES}")
+if ( NOT "${SHOUT_LIBRARY_DIRS}" STREQUAL "" )
+	set(CMAKE_REQUIRED_LINK_OPTIONS "-L${SHOUT_LIBRARY_DIRS}")
+endif()
+set(LIBSHOUT_HEADER "shout/shout.h")
+CHECK_CXX_SYMBOL_EXISTS("SHOUT_TLS_AUTO" ${LIBSHOUT_HEADER}
+	HAVE_SHOUT_TLS_AUTO)
+CHECK_CXX_SYMBOL_EXISTS("SHOUT_TLS_AUTO_NO_PLAIN" ${LIBSHOUT_HEADER}
+	HAVE_SHOUT_TLS_AUTO_NO_PLAIN)
+CHECK_CXX_SYMBOL_EXISTS("SHOUT_TLS_RFC2818" ${LIBSHOUT_HEADER}
+	HAVE_SHOUT_TLS_RFC2818)
+CHECK_CXX_SYMBOL_EXISTS("SHOUT_TLS_RFC2817" ${LIBSHOUT_HEADER}
+	HAVE_SHOUT_TLS_RFC2817)
+CHECK_CXX_SYMBOL_EXISTS("SHOUT_TLS_DISABLED" ${LIBSHOUT_HEADER}
+	HAVE_SHOUT_TLS_DISABLED)
+CHECK_CXX_SYMBOL_EXISTS("shout_set_tls" ${LIBSHOUT_HEADER}
+	HAVE_SHOUT_SET_TLS)
+CHECK_CXX_SYMBOL_EXISTS("shout_set_content_format" ${LIBSHOUT_HEADER}
+	LIBSHOUT_HAS_CONTENT_FORMAT)
+
+if(HAVE_SHOUT_TLS_AUTO AND HAVE_SHOUT_TLS_AUTO_NO_PLAIN AND
+		HAVE_SHOUT_TLS_RFC2818 AND HAVE_SHOUT_TLS_RFC2817 AND
+		HAVE_SHOUT_TLS_DISABLED AND HAVE_SHOUT_SET_TLS)
+	set(LIBSHOUT_HAS_TLS TRUE)
+else()
+	set(LIBSHOUT_HAS_TLS FALSE)
+endif()
+
+# check for shout_set_metadata_utf8() - introduced in libshout v2.4.6
+CHECK_CXX_SYMBOL_EXISTS("shout_set_metadata_utf8" ${LIBSHOUT_HEADER}
+	HAVE_SHOUT_SET_METADATA_UTF8)
+if(HAVE_SHOUT_SET_METADATA_UTF8)
+	set(SHOUT_SET_METADATA "shout_set_metadata_utf8")
+else()
+	set(SHOUT_SET_METADATA "shout_set_metadata")
+endif()
+
+set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES_SAVE})
+set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES_SAVE})
+set(CMAKE_REQUIRED_LINK_OPTIONS ${CMAKE_REQUIRED_LINK_OPTIONS_SAVE})
+
+option(NFM "Enable support for narrow FM channels" OFF)
+
+set(PLATFORM "native" CACHE STRING "Optimize the build for the given hardware platform")
+
+option(RTLSDR "Enable RTL-SDR support" ON)
+set(WITH_RTLSDR FALSE)
+
+option(MIRISDR "Enable Mirics support" ON)
+set(WITH_MIRISDR FALSE)
+
+option(SOAPYSDR "Enable SoapySDR support" ON)
+set(WITH_SOAPYSDR FALSE)
+
+option(PULSEAUDIO "Enable PulseAudio support" ON)
+set(WITH_PULSEAUDIO FALSE)
+
+option(PROFILING "Enable profiling with gperftools")
+set(WITH_PROFILING FALSE)
+find_library(GPIOD_LIBRARY gpiod)
+if(NOT GPIOD_LIBRARY)
+    message(FATAL_ERROR "libgpiod not found")
+endif()
+if(GPIOD_LIBRARY)
+    message(STATUS "libgpiod found")
+endif()
+if(RTLSDR)
+	find_package(RTLSDR)
+	if(RTLSDR_FOUND)
+		list(APPEND rtl_airband_extra_sources input-rtlsdr.cpp)
+		list(APPEND rtl_airband_extra_libs ${RTLSDR_LIBRARIES})
+		list(APPEND rtl_airband_include_dirs ${RTLSDR_INCLUDE_DIRS})
+		list(APPEND link_dirs ${RTLSDR_LIBRARY_DIRS})
+		set(WITH_RTLSDR TRUE)
+	endif()
+endif()
+
+if(MIRISDR)
+	find_package(MiriSDR)
+	if(MIRISDR_FOUND)
+		set(WITH_MIRISDR TRUE)
+		list(APPEND rtl_airband_extra_sources input-mirisdr.cpp)
+		list(APPEND rtl_airband_extra_libs ${MIRISDR_LIBRARIES})
+		list(APPEND rtl_airband_include_dirs ${MIRISDR_INCLUDE_DIRS})
+		list(APPEND link_dirs ${MIRISDR_LIBRARY_DIRS})
+	endif()
+endif()
+
+if(SOAPYSDR)
+	message(STATUS "Checking for SoapySDR")
+	find_package(SoapySDR NO_MODULE)
+	if(SoapySDR_FOUND)
+		list(APPEND rtl_airband_extra_sources input-soapysdr.cpp)
+		message(STATUS "  SoapySDR found, ${SoapySDR_INCLUDE_DIRS}, ${SoapySDR_LIBRARIES}")
+		list(APPEND rtl_airband_extra_libs ${SoapySDR_LIBRARIES})
+		list(APPEND rtl_airband_include_dirs ${SoapySDR_INCLUDE_DIRS})
+		set(WITH_SOAPYSDR TRUE)
+	else()
+		message(STATUS "  SoapySDR not found")
+	endif()
+endif()
+
+if(PULSEAUDIO)
+	pkg_check_modules(PULSEAUDIO libpulse)
+	if(PULSEAUDIO_FOUND)
+		list(APPEND rtl_airband_extra_sources pulse.cpp)
+		list(APPEND rtl_airband_extra_libs ${PULSEAUDIO_LIBRARIES})
+		list(APPEND rtl_airband_include_dirs ${PULSEAUDIO_INCLUDE_DIRS})
+		list(APPEND link_dirs ${PULSEAUDIO_LIBRARY_DIRS})
+		set(WITH_PULSEAUDIO TRUE)
+	endif()
+endif()
+
+if(PROFILING)
+	pkg_check_modules(PROFILING libprofiler)
+	if(PROFILING_FOUND)
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+		list(APPEND rtl_airband_extra_libs ${PROFILING_LIBRARIES})
+		list(APPEND rtl_airband_include_dirs ${PROFILING_INCLUDE_DIRS})
+		list(APPEND link_dirs ${PROFILING_LIBRARY_DIRS})
+		set(WITH_PROFILING TRUE)
+	endif()
+endif()
+
+
+option(BCM_VC "Enable Broadcom Videocore 3 support" OFF)
+set(WITH_BCM_VC FALSE)
+
+# error out on depricated PLATFORM values
+if(PLATFORM STREQUAL "rpiv1" OR PLATFORM STREQUAL "armv7-generic" OR PLATFORM STREQUAL "armv8-generic")
+	message(FATAL_ERROR "platform '${PLATFORM}' has been deprecated, see https://github.com/charlie-foxtrot/RTLSDR-Airband/discussions/447")
+# rpiv2 - Raspberry Pi 2 or Raspberry Pi 3 using Broadcom VideoCore IV GPU for FFT
+# NOTE: use 'native' to not use the GPU for FFT
+elseif(PLATFORM STREQUAL "rpiv2")
+	set(BCM_VC ON)
+	add_compile_options(-march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard)
+	enable_language(ASM)
+	list(APPEND rtl_airband_extra_sources rtl_airband_neon.s)
+# native - let the complier optimize to run on local hardware (default)
+elseif(PLATFORM STREQUAL "native")
+	CHECK_CXX_COMPILER_FLAG(-march=native CXX_HAS_MARCH_NATIVE)
+	if(CXX_HAS_MARCH_NATIVE)
+		add_compile_options(-march=native)
+	else()
+		message(FATAL_ERROR "Cannot build with PLATFORM=native: the compiler does not support -march=native option")
+	endif()
+# generic - dont add any hardware related flags, used to build a "portable" binary
+elseif(PLATFORM STREQUAL "generic")
+	# NO-OP
+# error out on unrecongnnized PLATFORM value
+else()
+	message(FATAL_ERROR "Unknown platform '${PLATFORM}'. Valid options are: rpiv2, native, and generic")
+endif()
+
+# Try using VC GPU if enabled. Fallback to fftw3f if disabled or if VC lib not found
+if(BCM_VC)
+	find_package(BCM_VC)
+	if(BCM_VC_FOUND)
+		add_subdirectory(hello_fft)
+		list(APPEND rtl_airband_obj_files $<TARGET_OBJECTS:hello_fft>)
+		list(APPEND rtl_airband_extra_libs ${BCM_VC_LIBRARIES})
+		set(WITH_BCM_VC TRUE)
+	endif()
+endif()
+if(NOT BCM_VC_FOUND)
+	pkg_check_modules(FFTW3F REQUIRED fftw3f)
+	if(FFTW3F_FOUND)
+		list(APPEND rtl_airband_extra_libs ${FFTW3F_LIBRARIES})
+		list(APPEND rtl_airband_include_dirs ${FFTW3F_INCLUDE_DIRS})
+		list(APPEND link_dirs ${FFTW3F_LIBRARY_DIRS})
+	endif()
+endif()
+
+if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
+	list(APPEND rtl_airband_extra_libs c++)
+endif()
+
+if(BUILD_UNITTESTS)
+	set(BUILD_UNITTESTS TRUE)
+else()
+	set(BUILD_UNITTESTS FALSE)
+endif()
+
+message(STATUS "RTLSDR-Airband configuration summary:\n")
+message(STATUS "- Version string:\t\t${RTL_AIRBAND_VERSION}")
+message(STATUS "- Build type:\t\t${CMAKE_BUILD_TYPE}")
+message(STATUS "- Operating system:\t\t${CMAKE_SYSTEM_NAME}")
+message(STATUS "- SDR drivers:")
+message(STATUS "  - librtlsdr:\t\trequested: ${RTLSDR}, enabled: ${WITH_RTLSDR}")
+message(STATUS "  - mirisdr:\t\t\trequested: ${MIRISDR}, enabled: ${WITH_MIRISDR}")
+message(STATUS "  - soapysdr:\t\trequested: ${SOAPYSDR}, enabled: ${WITH_SOAPYSDR}")
+message(STATUS "- Other options:")
+message(STATUS "  - Platform:\t\t${PLATFORM}")
+message(STATUS "  - Build Unit Tests:\t${BUILD_UNITTESTS}")
+message(STATUS "  - Broadcom VideoCore GPU:\t${WITH_BCM_VC}")
+message(STATUS "  - NFM support:\t\t${NFM}")
+message(STATUS "  - PulseAudio:\t\trequested: ${PULSEAUDIO}, enabled: ${WITH_PULSEAUDIO}")
+message(STATUS "  - Profiling:\t\trequested: ${PROFILING}, enabled: ${WITH_PROFILING}")
+message(STATUS "  - Icecast TLS support:\t${LIBSHOUT_HAS_TLS}")
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/config.h)
+	message(FATAL_ERROR "${CMAKE_CURRENT_SOURCE_DIR}/config.h nolonger used, delete before continuing")
+endif()
+
+configure_file(
+	"${CMAKE_CURRENT_SOURCE_DIR}/config.h.in"
+	"${CMAKE_CURRENT_BINARY_DIR}/config.h"
+	@ONLY
+)
+
+add_custom_command(
+	OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/version.cpp
+		${CMAKE_CURRENT_BINARY_DIR}/_version.cpp
+	COMMAND ${CMAKE_COMMAND} -DRTL_AIRBAND_VERSION=${RTL_AIRBAND_VERSION} -P
+	${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/version.cmake
+)
+
+add_library (rtl_airband_base OBJECT
+	config.cpp
+	input-common.cpp
+	input-file.cpp
+	input-helpers.cpp
+	mixer.cpp
+	output.cpp
+	rtl_airband.cpp
+	squelch.cpp
+	ctcss.cpp
+	util.cpp
+	udp_stream.cpp
+	logging.cpp
+	filters.cpp
+	helper_functions.cpp
+	${CMAKE_CURRENT_BINARY_DIR}/version.cpp
+	${rtl_airband_extra_sources}
+	)
+
+target_include_directories (rtl_airband_base PUBLIC
+	${CMAKE_CURRENT_BINARY_DIR} # needed for config.h
+	${rtl_airband_include_dirs}
+)
+
+# can't do this per target with cmake <3.13
+link_directories(${link_dirs})
+
+list(APPEND rtl_airband_obj_files $<TARGET_OBJECTS:rtl_airband_base>)
+
+add_executable (rtl_airband ${rtl_airband_obj_files})
+set_property(TARGET rtl_airband PROPERTY ENABLE_EXPORTS 1)
+
+# add include for config.h
+target_include_directories (rtl_airband PUBLIC
+	${CMAKE_CURRENT_BINARY_DIR}
+)
+
+target_link_libraries (rtl_airband
+	dl
+	m
+	pthread
+	${rtl_airband_extra_libs}
+	${GPIOD_LIBRARY}
+)
+
+install(TARGETS rtl_airband
+	RUNTIME DESTINATION bin
+)
+
+# TODO: install config if not present
+
+
+if(BUILD_UNITTESTS)
+	cmake_minimum_required(VERSION 3.14)
+
+	# GoogleTest requires at least C++14
+	set(CMAKE_CXX_STANDARD 14)
+
+	# set timestamps of URL extracted files to the extraction time
+	if(POLICY CMP0135)
+		cmake_policy(SET CMP0135 NEW)
+	endif()
+
+	# pull in GoogleTest as a dependency
+	include(FetchContent)
+	FetchContent_Declare(
+		googletest
+		URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
+	)
+	FetchContent_MakeAvailable(googletest)
+
+	enable_testing()
+
+	file(GLOB_RECURSE TEST_FILES "test_*.cpp")
+	list(APPEND TEST_FILES
+		squelch.cpp
+		logging.cpp
+		filters.cpp
+		ctcss.cpp
+		generate_signal.cpp
+		helper_functions.cpp
+	)
+
+	add_executable(
+		unittests
+		${TEST_FILES}
+	)
+	target_link_libraries(
+		unittests
+		GTest::gtest_main
+		dl
+		${rtl_airband_extra_libs}
+	)
+
+	# add include for config.h
+	target_include_directories (unittests PUBLIC
+		${CMAKE_CURRENT_BINARY_DIR}
+	)
+
+	include(GoogleTest)
+	gtest_discover_tests(unittests)
+
+endif()

+ 19 - 0
src/CMakeModules/FindBCM_VC.cmake

@@ -0,0 +1,19 @@
+if(NOT BCM_VC_FOUND)
+
+	set(BCM_VC_PATH "/opt/vc" CACHE STRING "List of paths to search for Broadcom VideoCore library")
+
+	find_path(BCM_VC_INCLUDE_DIR bcm_host.h PATHS ${BCM_VC_PATH}/include)
+	find_library(BCM_VC_LIBRARY NAMES bcm_host PATHS ${BCM_VC_PATH}/lib)
+
+	set(BCM_VC_LIBRARIES ${BCM_VC_LIBRARY} )
+	set(BCM_VC_INCLUDE_DIRS ${BCM_VC_INCLUDE_DIR} )
+
+	include(FindPackageHandleStandardArgs)
+	# handle the QUIETLY and REQUIRED arguments and set BCM_VC_FOUND to TRUE
+	# if all listed variables are TRUE
+	find_package_handle_standard_args(BCM_VC DEFAULT_MSG
+					  BCM_VC_LIBRARY BCM_VC_INCLUDE_DIR)
+
+	mark_as_advanced(BCM_VC_INCLUDE_DIR BCM_VC_LIBRARY)
+
+endif()

+ 17 - 0
src/CMakeModules/FindLame.cmake

@@ -0,0 +1,17 @@
+FIND_PATH(LAME_INCLUDE_DIR lame/lame.h)
+FIND_LIBRARY(LAME_LIBRARIES NAMES mp3lame)
+
+IF(LAME_INCLUDE_DIR AND LAME_LIBRARIES)
+	SET(LAME_FOUND TRUE)
+ENDIF(LAME_INCLUDE_DIR AND LAME_LIBRARIES)
+
+IF(LAME_FOUND)
+	IF (NOT Lame_FIND_QUIETLY)
+		MESSAGE(STATUS "Found lame includes:	${LAME_INCLUDE_DIR}/lame/lame.h")
+		MESSAGE(STATUS "Found lame library: ${LAME_LIBRARIES}")
+	ENDIF (NOT Lame_FIND_QUIETLY)
+ELSE(LAME_FOUND)
+	IF (Lame_FIND_REQUIRED)
+		MESSAGE(FATAL_ERROR "lame library required but not found")
+	ENDIF (Lame_FIND_REQUIRED)
+ENDIF(LAME_FOUND)

+ 35 - 0
src/CMakeModules/FindMiriSDR.cmake

@@ -0,0 +1,35 @@
+# - Try to find mirisdr - the hardware driver for Mirics chip in the dvb receivers
+# Once done this will define
+#  MIRISDR_FOUND - System has mirisdr
+#  MIRISDR_LIBRARIES - The mirisdr libraries
+#  MIRISDR_INCLUDE_DIRS - The mirisdr include directories
+#  MIRISDR_LIB_DIRS - The mirisdr library directories
+
+if(NOT MIRISDR_FOUND)
+
+    find_package(PkgConfig)
+    pkg_check_modules (MIRISDR_PKG libmirisdr)
+    set(MIRISDR_DEFINITIONS ${PC_MIRISDR_CFLAGS_OTHER})
+
+    find_path(MIRISDR_INCLUDE_DIR
+                NAMES mirisdr.h
+                HINTS ${MIRISDR_PKG_INCLUDE_DIRS} $ENV{MIRISDR_DIR}/include
+                PATHS /usr/local/include /usr/include /opt/include /opt/local/include)
+
+    find_library(MIRISDR_LIBRARY
+                NAMES mirisdr
+                HINTS ${MIRISDR_PKG_LIBRARY_DIRS} $ENV{MIRISDR_DIR}/include
+                PATHS /usr/local/lib /usr/lib /opt/lib /opt/local/lib)
+
+    set(MIRISDR_LIBRARIES ${MIRISDR_LIBRARY} )
+    set(MIRISDR_INCLUDE_DIRS ${MIRISDR_INCLUDE_DIR} )
+
+    include(FindPackageHandleStandardArgs)
+    # handle the QUIETLY and REQUIRED arguments and set LibMIRISDR_FOUND to TRUE
+    # if all listed variables are TRUE
+    find_package_handle_standard_args(MiriSDR  DEFAULT_MSG
+                                      MIRISDR_LIBRARY MIRISDR_INCLUDE_DIR)
+
+    mark_as_advanced(MIRISDR_INCLUDE_DIR MIRISDR_LIBRARY)
+
+endif(NOT MIRISDR_FOUND)

+ 57 - 0
src/CMakeModules/FindRTLSDR.cmake

@@ -0,0 +1,57 @@
+#
+# Copyright 2012-2013 The Iris Project Developers. See the
+# COPYRIGHT file at the top-level directory of this distribution
+# and at http://www.softwareradiosystems.com/iris/copyright.html.
+#
+# This file is part of the Iris Project.
+#
+# Iris is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of
+# the License, or (at your option) any later version.
+#
+# Iris is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# A copy of the GNU Lesser General Public License can be found in
+# the LICENSE file in the top-level directory of this distribution
+# and at http://www.gnu.org/licenses/.
+#
+
+# - Try to find rtlsdr - the hardware driver for the realtek chip in the dvb receivers
+# Once done this will define
+#  RTLSDR_FOUND - System has rtlsdr
+#  RTLSDR_LIBRARIES - The rtlsdr libraries
+#  RTLSDR_INCLUDE_DIRS - The rtlsdr include directories
+#  RTLSDR_LIB_DIRS - The rtlsdr library directories
+
+if(NOT RTLSDR_FOUND)
+
+    find_package(PkgConfig)
+    pkg_check_modules (RTLSDR_PKG librtlsdr)
+    set(RTLSDR_DEFINITIONS ${PC_RTLSDR_CFLAGS_OTHER})
+
+    find_path(RTLSDR_INCLUDE_DIR
+                NAMES rtl-sdr.h
+                HINTS ${RTLSDR_PKG_INCLUDE_DIRS} $ENV{RTLSDR_DIR}/include
+                PATHS /usr/local/include /usr/include /opt/include /opt/local/include /usr/lib/aarch64-linux-gnu/ )
+
+    find_library(RTLSDR_LIBRARY
+                NAMES rtlsdr
+                HINTS ${RTLSDR_PKG_LIBRARY_DIRS} $ENV{RTLSDR_DIR}/include
+                PATHS /usr/local/lib /usr/lib /opt/lib /opt/local/lib)
+
+    set(RTLSDR_LIBRARIES ${RTLSDR_LIBRARY} )
+    set(RTLSDR_INCLUDE_DIRS ${RTLSDR_INCLUDE_DIR} )
+
+    include(FindPackageHandleStandardArgs)
+    # handle the QUIETLY and REQUIRED arguments and set LibRTLSDR_FOUND to TRUE
+    # if all listed variables are TRUE
+    find_package_handle_standard_args(RTLSDR  DEFAULT_MSG
+                                      RTLSDR_LIBRARY RTLSDR_INCLUDE_DIR)
+
+    mark_as_advanced(RTLSDR_INCLUDE_DIR RTLSDR_LIBRARY)
+
+endif(NOT RTLSDR_FOUND)

+ 11 - 0
src/CMakeModules/version.cmake

@@ -0,0 +1,11 @@
+set (VERSION "char const *RTL_AIRBAND_VERSION=\"${RTL_AIRBAND_VERSION}\";\n")
+
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/version.cpp)
+	file(READ ${CMAKE_CURRENT_BINARY_DIR}/version.cpp VERSION_)
+else()
+	set(VERSION_ "")
+endif()
+
+if (NOT "${VERSION}" STREQUAL "${VERSION_}")
+	file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/version.cpp "${VERSION}")
+endif()

+ 886 - 0
src/config.cpp

@@ -0,0 +1,886 @@
+/*
+ * config.cpp
+ * Configuration parsing routines
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <stdint.h>  // uint32_t
+#include <syslog.h>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <libconfig.h++>
+#include "input-common.h"  // input_t
+#include "rtl_airband.h"
+
+using namespace std;
+
+static int parse_outputs(libconfig::Setting& outs, channel_t* channel, int i, int j, bool parsing_mixers) {
+    int oo = 0;
+    for (int o = 0; o < channel->output_count; o++) {
+        if (outs[o].exists("disable") && (bool)outs[o]["disable"] == true) {
+            continue;
+        }
+        if (!strncmp(outs[o]["type"], "icecast", 7)) {
+            channel->outputs[oo].data = XCALLOC(1, sizeof(struct icecast_data));
+            channel->outputs[oo].type = O_ICECAST;
+            icecast_data* idata = (icecast_data*)(channel->outputs[oo].data);
+            idata->hostname = strdup(outs[o]["server"]);
+            idata->port = outs[o]["port"];
+            idata->mountpoint = strdup(outs[o]["mountpoint"]);
+            idata->username = strdup(outs[o]["username"]);
+            idata->password = strdup(outs[o]["password"]);
+            if (outs[o].exists("name"))
+                idata->name = strdup(outs[o]["name"]);
+            if (outs[o].exists("genre"))
+                idata->genre = strdup(outs[o]["genre"]);
+            if (outs[o].exists("description"))
+                idata->description = strdup(outs[o]["description"]);
+            if (outs[o].exists("send_scan_freq_tags"))
+                idata->send_scan_freq_tags = (bool)outs[o]["send_scan_freq_tags"];
+            else
+                idata->send_scan_freq_tags = 0;
+#ifdef LIBSHOUT_HAS_TLS
+            if (outs[o].exists("tls")) {
+                if (outs[o]["tls"].getType() == libconfig::Setting::TypeString) {
+                    if (!strcmp(outs[o]["tls"], "auto")) {
+                        idata->tls_mode = SHOUT_TLS_AUTO;
+                    } else if (!strcmp(outs[o]["tls"], "auto_no_plain")) {
+                        idata->tls_mode = SHOUT_TLS_AUTO_NO_PLAIN;
+                    } else if (!strcmp(outs[o]["tls"], "transport")) {
+                        idata->tls_mode = SHOUT_TLS_RFC2818;
+                    } else if (!strcmp(outs[o]["tls"], "upgrade")) {
+                        idata->tls_mode = SHOUT_TLS_RFC2817;
+                    } else if (!strcmp(outs[o]["tls"], "disabled")) {
+                        idata->tls_mode = SHOUT_TLS_DISABLED;
+                    } else {
+                        if (parsing_mixers) {
+                            cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: ";
+                        } else {
+                            cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: ";
+                        }
+                        cerr << "invalid value for tls; must be one of: auto, auto_no_plain, transport, upgrade, disabled\n";
+                        error();
+                    }
+                } else {
+                    if (parsing_mixers) {
+                        cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: ";
+                    } else {
+                        cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: ";
+                    }
+                    cerr << "tls value must be a string\n";
+                    error();
+                }
+            } else {
+                idata->tls_mode = SHOUT_TLS_DISABLED;
+            }
+#endif /* LIBSHOUT_HAS_TLS */
+            channel->need_mp3 = 1;
+        } else if (!strncmp(outs[o]["type"], "file", 4)) {
+            channel->outputs[oo].data = XCALLOC(1, sizeof(struct file_data));
+            channel->outputs[oo].type = O_FILE;
+            file_data* fdata = (file_data*)(channel->outputs[oo].data);
+
+            fdata->type = O_FILE;
+            if (!outs[o].exists("directory") || !outs[o].exists("filename_template")) {
+                if (parsing_mixers) {
+                    cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: ";
+                } else {
+                    cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: ";
+                }
+                cerr << "both directory and filename_template required for file\n";
+                error();
+            }
+            fdata->basedir = outs[o]["directory"].c_str();
+            fdata->basename = outs[o]["filename_template"].c_str();
+            fdata->dated_subdirectories = outs[o].exists("dated_subdirectories") ? (bool)(outs[o]["dated_subdirectories"]) : false;
+            fdata->suffix = ".mp3";
+
+            fdata->continuous = outs[o].exists("continuous") ? (bool)(outs[o]["continuous"]) : false;
+            fdata->append = (!outs[o].exists("append")) || (bool)(outs[o]["append"]);
+            fdata->split_on_transmission = outs[o].exists("split_on_transmission") ? (bool)(outs[o]["split_on_transmission"]) : false;
+            fdata->include_freq = outs[o].exists("include_freq") ? (bool)(outs[o]["include_freq"]) : false;
+            channel->need_mp3 = 1;
+
+            if (fdata->split_on_transmission) {
+                if (parsing_mixers) {
+                    cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: split_on_transmission is not allowed for mixers\n";
+                    error();
+                }
+                if (fdata->continuous) {
+                    cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: can't have both continuous and split_on_transmission\n";
+                    error();
+                }
+            }
+
+        } else if (!strncmp(outs[o]["type"], "rawfile", 7)) {
+            if (parsing_mixers) {  // rawfile outputs not allowed for mixers
+                cerr << "Configuration error: mixers.[" << i << "] outputs[" << o << "]: rawfile output is not allowed for mixers\n";
+                error();
+            }
+            channel->outputs[oo].data = XCALLOC(1, sizeof(struct file_data));
+            channel->outputs[oo].type = O_RAWFILE;
+            file_data* fdata = (file_data*)(channel->outputs[oo].data);
+
+            fdata->type = O_RAWFILE;
+            if (!outs[o].exists("directory") || !outs[o].exists("filename_template")) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: both directory and filename_template required for file\n";
+                error();
+            }
+
+            fdata->basedir = outs[o]["directory"].c_str();
+            fdata->basename = outs[o]["filename_template"].c_str();
+            fdata->dated_subdirectories = outs[o].exists("dated_subdirectories") ? (bool)(outs[o]["dated_subdirectories"]) : false;
+            fdata->suffix = ".cf32";
+
+            fdata->continuous = outs[o].exists("continuous") ? (bool)(outs[o]["continuous"]) : false;
+            fdata->append = (!outs[o].exists("append")) || (bool)(outs[o]["append"]);
+            fdata->split_on_transmission = outs[o].exists("split_on_transmission") ? (bool)(outs[o]["split_on_transmission"]) : false;
+            fdata->include_freq = outs[o].exists("include_freq") ? (bool)(outs[o]["include_freq"]) : false;
+            channel->needs_raw_iq = channel->has_iq_outputs = 1;
+
+            if (fdata->continuous && fdata->split_on_transmission) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: can't have both continuous and split_on_transmission\n";
+                error();
+            }
+        } else if (!strncmp(outs[o]["type"], "mixer", 5)) {
+            if (parsing_mixers) {  // mixer outputs not allowed for mixers
+                cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: mixer output is not allowed for mixers\n";
+                error();
+            }
+            channel->outputs[oo].data = XCALLOC(1, sizeof(struct mixer_data));
+            channel->outputs[oo].type = O_MIXER;
+            mixer_data* mdata = (mixer_data*)(channel->outputs[oo].data);
+            const char* name = (const char*)outs[o]["name"];
+            if ((mdata->mixer = getmixerbyname(name)) == NULL) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: unknown mixer \"" << name << "\"\n";
+                error();
+            }
+            float ampfactor = outs[o].exists("ampfactor") ? (float)outs[o]["ampfactor"] : 1.0f;
+            float balance = outs[o].exists("balance") ? (float)outs[o]["balance"] : 0.0f;
+            if (balance < -1.0f || balance > 1.0f) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: balance out of allowed range <-1.0;1.0>\n";
+                error();
+            }
+            if ((mdata->input = mixer_connect_input(mdata->mixer, ampfactor, balance)) < 0) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o
+                     << "]: "
+                        "could not connect to mixer "
+                     << name << ": " << mixer_get_error() << "\n";
+                error();
+            }
+            debug_print("dev[%d].chan[%d].out[%d] connected to mixer %s as input %d (ampfactor=%.1f balance=%.1f)\n", i, j, o, name, mdata->input, ampfactor, balance);
+        } else if (!strncmp(outs[o]["type"], "udp_stream", 6)) {
+            channel->outputs[oo].data = XCALLOC(1, sizeof(struct udp_stream_data));
+            channel->outputs[oo].type = O_UDP_STREAM;
+
+            udp_stream_data* sdata = (udp_stream_data*)channel->outputs[oo].data;
+
+            sdata->continuous = outs[o].exists("continuous") ? (bool)(outs[o]["continuous"]) : false;
+
+            if (outs[o].exists("dest_address")) {
+                sdata->dest_address = strdup(outs[o]["dest_address"]);
+            } else {
+                if (parsing_mixers) {
+                    cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: ";
+                } else {
+                    cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: ";
+                }
+                cerr << "missing dest_address\n";
+                error();
+            }
+
+            if (outs[o].exists("dest_port")) {
+                if (outs[o]["dest_port"].getType() == libconfig::Setting::TypeInt) {
+                    char buffer[12];
+                    sprintf(buffer, "%d", (int)outs[o]["dest_port"]);
+                    sdata->dest_port = strdup(buffer);
+                } else {
+                    sdata->dest_port = strdup(outs[o]["dest_port"]);
+                }
+            } else {
+                if (parsing_mixers) {
+                    cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: ";
+                } else {
+                    cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: ";
+                }
+                cerr << "missing dest_port\n";
+                error();
+            }
+#ifdef WITH_PULSEAUDIO
+        } else if (!strncmp(outs[o]["type"], "pulse", 5)) {
+            channel->outputs[oo].data = XCALLOC(1, sizeof(struct pulse_data));
+            channel->outputs[oo].type = O_PULSE;
+
+            pulse_data* pdata = (pulse_data*)(channel->outputs[oo].data);
+            pdata->continuous = outs[o].exists("continuous") ? (bool)(outs[o]["continuous"]) : false;
+            pdata->server = outs[o].exists("server") ? strdup(outs[o]["server"]) : NULL;
+            pdata->name = outs[o].exists("name") ? strdup(outs[o]["name"]) : "rtl_airband";
+            pdata->sink = outs[o].exists("sink") ? strdup(outs[o]["sink"]) : NULL;
+
+            if (outs[o].exists("stream_name")) {
+                pdata->stream_name = strdup(outs[o]["stream_name"]);
+            } else {
+                if (parsing_mixers) {
+                    cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: PulseAudio outputs of mixers must have stream_name defined\n";
+                    error();
+                }
+                char buf[1024];
+                snprintf(buf, sizeof(buf), "%.3f MHz", (float)channel->freqlist[0].frequency / 1000000.0f);
+                pdata->stream_name = strdup(buf);
+            }
+#endif /* WITH_PULSEAUDIO */
+        } else {
+            if (parsing_mixers) {
+                cerr << "Configuration error: mixers.[" << i << "] outputs.[" << o << "]: ";
+            } else {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] outputs.[" << o << "]: ";
+            }
+            cerr << "unknown output type\n";
+            error();
+        }
+        channel->outputs[oo].enabled = true;
+        channel->outputs[oo].active = false;
+        oo++;
+    }
+    return oo;
+}
+
+static struct freq_t* mk_freqlist(int n) {
+    if (n < 1) {
+        cerr << "mk_freqlist: invalid list length " << n << "\n";
+        error();
+    }
+    struct freq_t* fl = (struct freq_t*)XCALLOC(n, sizeof(struct freq_t));
+    for (int i = 0; i < n; i++) {
+        fl[i].frequency = 0;
+        fl[i].label = NULL;
+        fl[i].agcavgfast = 0.5f;
+        fl[i].ampfactor = 1.0f;
+        fl[i].squelch = Squelch();
+        fl[i].active_counter = 0;
+        fl[i].modulation = MOD_AM;
+    }
+    return fl;
+}
+
+static void warn_if_freq_not_in_range(int devidx, int chanidx, int freq, int centerfreq, int sample_rate) {
+    static const float soft_bw_threshold = 0.9f;
+    float bw_limit = (float)sample_rate / 2.f * soft_bw_threshold;
+    if ((float)abs(freq - centerfreq) >= bw_limit) {
+        log(LOG_WARNING, "Warning: dev[%d].channel[%d]: frequency %.3f MHz is outside of SDR operating bandwidth (%.3f-%.3f MHz)\n", devidx, chanidx, (double)freq / 1e6,
+            (double)(centerfreq - bw_limit) / 1e6, (double)(centerfreq + bw_limit) / 1e6);
+    }
+}
+
+static int parse_anynum2int(libconfig::Setting& f) {
+    int ret = 0;
+    if (f.getType() == libconfig::Setting::TypeInt) {
+        ret = (int)f;
+    } else if (f.getType() == libconfig::Setting::TypeFloat) {
+        ret = (int)((double)f * 1e6);
+    } else if (f.getType() == libconfig::Setting::TypeString) {
+        char* s = strdup((char const*)f);
+        ret = (int)atofs(s);
+        free(s);
+    }
+    return ret;
+}
+
+static int parse_channels(libconfig::Setting& chans, device_t* dev, int i) {
+    int jj = 0;
+    for (int j = 0; j < chans.getLength(); j++) {
+        if (chans[j].exists("disable") && (bool)chans[j]["disable"] == true) {
+            continue;
+        }
+        channel_t* channel = dev->channels + jj;
+        for (int k = 0; k < AGC_EXTRA; k++) {
+            channel->wavein[k] = 20;
+            channel->waveout[k] = 0.5;
+        }
+        channel->axcindicate = NO_SIGNAL;
+        channel->mode = MM_MONO;
+        channel->need_mp3 = 0;
+        channel->freq_count = 1;
+        channel->freq_idx = 0;
+        channel->highpass = chans[j].exists("highpass") ? (int)chans[j]["highpass"] : 100;
+        channel->lowpass = chans[j].exists("lowpass") ? (int)chans[j]["lowpass"] : 2500;
+        channel->lame = NULL;
+        channel->lamebuf = NULL;
+#ifdef NFM
+        channel->pr = 0;
+        channel->pj = 0;
+        channel->prev_waveout = 0.5;
+        channel->alpha = dev->alpha;
+#endif /* NFM */
+
+        // Make sure lowpass / highpass aren't flipped.
+        // If lowpass is enabled (greater than zero) it must be larger than highpass
+        if (channel->lowpass > 0 && channel->lowpass < channel->highpass) {
+            cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: lowpass (" << channel->lowpass << ") must be greater than or equal to highpass (" << channel->highpass << ")\n";
+            error();
+        }
+
+        modulations channel_modulation = MOD_AM;
+        if (chans[j].exists("modulation")) {
+#ifdef NFM
+            if (strncmp(chans[j]["modulation"], "nfm", 3) == 0) {
+                channel_modulation = MOD_NFM;
+            } else
+#endif /* NFM */
+                if (strncmp(chans[j]["modulation"], "am", 2) != 0) {
+                    cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: unknown modulation\n";
+                    error();
+                }
+        }
+        channel->afc = chans[j].exists("afc") ? (unsigned char)(unsigned int)chans[j]["afc"] : 0;
+        if (dev->mode == R_MULTICHANNEL) {
+            channel->freqlist = mk_freqlist(1);
+            channel->freqlist[0].frequency = parse_anynum2int(chans[j]["freq"]);
+            warn_if_freq_not_in_range(i, j, channel->freqlist[0].frequency, dev->input->centerfreq, dev->input->sample_rate);
+            if (chans[j].exists("label")) {
+                channel->freqlist[0].label = strdup(chans[j]["label"]);
+            }
+            channel->freqlist[0].modulation = channel_modulation;
+        } else { /* R_SCAN */
+            channel->freq_count = chans[j]["freqs"].getLength();
+            if (channel->freq_count < 1) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: freqs should be a list with at least one element\n";
+                error();
+            }
+            channel->freqlist = mk_freqlist(channel->freq_count);
+            if (chans[j].exists("labels") && chans[j]["labels"].getLength() < channel->freq_count) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: labels should be a list with at least " << channel->freq_count << " elements\n";
+                error();
+            }
+            if (chans[j].exists("squelch_threshold") && libconfig::Setting::TypeList == chans[j]["squelch_threshold"].getType() && chans[j]["squelch_threshold"].getLength() < channel->freq_count) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: squelch_threshold should be an int or a list of ints with at least " << channel->freq_count
+                     << " elements\n";
+                error();
+            }
+            if (chans[j].exists("squelch_snr_threshold") && libconfig::Setting::TypeList == chans[j]["squelch_snr_threshold"].getType() &&
+                chans[j]["squelch_snr_threshold"].getLength() < channel->freq_count) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j
+                     << "]: squelch_snr_threshold should be an int, a float or a list of "
+                        "ints or floats with at least "
+                     << channel->freq_count << " elements\n";
+                error();
+            }
+            if (chans[j].exists("notch") && libconfig::Setting::TypeList == chans[j]["notch"].getType() && chans[j]["notch"].getLength() < channel->freq_count) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: notch should be an float or a list of floats with at least " << channel->freq_count << " elements\n";
+                error();
+            }
+            if (chans[j].exists("notch_q") && libconfig::Setting::TypeList == chans[j]["notch_q"].getType() && chans[j]["notch_q"].getLength() < channel->freq_count) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: notch_q should be a float or a list of floats with at least " << channel->freq_count << " elements\n";
+                error();
+            }
+            if (chans[j].exists("ctcss") && libconfig::Setting::TypeList == chans[j]["ctcss"].getType() && chans[j]["ctcss"].getLength() < channel->freq_count) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: ctcss should be an float or a list of floats with at least " << channel->freq_count << " elements\n";
+                error();
+            }
+            if (chans[j].exists("modulation") && chans[j].exists("modulations")) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: can't set both modulation and modulations\n";
+                error();
+            }
+            if (chans[j].exists("modulations") && chans[j]["modulations"].getLength() < channel->freq_count) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: modulations should be a list with at least " << channel->freq_count << " elements\n";
+                error();
+            }
+
+            for (int f = 0; f < channel->freq_count; f++) {
+                channel->freqlist[f].frequency = parse_anynum2int((chans[j]["freqs"][f]));
+                if (chans[j].exists("labels")) {
+                    channel->freqlist[f].label = strdup(chans[j]["labels"][f]);
+                }
+                if (chans[j].exists("modulations")) {
+#ifdef NFM
+                    if (strncmp(chans[j]["modulations"][f], "nfm", 3) == 0) {
+                        channel->freqlist[f].modulation = MOD_NFM;
+                    } else
+#endif /* NFM */
+                        if (strncmp(chans[j]["modulations"][f], "am", 2) == 0) {
+                            channel->freqlist[f].modulation = MOD_AM;
+                        } else {
+                            cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] modulations.[" << f << "]: unknown modulation\n";
+                            error();
+                        }
+                } else {
+                    channel->freqlist[f].modulation = channel_modulation;
+                }
+            }
+            // Set initial frequency for scanning
+            // We tune 20 FFT bins higher to avoid DC spike
+            dev->input->centerfreq = channel->freqlist[0].frequency + 20 * (double)(dev->input->sample_rate / fft_size);
+        }
+        if (chans[j].exists("squelch")) {
+            cerr << "Warning: 'squelch' no longer supported and will be ignored, use 'squelch_threshold' or 'squelch_snr_threshold' instead\n";
+        }
+        if (chans[j].exists("squelch_threshold") && chans[j].exists("squelch_snr_threshold")) {
+            cerr << "Warning: Both 'squelch_threshold' and 'squelch_snr_threshold' are set and may conflict\n";
+        }
+        if (chans[j].exists("squelch_threshold")) {
+            // Value is dBFS, zero disables manual threshold (ie use auto squelch), negative is valid, positive is invalid
+            if (libconfig::Setting::TypeList == chans[j]["squelch_threshold"].getType()) {
+                // New-style array of per-frequency squelch settings
+                for (int f = 0; f < channel->freq_count; f++) {
+                    int threshold_dBFS = (int)chans[j]["squelch_threshold"][f];
+                    if (threshold_dBFS > 0) {
+                        cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: squelch_threshold must be less than or equal to 0\n";
+                        error();
+                    } else if (threshold_dBFS == 0) {
+                        channel->freqlist[f].squelch.set_squelch_level_threshold(0);
+                    } else {
+                        channel->freqlist[f].squelch.set_squelch_level_threshold(dBFS_to_level(threshold_dBFS));
+                    }
+                }
+            } else if (libconfig::Setting::TypeInt == chans[j]["squelch_threshold"].getType()) {
+                // Legacy (single squelch for all frequencies)
+                int threshold_dBFS = (int)chans[j]["squelch_threshold"];
+                float level;
+                if (threshold_dBFS > 0) {
+                    cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: squelch_threshold must be less than or equal to 0\n";
+                    error();
+                } else if (threshold_dBFS == 0) {
+                    level = 0;
+                } else {
+                    level = dBFS_to_level(threshold_dBFS);
+                }
+
+                for (int f = 0; f < channel->freq_count; f++) {
+                    channel->freqlist[f].squelch.set_squelch_level_threshold(level);
+                }
+            } else {
+                cerr << "Invalid value for squelch_threshold (should be int or list - use parentheses)\n";
+                error();
+            }
+        }
+        if (chans[j].exists("squelch_snr_threshold")) {
+            // Value is SNR in dB, zero disables squelch (ie always open), -1 uses default value, positive is valid, other negative values are invalid
+            if (libconfig::Setting::TypeList == chans[j]["squelch_snr_threshold"].getType()) {
+                // New-style array of per-frequency squelch settings
+                for (int f = 0; f < channel->freq_count; f++) {
+                    float snr = 0.f;
+                    if (libconfig::Setting::TypeFloat == chans[j]["squelch_snr_threshold"][f].getType()) {
+                        snr = (float)chans[j]["squelch_snr_threshold"][f];
+                    } else if (libconfig::Setting::TypeInt == chans[j]["squelch_snr_threshold"][f].getType()) {
+                        snr = (int)chans[j]["squelch_snr_threshold"][f];
+                    } else {
+                        cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: squelch_snr_threshold list must be of int or float\n";
+                        error();
+                    }
+
+                    if (snr == -1.0) {
+                        continue;  // "disable" for this channel in list
+                    } else if (snr < 0) {
+                        cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: squelch_snr_threshold must be greater than or equal to 0\n";
+                        error();
+                    } else {
+                        channel->freqlist[f].squelch.set_squelch_snr_threshold(snr);
+                    }
+                }
+            } else if (libconfig::Setting::TypeFloat == chans[j]["squelch_snr_threshold"].getType() || libconfig::Setting::TypeInt == chans[j]["squelch_snr_threshold"].getType()) {
+                // Legacy (single squelch for all frequencies)
+                float snr = (libconfig::Setting::TypeFloat == chans[j]["squelch_snr_threshold"].getType()) ? (float)chans[j]["squelch_snr_threshold"] : (int)chans[j]["squelch_snr_threshold"];
+
+                if (snr == -1.0) {
+                    continue;  // "disable" so use the default without error message
+                } else if (snr < 0) {
+                    cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: squelch_snr_threshold must be greater than or equal to 0\n";
+                    error();
+                }
+
+                for (int f = 0; f < channel->freq_count; f++) {
+                    channel->freqlist[f].squelch.set_squelch_snr_threshold(snr);
+                }
+            } else {
+                cerr << "Invalid value for squelch_snr_threshold (should be float, int, or list of int/float - use parentheses)\n";
+                error();
+            }
+        }
+        if (chans[j].exists("notch")) {
+            static const float default_q = 10.0;
+
+            if (chans[j].exists("notch_q") && chans[j]["notch"].getType() != chans[j]["notch_q"].getType()) {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: notch_q (if set) must be the same type as notch - "
+                     << "float or a list of floats with at least " << channel->freq_count << " elements\n";
+                error();
+            }
+            if (libconfig::Setting::TypeList == chans[j]["notch"].getType()) {
+                for (int f = 0; f < channel->freq_count; f++) {
+                    float freq = (float)chans[j]["notch"][f];
+                    float q = chans[j].exists("notch_q") ? (float)chans[j]["notch_q"][f] : default_q;
+
+                    if (q == 0.0) {
+                        q = default_q;
+                    } else if (q <= 0.0) {
+                        cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "] freq.[" << f << "]: invalid value for notch_q: " << q << " (must be greater than 0.0)\n";
+                        error();
+                    }
+
+                    if (freq == 0) {
+                        continue;  // "disable" for this channel in list
+                    } else if (freq < 0) {
+                        cerr << "devices.[" << i << "] channels.[" << j << "] freq.[" << f << "]: invalid value for notch: " << freq << ", ignoring\n";
+                    } else {
+                        channel->freqlist[f].notch_filter = NotchFilter(freq, WAVE_RATE, q);
+                    }
+                }
+            } else if (libconfig::Setting::TypeFloat == chans[j]["notch"].getType()) {
+                float freq = (float)chans[j]["notch"];
+                float q = chans[j].exists("notch_q") ? (float)chans[j]["notch_q"] : default_q;
+                if (q <= 0.0) {
+                    cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: invalid value for notch_q: " << q << " (must be greater than 0.0)\n";
+                    error();
+                }
+                for (int f = 0; f < channel->freq_count; f++) {
+                    if (freq == 0) {
+                        continue;  // "disable" is default so ignore without error message
+                    } else if (freq < 0) {
+                        cerr << "devices.[" << i << "] channels.[" << j << "]: notch value '" << freq << "' invalid, ignoring\n";
+                    } else {
+                        channel->freqlist[f].notch_filter = NotchFilter(freq, WAVE_RATE, q);
+                    }
+                }
+            } else {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: notch should be an float or a list of floats with at least " << channel->freq_count << " elements\n";
+                error();
+            }
+        }
+        if (chans[j].exists("ctcss")) {
+            if (libconfig::Setting::TypeList == chans[j]["ctcss"].getType()) {
+                for (int f = 0; f < channel->freq_count; f++) {
+                    float freq = (float)chans[j]["ctcss"][f];
+
+                    if (freq == 0) {
+                        continue;  // "disable" for this channel in list
+                    } else if (freq < 0) {
+                        cerr << "devices.[" << i << "] channels.[" << j << "] freq.[" << f << "]: invalid value for ctcss: " << freq << ", ignoring\n";
+                    } else {
+                        channel->freqlist[f].squelch.set_ctcss_freq(freq, WAVE_RATE);
+                    }
+                }
+            } else if (libconfig::Setting::TypeFloat == chans[j]["ctcss"].getType()) {
+                float freq = (float)chans[j]["ctcss"];
+                for (int f = 0; f < channel->freq_count; f++) {
+                    if (freq <= 0) {
+                        cerr << "devices.[" << i << "] channels.[" << j << "]: ctcss value '" << freq << "' invalid, ignoring\n";
+                    } else {
+                        channel->freqlist[f].squelch.set_ctcss_freq(freq, WAVE_RATE);
+                    }
+                }
+            } else {
+                cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: ctcss should be an float or a list of floats with at least " << channel->freq_count << " elements\n";
+                error();
+            }
+        }
+        if (chans[j].exists("bandwidth")) {
+            channel->needs_raw_iq = 1;
+
+            if (libconfig::Setting::TypeList == chans[j]["bandwidth"].getType()) {
+                for (int f = 0; f < channel->freq_count; f++) {
+                    int bandwidth = parse_anynum2int(chans[j]["bandwidth"][f]);
+
+                    if (bandwidth == 0) {
+                        continue;  // "disable" for this channel in list
+                    } else if (bandwidth < 0) {
+                        cerr << "devices.[" << i << "] channels.[" << j << "] freq.[" << f << "]: bandwidth value '" << bandwidth << "' invalid, ignoring\n";
+                    } else {
+                        channel->freqlist[f].lowpass_filter = LowpassFilter((float)bandwidth / 2, WAVE_RATE);
+                    }
+                }
+            } else {
+                int bandwidth = parse_anynum2int(chans[j]["bandwidth"]);
+                if (bandwidth == 0) {
+                    continue;  // "disable" is default so ignore without error message
+                } else if (bandwidth < 0) {
+                    cerr << "devices.[" << i << "] channels.[" << j << "]: bandwidth value '" << bandwidth << "' invalid, ignoring\n";
+                } else {
+                    for (int f = 0; f < channel->freq_count; f++) {
+                        channel->freqlist[f].lowpass_filter = LowpassFilter((float)bandwidth / 2, WAVE_RATE);
+                    }
+                }
+            }
+        }
+        if (chans[j].exists("ampfactor")) {
+            if (libconfig::Setting::TypeList == chans[j]["ampfactor"].getType()) {
+                for (int f = 0; f < channel->freq_count; f++) {
+                    float ampfactor = (float)chans[j]["ampfactor"][f];
+
+                    if (ampfactor < 0) {
+                        cerr << "devices.[" << i << "] channels.[" << j << "] freq.[" << f << "]: ampfactor '" << ampfactor << "' must not be negative\n";
+                        error();
+                    }
+
+                    channel->freqlist[f].ampfactor = ampfactor;
+                }
+            } else {
+                float ampfactor = (float)chans[j]["ampfactor"];
+
+                if (ampfactor < 0) {
+                    cerr << "devices.[" << i << "] channels.[" << j << "]: ampfactor '" << ampfactor << "' must not be negative\n";
+                    error();
+                }
+
+                for (int f = 0; f < channel->freq_count; f++) {
+                    channel->freqlist[f].ampfactor = ampfactor;
+                }
+            }
+        }
+
+#ifdef NFM
+        if (chans[j].exists("tau")) {
+            channel->alpha = ((int)chans[j]["tau"] == 0 ? 0.0f : exp(-1.0f / (WAVE_RATE * 1e-6 * (int)chans[j]["tau"])));
+        }
+#endif /* NFM */
+        libconfig::Setting& outputs = chans[j]["outputs"];
+        channel->output_count = outputs.getLength();
+        if (channel->output_count < 1) {
+            cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: no outputs defined\n";
+            error();
+        }
+        channel->outputs = (output_t*)XCALLOC(channel->output_count, sizeof(struct output_t));
+        int outputs_enabled = parse_outputs(outputs, channel, i, j, false);
+        if (outputs_enabled < 1) {
+            cerr << "Configuration error: devices.[" << i << "] channels.[" << j << "]: no outputs defined\n";
+            error();
+        }
+        channel->outputs = (output_t*)XREALLOC(channel->outputs, outputs_enabled * sizeof(struct output_t));
+        channel->output_count = outputs_enabled;
+
+        dev->base_bins[jj] = dev->bins[jj] =
+            (size_t)ceil((channel->freqlist[0].frequency + dev->input->sample_rate - dev->input->centerfreq) / (double)(dev->input->sample_rate / fft_size) - 1.0) % fft_size;
+        debug_print("bins[%d]: %zu\n", jj, dev->bins[jj]);
+
+#ifdef NFM
+        for (int f = 0; f < channel->freq_count; f++) {
+            if (channel->freqlist[f].modulation == MOD_NFM) {
+                channel->needs_raw_iq = 1;
+                break;
+            }
+        }
+#endif /* NFM */
+
+        if (channel->needs_raw_iq) {
+            // Downmixing is done only for NFM and raw IQ outputs. It's not critical to have some residual
+            // freq offset in AM, as it doesn't affect sound quality significantly.
+            double dm_dphi = (double)(channel->freqlist[0].frequency - dev->input->centerfreq);  // downmix freq in Hz
+
+            // In general, sample_rate is not required to be an integer multiple of WAVE_RATE.
+            // However the FFT window may only slide by an integer number of input samples. A non-zero rounding error
+            // introduces additional phase rotation which we have to compensate in order to shift the channel of interest
+            // to the center of the spectrum of the output I/Q stream. This is important for correct NFM demodulation.
+            // The error value (in Hz):
+            // - has an absolute value 0..WAVE_RATE/2
+            // - is linear with the error introduced by rounding the value of sample_rate/WAVE_RATE to the nearest integer
+            //   (range of -0.5..0.5)
+            // - is linear with the distance between center frequency and the channel frequency, normalized to 0..1
+            double decimation_factor = ((double)dev->input->sample_rate / (double)WAVE_RATE);
+            double dm_dphi_correction = (double)WAVE_RATE / 2.0;
+            dm_dphi_correction *= (decimation_factor - round(decimation_factor));
+            dm_dphi_correction *= (double)(channel->freqlist[0].frequency - dev->input->centerfreq) / ((double)dev->input->sample_rate / 2.0);
+
+            debug_print("dev[%d].chan[%d]: dm_dphi: %f Hz dm_dphi_correction: %f Hz\n", i, jj, dm_dphi, dm_dphi_correction);
+            dm_dphi -= dm_dphi_correction;
+            debug_print("dev[%d].chan[%d]: dm_dphi_corrected: %f Hz\n", i, jj, dm_dphi);
+            // Normalize
+            dm_dphi /= (double)WAVE_RATE;
+            // Unalias it, to prevent overflow of int during cast
+            dm_dphi -= trunc(dm_dphi);
+            debug_print("dev[%d].chan[%d]: dm_dphi_normalized=%f\n", i, jj, dm_dphi);
+            // Translate this to uint32_t range 0x00000000-0x00ffffff
+            dm_dphi *= 256.0 * 65536.0;
+            // Cast it to signed int first, because casting negative float to uint is not portable
+            channel->dm_dphi = (uint32_t)((int)dm_dphi);
+            debug_print("dev[%d].chan[%d]: dm_dphi_scaled=%f cast=0x%x\n", i, jj, dm_dphi, channel->dm_dphi);
+            channel->dm_phi = 0.f;
+        }
+
+#ifdef DEBUG_SQUELCH
+        // Setup squelch debug file, if enabled
+        char tmp_filepath[1024];
+        for (int f = 0; f < channel->freq_count; f++) {
+            snprintf(tmp_filepath, sizeof(tmp_filepath), "./squelch_debug-%d-%d.dat", j, f);
+            channel->freqlist[f].squelch.set_debug_file(tmp_filepath);
+        }
+#endif /* DEBUG_SQUELCH */
+
+        jj++;
+    }
+    return jj;
+}
+
+int parse_devices(libconfig::Setting& devs) {
+    int devcnt = 0;
+    for (int i = 0; i < devs.getLength(); i++) {
+        if (devs[i].exists("disable") && (bool)devs[i]["disable"] == true)
+            continue;
+        device_t* dev = devices + devcnt;
+        if (devs[i].exists("type")) {
+            dev->input = input_new(devs[i]["type"]);
+            if (dev->input == NULL) {
+                cerr << "Configuration error: devices.[" << i << "]: unsupported device type\n";
+                error();
+            }
+        } else {
+#ifdef WITH_RTLSDR
+            cerr << "Warning: devices.[" << i << "]: assuming device type \"rtlsdr\", please set \"type\" in the device section.\n";
+            dev->input = input_new("rtlsdr");
+#else
+            cerr << "Configuration error: devices.[" << i << "]: mandatory parameter missing: type\n";
+            error();
+#endif /* WITH_RTLSDR */
+        }
+        assert(dev->input != NULL);
+        if (devs[i].exists("sample_rate")) {
+            int sample_rate = parse_anynum2int(devs[i]["sample_rate"]);
+            if (sample_rate < WAVE_RATE) {
+                cerr << "Configuration error: devices.[" << i << "]: sample_rate must be greater than " << WAVE_RATE << "\n";
+                error();
+            }
+            dev->input->sample_rate = sample_rate;
+        }
+        if (devs[i].exists("mode")) {
+            if (!strncmp(devs[i]["mode"], "multichannel", 12)) {
+                dev->mode = R_MULTICHANNEL;
+            } else if (!strncmp(devs[i]["mode"], "scan", 4)) {
+                dev->mode = R_SCAN;
+            } else {
+                cerr << "Configuration error: devices.[" << i << "]: invalid mode (must be one of: \"scan\", \"multichannel\")\n";
+                error();
+            }
+        } else {
+            dev->mode = R_MULTICHANNEL;
+        }
+        if (dev->mode == R_MULTICHANNEL) {
+            dev->input->centerfreq = parse_anynum2int(devs[i]["centerfreq"]);
+        }  // centerfreq for R_SCAN will be set by parse_channels() after frequency list has been read
+#ifdef NFM
+        if (devs[i].exists("tau")) {
+            dev->alpha = ((int)devs[i]["tau"] == 0 ? 0.0f : exp(-1.0f / (WAVE_RATE * 1e-6 * (int)devs[i]["tau"])));
+        } else {
+            dev->alpha = alpha;
+        }
+#endif /* NFM */
+
+        // Parse hardware-dependent configuration parameters
+        if (input_parse_config(dev->input, devs[i]) < 0) {
+            // FIXME: get and display error string from input_parse_config
+            // Right now it exits the program on failure.
+        }
+        // Some basic sanity checks for crucial parameters which have to be set
+        // (or can be modified) by the input driver
+        assert(dev->input->sfmt != SFMT_UNDEF);
+        assert(dev->input->fullscale > 0);
+        assert(dev->input->bytes_per_sample > 0);
+        assert(dev->input->sample_rate > WAVE_RATE);
+
+        // For the input buffer size use a base value and round it up to the nearest multiple
+        // of FFT_BATCH blocks of input samples.
+        // ceil is required here because sample rate is not guaranteed to be an integer multiple of WAVE_RATE.
+        size_t fft_batch_len = FFT_BATCH * (2 * dev->input->bytes_per_sample * (size_t)ceil((double)dev->input->sample_rate / (double)WAVE_RATE));
+        dev->input->buf_size = MIN_BUF_SIZE;
+        if (dev->input->buf_size % fft_batch_len != 0)
+            dev->input->buf_size += fft_batch_len - dev->input->buf_size % fft_batch_len;
+        debug_print("dev->input->buf_size: %zu\n", dev->input->buf_size);
+        dev->input->buffer = (unsigned char*)XCALLOC(sizeof(unsigned char), dev->input->buf_size + 2 * dev->input->bytes_per_sample * fft_size);
+        dev->input->bufs = dev->input->bufe = 0;
+        dev->input->overflow_count = 0;
+        dev->output_overrun_count = 0;
+        dev->waveend = dev->waveavail = dev->row = dev->tq_head = dev->tq_tail = 0;
+        dev->last_frequency = -1;
+
+        libconfig::Setting& chans = devs[i]["channels"];
+        if (chans.getLength() < 1) {
+            cerr << "Configuration error: devices.[" << i << "]: no channels configured\n";
+            error();
+        }
+        dev->channels = (channel_t*)XCALLOC(chans.getLength(), sizeof(channel_t));
+        dev->bins = (size_t*)XCALLOC(chans.getLength(), sizeof(size_t));
+        dev->base_bins = (size_t*)XCALLOC(chans.getLength(), sizeof(size_t));
+        dev->channel_count = 0;
+        int channel_count = parse_channels(chans, dev, i);
+        if (channel_count < 1) {
+            cerr << "Configuration error: devices.[" << i << "]: no channels enabled\n";
+            error();
+        }
+        if (dev->mode == R_SCAN && channel_count > 1) {
+            cerr << "Configuration error: devices.[" << i << "]: only one channel is allowed in scan mode\n";
+            error();
+        }
+        dev->channels = (channel_t*)XREALLOC(dev->channels, channel_count * sizeof(channel_t));
+        dev->bins = (size_t*)XREALLOC(dev->bins, channel_count * sizeof(size_t));
+        dev->base_bins = (size_t*)XREALLOC(dev->base_bins, channel_count * sizeof(size_t));
+        dev->channel_count = channel_count;
+        devcnt++;
+    }
+    return devcnt;
+}
+
+int parse_mixers(libconfig::Setting& mx) {
+    const char* name;
+    int mm = 0;
+    for (int i = 0; i < mx.getLength(); i++) {
+        if (mx[i].exists("disable") && (bool)mx[i]["disable"] == true)
+            continue;
+        if ((name = mx[i].getName()) == NULL) {
+            cerr << "Configuration error: mixers.[" << i << "]: undefined mixer name\n";
+            error();
+        }
+        debug_print("mm=%d name=%s\n", mm, name);
+        mixer_t* mixer = &mixers[mm];
+        mixer->name = strdup(name);
+        mixer->enabled = false;
+        mixer->interval = MIX_DIVISOR;
+        mixer->output_overrun_count = 0;
+        mixer->input_count = 0;
+        mixer->inputs = NULL;
+        mixer->inputs_todo = NULL;
+        mixer->input_mask = NULL;
+        channel_t* channel = &mixer->channel;
+        channel->highpass = mx[i].exists("highpass") ? (int)mx[i]["highpass"] : 100;
+        channel->lowpass = mx[i].exists("lowpass") ? (int)mx[i]["lowpass"] : 2500;
+        channel->mode = MM_MONO;
+
+        // Make sure lowpass / highpass aren't flipped.
+        // If lowpass is enabled (greater than zero) it must be larger than highpass
+        if (channel->lowpass > 0 && channel->lowpass < channel->highpass) {
+            cerr << "Configuration error: mixers.[" << i << "]: lowpass (" << channel->lowpass << ") must be greater than or equal to highpass (" << channel->highpass << ")\n";
+            error();
+        }
+
+        libconfig::Setting& outputs = mx[i]["outputs"];
+        channel->output_count = outputs.getLength();
+        if (channel->output_count < 1) {
+            cerr << "Configuration error: mixers.[" << i << "]: no outputs defined\n";
+            error();
+        }
+        channel->outputs = (output_t*)XCALLOC(channel->output_count, sizeof(struct output_t));
+        int outputs_enabled = parse_outputs(outputs, channel, i, 0, true);
+        if (outputs_enabled < 1) {
+            cerr << "Configuration error: mixers.[" << i << "]: no outputs defined\n";
+            error();
+        }
+        channel->outputs = (output_t*)XREALLOC(channel->outputs, outputs_enabled * sizeof(struct output_t));
+        channel->output_count = outputs_enabled;
+        mm++;
+    }
+    return mm;
+}
+
+// vim: ts=4

+ 36 - 0
src/config.h.in

@@ -0,0 +1,36 @@
+/*
+ * config.h.in
+ * Template for cmake-generated config.h
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#cmakedefine WITH_RTLSDR
+#cmakedefine WITH_MIRISDR
+#cmakedefine WITH_SOAPYSDR
+#cmakedefine WITH_PROFILING
+#cmakedefine WITH_PULSEAUDIO
+#cmakedefine NFM
+#cmakedefine WITH_BCM_VC
+#cmakedefine LIBSHOUT_HAS_TLS
+#cmakedefine LIBSHOUT_HAS_CONTENT_FORMAT
+#define SINCOSF @SINCOSF@
+
+#define SHOUT_SET_METADATA @SHOUT_SET_METADATA@
+
+#endif /* _CONFIG_H */

+ 172 - 0
src/ctcss.cpp

@@ -0,0 +1,172 @@
+/*
+ * ctcss.h
+ *
+ * Copyright (C) 2022-2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>     // M_PI
+#include <algorithm>  // sort
+
+#include "logging.h"  // debug_print()
+
+#include "ctcss.h"
+
+using namespace std;
+
+// Implementation of https://www.embedded.com/detecting-ctcss-tones-with-goertzels-algorithm/
+// also https://www.embedded.com/the-goertzel-algorithm/
+ToneDetector::ToneDetector(float tone_freq, float sample_rate, int window_size) {
+    tone_freq_ = tone_freq;
+    magnitude_ = 0.0;
+
+    window_size_ = window_size;
+
+    int k = (0.5 + window_size * tone_freq / sample_rate);
+    float omega = (2.0 * M_PI * k) / window_size;
+    coeff_ = 2.0 * cos(omega);
+
+    reset();
+}
+
+void ToneDetector::process_sample(const float& sample) {
+    q0_ = coeff_ * q1_ - q2_ + sample;
+    q2_ = q1_;
+    q1_ = q0_;
+
+    count_++;
+    if (count_ == window_size_) {
+        magnitude_ = q1_ * q1_ + q2_ * q2_ - q1_ * q2_ * coeff_;
+        count_ = 0;
+    }
+}
+
+void ToneDetector::reset(void) {
+    count_ = 0;
+    q0_ = q1_ = q2_ = 0.0;
+}
+
+bool ToneDetectorSet::add(const float& tone_freq, const float& sample_rate, int window_size) {
+    ToneDetector new_tone = ToneDetector(tone_freq, sample_rate, window_size);
+
+    for (const auto tone : tones_) {
+        if (new_tone.coefficient() == tone.coefficient()) {
+            debug_print("Skipping tone %f, too close to other tones\n", tone_freq);
+            return false;
+        }
+    }
+
+    tones_.push_back(new_tone);
+    return true;
+}
+
+void ToneDetectorSet::process_sample(const float& sample) {
+    for (vector<ToneDetector>::iterator it = tones_.begin(); it != tones_.end(); ++it) {
+        it->process_sample(sample);
+    }
+}
+
+void ToneDetectorSet::reset(void) {
+    for (vector<ToneDetector>::iterator it = tones_.begin(); it != tones_.end(); ++it) {
+        it->reset();
+    }
+}
+
+float ToneDetectorSet::sorted_powers(vector<ToneDetectorSet::PowerIndex>& powers) {
+    powers.clear();
+
+    float total_power = 0.0;
+    for (size_t i = 0; i < tones_.size(); ++i) {
+        powers.push_back({tones_[i].relative_power(), tones_[i].freq()});
+        total_power += tones_[i].relative_power();
+    }
+
+    sort(powers.begin(), powers.end(), [](PowerIndex a, PowerIndex b) { return a.power > b.power; });
+
+    return total_power / tones_.size();
+}
+
+vector<float> CTCSS::standard_tones = {67.0,  69.3,  71.9,  74.4,  77.0,  79.7,  82.5,  85.4,  88.5,  91.5,  94.8,  97.4,  100.0, 103.5, 107.2, 110.9, 114.8,
+                                       118.8, 123.0, 127.3, 131.8, 136.5, 141.3, 146.2, 150.0, 151.4, 156.7, 159.8, 162.2, 165.5, 167.9, 171.3, 173.8, 177.3,
+                                       179.9, 183.5, 186.2, 189.9, 192.8, 196.6, 199.5, 203.5, 206.5, 210.7, 218.1, 225.7, 229.1, 233.6, 241.8, 250.3, 254.1};
+
+CTCSS::CTCSS(const float& ctcss_freq, const float& sample_rate, int window_size) : enabled_(true), ctcss_freq_(ctcss_freq), window_size_(window_size), found_count_(0), not_found_count_(0) {
+    debug_print("Adding CTCSS detector for %f Hz with a sample rate of %f and window %d\n", ctcss_freq, sample_rate, window_size_);
+
+    // Add the target CTCSS frequency first followed by the other "standard tones", except those
+    // within +/- 5 Hz
+    powers_.add(ctcss_freq, sample_rate, window_size_);
+
+    for (const auto tone : standard_tones) {
+        if (abs(ctcss_freq - tone) < 5) {
+            debug_print("Skipping tone %f, too close to other tones\n", tone);
+            continue;
+        }
+        powers_.add(tone, sample_rate, window_size_);
+    }
+
+    // clear all values to start NOTE: has_tone_ will be true until the first window count of samples are processed
+    reset();
+}
+
+void CTCSS::process_audio_sample(const float& sample) {
+    if (!enabled_) {
+        return;
+    }
+
+    powers_.process_sample(sample);
+
+    sample_count_++;
+    if (sample_count_ < window_size_) {
+        return;
+    }
+
+    enough_samples_ = true;
+
+    // if this is sample fills out the window then check if one of the "strongest"
+    // tones is the CTCSS tone we are looking for.  NOTE: there can be multiple "strongest"
+    // tones based on floating point math
+    vector<ToneDetectorSet::PowerIndex> tone_powers;
+    float avg_power = powers_.sorted_powers(tone_powers);
+    float ctcss_tone_power = 0.0;
+    for (const auto i : tone_powers) {
+        if (i.freq == ctcss_freq_) {
+            ctcss_tone_power = i.power;
+            break;
+        }
+    }
+    if (ctcss_tone_power == tone_powers[0].power && ctcss_tone_power > avg_power) {
+        debug_print("CTCSS tone of %f Hz detected\n", ctcss_freq_);
+        has_tone_ = true;
+        found_count_++;
+    } else {
+        debug_print("CTCSS tone of %f Hz not detected - highest power was %f Hz at %f vs %f\n", ctcss_freq_, tone_powers[0].freq, tone_powers[0].power, ctcss_tone_power);
+        has_tone_ = false;
+        not_found_count_++;
+    }
+
+    // reset everything for the next window's worth of samples
+    powers_.reset();
+    sample_count_ = 0;
+}
+
+void CTCSS::reset(void) {
+    if (enabled_) {
+        powers_.reset();
+        enough_samples_ = false;
+        sample_count_ = 0;
+        has_tone_ = false;
+    }
+}

+ 98 - 0
src/ctcss.h

@@ -0,0 +1,98 @@
+/*
+ * ctcss.h
+ *
+ * Copyright (C) 2022-2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _CTCSS_H
+#define _CTCSS_H 1
+
+#include <cstddef>  // size_t
+#include <vector>
+
+class ToneDetector {
+   public:
+    ToneDetector(float tone_freq, float sample_freq, int window_size);
+    void process_sample(const float& sample);
+    void reset(void);
+
+    const float& relative_power(void) const { return magnitude_; }
+    const float& freq(void) const { return tone_freq_; }
+    const float& coefficient(void) const { return coeff_; }
+
+   private:
+    float tone_freq_;
+    float magnitude_;
+
+    int window_size_;
+    float coeff_;
+
+    int count_;
+    float q0_;
+    float q1_;
+    float q2_;
+};
+
+class ToneDetectorSet {
+   public:
+    struct PowerIndex {
+        float power;
+        float freq;
+    };
+
+    ToneDetectorSet() {}
+
+    bool add(const float& tone_freq, const float& sample_freq, int window_size);
+    void process_sample(const float& sample);
+    void reset(void);
+
+    float sorted_powers(std::vector<PowerIndex>& powers);
+
+   private:
+    std::vector<ToneDetector> tones_;
+};
+
+class CTCSS {
+   public:
+    CTCSS(void) : enabled_(false), found_count_(0), not_found_count_(0) {}
+    CTCSS(const float& ctcss_freq, const float& sample_rate, int window_size);
+    void process_audio_sample(const float& sample);
+    void reset(void);
+
+    const size_t& found_count(void) const { return found_count_; }
+    const size_t& not_found_count(void) const { return not_found_count_; }
+
+    bool is_enabled(void) const { return enabled_; }
+    bool enough_samples(void) const { return enough_samples_; }
+    bool has_tone(void) const { return !enabled_ || has_tone_; }
+
+    static std::vector<float> standard_tones;
+
+   private:
+    bool enabled_;
+    float ctcss_freq_;
+    int window_size_;
+    size_t found_count_;
+    size_t not_found_count_;
+
+    ToneDetectorSet powers_;
+
+    bool enough_samples_;
+    int sample_count_;
+    bool has_tone_;
+};
+
+#endif /* _CTCSS_H */

+ 163 - 0
src/filters.cpp

@@ -0,0 +1,163 @@
+/*
+ * filters.cpp
+ *
+ * Copyright (C) 2022-2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "logging.h"  // debug_print()
+
+#include "filters.h"
+
+using namespace std;
+
+// Default constructor is no filter
+NotchFilter::NotchFilter(void) : enabled_(false) {}
+
+// Notch Filter based on https://www.dsprelated.com/showcode/173.php
+NotchFilter::NotchFilter(float notch_freq, float sample_freq, float q) : enabled_(true), x{0.0}, y{0.0} {
+    if (notch_freq <= 0.0) {
+        debug_print("Invalid frequency %f Hz, disabling notch filter\n", notch_freq);
+        enabled_ = false;
+        return;
+    }
+
+    debug_print("Adding notch filter for %f Hz with parameters {%f, %f}\n", notch_freq, sample_freq, q);
+
+    float wo = 2 * M_PI * (notch_freq / sample_freq);
+
+    e = 1 / (1 + tan(wo / (q * 2)));
+    p = cos(wo);
+    d[0] = e;
+    d[1] = 2 * e * p;
+    d[2] = (2 * e - 1);
+
+    debug_print("wo:%f e:%f p:%f d:{%f,%f,%f}\n", wo, e, p, d[0], d[1], d[2]);
+}
+
+void NotchFilter::apply(float& value) {
+    if (!enabled_) {
+        return;
+    }
+
+    x[0] = x[1];
+    x[1] = x[2];
+    x[2] = value;
+
+    y[0] = y[1];
+    y[1] = y[2];
+    y[2] = d[0] * x[2] - d[1] * x[1] + d[0] * x[0] + d[1] * y[1] - d[2] * y[0];
+
+    value = y[2];
+}
+
+// Default constructor is no filter
+LowpassFilter::LowpassFilter(void) : enabled_(false) {}
+
+// 2nd order lowpass Bessel filter, based entirely on a simplification of https://www-users.cs.york.ac.uk/~fisher/mkfilter/
+LowpassFilter::LowpassFilter(float freq, float sample_freq) : enabled_(true) {
+    if (freq <= 0.0) {
+        debug_print("Invalid frequency %f Hz, disabling lowpass filter\n", freq);
+        enabled_ = false;
+        return;
+    }
+
+    debug_print("Adding lowpass filter at %f Hz with a sample rate of %f\n", freq, sample_freq);
+
+    double raw_alpha = (double)freq / sample_freq;
+    double warped_alpha = tan(M_PI * raw_alpha) / M_PI;
+
+    complex<double> zeros[2] = {-1.0, -1.0};
+    complex<double> poles[2];
+    poles[0] = blt(M_PI * 2 * warped_alpha * complex<double>(-1.10160133059e+00, 6.36009824757e-01));
+    poles[1] = blt(M_PI * 2 * warped_alpha * conj(complex<double>(-1.10160133059e+00, 6.36009824757e-01)));
+
+    complex<double> topcoeffs[3];
+    complex<double> botcoeffs[3];
+    expand(zeros, 2, topcoeffs);
+    expand(poles, 2, botcoeffs);
+    complex<double> gain_complex = evaluate(topcoeffs, 2, botcoeffs, 2, 1.0);
+    gain = hypot(gain_complex.imag(), gain_complex.real());
+
+    for (int i = 0; i <= 2; i++) {
+        ycoeffs[i] = -(botcoeffs[i].real() / botcoeffs[2].real());
+    }
+
+    debug_print("gain: %f, ycoeffs: {%f, %f}\n", gain, ycoeffs[0], ycoeffs[1]);
+}
+
+complex<double> LowpassFilter::blt(complex<double> pz) {
+    return (2.0 + pz) / (2.0 - pz);
+}
+
+/* evaluate response, substituting for z */
+complex<double> LowpassFilter::evaluate(complex<double> topco[], int nz, complex<double> botco[], int np, complex<double> z) {
+    return eval(topco, nz, z) / eval(botco, np, z);
+}
+
+/* evaluate polynomial in z, substituting for z */
+complex<double> LowpassFilter::eval(complex<double> coeffs[], int npz, complex<double> z) {
+    complex<double> sum(0.0);
+    for (int i = npz; i >= 0; i--) {
+        sum = (sum * z) + coeffs[i];
+    }
+    return sum;
+}
+
+/* compute product of poles or zeros as a polynomial of z */
+void LowpassFilter::expand(complex<double> pz[], int npz, complex<double> coeffs[]) {
+    coeffs[0] = 1.0;
+    for (int i = 0; i < npz; i++) {
+        coeffs[i + 1] = 0.0;
+    }
+    for (int i = 0; i < npz; i++) {
+        multin(pz[i], npz, coeffs);
+    }
+    /* check computed coeffs of z^k are all real */
+    for (int i = 0; i < npz + 1; i++) {
+        if (fabs(coeffs[i].imag()) > 1e-10) {
+            log(LOG_ERR, "coeff of z^%d is not real; poles/zeros are not complex conjugates\n", i);
+            error();
+        }
+    }
+}
+
+void LowpassFilter::multin(complex<double> w, int npz, complex<double> coeffs[]) {
+    /* multiply factor (z-w) into coeffs */
+    complex<double> nw = -w;
+    for (int i = npz; i >= 1; i--) {
+        coeffs[i] = (nw * coeffs[i]) + coeffs[i - 1];
+    }
+    coeffs[0] = nw * coeffs[0];
+}
+
+void LowpassFilter::apply(float& r, float& j) {
+    if (!enabled_) {
+        return;
+    }
+
+    complex<float> input(r, j);
+
+    xv[0] = xv[1];
+    xv[1] = xv[2];
+    xv[2] = input / gain;
+
+    yv[0] = yv[1];
+    yv[1] = yv[2];
+    yv[2] = (xv[0] + xv[2]) + (2.0f * xv[1]) + (ycoeffs[0] * yv[0]) + (ycoeffs[1] * yv[1]);
+
+    r = yv[2].real();
+    j = yv[2].imag();
+}

+ 63 - 0
src/filters.h

@@ -0,0 +1,63 @@
+/*
+ * filters.h
+ *
+ * Copyright (C) 2022-2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _FILTERS_H
+#define _FILTERS_H 1
+
+#include <complex>
+
+class NotchFilter {
+   public:
+    NotchFilter(void);
+    NotchFilter(float notch_freq, float sample_freq, float q);
+    void apply(float& value);
+    bool enabled(void) { return enabled_; }
+
+   private:
+    bool enabled_;
+    float e;
+    float p;
+    float d[3];
+    float x[3];
+    float y[3];
+};
+
+class LowpassFilter {
+   public:
+    LowpassFilter(void);
+    LowpassFilter(float freq, float sample_freq);
+    void apply(float& r, float& j);
+    bool enabled(void) const { return enabled_; }
+
+   private:
+    static std::complex<double> blt(std::complex<double> pz);
+    static void expand(std::complex<double> pz[], int npz, std::complex<double> coeffs[]);
+    static void multin(std::complex<double> w, int npz, std::complex<double> coeffs[]);
+    static std::complex<double> evaluate(std::complex<double> topco[], int nz, std::complex<double> botco[], int np, std::complex<double> z);
+    static std::complex<double> eval(std::complex<double> coeffs[], int npz, std::complex<double> z);
+
+    bool enabled_;
+    float ycoeffs[3];
+    float gain;
+
+    std::complex<float> xv[3];
+    std::complex<float> yv[3];
+};
+
+#endif /* _FILTERS_H */

+ 86 - 0
src/generate_signal.cpp

@@ -0,0 +1,86 @@
+/*
+ * generate_signal.cpp
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <cmath>
+
+#include "generate_signal.h"
+
+using namespace std;
+
+float Tone::WEAK = 0.05;
+float Tone::NORMAL = 0.2;
+float Tone::STRONG = 0.4;
+
+Tone::Tone(int sample_rate, const float& freq, const float& ampl) : sample_rate_(sample_rate), freq_(freq), ampl_(ampl), sample_count_(0) {}
+
+float Tone::get_sample(void) {
+    sample_count_++;
+    return ampl_ * sin(2 * M_PI * sample_count_ * freq_ / sample_rate_);
+}
+
+float Noise::WEAK = 0.05;
+float Noise::NORMAL = 0.2;
+float Noise::STRONG = 0.5;
+
+Noise::Noise(const float& ampl) : ampl_(ampl) {
+    // create a seeded generator
+    std::random_device r;
+    std::seed_seq s{r(), r(), r(), r(), r(), r(), r(), r()};
+    generator = std::mt19937(s);
+
+    // centered at 0.0, standard deviation of 0.1
+    distribution = normal_distribution<float>(0.0, 0.1);
+}
+float Noise::get_sample(void) {
+    return ampl_ * distribution(generator);
+}
+
+GenerateSignal::GenerateSignal(int sample_rate) : sample_rate_(sample_rate) {}
+
+void GenerateSignal::add_tone(const float& freq, const float& ampl) {
+    tones_.push_back(Tone(sample_rate_, freq, ampl));
+}
+
+void GenerateSignal::add_noise(const float& ampl) {
+    noises_.push_back(Noise(ampl));
+}
+
+float GenerateSignal::get_sample(void) {
+    float value = 0.0;
+
+    for (vector<Tone>::iterator tone = tones_.begin(); tone != tones_.end(); ++tone) {
+        value += tone->get_sample();
+    }
+
+    for (vector<Noise>::iterator noise = noises_.begin(); noise != noises_.end(); ++noise) {
+        value += noise->get_sample();
+    }
+
+    return value;
+}
+
+void GenerateSignal::write_file(const string& filepath, const float& seconds) {
+    FILE* fp = fopen(filepath.c_str(), "wb");
+
+    for (int i = 0; i < sample_rate_ * seconds; ++i) {
+        float sample = get_sample();
+        fwrite(&sample, sizeof(float), 1, fp);
+    }
+    fclose(fp);
+}

+ 75 - 0
src/generate_signal.h

@@ -0,0 +1,75 @@
+/*
+ * generate_signal.h
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _GENERATE_SIGNAL_H
+#define _GENERATE_SIGNAL_H
+
+#include <random>
+#include <string>
+#include <vector>
+
+class Tone {
+   public:
+    static float WEAK;
+    static float NORMAL;
+    static float STRONG;
+
+    Tone(int sample_rate, const float& freq, const float& ampl);
+    float get_sample(void);
+
+   private:
+    int sample_rate_;
+    float freq_;
+    float ampl_;
+    size_t sample_count_;
+};
+
+class Noise {
+   public:
+    static float WEAK;
+    static float NORMAL;
+    static float STRONG;
+
+    Noise(const float& ampl);
+    float get_sample(void);
+
+   private:
+    float ampl_;
+    std::mt19937 generator;
+    std::normal_distribution<float> distribution;
+};
+
+class GenerateSignal {
+   public:
+    GenerateSignal(int sample_rate);
+
+    void add_tone(const float& freq, const float& ampl);
+    void add_noise(const float& ampl);
+
+    float get_sample(void);
+
+    void write_file(const std::string& filepath, const float& seconds);
+
+   private:
+    int sample_rate_;
+    std::vector<Tone> tones_;
+    std::vector<Noise> noises_;
+};
+
+#endif /* _GENERATE_SIGNAL_H */

+ 21 - 0
src/hello_fft/CMakeLists.txt

@@ -0,0 +1,21 @@
+set(hello_fft_source_files
+	mailbox.c
+	gpu_fft.c
+	gpu_fft_twiddles.c
+	gpu_fft_shaders.c
+	gpu_fft_base.c
+)
+# Temp hack due to the fact that mailbox.c includes ../rtl_airband.h which
+# is a C++ header.
+SET_SOURCE_FILES_PROPERTIES(${hello_fft_source_files} PROPERTIES LANGUAGE CXX )
+add_library(hello_fft OBJECT
+	${hello_fft_source_files}
+)
+target_include_directories(hello_fft PUBLIC
+	".." # needed for rtl_airband.h
+	"${CMAKE_CURRENT_BINARY_DIR}/.." # needed for config.h
+	${BCM_VC_INCLUDE_DIRS}
+)
+
+# disable -Wcast-qual for this folder
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-cast-qual")

+ 135 - 0
src/hello_fft/gpu_fft.c

@@ -0,0 +1,135 @@
+/*
+BCM2835 "GPU_FFT" release 2.0
+Copyright (c) 2014, Andrew Holme.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <string.h>
+
+#include "gpu_fft.h"
+
+#define GPU_FFT_BUSY_WAIT_LIMIT (5 << 12)  // ~1ms
+
+typedef struct GPU_FFT_COMPLEX COMPLEX;
+
+int gpu_fft_prepare(int mb,         // mailbox file_desc
+                    int log2_N,     // log2(FFT_length) = 8...20
+                    int direction,  // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
+                    int jobs,       // number of transforms in batch
+                    struct GPU_FFT** fft) {
+    unsigned info_bytes, twid_bytes, data_bytes, code_bytes, unif_bytes, mail_bytes;
+    unsigned size, *uptr, vc_tw, vc_data;
+    int i, q, shared, unique, passes, ret;
+
+    struct GPU_FFT_BASE* base;
+    struct GPU_FFT_PTR ptr;
+    struct GPU_FFT* info;
+
+    if (gpu_fft_twiddle_size(log2_N, &shared, &unique, &passes))
+        return -2;
+
+    info_bytes = 4096;
+    data_bytes = (1 + ((sizeof(COMPLEX) << log2_N) | 4095));
+    code_bytes = gpu_fft_shader_size(log2_N);
+    twid_bytes = sizeof(COMPLEX) * 16 * (shared + GPU_FFT_QPUS * unique);
+    unif_bytes = sizeof(int) * GPU_FFT_QPUS * (5 + jobs * 2);
+    mail_bytes = sizeof(int) * GPU_FFT_QPUS * 2;
+
+    size = info_bytes +             // header
+           data_bytes * jobs * 2 +  // ping-pong data, aligned
+           code_bytes +             // shader, aligned
+           twid_bytes +             // twiddles
+           unif_bytes +             // uniforms
+           mail_bytes;              // mailbox message
+
+    ret = gpu_fft_alloc(mb, size, &ptr);
+    if (ret)
+        return ret;
+
+    // Header
+    info = (struct GPU_FFT*)ptr.arm.vptr;
+    base = (struct GPU_FFT_BASE*)info;
+    gpu_fft_ptr_inc(&ptr, info_bytes);
+
+    // For transpose
+    info->x = 1 << log2_N;
+    info->y = jobs;
+
+    // Ping-pong buffers leave results in or out of place
+    info->in = info->out = ptr.arm.cptr;
+    info->step = data_bytes / sizeof(COMPLEX);
+    if (passes & 1)
+        info->out += info->step * jobs;  // odd => out of place
+    vc_data = gpu_fft_ptr_inc(&ptr, data_bytes * jobs * 2);
+
+    // Shader code
+    memcpy(ptr.arm.vptr, gpu_fft_shader_code(log2_N), code_bytes);
+    base->vc_code = gpu_fft_ptr_inc(&ptr, code_bytes);
+
+    // Twiddles
+    gpu_fft_twiddle_data(log2_N, direction, ptr.arm.fptr);
+    vc_tw = gpu_fft_ptr_inc(&ptr, twid_bytes);
+
+    uptr = ptr.arm.uptr;
+
+    // Uniforms
+    for (q = 0; q < GPU_FFT_QPUS; q++) {
+        *uptr++ = vc_tw;
+        *uptr++ = vc_tw + sizeof(COMPLEX) * 16 * (shared + q * unique);
+        *uptr++ = q;
+        for (i = 0; i < jobs; i++) {
+            *uptr++ = vc_data + data_bytes * i;
+            *uptr++ = vc_data + data_bytes * i + data_bytes * jobs;
+        }
+        *uptr++ = 0;
+        *uptr++ = (q == 0);  // For mailbox: IRQ enable, master only
+
+        base->vc_unifs[q] = gpu_fft_ptr_inc(&ptr, sizeof(int) * (5 + jobs * 2));
+    }
+
+    if ((jobs << log2_N) <= GPU_FFT_BUSY_WAIT_LIMIT) {
+        // Direct register poking with busy wait
+        base->vc_msg = 0;
+    } else {
+        // Mailbox message
+        for (q = 0; q < GPU_FFT_QPUS; q++) {
+            *uptr++ = base->vc_unifs[q];
+            *uptr++ = base->vc_code;
+        }
+
+        base->vc_msg = ptr.vc;
+    }
+
+    *fft = info;
+    return 0;
+}
+
+unsigned gpu_fft_execute(struct GPU_FFT* info) {
+    return gpu_fft_base_exec(&info->base, GPU_FFT_QPUS);
+}
+
+void gpu_fft_release(struct GPU_FFT* info) {
+    gpu_fft_base_release(&info->base);
+}

+ 92 - 0
src/hello_fft/gpu_fft.h

@@ -0,0 +1,92 @@
+/*
+BCM2835 "GPU_FFT" release 2.0
+Copyright (c) 2014, Andrew Holme.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __GPU_FFT__
+#define __GPU_FFT__
+
+#define GPU_FFT_QPUS 8
+
+#define GPU_FFT_PI 3.14159265358979323846
+
+#define GPU_FFT_FWD 0  // forward FFT
+#define GPU_FFT_REV 1  // inverse FFT
+
+struct GPU_FFT_COMPLEX {
+    float re, im;
+};
+
+struct GPU_FFT_PTR {
+    unsigned vc;
+    union {
+        struct GPU_FFT_COMPLEX* cptr;
+        void* vptr;
+        char* bptr;
+        float* fptr;
+        unsigned* uptr;
+    } arm;
+};
+
+struct GPU_FFT_BASE {
+    int mb;
+    unsigned handle, size, vc_msg, vc_code, vc_unifs[GPU_FFT_QPUS];
+    volatile unsigned* peri;
+};
+
+struct GPU_FFT {
+    struct GPU_FFT_BASE base;
+    struct GPU_FFT_COMPLEX *in, *out;
+    int x, y, step;
+};
+
+int gpu_fft_prepare(int mb,         // mailbox file_desc
+                    int log2_N,     // log2(FFT_length) = 8...20
+                    int direction,  // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
+                    int jobs,       // number of transforms in batch
+                    struct GPU_FFT** fft);
+
+unsigned gpu_fft_execute(struct GPU_FFT* info);
+
+void gpu_fft_release(struct GPU_FFT* info);
+
+// private
+int gpu_fft_twiddle_size(int, int*, int*, int*);
+void gpu_fft_twiddle_data(int, int, float*);
+unsigned int gpu_fft_shader_size(int);
+unsigned int* gpu_fft_shader_code(int);
+
+// gpu_fft_base:
+
+unsigned gpu_fft_base_exec(struct GPU_FFT_BASE* base, unsigned num_qpus);
+
+int gpu_fft_alloc(int mb, unsigned size, struct GPU_FFT_PTR* ptr);
+
+void gpu_fft_base_release(struct GPU_FFT_BASE* base);
+
+unsigned gpu_fft_ptr_inc(struct GPU_FFT_PTR* ptr, int bytes);
+
+#endif  // __GPU_FFT__

+ 157 - 0
src/hello_fft/gpu_fft.txt

@@ -0,0 +1,157 @@
+BCM2835 "GPU_FFT" release 2.0 by Andrew Holme, 2014.
+
+GPU_FFT is an FFT library for the Raspberry Pi which exploits the BCM2835 SoC
+3D hardware to deliver ten times more data throughput than is possible on the
+700 MHz ARM.  Kernels are provided for all power-of-2 FFT lengths between 256
+and 2,097,152 points inclusive.  A transpose function, which also uses the 3D
+hardware, is provided to support 2-dimensional transforms.
+
+
+*** Accuracy ***
+
+GPU_FFT uses single-precision floats for data and twiddle factors.  The output
+is not scaled.  The relative root-mean-square (rms) error in parts-per-million
+(ppm) for different transform lengths (N) is typically:
+
+log2(N) |  8    | 9    | 10   |  11   |  12  |  13  |  14  |  15  |  16 |  17
+ppm rms |  0.27 | 0.42 | 0.50 |  0.70 |  2.3 |  4.4 |  7.6 |  9.2 |  18 |  70
+
+log2(N) |  18 |  19 |  20 |  21 |                8...17 batch of 10
+ppm rms | 100 | 180 | 360 | 720 |               18...21 batch of  1
+
+
+*** Throughput ***
+
+GPU_FFT 1.0 had to be invoked through a "mailbox" which added a 100us overhead
+on every call.  To mitigate this, batches of transforms could be submitted via
+a single call.  GPU_FFT 2.0 avoids this 100us overhead by poking GPU registers
+directly from the ARM if total batch runtime will be short; but still uses the
+mailbox for longer jobs to avoid busy waiting at 100% CPU for too long.
+
+Typical per-transform runtimes for batch sizes of 1 and 10; and comparative
+figures for FFTW (FFTW_MEASURE mode) are:
+
+log2(N) |   8   |   9   |  10   |  11   |  12  |  13  |  14  |  15  |
+      1 | 0.036 | 0.051 | 0.070 | 0.11  | 0.24 | 0.58 |  1.2 |  3.3 |
+     10 | 0.016 | 0.027 | 0.045 | 0.095 | 0.25 | 0.61 |  1.2 |  3.2 |
+   FFTW | 0.092 | 0.22  | 0.48  | 0.95  | 3.0  | 5.1  | 12   | 31   |
+
+log2(N) |  16  |  17 |  18 |  19 |   20 |   21 |       All times in
+      1 |  6.8 |  16 |  42 |  95 |  190 |  380 |       milliseconds
+   FFTW | 83   | 180 | 560 | 670 | 1600 | 3400 |       2 sig. figs.
+
+
+*** API functions ***
+
+    gpu_fft_prepare()       Call once to allocate memory and initialise data
+                            structures.  Returns 0 for success.
+
+    gpu_fft_execute()       Call one or more times to execute a previously
+                            prepared FFT batch.  Returns 0 for success.
+
+    gpu_fft_release()       Call once to release resources after use.
+                            GPU memory is permanently lost if not freed.
+
+
+*** Parameters ***
+
+    int mb          Mailbox file descriptor obtained by calling mbox_open()
+
+    int log2_N      log2(FFT length) = 8 to 21
+
+    int direction   FFT direction:  GPU_FFT_FWD for forward FFT
+                                    GPU_FFT_REV for inverse FFT
+
+    int jobs        Number of transforms in batch = 1 or more
+
+    GPU_FFT **      Output parameter from prepare: control structure.
+    GPU_FFT *       Input parameter to execute and release
+
+
+*** Data format ***
+
+Complex data arrays are stored as alternate real and imaginary parts:
+
+    struct GPU_FFT_COMPLEX {
+        float re, im;
+    };
+
+The GPU_FFT struct created by gpu_fft_prepare() contains pointers to the input
+and output arrays:
+
+    struct GPU_FFT {
+       struct GPU_FFT_COMPLEX *in, *out;
+
+When executing a batch of transforms, buffer pointers are obtained as follows:
+
+    struct GPU_FFT *fft = gpu_fft_prepare( ... , jobs);
+    for (int j=0; j<jobs; j++) {
+       struct GPU_FFT_COMPLEX *in  = fft->in  + j*fft->step;
+       struct GPU_FFT_COMPLEX *out = fft->out + j*fft->step;
+
+GPU_FFT.step is greater than FFT length because a guard space is left between
+buffers for caching and alignment reasons.
+
+GPU_FFT performs multiple passes between ping-pong buffers.  The final output
+lands in the same buffer as input after an even number of passes.  Transforms
+where log2_N=12...16 use an odd number of passes and the final result is left
+out-of-place.  The input data is never preserved.
+
+
+*** Example program ***
+
+The code that produced the above accuracy and performance figures is included
+as a demo with the latest Raspbian distro.  Build and run it as follows:
+
+cd /opt/vc/src/hello_pi/hello_fft
+make
+sudo mknod char_dev c 100 0
+sudo ./hello_fft.bin 12
+
+It accepts three optional command-line arguments: <log2_N> <batch> <loops>
+
+The special character device is required for the ioctl mailbox through which
+the ARM communicates with the Videocore GPU.
+
+
+*** With Open GL ***
+
+GPU_FFT and Open GL will run concurrently if the GPU_FFT_MEM_* defines in
+file gpu_fft.c are changed as follows:
+
+#define GPU_FFT_MEM_FLG 0x4        // cached=0xC; direct=0x4
+#define GPU_FFT_MEM_MAP 0x20000000 // cached=0x0; direct=0x20000000
+
+Overall performance will probably be higher if GPU_FFT and Open GL take turns
+at using the 3D hardware.  Since eglSwapBuffers() returns immediately without
+waiting for rendering, call glFlush() and glFinish() afterwards as follows:
+
+    for (;;) {
+        ....
+        eglSwapBuffers(....); // non-blocking call returns immediately
+        glFlush();
+        glFinish(); // wait until V3D hardware is idle
+        ....
+        gpu_fft_execute(....); // blocking call
+        ....
+    }
+
+
+*** 2-dimensional FFT ***
+
+Please study the hello_fft_2d demo source, which is built and executed thus:
+
+make hello_fft_2d.bin
+sudo ./hello_fft_2d.bin
+
+This generates a Windows BMP file: "hello_fft_2d.bmp"
+
+The demo uses a square 512x512 array; however, rectangular arrays are allowed.
+The following lines in gpu_fft_trans.c will do what is safe:
+
+    ptr.arm.uptr[6] = src->x < dst->y? src->x : dst->y;
+    ptr.arm.uptr[7] = src->y < dst->x? src->y : dst->x;
+
+One may transpose the output from the second FFT pass back into the first pass
+input buffer, by preparing and executing a second transposition; however, this
+is probably unnecessary.  It depends on how the final output will be accessed.

+ 137 - 0
src/hello_fft/gpu_fft_base.c

@@ -0,0 +1,137 @@
+/*
+BCM2835 "GPU_FFT" release 2.0
+Copyright (c) 2014, Andrew Holme.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "bcm_host.h"
+#include "gpu_fft.h"
+#include "mailbox.h"
+
+#define BUS_TO_PHYS(x) ((x) & ~0xC0000000)
+
+// V3D spec: http://www.broadcom.com/docs/support/videocore/VideoCoreIV-AG100-R.pdf
+#define V3D_L2CACTL (0xC00020 >> 2)
+#define V3D_SLCACTL (0xC00024 >> 2)
+#define V3D_SRQPC (0xC00430 >> 2)
+#define V3D_SRQUA (0xC00434 >> 2)
+#define V3D_SRQCS (0xC0043c >> 2)
+#define V3D_DBCFG (0xC00e00 >> 2)
+#define V3D_DBQITE (0xC00e2c >> 2)
+#define V3D_DBQITC (0xC00e30 >> 2)
+
+#define GPU_FFT_MEM_MAP 0x0  // cached=0x0; direct=0x20000000
+
+#define GPU_FFT_NO_FLUSH 1
+#define GPU_FFT_TIMEOUT 2000  // ms
+
+unsigned gpu_fft_base_exec_direct(struct GPU_FFT_BASE* base, unsigned num_qpus) {
+    unsigned q;
+
+    base->peri[V3D_DBCFG] = 0;    // Disallow IRQ
+    base->peri[V3D_DBQITE] = 0;   // Disable IRQ
+    base->peri[V3D_DBQITC] = -1;  // Resets IRQ flags
+
+    base->peri[V3D_L2CACTL] = 1 << 2;  // Clear L2 cache
+    base->peri[V3D_SLCACTL] = -1;      // Clear other caches
+
+    base->peri[V3D_SRQCS] = (1 << 7) | (1 << 8) | (1 << 16);  // Reset error bit and counts
+
+    for (q = 0; q < num_qpus; q++) {  // Launch shader(s)
+        base->peri[V3D_SRQUA] = base->vc_unifs[q];
+        base->peri[V3D_SRQPC] = base->vc_code;
+    }
+
+    // Busy wait polling
+    for (;;) {
+        if (((base->peri[V3D_SRQCS] >> 16) & 0xff) == num_qpus)
+            break;  // All done?
+    }
+
+    return 0;
+}
+
+unsigned gpu_fft_base_exec(struct GPU_FFT_BASE* base, unsigned num_qpus) {
+    if (base->vc_msg) {
+        // Use mailbox
+        // Returns: 0x0 for success; 0x80000000 for timeout
+        return execute_qpu(base->mb, num_qpus, base->vc_msg, GPU_FFT_NO_FLUSH, GPU_FFT_TIMEOUT);
+    } else {
+        // Direct register poking
+        return gpu_fft_base_exec_direct(base, num_qpus);
+    }
+}
+
+int gpu_fft_alloc(int mb, unsigned size, struct GPU_FFT_PTR* ptr) {
+    struct GPU_FFT_BASE* base;
+    volatile unsigned* peri;
+    unsigned handle;
+
+    if (qpu_enable(mb, 1))
+        return -1;
+
+    // Shared memory : cached=0xC; direct=0x4
+    unsigned mem_flg = bcm_host_get_sdram_address() == 0x40000000 ? 0xC : 0x4;
+    handle = mem_alloc(mb, size, 4096, mem_flg);
+    if (!handle) {
+        qpu_enable(mb, 0);
+        return -3;
+    }
+
+    peri = (volatile unsigned*)mapmem(bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size());
+    if (!peri) {
+        mem_free(mb, handle);
+        qpu_enable(mb, 0);
+        return -4;
+    }
+
+    ptr->vc = mem_lock(mb, handle);
+    ptr->arm.vptr = mapmem(BUS_TO_PHYS(ptr->vc + GPU_FFT_MEM_MAP), size);
+
+    base = (struct GPU_FFT_BASE*)ptr->arm.vptr;
+    base->peri = peri;
+    base->mb = mb;
+    base->handle = handle;
+    base->size = size;
+
+    return 0;
+}
+
+void gpu_fft_base_release(struct GPU_FFT_BASE* base) {
+    int mb = base->mb;
+    unsigned handle = base->handle, size = base->size;
+    unmapmem((void*)base->peri, bcm_host_get_peripheral_size());
+    unmapmem((void*)base, size);
+    mem_unlock(mb, handle);
+    mem_free(mb, handle);
+    qpu_enable(mb, 0);
+}
+
+unsigned gpu_fft_ptr_inc(struct GPU_FFT_PTR* ptr, int bytes) {
+    unsigned vc = ptr->vc;
+    ptr->vc += bytes;
+    ptr->arm.bptr += bytes;
+    return vc;
+}

+ 84 - 0
src/hello_fft/gpu_fft_shaders.c

@@ -0,0 +1,84 @@
+/*
+BCM2835 "GPU_FFT" release 2.0
+Copyright (c) 2014, Andrew Holme.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+static unsigned int shader_256[] = {
+#include "hex/shader_256.hex"
+};
+static unsigned int shader_512[] = {
+#include "hex/shader_512.hex"
+};
+static unsigned int shader_1k[] = {
+#include "hex/shader_1k.hex"
+};
+static unsigned int shader_2k[] = {
+#include "hex/shader_2k.hex"
+};
+static unsigned int shader_4k[] = {
+#include "hex/shader_4k.hex"
+};
+static unsigned int shader_8k[] = {
+#include "hex/shader_8k.hex"
+};
+static unsigned int shader_16k[] = {
+#include "hex/shader_16k.hex"
+};
+static unsigned int shader_32k[] = {
+#include "hex/shader_32k.hex"
+};
+static unsigned int shader_64k[] = {
+#include "hex/shader_64k.hex"
+};
+static unsigned int shader_128k[] = {
+#include "hex/shader_128k.hex"
+};
+static unsigned int shader_256k[] = {
+#include "hex/shader_256k.hex"
+};
+static unsigned int shader_512k[] = {
+#include "hex/shader_512k.hex"
+};
+static unsigned int shader_1024k[] = {
+#include "hex/shader_1024k.hex"
+};
+static unsigned int shader_2048k[] = {
+#include "hex/shader_2048k.hex"
+};
+
+static struct {
+    unsigned int size, *code;
+} shaders[] = {{sizeof(shader_256), shader_256},   {sizeof(shader_512), shader_512},   {sizeof(shader_1k), shader_1k},       {sizeof(shader_2k), shader_2k},      {sizeof(shader_4k), shader_4k},
+               {sizeof(shader_8k), shader_8k},     {sizeof(shader_16k), shader_16k},   {sizeof(shader_32k), shader_32k},     {sizeof(shader_64k), shader_64k},    {sizeof(shader_128k), shader_128k},
+               {sizeof(shader_256k), shader_256k}, {sizeof(shader_512k), shader_512k}, {sizeof(shader_1024k), shader_1024k}, {sizeof(shader_2048k), shader_2048k}};
+
+unsigned int gpu_fft_shader_size(int log2_N) {
+    return shaders[log2_N - 8].size;
+}
+
+unsigned int* gpu_fft_shader_code(int log2_N) {
+    return shaders[log2_N - 8].code;
+}

+ 40 - 0
src/hello_fft/gpu_fft_trans.h

@@ -0,0 +1,40 @@
+/*
+BCM2835 "GPU_FFT" release 2.0
+Copyright (c) 2014, Andrew Holme.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "gpu_fft.h"
+
+struct GPU_FFT_TRANS {
+    struct GPU_FFT_BASE base;
+};
+
+int gpu_fft_trans_prepare(int mb, struct GPU_FFT* src, struct GPU_FFT* dst, struct GPU_FFT_TRANS** out);
+
+unsigned gpu_fft_trans_execute(  // src->out ==> T ==> dst->in
+    struct GPU_FFT_TRANS* info);
+
+void gpu_fft_trans_release(struct GPU_FFT_TRANS* info);

+ 278 - 0
src/hello_fft/gpu_fft_twiddles.c

@@ -0,0 +1,278 @@
+/*
+BCM2835 "GPU_FFT" release 2.0
+Copyright (c) 2014, Andrew Holme.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <math.h>
+
+#include "gpu_fft.h"
+
+#define ALPHA(dx) (2 * pow(sin((dx) / 2), 2))
+#define BETA(dx) (sin(dx))
+
+static double k[16] = {0, 8, 4, 4, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1};
+static double m[16] = {0, 0, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7};
+
+/****************************************************************************/
+
+static float* twiddles_base_16(double two_pi, float* out, double theta) {
+    int i;
+    for (i = 0; i < 16; i++) {
+        *out++ = cos(two_pi / 16 * k[i] * m[i] + theta * k[i]);
+        *out++ = sin(two_pi / 16 * k[i] * m[i] + theta * k[i]);
+    }
+    return out;
+}
+
+static float* twiddles_base_32(double two_pi, float* out, double theta) {
+    int i;
+    for (i = 0; i < 16; i++) {
+        *out++ = cos(two_pi / 32 * i + theta);
+        *out++ = sin(two_pi / 32 * i + theta);
+    }
+    return twiddles_base_16(two_pi, out, 2 * theta);
+}
+
+static float* twiddles_base_64(double two_pi, float* out) {
+    int i;
+    for (i = 0; i < 32; i++) {
+        *out++ = cos(two_pi / 64 * i);
+        *out++ = sin(two_pi / 64 * i);
+    }
+    return twiddles_base_32(two_pi, out, 0);
+}
+
+/****************************************************************************/
+
+static float* twiddles_step_16(double /*two_pi*/, float* out, double theta) {
+    int i;
+    for (i = 0; i < 16; i++) {
+        *out++ = ALPHA(theta * k[i]);
+        *out++ = BETA(theta * k[i]);
+    }
+    return out;
+}
+
+static float* twiddles_step_32(double two_pi, float* out, double theta) {
+    int i;
+    for (i = 0; i < 16; i++) {
+        *out++ = ALPHA(theta);
+        *out++ = BETA(theta);
+    }
+    return twiddles_step_16(two_pi, out, 2 * theta);
+}
+
+/****************************************************************************/
+
+static void twiddles_256(double two_pi, float* out) {
+    double N = 256;
+    int q;
+
+    out = twiddles_base_16(two_pi, out, 0);
+    out = twiddles_step_16(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_16(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_512(double two_pi, float* out) {
+    double N = 512;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_16(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_16(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_1k(double two_pi, float* out) {
+    double N = 1024;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_32(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_32(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_2k(double two_pi, float* out) {
+    double N = 2048;
+    int q;
+
+    out = twiddles_base_64(two_pi, out);
+    out = twiddles_step_32(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_32(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_4k(double two_pi, float* out) {
+    double N = 4096;
+    int q;
+
+    out = twiddles_base_16(two_pi, out, 0);
+    out = twiddles_step_16(two_pi, out, two_pi / N * 16);
+    out = twiddles_step_16(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_16(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_8k(double two_pi, float* out) {
+    double N = 8192;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_16(two_pi, out, two_pi / N * 16);
+    out = twiddles_step_16(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_16(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_16k(double two_pi, float* out) {
+    double N = 16384;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_32(two_pi, out, two_pi / N * 16);
+    out = twiddles_step_16(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_16(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_32k(double two_pi, float* out) {
+    double N = 32768;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_32(two_pi, out, two_pi / N * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_32(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_64k(double two_pi, float* out) {
+    double N = 65536;
+    int q;
+
+    out = twiddles_base_64(two_pi, out);
+    out = twiddles_step_32(two_pi, out, two_pi / N * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_32(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_128k(double two_pi, float* out) {
+    double N = 128 * 1024;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_16(two_pi, out, two_pi / N * 16 * 16);
+    out = twiddles_step_16(two_pi, out, two_pi / N * 16);
+    out = twiddles_step_16(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_16(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_256k(double two_pi, float* out) {
+    double N = 256 * 1024;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_16(two_pi, out, two_pi / N * 32 * 16);
+    out = twiddles_step_16(two_pi, out, two_pi / N * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_32(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_512k(double two_pi, float* out) {
+    double N = 512 * 1024;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_16(two_pi, out, two_pi / N * 32 * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_32(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_1024k(double two_pi, float* out) {
+    double N = 1024 * 1024;
+    int q;
+
+    out = twiddles_base_32(two_pi, out, 0);
+    out = twiddles_step_32(two_pi, out, two_pi / N * 32 * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_32(two_pi, out, two_pi / N * q);
+}
+
+static void twiddles_2048k(double two_pi, float* out) {
+    double N = 2048 * 1024;
+    int q;
+
+    out = twiddles_base_64(two_pi, out);
+    out = twiddles_step_32(two_pi, out, two_pi / N * 32 * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * 32);
+    out = twiddles_step_32(two_pi, out, two_pi / N * GPU_FFT_QPUS);
+
+    for (q = 0; q < GPU_FFT_QPUS; q++)
+        out = twiddles_base_32(two_pi, out, two_pi / N * q);
+}
+
+/****************************************************************************/
+
+static struct {
+    int passes, shared, unique;
+    void (*twiddles)(double, float*);
+} shaders[] = {{2, 2, 1, twiddles_256}, {2, 3, 1, twiddles_512}, {2, 4, 2, twiddles_1k},   {2, 6, 2, twiddles_2k},   {3, 3, 1, twiddles_4k},   {3, 4, 1, twiddles_8k},    {3, 5, 1, twiddles_16k},
+               {3, 6, 2, twiddles_32k}, {3, 8, 2, twiddles_64k}, {4, 5, 1, twiddles_128k}, {4, 6, 2, twiddles_256k}, {4, 7, 2, twiddles_512k}, {4, 8, 2, twiddles_1024k}, {4, 10, 2, twiddles_2048k}};
+
+int gpu_fft_twiddle_size(int log2_N, int* shared, int* unique, int* passes) {
+    if (log2_N < 8 || log2_N > 21)
+        return -1;
+    *shared = shaders[log2_N - 8].shared;
+    *unique = shaders[log2_N - 8].unique;
+    *passes = shaders[log2_N - 8].passes;
+    return 0;
+}
+
+void gpu_fft_twiddle_data(int log2_N, int direction, float* out) {
+    shaders[log2_N - 8].twiddles((direction == GPU_FFT_FWD ? -2 : 2) * GPU_FFT_PI, out);
+}

+ 707 - 0
src/hello_fft/hex/shader_1024k.hex

@@ -0,0 +1,707 @@
+0x00000014, 0xe0021227, // mov rb_STAGES,  STAGES
+0x00000010, 0xe00216e7, // mov rb_0x10,    0x10
+0x00000040, 0xe0021727, // mov rb_0x40,    0x40
+0x00000080, 0xe0021767, // mov rb_0x80,    0x80
+0x000000f0, 0xe00217a7, // mov rb_0xF0,    0xF0
+0x00000100, 0xe00217e7, // mov rb_0x100,   0x100
+0x55555555, 0xe0020767, // mov rx_0x55555555, 0x55555555
+0x33333333, 0xe00207a7, // mov rx_0x33333333, 0x33333333
+0x0f0f0f0f, 0xe00207e7, // mov rx_0x0F0F0F0F, 0x0F0F0F0F
+0x00ff00ff, 0xe0021027, // mov rx_0x00FF00FF, 0x00FF00FF
+0x0000ffff, 0xe00216a7, // mov rx_0x0000FFFF, 0x0000FFFF
+0x80904000, 0xe0020727, // mov ra_vdw_32, vdw_setup_0(1, 16, dma_h32( 0,0))
+0x80905000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(1, 16, dma_h32(32,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020467, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021467, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020527, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021527, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x100246a0, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100246e0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000002e8, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156e7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x15727d80, 0x10020827, // mov r0, ra_vdw_32
+0x8c05cdf6, 0x10024061, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov r1, ra_save_ptr
+0x00000080, 0xe00208a7, // mov r2, vdw_setup_0(1, 16, dma_h32(1,0)) - vdw_setup_0(1, 16, dma_h32(0,0))
+0x00040000, 0xe00208e7, // mov r3, PASS32_STRIDE
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000050, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156e7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x000005d8, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x205a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d6039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22096cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x205a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x205e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d7039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22097cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x205e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20627030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d8039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22098cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20627031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20667030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d9039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22099cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20667031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149c01c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149c01c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9db1c0, 0x10020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119db3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c91c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffd78, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149c01c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149c01c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9db1c0, 0x10020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119db3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c91c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc30, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x20567006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d500f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2056700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22095c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95682ff6, 0x10024682, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c7ff6, 0x100246c7, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffba0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb50, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x20567006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d500f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2056700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22095c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95682ff6, 0x10024682, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c7ff6, 0x100246c7, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x000008d0, 0xf00809e7, // brr.allz -, r:end
+0x952cbdbf, 0x10024555, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149c01c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149c01c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9db1c0, 0x10020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119db3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c91c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffa50, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffffa28, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x952cbdbf, 0x10024555, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffbf0, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00007fff, 0xe0020827, // mov r0, 0x7FFF
+0x141e7c00, 0x100229e7, // and.setf -, ra_points, r0
+0xfffffbc0, 0xf01809e7, // brr.allnz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100601e7, // add.ifnz ra_points, ra_points, rb_0x100
+0x95555dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02567c80, 0x10020567, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d5ec0, 0x10021567, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95659dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209ce017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209ce01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02667c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d9ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffffab8, 0xf00809e7, // brr.allz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x952cbdbf, 0x10024555, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff9b0, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffff990, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffff970, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffff950, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x95555dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x203e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209cf017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209cf01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x213e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02567c80, 0x10020567, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d5ec0, 0x10021567, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95659dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20427016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d0017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d001f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2142709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02667c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d9ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff848, 0xf00809e7, // brr.allz -, r:pass_3
+0x00000060, 0xe0020827, // mov r0, 3*4*8
+0x0d227c00, 0x10020227, // sub ra_link_1, ra_link_1, r0
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x954d3dbf, 0x10024555, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95514dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff740, 0xf0f80227, // brr ra_link_1, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x95555dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x20467016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209d1017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209d101f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x2146709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02567c80, 0x10020567, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d5ec0, 0x10021567, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95659dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x204a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d2017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d201f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x214a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02667c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d9ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff638, 0xf00809e7, // brr.allz -, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffff700, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 605 - 0
src/hello_fft/hex/shader_128k.hex

@@ -0,0 +1,605 @@
+0x00000011, 0xe0021227, // mov rb_STAGES,  STAGES
+0x00000010, 0xe00216a7, // mov rb_0x10,    0x10
+0x00000040, 0xe00216e7, // mov rb_0x40,    0x40
+0x00000080, 0xe0021727, // mov rb_0x80,    0x80
+0x000000f0, 0xe0021767, // mov rb_0xF0,    0xF0
+0x00000100, 0xe00217a7, // mov rb_0x100,   0x100
+0x00000fff, 0xe00217e7, // mov rb_0xFFF,   0xFFF
+0x55555555, 0xe0020767, // mov rx_0x55555555, 0x55555555
+0x33333333, 0xe00207a7, // mov rx_0x33333333, 0x33333333
+0x0f0f0f0f, 0xe00207e7, // mov rx_0x0F0F0F0F, 0x0F0F0F0F
+0x00ff00ff, 0xe0021627, // mov rx_0x00FF00FF, 0x00FF00FF
+0x0000ffff, 0xe0021667, // mov rx_0x0000FFFF, 0x0000FFFF
+0x88104000, 0xe00206e7, // mov ra_vdw_16, vdw_setup_0(16, 16, dma_h32( 0,0))
+0x88105000, 0xe0021027, // mov rb_vdw_16, vdw_setup_0(16, 16, dma_h32(32,0))
+0x90104000, 0xe0020727, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10024660, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100246a0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000000b0, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x156e7d80, 0x10021c67, // mov vw_setup, arg_vdw
+0xc000ffc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS16_STRIDE-16*4
+0x8c05bdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000038, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, arg_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x000000c8, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x15727d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc0007fc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c05bdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000560, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d2039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22092cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d3039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22093cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20527030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d4039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22094cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20527031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20567030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d5039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22095cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20567031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d81c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d81c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9da1c0, 0x10020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119da3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9cc1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffd78, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15adf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d81c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d81c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9da1c0, 0x10020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119da3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9cc1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc30, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x20467006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d100f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2046700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22091c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95687ff6, 0x10024687, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffba0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x00000000, 0xf0f489e7, // bra -, ra_save_16
+0x009e7000, 0x100009e7, // nop
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c0ff6, 0x100246c0, // mov ra_vdw_16, rb_vdw_16; mov rb_vdw_16, ra_vdw_16
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_16, rx_save_slave_16
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x000007b0, 0xf00809e7, // brr.allz -, r:end
+0x952cbdbf, 0x10024451, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15adf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d81c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d81c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9da1c0, 0x10020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119da3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9cc1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffac0, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dedc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffffa98, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dedc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc68, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x141dfdc0, 0x100229e7, // and.setf -, ra_points, rb_0xFFF
+0xfffffc40, 0xf01809e7, // brr.allnz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100601e7, // add.ifnz ra_points, ra_points, rb_0x80
+0x95555dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02567c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d5ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffffb78, 0xf00809e7, // brr.allz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffa78, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0xfffffa58, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x95555dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209ce017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209ce01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02567c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d5ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff990, 0xf00809e7, // brr.allz -, r:pass_3
+0x00000020, 0xe0020827, // mov r0, 4*8
+0x0d227c00, 0x10020227, // sub ra_link_1, ra_link_1, r0
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95410dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff890, 0xf0f80227, // brr ra_link_1, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x95555dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cf017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cf01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02567c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d5ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff7c8, 0xf00809e7, // brr.allz -, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffff820, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 562 - 0
src/hello_fft/hex/shader_16k.hex

@@ -0,0 +1,562 @@
+0x00000010, 0xe00216e7, // mov rb_0x10,    0x10
+0x00000040, 0xe0021727, // mov rb_0x40,    0x40
+0x00000080, 0xe0021767, // mov rb_0x80,    0x80
+0x000000f0, 0xe00217a7, // mov rb_0xF0,    0xF0
+0x00000100, 0xe00217e7, // mov rb_0x100,   0x100
+0x00005555, 0xe0020767, // mov rx_0x5555,  0x5555
+0x00003333, 0xe00207a7, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207e7, // mov rx_0x0F0F,  0x0F0F
+0x000000ff, 0xe00216a7, // mov rx_0x00FF,  0x00FF
+0x88104000, 0xe00206e7, // mov ra_vdw_16, vdw_setup_0(16, 16, dma_h32( 0,0))
+0x88105000, 0xe0021027, // mov rb_vdw_16, vdw_setup_0(16, 16, dma_h32(32,0))
+0x90104000, 0xe0020727, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10024660, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100246a0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000000b0, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x156e7d80, 0x10021c67, // mov vw_setup, arg_vdw
+0xc0001fc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS16_STRIDE-16*4
+0x8c05cdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000038, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, arg_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x000000c8, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x15727d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc0000fc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c05cdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x000005f0, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d2039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22092cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d3039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22093cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20527030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d4039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22094cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20527031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20567030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d5039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22095cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20567031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c11c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffda0, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c11c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc80, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x20467006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d100f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2046700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22091c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95687ff6, 0x10024687, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffbf0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffba0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x20467006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d100f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2046700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22091c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95687ff6, 0x10024687, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb10, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x00000000, 0xf0f489e7, // bra -, ra_save_16
+0x009e7000, 0x100009e7, // nop
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c0ff6, 0x100246c0, // mov ra_vdw_16, rb_vdw_16; mov rb_vdw_16, ra_vdw_16
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_16, rx_save_slave_16
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x000005e0, 0xf00809e7, // brr.allz -, r:end
+0x952cbdbf, 0x10024451, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c11c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffa58, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x0e1cedc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffa30, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x952cbdbf, 0x10024451, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffba8, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffffb88, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x95451dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02467c80, 0x10020467, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d1ec0, 0x10021467, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95555dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209ce017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209ce01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02567c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d5ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cedc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffa80, 0xf00809e7, // brr.allz -, r:pass_2
+0x00000020, 0xe0020827, // mov r0, 4*8
+0x0d227c00, 0x10020227, // sub ra_link_1, ra_link_1, r0
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95410dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffa60, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x95555dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cf017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cf01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02567c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d5ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cedc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffff998, 0xf00809e7, // brr.allz -, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffff9f0, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 447 - 0
src/hello_fft/hex/shader_1k.hex

@@ -0,0 +1,447 @@
+0x00000010, 0xe00216e7, // mov rb_0x10,    0x10
+0x00000040, 0xe0021727, // mov rb_0x40,    0x40
+0x000000f0, 0xe0021767, // mov rb_0xF0,    0xF0
+0x00005555, 0xe00207a7, // mov rx_0x5555,  0x5555
+0x00003333, 0xe00217a7, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207e7, // mov rx_0x0F0F,  0x0F0F
+0x000000ff, 0xe00217e7, // mov rx_0x00FF,  0x00FF
+0x90104000, 0xe0020767, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x00000080, 0xe00208e7, // mov r3, 0x80
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x100246e0, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x10024720, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000000c8, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156e7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15727d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x15767d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc00000c0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c05cdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156e7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15727d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x156e7d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000588, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20467030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d1039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22091cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20467031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d2039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22092cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d3039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22093cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20527030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d4039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22094cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20527031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149de1c0, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149de1c0, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149df1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149df1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c31c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffda0, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149de1c0, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149de1c0, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149df1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149df1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c31c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc80, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x20427006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d000f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2042700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22090c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f489e7, // bra -, ra_save_32
+0x956c2ff6, 0x100246c2, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95707ff6, 0x10024707, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95741ff6, 0x10024741, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffbf0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffba0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x20427006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d000f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2042700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22090c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f489e7, // bra -, ra_save_32
+0x956c2ff6, 0x100246c2, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95707ff6, 0x10024707, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95741ff6, 0x10024741, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x000003f8, 0xf00809e7, // brr.allz -, r:end
+0x9528adbf, 0x10024410, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x952cbdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149de1c0, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149de1c0, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149df1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149df1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c31c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffac8, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x0e1cadc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffaa0, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9538edbf, 0x10024410, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x953cfdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc18, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x95410dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x20327016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209cc017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209cc01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x2132709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02427c80, 0x10020427, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d0ec0, 0x10021427, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95514dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02527c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d4ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cadc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffb10, 0xf00809e7, // brr.allz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffffbd8, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 1103 - 0
src/hello_fft/hex/shader_2048k.hex

@@ -0,0 +1,1103 @@
+0x00000010, 0xe0021227, // mov rb_0x10,    0x10
+0x000001d0, 0xe0021967, // mov r5rep,      0x1D0
+0x00000080, 0xe00208e7, // mov r3, 0x80
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020467, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020527, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020567, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100205a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100205e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020627, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021467, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021527, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021567, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100215a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100215e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021627, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020667, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100206a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021667, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100216a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10025020, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x10025060, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000002e8, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15327d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x153a7d80, 0x10020827, // mov r0, ra_vdw_32
+0x8c04ddf6, 0x10024061, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov r1, ra_save_ptr
+0x00000080, 0xe00208a7, // mov r2, vdw_setup_0(1, 16, dma_h32(1,0)) - vdw_setup_0(1, 16, dma_h32(0,0))
+0x00080000, 0xe00208e7, // mov r3, PASS32_STRIDE
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000050, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15327d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000520, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x159c0fc0, 0x10021c67, // mov vw_setup, rb_vpm
+0x012cbdc0, 0x10020c27, // fadd vpm, ra_64+0, rb_64+0
+0x0130cdc0, 0x10020c27, // fadd vpm, ra_64+1, rb_64+1
+0x159c1fc0, 0x10021c67, // mov vw_setup, rb_vpm_16
+0x0134ddc0, 0x10020c27, // fadd vpm, ra_64+2, rb_64+2
+0x0138edc0, 0x10020c27, // fadd vpm, ra_64+3, rb_64+3
+0x159c2fc0, 0x10021c67, // mov vw_setup, rb_vpm_32
+0x022cbdc0, 0x10020c27, // fsub vpm, ra_64+0, rb_64+0
+0x0230cdc0, 0x10020c27, // fsub vpm, ra_64+1, rb_64+1
+0x159c7fc0, 0x10021c67, // mov vw_setup, rb_vpm_48
+0x0234ddc0, 0x10020c27, // fsub vpm, ra_64+2, rb_64+2
+0x0238edc0, 0x10020c27, // fsub vpm, ra_64+3, rb_64+3
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x80904000, 0xe0020827, // mov r0, vdw_setup_0(1, 16, dma_h32(0,0))
+0x00000040, 0xe0020867, // mov r1, 0x40
+0x8c067c76, 0x10024061, // add ra_save_ptr, ra_save_ptr, r1; mov r1, ra_save_ptr
+0x00000080, 0xe00208a7, // mov r2, vdw_setup_0(1, 16, dma_h32(1,0)) - vdw_setup_0(1, 16, dma_h32(0,0))
+0x00040000, 0xe00208e7, // mov r3, PASS64_STRIDE
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000002b8, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd00200a7, // shl ra_temp, r0, 5
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0fc49e7, // brr -, ra_temp
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000e0, 0xf0f809e7, // brr -, r:2f
+0x00000010, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000c0, 0xf0f809e7, // brr -, r:2f
+0x00000011, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000a0, 0xf0f809e7, // brr -, r:2f
+0x00000012, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000080, 0xf0f809e7, // brr -, r:2f
+0x00000013, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000060, 0xf0f809e7, // brr -, r:2f
+0x00000014, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000040, 0xf0f809e7, // brr -, r:2f
+0x00000015, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000020, 0xf0f809e7, // brr -, r:2f
+0x00000016, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f809e7, // brr -, r:2f
+0x00000017, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c0fc0, 0x10021c67, // mov vw_setup, rb_vpm
+0x012cbdc0, 0x10020c27, // fadd vpm, ra_64+0, rb_64+0
+0x0130cdc0, 0x10020c27, // fadd vpm, ra_64+1, rb_64+1
+0x159c1fc0, 0x10021c67, // mov vw_setup, rb_vpm_16
+0x0134ddc0, 0x10020c27, // fadd vpm, ra_64+2, rb_64+2
+0x0138edc0, 0x10020c27, // fadd vpm, ra_64+3, rb_64+3
+0x159c2fc0, 0x10021c67, // mov vw_setup, rb_vpm_32
+0x022cbdc0, 0x10020c27, // fsub vpm, ra_64+0, rb_64+0
+0x0230cdc0, 0x10020c27, // fsub vpm, ra_64+1, rb_64+1
+0x159c7fc0, 0x10021c67, // mov vw_setup, rb_vpm_48
+0x0234ddc0, 0x10020c27, // fsub vpm, ra_64+2, rb_64+2
+0x0238edc0, 0x10020c27, // fsub vpm, ra_64+3, rb_64+3
+0x00000000, 0xf0fc49e7, // brr -, ra_temp
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000008, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000009, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000a, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000b, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000c, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000d, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000e, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000f, 0xe80009e7, // mov -, srel(i+8)
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000998, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20727030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209dc039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209ccb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20727031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20767030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209dd039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209dcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20767031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x207a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209de039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209ecb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x207a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x207e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209df039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209fcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x207e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x55555555, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x33333333, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0f0f0f0f, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x00ff00ff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0000ffff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0x10020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c81c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffd50, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x55555555, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x33333333, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0f0f0f0f, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x00ff00ff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0000ffff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0x10020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c81c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffbe0, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x206e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209db00f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x206e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x2209bc87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x01267c00, 0x100202e7, // fadd ra_64+0, ra_32_re, r0
+0x019c9e40, 0x10020327, // fadd ra_64+1, rb_32_im, r1
+0x02267c00, 0x10020367, // fsub ra_64+2, ra_32_re, r0
+0x029c9e40, 0x100203a7, // fsub ra_64+3, rb_32_im, r1
+0x8c167d76, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x55555555, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x33333333, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0f0f0f0f, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x00ff00ff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0000ffff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0x10020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c81c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffa30, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x55555555, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x33333333, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0f0f0f0f, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x00ff00ff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0000ffff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0x10020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c81c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffff8c0, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x206e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209db00f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x206e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x2209bc87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x029c9e40, 0x100208e7, // fsub r3, rb_32_im, r1
+0x02267c00, 0x100208a7, // fsub r2, ra_32_re, r0
+0x019c9e40, 0x10020867, // fadd r1, rb_32_im, r1
+0x01267c00, 0x10020827, // fadd r0, ra_32_re, r0
+0x203e700e, 0x100049c9, // nop;                        fmul rb_32_im, r1, ra_tw_re+TW64_P1_BASE0
+0x209cf00f, 0x100059c9, // nop;                        fmul ra_32_re, r1, rb_tw_im+TW64_P1_BASE0
+0x209cf007, 0x100049e1, // nop;                        fmul r1,       r0, rb_tw_im+TW64_P1_BASE0
+0x213c93c6, 0x10025320, // fadd rb_64+1, r1, rb_32_im; fmul r0,       r0, ra_tw_re+TW64_P1_BASE0
+0x2225019f, 0x100252c9, // fsub rb_64+0, r0, ra_32_re; fmul ra_32_re, r3, rb_tw_im+TW64_P1_BASE1
+0x2042701e, 0x100049c9, // nop;                        fmul rb_32_im, r3, ra_tw_re+TW64_P1_BASE1
+0x00000000, 0xf0f549e7, // bra -, ra_save_64
+0x209d0017, 0x100049e3, // nop;                        fmul r3,       r2, rb_tw_im+TW64_P1_BASE1
+0x214097d6, 0x100253a2, // fadd rb_64+3, r3, rb_32_im; fmul r2,       r2, ra_tw_re+TW64_P1_BASE1
+0x02267580, 0x10021367, // fsub rb_64+2, r2, ra_32_re
+0x8c14cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff7e0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff790, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x206e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209db00f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x206e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x2209bc87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f489e7, // bra -, ra_save_32
+0x952c2ff6, 0x100242c2, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95307ff6, 0x10024307, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x9538eff6, 0x1002438e, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_32, rx_save_slave_32
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_64, rx_save_slave_64
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x00000940, 0xf00809e7, // brr.allz -, r:end
+0x95451dbf, 0x100246db, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c7e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d7e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c7a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d7a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c61c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x55555555, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x33333333, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0f0f0f0f, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x00ff00ff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0000ffff, 0xe00208a7, // mov r2, mask
+0x149e7080, 0x10020867, // and r1, r0, r2
+0x0e9c81c0, 0x10020827, // shr r0, r0, shift
+0x149e7080, 0x10020827, // and r0, r0, r2
+0x119c83c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c81c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff660, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x00000200, 0xe0020827, // mov r0, 0x200
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000015, 0xe0020867, // mov r1, STAGES
+0x0e1e7c40, 0x100229e7, // shr.setf -, ra_points, r1
+0xfffff630, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x00000200, 0xe0020827, // mov r0, 0x200
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159c0fc0, 0x100202e7, // mov ra_vpm_lo, rb_vpm
+0x159c1fc0, 0x10020327, // mov ra_vpm_hi, rb_vpm_16
+0x80904000, 0xe00203a7, // mov ra_vdw_32, vdw_setup_0(1, 16, dma_h32( 0,0))
+0x80905000, 0xe00213a7, // mov rb_vdw_32, vdw_setup_0(1, 16, dma_h32(32,0))
+0x00000015, 0xe00212e7, // mov rb_STAGES, STAGES
+0x000000f0, 0xe0021327, // mov rb_0xF0, 0xF0
+0x00000040, 0xe0021367, // mov rb_0x40, 0x40
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95451dbf, 0x100246db, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c7e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d7e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c7a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d7a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb80, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00007fff, 0xe0020827, // mov r0, 0x7FFF
+0x141e7c00, 0x100229e7, // and.setf -, ra_points, r0
+0xfffffb50, 0xf01809e7, // brr.allnz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100601e7, // add.ifnz ra_points, ra_points, r0
+0x956dbdbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x204e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209d3017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209d301f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x214e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x026e7c80, 0x100206e7, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029dbec0, 0x100216e7, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x957dfdbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20527016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d4017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d401f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2152709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x027e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029dfec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c7e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d7e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c7a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d7a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cbdc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffffa48, 0xf00809e7, // brr.allz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95451dbf, 0x100246db, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c7e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d7e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c7a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d7a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff940, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0xfffff920, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0xfffff900, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0xfffff8e0, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x956dbdbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x20567016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209d5017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209d501f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x2156709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x026e7c80, 0x100206e7, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029dbec0, 0x100216e7, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x957dfdbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x205a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d6017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d601f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x215a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x027e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029dfec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c7e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d7e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c7a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d7a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cbdc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0x00000100, 0xe0020827, // mov r0, 0x100
+0xfffff7d0, 0xf00809e7, // brr.allz -, r:pass_3
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000060, 0xe0020827, // mov r0, (4-1)*4*8
+0x0d227c00, 0x10020227, // sub ra_link_1, ra_link_1, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95659dbf, 0x100246db, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9569adbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c7e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d7e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c7a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d7a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff6c8, 0xf0f80227, // brr ra_link_1, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x956dbdbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x205e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209d7017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209d701f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x215e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x026e7c80, 0x100206e7, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029dbec0, 0x100216e7, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x957dfdbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20627016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d8017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d801f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2162709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x027e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029dfec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c7e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d7e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c7a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d7a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cbdc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff5c0, 0xf00809e7, // brr.allz -, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffff690, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 321 - 0
src/hello_fft/hex/shader_256.hex

@@ -0,0 +1,321 @@
+0x00000040, 0xe00217a7, // mov rb_0x40,    0x40
+0x00000080, 0xe00217e7, // mov rb_0x80,    0x80
+0x00005555, 0xe0020767, // mov rx_0x5555,  0x5555
+0x00003333, 0xe00207a7, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207e7, // mov rx_0x0F0F,  0x0F0F
+0x88104000, 0xe0020727, // mov ra_vdw, vdw_setup_0(16, 16, dma_h32( 0,0))
+0x88104800, 0xe0021727, // mov rb_vdw, vdw_setup_0(16, 16, dma_h32(16,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9df1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020227, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9df1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020267, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9df3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021227, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9df3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021267, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9df1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9df3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x100246e0, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100256e0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100049e0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100009e7, // add out_3, r0, r2
+0x000000b0, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156e7d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x15727d80, 0x10021c67, // mov vw_setup, arg_vdw
+0xc0000040, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS16_STRIDE-16*4
+0x8c05edf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000038, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156e7d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x156e7d80, 0x10020c67, // mov vr_setup, arg_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000248, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x202e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209cb039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2208bcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x202e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20327030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209cc039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2208ccb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20327031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20367030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209cd039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2208dcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20367031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x203a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209ce039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2208ecb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x203a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f489e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0xfffffe98, 0xf0f809e7, // brr -, r:fft_16
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_16, rx_save_slave_16
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x00000420, 0xf00809e7, // brr.allz -, r:end
+0x95208dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c3a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d3a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c362, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d363, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c322, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d323, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c2e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d2e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffdb0, 0xf0f80027, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x956dbff6, 0x100246db, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x9571cff6, 0x1002471c, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0xfffffd90, 0xf0f80027, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x956dbff6, 0x100246db, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x9571cff6, 0x1002471c, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x00000000, 0xf0f4c027, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9528adbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c3a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d3a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c362, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d363, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c322, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d323, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c2e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d2e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc68, 0xf0f80027, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x956dbff6, 0x100246db, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x9571cff6, 0x1002471c, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x9538edbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20267016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209c9017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209c901f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2126709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x023a7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029ceec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c3a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d3a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c362, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d363, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c322, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d323, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c2e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d2e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0xfffffba8, 0xf0f80027, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x956dbff6, 0x100246db, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x9571cff6, 0x1002471c, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x00000000, 0xf0f4c027, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0xfffffbb0, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 698 - 0
src/hello_fft/hex/shader_256k.hex

@@ -0,0 +1,698 @@
+0x00000012, 0xe0021227, // mov rb_STAGES,  STAGES
+0x00000010, 0xe00216a7, // mov rb_0x10,    0x10
+0x00000040, 0xe00216e7, // mov rb_0x40,    0x40
+0x00000080, 0xe0021727, // mov rb_0x80,    0x80
+0x000000f0, 0xe0021767, // mov rb_0xF0,    0xF0
+0x00000100, 0xe00217a7, // mov rb_0x100,   0x100
+0x00001fff, 0xe00217e7, // mov rb_0x1FFF,  0x1FFF
+0x55555555, 0xe0020767, // mov rx_0x55555555, 0x55555555
+0x33333333, 0xe00207a7, // mov rx_0x33333333, 0x33333333
+0x0f0f0f0f, 0xe00207e7, // mov rx_0x0F0F0F0F, 0x0F0F0F0F
+0x00ff00ff, 0xe0021627, // mov rx_0x00FF00FF, 0x00FF00FF
+0x0000ffff, 0xe0021667, // mov rx_0x0000FFFF, 0x0000FFFF
+0x80904000, 0xe00206e7, // mov ra_vdw_16, vdw_setup_0( 1, 16, dma_h32( 0,0))
+0x80905000, 0xe0021027, // mov rb_vdw_16, vdw_setup_0( 1, 16, dma_h32(32,0))
+0x90104000, 0xe0020727, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020467, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dc1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021467, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dc3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10024660, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100246a0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000001d0, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x156e7d80, 0x10020827, // mov r0, arg_vdw
+0x8c05bdf6, 0x10024061, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov r1, ra_save_ptr
+0x00000080, 0xe00208a7, // mov r2, vdw_setup_0(1, 16, dma_h32(1,0)) - vdw_setup_0(1, 16, dma_h32(0,0))
+0x00020000, 0xe00208e7, // mov r3, PASS16_STRIDE
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000038, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, arg_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x000000c8, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x15727d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc000ffc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c05bdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000640, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20527030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d4039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22094cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20527031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20567030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d5039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22095cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20567031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x205a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d6039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22096cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x205a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x205e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d7039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22097cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x205e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d81c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d81c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9da1c0, 0x10020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119da3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9cb1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffd78, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15adf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d81c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d81c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9da1c0, 0x10020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119da3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9cb1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc30, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x204e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d300f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x204e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22093c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95687ff6, 0x10024687, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffba0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x00000000, 0xf0f489e7, // bra -, ra_save_16
+0x009e7000, 0x100009e7, // nop
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c0ff6, 0x100246c0, // mov ra_vdw_16, rb_vdw_16; mov rb_vdw_16, ra_vdw_16
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb38, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15adf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffae8, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x204e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d300f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x204e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22093c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95687ff6, 0x10024687, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_16, rx_save_slave_16
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x00000838, 0xf00809e7, // brr.allz -, r:end
+0x952cbdbf, 0x100244d3, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15adf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d81c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d81c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9da1c0, 0x10020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119da3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9cb1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff9e0, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dedc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff9b8, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dedc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb88, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x141dfdc0, 0x100229e7, // and.setf -, ra_points, rb_0x1FFF
+0xfffffb60, 0xf01809e7, // brr.allnz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100601e7, // add.ifnz ra_points, ra_points, rb_0x80
+0x955d7dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x025e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d7ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffffa98, 0xf00809e7, // brr.allz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff998, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0xfffff978, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0xfffff958, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0xfffff938, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x955d7dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209ce017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209ce01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x025e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d7ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff870, 0xf00809e7, // brr.allz -, r:pass_3
+0x00000060, 0xe0020827, // mov r0, 3*4*8
+0x0d227c00, 0x10020227, // sub ra_link_1, ra_link_1, r0
+0x0c1dcdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95451dbf, 0x100244d3, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15adf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff7d0, 0xf0f80227, // brr ra_link_1, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dedc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x954d3dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x203e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209cf017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209cf01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x213e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x024e7c80, 0x100204e7, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d3ec0, 0x100214e7, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x955d7dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20427016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d0017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d001f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2142709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x025e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d7ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff6c8, 0xf00809e7, // brr.allz -, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dedc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffff798, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 679 - 0
src/hello_fft/hex/shader_2k.hex

@@ -0,0 +1,679 @@
+0x00000010, 0xe0021727, // mov rb_0x10,    0x10
+0x00000040, 0xe0021767, // mov rb_0x40,    0x40
+0x000000f0, 0xe00217a7, // mov rb_0xF0,    0xF0
+0x000001d0, 0xe00217e7, // mov rb_0x1D0,   0x1D0
+0x00005555, 0xe0020727, // mov rx_0x5555,  0x5555
+0x00003333, 0xe0020767, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207a7, // mov rx_0x0F0F,  0x0F0F
+0x000000ff, 0xe00207e7, // mov rx_0x00FF,  0x00FF
+0x00000080, 0xe00208e7, // mov r3, 0x80
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020467, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020527, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021467, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021527, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020567, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100205a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021567, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100215a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10025020, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x10025060, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000000c8, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15327d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x15367d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc00001c0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c05ddf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15327d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x000000f8, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x159c0fc0, 0x10021c67, // mov vw_setup, rb_vpm
+0x012cbdc0, 0x10020c27, // fadd vpm, ra_64+0, rb_64+0
+0x0130cdc0, 0x10020c27, // fadd vpm, ra_64+1, rb_64+1
+0x159c1fc0, 0x10021c67, // mov vw_setup, rb_vpm_16
+0x0134ddc0, 0x10020c27, // fadd vpm, ra_64+2, rb_64+2
+0x0138edc0, 0x10020c27, // fadd vpm, ra_64+3, rb_64+3
+0x159c2fc0, 0x10021c67, // mov vw_setup, rb_vpm_32
+0x022cbdc0, 0x10020c27, // fsub vpm, ra_64+0, rb_64+0
+0x0230cdc0, 0x10020c27, // fsub vpm, ra_64+1, rb_64+1
+0x159c7fc0, 0x10021c67, // mov vw_setup, rb_vpm_48
+0x0234ddc0, 0x10020c27, // fsub vpm, ra_64+2, rb_64+2
+0x0238edc0, 0x10020c27, // fsub vpm, ra_64+3, rb_64+3
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0xa0104000, 0xe0021c67, // mov vw_setup, vdw_setup_0(64, 16, dma_h32(0,0))
+0xc00000c0, 0xe0021c67, // mov vw_setup, vdw_setup_1(PASS64_STRIDE-16*4)
+0x8c05ddf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, step; mov vw_addr, ra_save_ptr
+0x000002b8, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd00200a7, // shl ra_temp, r0, 5
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0fc49e7, // brr -, ra_temp
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000e0, 0xf0f809e7, // brr -, r:2f
+0x00000010, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000c0, 0xf0f809e7, // brr -, r:2f
+0x00000011, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000a0, 0xf0f809e7, // brr -, r:2f
+0x00000012, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000080, 0xf0f809e7, // brr -, r:2f
+0x00000013, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000060, 0xf0f809e7, // brr -, r:2f
+0x00000014, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000040, 0xf0f809e7, // brr -, r:2f
+0x00000015, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000020, 0xf0f809e7, // brr -, r:2f
+0x00000016, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f809e7, // brr -, r:2f
+0x00000017, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c0fc0, 0x10021c67, // mov vw_setup, rb_vpm
+0x012cbdc0, 0x10020c27, // fadd vpm, ra_64+0, rb_64+0
+0x0130cdc0, 0x10020c27, // fadd vpm, ra_64+1, rb_64+1
+0x159c1fc0, 0x10021c67, // mov vw_setup, rb_vpm_16
+0x0134ddc0, 0x10020c27, // fadd vpm, ra_64+2, rb_64+2
+0x0138edc0, 0x10020c27, // fadd vpm, ra_64+3, rb_64+3
+0x159c2fc0, 0x10021c67, // mov vw_setup, rb_vpm_32
+0x022cbdc0, 0x10020c27, // fsub vpm, ra_64+0, rb_64+0
+0x0230cdc0, 0x10020c27, // fsub vpm, ra_64+1, rb_64+1
+0x159c7fc0, 0x10021c67, // mov vw_setup, rb_vpm_48
+0x0234ddc0, 0x10020c27, // fsub vpm, ra_64+2, rb_64+2
+0x0238edc0, 0x10020c27, // fsub vpm, ra_64+3, rb_64+3
+0x00000000, 0xf0fc49e7, // brr -, ra_temp
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000008, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000009, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000a, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000b, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000c, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000d, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000e, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000f, 0xe80009e7, // mov -, srel(i+8)
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000858, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20627030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d8039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22098cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20627031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20667030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d9039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22099cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20667031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x206a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209da039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209acb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x206a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x206e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209db039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209bcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x206e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c21c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffda0, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c21c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc80, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x205e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d700f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x205e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22097c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x01267c00, 0x100202e7, // fadd ra_64+0, ra_32_re, r0
+0x019c9e40, 0x10020327, // fadd ra_64+1, rb_32_im, r1
+0x02267c00, 0x10020367, // fsub ra_64+2, ra_32_re, r0
+0x029c9e40, 0x100203a7, // fsub ra_64+3, rb_32_im, r1
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c21c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffb20, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c21c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffa00, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x205e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d700f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x205e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22097c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x029c9e40, 0x100208e7, // fsub r3, rb_32_im, r1
+0x02267c00, 0x100208a7, // fsub r2, ra_32_re, r0
+0x019c9e40, 0x10020867, // fadd r1, rb_32_im, r1
+0x01267c00, 0x10020827, // fadd r0, ra_32_re, r0
+0x203e700e, 0x100049c9, // nop;                        fmul rb_32_im, r1, ra_tw_re+TW64_P1_BASE0
+0x209cf00f, 0x100059c9, // nop;                        fmul ra_32_re, r1, rb_tw_im+TW64_P1_BASE0
+0x209cf007, 0x100049e1, // nop;                        fmul r1,       r0, rb_tw_im+TW64_P1_BASE0
+0x213c93c6, 0x10025320, // fadd rb_64+1, r1, rb_32_im; fmul r0,       r0, ra_tw_re+TW64_P1_BASE0
+0x2225019f, 0x100252c9, // fsub rb_64+0, r0, ra_32_re; fmul ra_32_re, r3, rb_tw_im+TW64_P1_BASE1
+0x2042701e, 0x100049c9, // nop;                        fmul rb_32_im, r3, ra_tw_re+TW64_P1_BASE1
+0x00000000, 0xf0f549e7, // bra -, ra_save_64
+0x209d0017, 0x100049e3, // nop;                        fmul r3,       r2, rb_tw_im+TW64_P1_BASE1
+0x214097d6, 0x100253a2, // fadd rb_64+3, r3, rb_32_im; fmul r2,       r2, ra_tw_re+TW64_P1_BASE1
+0x02267580, 0x10021367, // fsub rb_64+2, r2, ra_32_re
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff920, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff8d0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x205e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d700f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x205e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22097c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f489e7, // bra -, ra_save_32
+0x952c2ff6, 0x100242c2, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95307ff6, 0x10024307, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x9534dff6, 0x1002434d, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_32, rx_save_slave_32
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_64, rx_save_slave_64
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x00000418, 0xf00809e7, // brr.allz -, r:end
+0x95451dbf, 0x100245d7, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c6e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d6e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c6a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d6a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c61c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c21c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff7f0, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x00000200, 0xe0020827, // mov r0, 0x200
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x0e1cbdc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffff7c8, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x00000200, 0xe0020827, // mov r0, 0x200
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159c0fc0, 0x100202e7, // mov ra_vpm_lo, rb_vpm
+0x159c1fc0, 0x10020327, // mov ra_vpm_hi, rb_vpm_16
+0x90104000, 0xe0020367, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe0021367, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95555dbf, 0x100245d7, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95596dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c6e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d6e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c6a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d6a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffbf0, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x955d7dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x204e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209d3017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209d301f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x214e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x025e7c80, 0x100205e7, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d7ec0, 0x100215e7, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x956dbdbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20527016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d4017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d401f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2152709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x026e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029dbec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c6e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d6e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c6a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d6a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c662, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d663, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cbdc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffae8, 0xf00809e7, // brr.allz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffffbb8, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 538 - 0
src/hello_fft/hex/shader_32k.hex

@@ -0,0 +1,538 @@
+0x00000010, 0xe00216e7, // mov rb_0x10,    0x10
+0x00000040, 0xe0021727, // mov rb_0x40,    0x40
+0x00000080, 0xe0021767, // mov rb_0x80,    0x80
+0x000000f0, 0xe00217a7, // mov rb_0xF0,    0xF0
+0x00000100, 0xe00217e7, // mov rb_0x100,   0x100
+0x00005555, 0xe0020767, // mov rx_0x5555,  0x5555
+0x00003333, 0xe00207a7, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207e7, // mov rx_0x0F0F,  0x0F0F
+0x000000ff, 0xe00216a7, // mov rx_0x00FF,  0x00FF
+0x90104000, 0xe0020727, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020467, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021467, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x100246a0, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100246e0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000000c8, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156e7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x15727d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc0001fc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c05cdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156e7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000588, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d3039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22093cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20527030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d4039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22094cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20527031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20567030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d5039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22095cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20567031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x205a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d6039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22096cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x205a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c21c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffda0, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c21c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc80, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x204a7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d200f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x204a700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22092c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f489e7, // bra -, ra_save_32
+0x95682ff6, 0x10024682, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c7ff6, 0x100246c7, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffbf0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffba0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x204a7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d200f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x204a700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22092c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f489e7, // bra -, ra_save_32
+0x95682ff6, 0x10024682, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c7ff6, 0x100246c7, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x00000668, 0xf00809e7, // brr.allz -, r:end
+0x9528adbf, 0x10024492, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x952cbdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c21c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffac8, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x0e1cfdc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffaa0, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9528adbf, 0x10024492, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x952cbdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc18, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffffbf8, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffffbd8, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffffbb8, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x20327016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209cc017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209cc01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x2132709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x024a7c80, 0x100204a7, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d2ec0, 0x100214a7, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95596dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x025a7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d6ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cfdc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffab0, 0xf00809e7, // brr.allz -, r:pass_2
+0x00000060, 0xe0020827, // mov r0, 3*4*8
+0x0d227c00, 0x10020227, // sub ra_link_1, ra_link_1, r0
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95410dbf, 0x10024492, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95451dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff9a8, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x203a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209ce017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209ce01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x213a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x024a7c80, 0x100204a7, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d2ec0, 0x100214a7, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95596dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cf017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cf01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x025a7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d6ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cfdc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffff8a0, 0xf00809e7, // brr.allz -, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffff968, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 434 - 0
src/hello_fft/hex/shader_4k.hex

@@ -0,0 +1,434 @@
+0x00000020, 0xe0021767, // mov rb_0x20,    0x20
+0x00000040, 0xe00217a7, // mov rb_0x40,    0x40
+0x00000080, 0xe00217e7, // mov rb_0x80,    0x80
+0x00005555, 0xe0020727, // mov rx_0x5555,  0x5555
+0x00003333, 0xe0020767, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207a7, // mov rx_0x0F0F,  0x0F0F
+0x000000ff, 0xe00207e7, // mov rx_0x00FF,  0x00FF
+0x88104000, 0xe00206e7, // mov ra_vdw, vdw_setup_0(16, 16, dma_h32( 0,0))
+0x88104800, 0xe00216e7, // mov rb_vdw, vdw_setup_0(16, 16, dma_h32(16,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9df1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020227, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9df1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020267, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9df1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9df3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021227, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9df3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021267, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9df3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9df1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9df3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x100246a0, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100256a0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100049e0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100009e7, // add out_3, r0, r2
+0x000000b0, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x156e7d80, 0x10021c67, // mov vw_setup, arg_vdw
+0xc00007c0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS16_STRIDE-16*4
+0x8c05edf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000038, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x156a7d80, 0x10020c67, // mov vr_setup, arg_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x000003e8, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f409e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20327030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209cc039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2208ccb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20327031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20367030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209cd039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2208dcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20367031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x203a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209ce039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2208ecb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x203a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x203e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209cf039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2208fcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x203e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f489e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c11c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x000000cc, 0xe20229e7, // mov.setf  -, [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
+0x959fa000, 0xd002c8a0, // mov r2, r0; mov.ifnz r0, r0 << 6
+0x959fa249, 0xd002c8e1, // mov r3, r1; mov.ifnz r1, r1 << 6
+0x00003300, 0xe20229e7, // mov.setf  -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0]
+0x809f6012, 0xd000c9e0, // nop; mov.ifnz r0, r2 >> 6
+0x809f601b, 0xd000c9e1, // nop; mov.ifnz r1, r3 >> 6
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x000000cc, 0xe20229e7, // mov.setf  -, [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
+0x959fa000, 0xd002c8a0, // mov r2, r0; mov.ifnz r0, r0 << 6
+0x959fa249, 0xd002c8e1, // mov r3, r1; mov.ifnz r1, r1 << 6
+0x00003300, 0xe20229e7, // mov.setf  -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0]
+0x809f6012, 0xd000c9e0, // nop; mov.ifnz r0, r2 >> 6
+0x809f601b, 0xd000c9e1, // nop; mov.ifnz r1, r3 >> 6
+0xfffffd40, 0xf0f809e7, // brr -, r:fft_16
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffcf8, 0xf0f809e7, // brr -, r:fft_16
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_16, rx_save_slave_16
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x000005c8, 0xf00809e7, // brr.allz -, r:end
+0x95208dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c3e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d3e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c3a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d3a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c362, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d363, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c322, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d323, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c11c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x000000cc, 0xe20229e7, // mov.setf  -, [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
+0x959fa000, 0xd002c8a0, // mov r2, r0; mov.ifnz r0, r0 << 6
+0x959fa249, 0xd002c8e1, // mov r3, r1; mov.ifnz r1, r1 << 6
+0x00003300, 0xe20229e7, // mov.setf  -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0]
+0x809f6012, 0xd000c9e0, // nop; mov.ifnz r0, r2 >> 6
+0x809f601b, 0xd000c9e1, // nop; mov.ifnz r1, r3 >> 6
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc40, 0xf0f80027, // brr ra_link_1, r:pass_1
+0x9569aff6, 0x1002469a, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x956dbff6, 0x100246db, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x0e1ccdc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffc18, 0xf00809e7, // brr.allz -, r:pass_1
+0x9569aff6, 0x1002469a, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x956dbff6, 0x100246db, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c027, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95208dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c3e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d3e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c3a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d3a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c362, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d363, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c322, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d323, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc90, 0xf0f80027, // brr ra_link_1, r:pass_2
+0x9569aff6, 0x1002469a, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x956dbff6, 0x100246db, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0xfffffc70, 0xf0f80027, // brr ra_link_1, r:pass_2
+0x9569aff6, 0x1002469a, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x956dbff6, 0x100246db, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x0d01ddc0, 0x10020027, // sub ra_link_1, ra_link_1, rb_0x20
+0x953cfdbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20267016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209c9017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209c901f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2126709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x023e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029cfec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c3e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d3e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c3a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d3a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c362, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d363, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c322, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d323, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1ccdc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffba0, 0xf00809e7, // brr.allz -, r:pass_2
+0x9569aff6, 0x1002469a, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x956dbff6, 0x100246db, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c027, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x952cbdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c3e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d3e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c3a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d3a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c362, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d363, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c322, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d323, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffaa0, 0xf0f80027, // brr ra_link_1, r:pass_3
+0x9569aff6, 0x1002469a, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x956dbff6, 0x100246db, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x953cfdbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x202a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209ca017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209ca01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x212a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x023e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029cfec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c3e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d3e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c3a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d3a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c362, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d363, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c322, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d323, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1ccdc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffff9d8, 0xf00809e7, // brr.allz -, r:pass_3
+0x9569aff6, 0x1002469a, // mov ra_vpm, rb_vpm; mov rb_vpm, ra_vpm
+0x956dbff6, 0x100246db, // mov ra_vdw, rb_vdw; mov rb_vdw, ra_vdw
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c027, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffffa08, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 450 - 0
src/hello_fft/hex/shader_512.hex

@@ -0,0 +1,450 @@
+0x00000010, 0xe0021727, // mov rb_0x10,    0x10
+0x00000040, 0xe0021767, // mov rb_0x40,    0x40
+0x00000080, 0xe00217a7, // mov rb_0x80,    0x80
+0x000000f0, 0xe00217e7, // mov rb_0xF0,    0xF0
+0x00005555, 0xe0020727, // mov rx_0x5555,  0x5555
+0x00003333, 0xe0020767, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207a7, // mov rx_0x0F0F,  0x0F0F
+0x000000ff, 0xe00207e7, // mov rx_0x00FF,  0x00FF
+0x88104000, 0xe00206a7, // mov ra_vdw_16, vdw_setup_0(16, 16, dma_h32( 0,0))
+0x88105000, 0xe0021027, // mov rb_vdw_16, vdw_setup_0(16, 16, dma_h32(32,0))
+0x90104000, 0xe00206e7, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9de1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9de1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9de1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9de3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9de3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9de3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9de1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9de3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10024620, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x10024660, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000000b0, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15627d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x156a7d80, 0x10021c67, // mov vw_setup, arg_vdw
+0xc00000c0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS16_STRIDE-16*4
+0x8c05ddf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000038, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15627d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15627d80, 0x10020c67, // mov vr_setup, arg_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x000000c8, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15627d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x156e7d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc0000040, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c05ddf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15627d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15627d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000510, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20427030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d0039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22090cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20427031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20467030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d1039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22091cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20467031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d2039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22092cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d3039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22093cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15fdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c41c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffda0, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c41c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc80, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x203e7006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209cf00f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x203e700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x2208fc87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95602ff6, 0x10024602, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95647ff6, 0x10024647, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x956c1ff6, 0x100246c1, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffbf0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x00000000, 0xf0f489e7, // bra -, ra_save_16
+0x009e7000, 0x100009e7, // nop
+0x95602ff6, 0x10024602, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95680ff6, 0x10024680, // mov ra_vdw_16, rb_vdw_16; mov rb_vdw_16, ra_vdw_16
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_16, rx_save_slave_16
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x000003a8, 0xf00809e7, // brr.allz -, r:end
+0x952cbdbf, 0x100243cf, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c422, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d423, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14727180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14727180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9c41c0, 0xd0020827, // shr r0, r0, 13-STAGES
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb38, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0xfffffb18, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9538edbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c422, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d423, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc98, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dedc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x954d3dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x024e7c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d3ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c422, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d423, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c9dc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffbd0, 0xf00809e7, // brr.allz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dedc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffffc28, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 781 - 0
src/hello_fft/hex/shader_512k.hex

@@ -0,0 +1,781 @@
+0x00000013, 0xe0021227, // mov rb_STAGES,  STAGES
+0x00000010, 0xe00216e7, // mov rb_0x10,    0x10
+0x00000040, 0xe0021727, // mov rb_0x40,    0x40
+0x00000080, 0xe0021767, // mov rb_0x80,    0x80
+0x000000f0, 0xe00217a7, // mov rb_0xF0,    0xF0
+0x00000100, 0xe00217e7, // mov rb_0x100,   0x100
+0x55555555, 0xe0020767, // mov rx_0x55555555, 0x55555555
+0x33333333, 0xe00207a7, // mov rx_0x33333333, 0x33333333
+0x0f0f0f0f, 0xe00207e7, // mov rx_0x0F0F0F0F, 0x0F0F0F0F
+0x00ff00ff, 0xe0021667, // mov rx_0x00FF00FF, 0x00FF00FF
+0x0000ffff, 0xe00216a7, // mov rx_0x0000FFFF, 0x0000FFFF
+0x80904000, 0xe00206e7, // mov ra_vdw_16, vdw_setup_0(1, 16, dma_h32( 0,0))
+0x80905000, 0xe0021027, // mov rb_vdw_16, vdw_setup_0(1, 16, dma_h32(32,0))
+0x80904000, 0xe0020727, // mov ra_vdw_32, vdw_setup_0(1, 16, dma_h32( 0,0))
+0x80905000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(1, 16, dma_h32(32,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020467, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021467, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204e7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214e7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10024660, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100246a0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000001d0, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x156e7d80, 0x10020827, // mov r0, arg_vdw
+0x8c05cdf6, 0x10024061, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov r1, ra_save_ptr
+0x00000080, 0xe00208a7, // mov r2, vdw_setup_0(1, 16, dma_h32(1,0)) - vdw_setup_0(1, 16, dma_h32(0,0))
+0x00040000, 0xe00208e7, // mov r3, PASS16_STRIDE
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000038, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, arg_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x000002e8, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x15727d80, 0x10020827, // mov r0, ra_vdw_32
+0x8c05cdf6, 0x10024061, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov r1, ra_save_ptr
+0x00000080, 0xe00208a7, // mov r2, vdw_setup_0(1, 16, dma_h32(1,0)) - vdw_setup_0(1, 16, dma_h32(0,0))
+0x00020000, 0xe00208e7, // mov r3, PASS32_STRIDE
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x8c9e7080, 0x10024831, // add r0, r0, r2; mov vw_setup, r0
+0x8c9e72c9, 0x10024872, // add r1, r1, r3; mov vw_addr,  r1
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000050, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000640, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20567030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d5039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22095cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20567031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x205a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d6039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22096cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x205a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x205e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d7039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22097cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x205e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20627030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d8039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22098cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20627031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9db1c0, 0x10020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119db3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9ca1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffd78, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9db1c0, 0x10020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119db3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9ca1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc30, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x20527006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d400f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2052700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22094c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95687ff6, 0x10024687, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffba0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x00000000, 0xf0f489e7, // bra -, ra_save_16
+0x009e7000, 0x100009e7, // nop
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c0ff6, 0x100246c0, // mov ra_vdw_16, rb_vdw_16; mov rb_vdw_16, ra_vdw_16
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb38, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffae8, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x20527006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d400f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2052700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22094c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95687ff6, 0x10024687, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_16, rx_save_slave_16
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x00000888, 0xf00809e7, // brr.allz -, r:end
+0x952cbdbf, 0x10024514, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149d91c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149d91c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9db1c0, 0x10020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119db3c0, 0x10020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0e9ca1c0, 0xd0020827, // shr r0, r0, 32-STAGES-3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff9e0, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff9b8, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb88, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00003fff, 0xe0020827, // mov r0, 0x3FFF
+0x141e7c00, 0x100229e7, // and.setf -, ra_points, r0
+0xfffffb58, 0xf01809e7, // brr.allnz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100601e7, // add.ifnz ra_points, ra_points, rb_0x80
+0x95618dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02627c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d8ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffffa90, 0xf00809e7, // brr.allz -, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x952cbdbf, 0x10024514, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff9f0, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffff9d0, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffff9b0, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0xfffff990, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x95514dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x203a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209ce017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209ce01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x213a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02527c80, 0x10020527, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d4ec0, 0x10021527, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95618dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cf017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cf01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02627c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d8ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff888, 0xf00809e7, // brr.allz -, r:pass_3
+0x00000060, 0xe0020827, // mov r0, 3*4*8
+0x0d227c00, 0x10020227, // sub ra_link_1, ra_link_1, r0
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95492dbf, 0x10024514, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x954d3dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff780, 0xf0f80227, // brr ra_link_1, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x95514dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x20427016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209d0017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209d001f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x2142709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02527c80, 0x10020527, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d4ec0, 0x10021527, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x95618dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20467016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d1017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d101f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2146709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02627c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d8ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c622, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d623, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c5e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d5e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c5a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d5a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c562, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d563, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff678, 0xf00809e7, // brr.allz -, r:pass_4
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffff748, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 772 - 0
src/hello_fft/hex/shader_64k.hex

@@ -0,0 +1,772 @@
+0x00000010, 0xe0021227, // mov rb_0x10,    0x10
+0x000001d0, 0xe0021967, // mov r5rep,      0x1D0
+0x00005555, 0xe00207a7, // mov rx_0x5555,  0x5555
+0x00003333, 0xe00217a7, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207e7, // mov rx_0x0F0F,  0x0F0F
+0x000000ff, 0xe00217e7, // mov rx_0x00FF,  0x00FF
+0x00000080, 0xe00208e7, // mov r3, 0x80
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020427, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020467, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204a7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100204e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020527, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020567, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100205a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021427, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021467, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214a7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100214e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021527, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021567, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100215a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100205e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9e70c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020627, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100215e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9e72c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021627, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10025020, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x10025060, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000000c8, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15327d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x153a7d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc0003fc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c04ddf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x15327d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x152e7d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000100, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000040, 0xe0020827, // mov r0, 0x40
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x159c0fc0, 0x10021c67, // mov vw_setup, rb_vpm
+0x012cbdc0, 0x10020c27, // fadd vpm, ra_64+0, rb_64+0
+0x0130cdc0, 0x10020c27, // fadd vpm, ra_64+1, rb_64+1
+0x159c1fc0, 0x10021c67, // mov vw_setup, rb_vpm_16
+0x0134ddc0, 0x10020c27, // fadd vpm, ra_64+2, rb_64+2
+0x0138edc0, 0x10020c27, // fadd vpm, ra_64+3, rb_64+3
+0x159c2fc0, 0x10021c67, // mov vw_setup, rb_vpm_32
+0x022cbdc0, 0x10020c27, // fsub vpm, ra_64+0, rb_64+0
+0x0230cdc0, 0x10020c27, // fsub vpm, ra_64+1, rb_64+1
+0x159c7fc0, 0x10021c67, // mov vw_setup, rb_vpm_48
+0x0234ddc0, 0x10020c27, // fsub vpm, ra_64+2, rb_64+2
+0x0238edc0, 0x10020c27, // fsub vpm, ra_64+3, rb_64+3
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0xa0104000, 0xe0021c67, // mov vw_setup, vdw_setup_0(64, 16, dma_h32(0,0))
+0xc0001fc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(PASS64_STRIDE-16*4)
+0x8c067c36, 0x10024072, // add ra_save_ptr, ra_save_ptr, step; mov vw_addr, ra_save_ptr
+0x000002b8, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd00200a7, // shl ra_temp, r0, 5
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0fc49e7, // brr -, ra_temp
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000e0, 0xf0f809e7, // brr -, r:2f
+0x00000010, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000c0, 0xf0f809e7, // brr -, r:2f
+0x00000011, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x000000a0, 0xf0f809e7, // brr -, r:2f
+0x00000012, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000080, 0xf0f809e7, // brr -, r:2f
+0x00000013, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000060, 0xf0f809e7, // brr -, r:2f
+0x00000014, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000040, 0xf0f809e7, // brr -, r:2f
+0x00000015, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000020, 0xf0f809e7, // brr -, r:2f
+0x00000016, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f809e7, // brr -, r:2f
+0x00000017, 0xe80009e7, // mov -, sacq(i)
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c0fc0, 0x10021c67, // mov vw_setup, rb_vpm
+0x012cbdc0, 0x10020c27, // fadd vpm, ra_64+0, rb_64+0
+0x0130cdc0, 0x10020c27, // fadd vpm, ra_64+1, rb_64+1
+0x159c1fc0, 0x10021c67, // mov vw_setup, rb_vpm_16
+0x0134ddc0, 0x10020c27, // fadd vpm, ra_64+2, rb_64+2
+0x0138edc0, 0x10020c27, // fadd vpm, ra_64+3, rb_64+3
+0x159c2fc0, 0x10021c67, // mov vw_setup, rb_vpm_32
+0x022cbdc0, 0x10020c27, // fsub vpm, ra_64+0, rb_64+0
+0x0230cdc0, 0x10020c27, // fsub vpm, ra_64+1, rb_64+1
+0x159c7fc0, 0x10021c67, // mov vw_setup, rb_vpm_48
+0x0234ddc0, 0x10020c27, // fsub vpm, ra_64+2, rb_64+2
+0x0238edc0, 0x10020c27, // fsub vpm, ra_64+3, rb_64+3
+0x00000000, 0xf0fc49e7, // brr -, ra_temp
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000008, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000009, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000a, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000b, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000c, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000d, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000e, 0xe80009e7, // mov -, srel(i+8)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x159c0fc0, 0x10020c67, // mov vr_setup, rb_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x0000000f, 0xe80009e7, // mov -, srel(i+8)
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000858, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x206a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209da039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209acb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x206a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x206e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209db039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209bcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x206e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20727030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209dc039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209ccb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20727031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20767030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209dd039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x2209dcb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20767031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149de1c0, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149de1c0, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149df1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149df1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c31c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffda0, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149de1c0, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149de1c0, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149df1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149df1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c31c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc80, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x20667006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d900f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2066700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22099c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x01267c00, 0x100202e7, // fadd ra_64+0, ra_32_re, r0
+0x019c9e40, 0x10020327, // fadd ra_64+1, rb_32_im, r1
+0x02267c00, 0x10020367, // fsub ra_64+2, ra_32_re, r0
+0x029c9e40, 0x100203a7, // fsub ra_64+3, rb_32_im, r1
+0x8c167d76, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149de1c0, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149de1c0, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149df1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149df1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c31c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffb20, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149de1c0, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149de1c0, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149df1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149df1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c31c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffa00, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x20667006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d900f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2066700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22099c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x029c9e40, 0x100208e7, // fsub r3, rb_32_im, r1
+0x02267c00, 0x100208a7, // fsub r2, ra_32_re, r0
+0x019c9e40, 0x10020867, // fadd r1, rb_32_im, r1
+0x01267c00, 0x10020827, // fadd r0, ra_32_re, r0
+0x203e700e, 0x100049c9, // nop;                        fmul rb_32_im, r1, ra_tw_re+TW64_P1_BASE0
+0x209cf00f, 0x100059c9, // nop;                        fmul ra_32_re, r1, rb_tw_im+TW64_P1_BASE0
+0x209cf007, 0x100049e1, // nop;                        fmul r1,       r0, rb_tw_im+TW64_P1_BASE0
+0x213c93c6, 0x10025320, // fadd rb_64+1, r1, rb_32_im; fmul r0,       r0, ra_tw_re+TW64_P1_BASE0
+0x2225019f, 0x100252c9, // fsub rb_64+0, r0, ra_32_re; fmul ra_32_re, r3, rb_tw_im+TW64_P1_BASE1
+0x2042701e, 0x100049c9, // nop;                        fmul rb_32_im, r3, ra_tw_re+TW64_P1_BASE1
+0x00000000, 0xf0f549e7, // bra -, ra_save_64
+0x209d0017, 0x100049e3, // nop;                        fmul r3,       r2, rb_tw_im+TW64_P1_BASE1
+0x214097d6, 0x100253a2, // fadd rb_64+3, r3, rb_32_im; fmul r2,       r2, ra_tw_re+TW64_P1_BASE1
+0x02267580, 0x10021367, // fsub rb_64+2, r2, ra_32_re
+0x8c14cdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff920, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff8d0, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x20667006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d900f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2066700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22099c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f489e7, // bra -, ra_save_32
+0x952c2ff6, 0x100242c2, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95307ff6, 0x10024307, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x9538eff6, 0x1002438e, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_32, rx_save_slave_32
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_64, rx_save_slave_64
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x000006a8, 0xf00809e7, // brr.allz -, r:end
+0x95451dbf, 0x10024659, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c6e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d6e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c6a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d6a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c61c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149de1c0, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x149de1c0, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149df1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149df1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x119c31c0, 0xd0020827, // shl r0, r0, STAGES-13
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff7f0, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x00000200, 0xe0020827, // mov r0, 0x200
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000010, 0xe0020867, // mov r1, STAGES
+0x0e1e7c40, 0x100229e7, // shr.setf -, ra_points, r1
+0xfffff7c0, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x00000200, 0xe0020827, // mov r0, 0x200
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159c0fc0, 0x100202e7, // mov ra_vpm_lo, rb_vpm
+0x159c1fc0, 0x10020327, // mov ra_vpm_hi, rb_vpm_16
+0x90104000, 0xe00203a7, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe00213a7, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x00000060, 0xe00212e7, // mov rb_3x4x8, 3*4*8
+0x000000f0, 0xe0021327, // mov rb_0xF0, 0xF0
+0x00000040, 0xe0021367, // mov rb_0x40, 0x40
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x95451dbf, 0x10024659, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95492dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c6e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d6e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c6a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d6a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffbd0, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0xfffffbb0, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0xfffffb90, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0xfffffb70, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x95659dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x204e7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209d3017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209d301f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x214e709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02667c80, 0x10020667, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d9ec0, 0x10021667, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x9575ddbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20527016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d4017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d401f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2152709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02767c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029ddec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c6e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d6e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c6a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d6a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffffa68, 0xf00809e7, // brr.allz -, r:pass_2
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x0d20bdc0, 0x10020227, // sub ra_link_1, ra_link_1, rb_3x4x8
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x955d7dbf, 0x10024659, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x95618dbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c6e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d6e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c6a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d6a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c148df6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffff960, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x95659dbf, 0x100248a3, // mov r2, ra_tw_re+TW32_ACTIVE; mov r3, rb_tw_im+TW32_ACTIVE
+0x20567016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw32
+0x209d5017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw32
+0x209d501f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw32
+0x2156709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw32
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02667c80, 0x10020667, // fsub ra_tw_re+TW32_ACTIVE, ra_tw_re+TW32_ACTIVE, r2
+0x029d9ec0, 0x10021667, // fsub rb_tw_im+TW32_ACTIVE, rb_tw_im+TW32_ACTIVE, r3
+0x9575ddbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x205a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209d6017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209d601f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x215a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02767c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029ddec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c762, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d763, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c722, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d723, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c6e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d6e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c6a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d6a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1c8dc0, 0x100229e7, // shr.setf -, ra_points, rb_STAGES
+0xfffff858, 0xf00809e7, // brr.allz -, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x00000100, 0xe0020827, // mov r0, 0x100
+0x0c1e7c00, 0x100201e7, // add ra_points, ra_points, r0
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffff928, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 516 - 0
src/hello_fft/hex/shader_8k.hex

@@ -0,0 +1,516 @@
+0x00000010, 0xe00216e7, // mov rb_0x10,    0x10
+0x00000040, 0xe0021727, // mov rb_0x40,    0x40
+0x00000080, 0xe0021767, // mov rb_0x80,    0x80
+0x000000f0, 0xe00217a7, // mov rb_0xF0,    0xF0
+0x00000100, 0xe00217e7, // mov rb_0x100,   0x100
+0x00005555, 0xe0020767, // mov rx_0x5555,  0x5555
+0x00003333, 0xe00207a7, // mov rx_0x3333,  0x3333
+0x00000f0f, 0xe00207e7, // mov rx_0x0F0F,  0x0F0F
+0x000000ff, 0xe00216a7, // mov rx_0x00FF,  0x00FF
+0x88104000, 0xe00206e7, // mov ra_vdw_16, vdw_setup_0(16, 16, dma_h32( 0,0))
+0x88105000, 0xe0021027, // mov rb_vdw_16, vdw_setup_0(16, 16, dma_h32(32,0))
+0x90104000, 0xe0020727, // mov ra_vdw_32, vdw_setup_0(32, 16, dma_h32( 0,0))
+0x90105000, 0xe0021067, // mov rb_vdw_32, vdw_setup_0(32, 16, dma_h32(32,0))
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100202e7, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020327, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x10020367, // mov ra_tw_re+off+i, r4
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203a7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100212e7, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021327, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x10021367, // mov rb_tw_im+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213a7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10020827, // mov r0, addr
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x11983dc0, 0xd00208a7, // shl r2, elem_num, 3
+0x0c9e7080, 0x10020e27, // add t0s, r0, r2
+0x0c9dd1c0, 0xa0020827, // add r0,  r0, stride; ldtmu0
+0x159e7900, 0x100203e7, // mov ra_tw_re+off+i, r4
+0x0c9e7280, 0x10020e27, // add t0s, r1, r2
+0x0c9dd3c0, 0xa0020867, // add r1,  r1, stride; ldtmu0
+0x159e7900, 0x100213e7, // mov rb_tw_im+off+i, r4
+0x15827d80, 0x10021167, // mov rb_inst, unif
+0x00101200, 0xe0020827, // mov r0, vpm_setup(1, 1, v32( 0,0))
+0x00000010, 0xe0020867, // mov r1, vpm_setup(1, 1, v32(16,0)) - vpm_setup(1, 1, v32(0,0))
+0x00000002, 0xe00208a7, // mov r2, vpm_setup(1, 1, v32( 0,2)) - vpm_setup(1, 1, v32(0,0))
+0x409c5017, 0x100049e2, // nop; mul24 r2, r2, in_inst
+0xcc9e7081, 0x10024660, // add out_0, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100246a0, // add out_1, r0, r2; v8adds r0, r0, r1
+0xcc9e7081, 0x100250a0, // add out_2, r0, r2; v8adds r0, r0, r1
+0x0c9e7080, 0x100211e7, // add out_3, r0, r2
+0x000000b0, 0xf0f80127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x156e7d80, 0x10021c67, // mov vw_setup, arg_vdw
+0xc0000fc0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS16_STRIDE-16*4
+0x8c05cdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000038, 0xf0f81127, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, arg
+0x159e7000, 0x10020c27, // mov vpm, r0
+0x159e7240, 0x10020c27, // mov vpm, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, arg_vpm
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x000000c8, 0xf0f802a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x15727d80, 0x10021c67, // mov vw_setup, ra_vdw_32
+0xc00007c0, 0xe0021c67, // mov vw_setup, vdw_setup_1(0) + PASS32_STRIDE-16*4
+0x8c05cdf6, 0x10024072, // add ra_save_ptr, ra_save_ptr, rb_0x40; mov vw_addr, ra_save_ptr
+0x00000050, 0xf0f812a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10021c67, // mov vw_setup, ra_vpm_lo
+0x01267c00, 0x10020c27, // fadd vpm, ra_32_re, r0
+0x019c9e40, 0x10020c27, // fadd vpm, rb_32_im, r1
+0x156a7d80, 0x10021c67, // mov vw_setup, ra_vpm_hi
+0x02267c00, 0x10020c27, // fsub vpm, ra_32_re, r0
+0x029c9e40, 0x10020c27, // fsub vpm, rb_32_im, r1
+0x00000000, 0xf0f4c9e7, // bra -, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x15667d80, 0x10020c67, // mov vr_setup, ra_vpm_lo
+0x15c27d80, 0x100009e7, // mov -, vpm
+0x00000080, 0xf0f801a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x00000019, 0xe80009e7, // mov -, sacq(i+9)
+0x00000001, 0xe80009e7, // mov -, srel(i+1)
+0x0000001a, 0xe80009e7, // mov -, sacq(i+9)
+0x00000002, 0xe80009e7, // mov -, srel(i+1)
+0x0000001b, 0xe80009e7, // mov -, sacq(i+9)
+0x00000003, 0xe80009e7, // mov -, srel(i+1)
+0x0000001c, 0xe80009e7, // mov -, sacq(i+9)
+0x00000004, 0xe80009e7, // mov -, srel(i+1)
+0x0000001d, 0xe80009e7, // mov -, sacq(i+9)
+0x00000005, 0xe80009e7, // mov -, srel(i+1)
+0x0000001e, 0xe80009e7, // mov -, sacq(i+9)
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000006, 0xe80009e7, // mov -, srel(i+1)
+0x0000001f, 0xe80009e7, // mov -, sacq(i+9)
+0x00000007, 0xe80009e7, // mov -, srel(i+1)
+0x00000500, 0xf0f811a7, // brr rx_ptr, label
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x00000009, 0xe80009e7, // mov -, srel(i+9)
+0x00000011, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000a, 0xe80009e7, // mov -, srel(i+9)
+0x00000012, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000b, 0xe80009e7, // mov -, srel(i+9)
+0x00000013, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000c, 0xe80009e7, // mov -, srel(i+9)
+0x00000014, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000d, 0xe80009e7, // mov -, srel(i+9)
+0x00000015, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000e, 0xe80009e7, // mov -, srel(i+9)
+0x00000016, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x00000000, 0xf0f509e7, // bra -, ra_link_1
+0x0000000f, 0xe80009e7, // mov -, srel(i+9)
+0x00000017, 0xe80009e7, // mov -, sacq(i+1)
+0x009e7000, 0x100009e7, // nop
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20467030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d1039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22091cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20467031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819ff2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f1400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829ff609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f1449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204a7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d2039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22092cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204a7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fe2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f2400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fe609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f2449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x204e7030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d3039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22093cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x204e7031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819fc2c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f4400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x829fc609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f4449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (1<<i)
+0x20527030, 0x1000d9c2, // nop;                       fmul.ifnz ra_temp, ra_tw_re+TW16_ACTIVE+i, r0
+0x209d4039, 0x1000c9e2, // nop;                       fmul.ifnz r2,      rb_tw_im+TW16_ACTIVE+i, r1
+0x22094cb8, 0x1006c823, // fsub.ifnz r0, ra_temp, r2; fmul.ifnz r3,      rb_tw_im+TW16_ACTIVE+i, r0
+0x20527031, 0x1000c9e1, // nop;                       fmul.ifnz r1,      ra_tw_re+TW16_ACTIVE+i, r1
+0x819f82c0, 0xd0064862, // fadd.ifnz r1, r1, r3; mov r2, r0 << (1<<i)
+0x819f8400, 0xd0044823, // fadd.ifz  r0, r2, r0; mov r3, r0 >> (1<<i)
+0x00000000, 0xf0f409e7, // bra -, ra_link_0
+0x829f8609, 0xd0064822, // fsub.ifnz r0, r3, r0; mov r2, r1 << (1<<i)
+0x819f8449, 0xd0044863, // fadd.ifz  r1, r2, r1; mov r3, r1 >> (1<<i)
+0x029e7640, 0x10060867, // fsub.ifnz r1, r3, r1
+0x8c15edf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffda8, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x959e7009, 0x10024249, // mov ra_32_re, r0; mov rb_32_im, r1
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0xfffffc90, 0xf0f80027, // brr ra_link_0, call
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x20427006, 0x100059c2, // nop;                  fmul ra_temp, r0, ra_tw_re+TW32_ACTIVE
+0x209d000f, 0x100049e2, // nop;                  fmul r2,      r1, rb_tw_im+TW32_ACTIVE
+0x2042700e, 0x100049e3, // nop;                  fmul r3,      r1, ra_tw_re+TW32_ACTIVE
+0x22090c87, 0x10024821, // fsub r0, ra_temp, r2; fmul r1,      r0, rb_tw_im+TW32_ACTIVE
+0x019e72c0, 0x10020867, // fadd r1, r1,      r3
+0x00000000, 0xf0f549e7, // bra -, ra_save_32
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x95687ff6, 0x10024687, // mov ra_vpm_hi, rb_vpm_hi; mov rb_vpm_hi, ra_vpm_hi
+0x95701ff6, 0x10024701, // mov ra_vdw_32, rb_vdw_32; mov rb_vdw_32, ra_vdw_32
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc00, 0xf0f80027, // brr ra_link_0, call
+0x009e7000, 0xa00009e7, // nop;        ldtmu0
+0x159e7900, 0xa0020827, // mov r0, r4; ldtmu0
+0x159e7900, 0x10020867, // mov r1, r4
+0x00000000, 0xf0f489e7, // bra -, ra_save_16
+0x009e7000, 0x100009e7, // nop
+0x95642ff6, 0x10024642, // mov ra_vpm_lo, rb_vpm_lo; mov rb_vpm_lo, ra_vpm_lo
+0x956c0ff6, 0x100246c0, // mov ra_vdw_16, rb_vdw_16; mov rb_vdw_16, ra_vdw_16
+0x159c5fc0, 0x10022827, // mov.setf r0, rb_inst
+0x0d9c11c0, 0xd0020827, // sub r0, r0, 1
+0x119c51c0, 0xd0020827, // shl r0, r0, 5
+0x0c9c6e00, 0x100601a7, // add.ifnz ra_sync, rx_sync_slave, r0
+0x159c4fc0, 0x10060127, // mov.ifnz ra_save_16, rx_save_slave_16
+0x159cafc0, 0x100602a7, // mov.ifnz ra_save_32, rx_save_slave_32
+0x15827d80, 0x100220e7, // mov.setf ra_addr_x, unif
+0x15827d80, 0x100210e7, // mov      rb_addr_y, unif
+0x00000590, 0xf00809e7, // brr.allz -, r:end
+0x952cbdbf, 0x10024410, // mov ra_tw_re+TW32_ACTIVE, ra_tw_re+tw32; mov rb_tw_im+TW32_ACTIVE, rb_tw_im+tw32
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c51c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15bdf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x14767180, 0x10020867, // and r1, r0, mask
+0x0e9c11c0, 0xd0020827, // shr r0, r0, shift
+0x14767180, 0x10020827, // and r0, r0, mask
+0x119c13c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147a7180, 0x10020867, // and r1, r0, mask
+0x0e9c21c0, 0xd0020827, // shr r0, r0, shift
+0x147a7180, 0x10020827, // and r0, r0, mask
+0x119c23c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x147e7180, 0x10020867, // and r1, r0, mask
+0x0e9c41c0, 0xd0020827, // shr r0, r0, shift
+0x147e7180, 0x10020827, // and r0, r0, mask
+0x119c43c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x149da1c0, 0x10020867, // and r1, r0, mask
+0x0e9c81c0, 0xd0020827, // shr r0, r0, shift
+0x149da1c0, 0x10020827, // and r0, r0, mask
+0x119c83c0, 0xd0020867, // shl r1, r1, shift
+0x159e7040, 0x10020827, // or  r0, r0, r1
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x94981dc0, 0xd00269e2, // and.setf -, elem_num, 1; mov r2, r0
+0x959f1489, 0xd004c820, // mov.ifz  r0, r2; mov.ifnz r0, r1 >> 1
+0x959ff252, 0xd0068861, // mov.ifnz r1, r1; mov.ifz  r1, r2 << 1
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffb50, 0xf0f80227, // brr ra_link_1, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x0e1cddc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffb28, 0xf00809e7, // brr.allz -, r:pass_1
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dfdc0, 0x100201e7, // add ra_points, ra_points, rb_0x100
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x9530cdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffc98, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0xfffffc78, 0xf0f80227, // brr ra_link_1, r:pass_2
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x95514dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x20367016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209cd017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209cd01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x2136709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02527c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d4ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cddc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffffbb0, 0xf00809e7, // brr.allz -, r:pass_2
+0x00000020, 0xe0020827, // mov r0, 4*8
+0x0d227c00, 0x10020227, // sub ra_link_1, ra_link_1, r0
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0x950c3dbf, 0x100250c3, // mov rb_addr_y, ra_addr_x; mov ra_addr_x, rb_addr_y
+0x953cfdbf, 0x100248a3, // mov r2, ra_tw_re+tw16; mov r3, rb_tw_im+tw16
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x159c5fc0, 0x10020827, // mov r0, rb_inst
+0x119c41c0, 0xd0020827, // shl r0, r0, m
+0x0c9a7180, 0x10020167, // add ra_load_idx, r0, elem_num
+0x00000000, 0xe00201e7, // mov ra_points, 0
+0x159c3fc0, 0x10020067, // mov ra_save_ptr, rb_addr_y
+0x8c15ddf6, 0x10024160, // add ra_load_idx, ra_load_idx, stride; mov r0, ra_load_idx
+0x119c31c0, 0xd0020827, // shl r0, r0, 3
+0x0c9c41c0, 0xd0020867, // add r1, r0, 4
+0x0c0e7c00, 0x10020e27, // add t0s, ra_addr_x, r0
+0x0c0e7c40, 0x10020e27, // add t0s, ra_addr_x, r1
+0xfffffab0, 0xf0f80227, // brr ra_link_1, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x95514dbf, 0x100248a3, // mov r2, ra_tw_re+TW16_ACTIVE+3; mov r3, rb_tw_im+TW16_ACTIVE+3
+0x203a7016, 0x100049e0, // nop;             fmul r0, r2, ra_tw_re+tw16
+0x209ce017, 0x100049e1, // nop;             fmul r1, r2, rb_tw_im+tw16
+0x209ce01f, 0x100049e2, // nop;             fmul r2, r3, rb_tw_im+tw16
+0x213a709e, 0x100248a3, // fadd r2, r0, r2; fmul r3, r3, ra_tw_re+tw16
+0x029e7640, 0x100208e7, // fsub r3, r3, r1
+0x02527c80, 0x100208a7, // fsub r2, ra_tw_re+TW16_ACTIVE+3, r2
+0x029d4ec0, 0x100208e7, // fsub r3, rb_tw_im+TW16_ACTIVE+3, r3
+0x14988dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f8492, 0xd002c522, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f86db, 0xd002d523, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14984dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f4492, 0xd002c4e2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f46db, 0xd002d4e3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14982dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f2492, 0xd002c4a2, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f26db, 0xd002d4a3, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x14981dc0, 0xd00229e7, // and.setf -, elem_num, (8>>i)
+0x959f1492, 0xd002c462, // mov ra_tw_re+TW16_ACTIVE+3-i, r2; mov.ifnz r2, r2 >> (8>>i)
+0x959f16db, 0xd002d463, // mov rb_tw_im+TW16_ACTIVE+3-i, r3; mov.ifnz r3, r3 >> (8>>i)
+0x0e1cddc0, 0xd00229e7, // shr.setf -, ra_points, STAGES
+0xfffff9e8, 0xf00809e7, // brr.allz -, r:pass_3
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c1dddc0, 0x100201e7, // add ra_points, ra_points, rb_0x80
+0x00000000, 0xf0f4c227, // bra ra_link_1, ra_sync
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0xa00009e7, // ldtmu0
+0x009e7000, 0xa00009e7, // ldtmu0
+0xfffffa40, 0xf0f809e7, // brr -, r:loop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x159c3fc0, 0x100209a7, // mov interrupt, flag
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 126 - 0
src/hello_fft/hex/shader_trans.hex

@@ -0,0 +1,126 @@
+0x15827d80, 0x10020e27, // mov t0s, unif
+0x009e7000, 0xa00009e7, // ldtmu0
+0x0c9cc9c0, 0xd0020e27, // add t0s, r4, 3*4
+0x009e7000, 0xa00009e7, // ldtmu0
+0x0c827980, 0x100200a7, // add ra_src_base, r4, unif
+0x15827d80, 0x10020e27, // mov t0s, unif
+0x009e7000, 0xa00009e7, // ldtmu0
+0x0c9cc9c0, 0xd0020e27, // add t0s, r4, 3*4
+0x009e7000, 0xa00009e7, // ldtmu0
+0x0c827980, 0x100200e7, // add ra_dst_base, r4, unif
+0x15827d80, 0x100214a7, // mov rb_Y_STRIDE_SRC, unif
+0x15827d80, 0x100214e7, // mov rb_Y_STRIDE_DST, unif
+0x15827d80, 0x10021527, // mov rb_NX,           unif
+0x15827d80, 0x10021567, // mov rb_NY,           unif
+0x00000008, 0xe0021467, // mov rb_X_STRIDE, 2*4
+0x00000010, 0xe0021427, // mov rb_0x10, 0x10
+0xc0000000, 0xe0020827, // mov r0, vdw_setup_1(0)
+0x0c9d31c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_DST
+0x00000040, 0xe0020867, // mov r1, 16*4
+0x0d9e7040, 0x100201a7, // sub ra_vdw_stride, r0, r1
+0x40991037, 0x100049e0, // nop; mul24 r0, elem_num, rb_X_STRIDE
+0x159e7000, 0x10021027, // mov rb_offsets_re+i, r0
+0x0c9c41c0, 0xd0021227, // add rb_offsets_im+i, r0, 4
+0x0c9d21c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_SRC
+0x159e7000, 0x10021067, // mov rb_offsets_re+i, r0
+0x0c9c41c0, 0xd0021267, // add rb_offsets_im+i, r0, 4
+0x0c9d21c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_SRC
+0x159e7000, 0x100210a7, // mov rb_offsets_re+i, r0
+0x0c9c41c0, 0xd00212a7, // add rb_offsets_im+i, r0, 4
+0x0c9d21c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_SRC
+0x159e7000, 0x100210e7, // mov rb_offsets_re+i, r0
+0x0c9c41c0, 0xd00212e7, // add rb_offsets_im+i, r0, 4
+0x0c9d21c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_SRC
+0x159e7000, 0x10021127, // mov rb_offsets_re+i, r0
+0x0c9c41c0, 0xd0021327, // add rb_offsets_im+i, r0, 4
+0x0c9d21c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_SRC
+0x159e7000, 0x10021167, // mov rb_offsets_re+i, r0
+0x0c9c41c0, 0xd0021367, // add rb_offsets_im+i, r0, 4
+0x0c9d21c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_SRC
+0x159e7000, 0x100211a7, // mov rb_offsets_re+i, r0
+0x0c9c41c0, 0xd00213a7, // add rb_offsets_im+i, r0, 4
+0x0c9d21c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_SRC
+0x159e7000, 0x100211e7, // mov rb_offsets_re+i, r0
+0x0c9c41c0, 0xd00213e7, // add rb_offsets_im+i, r0, 4
+0x0c9d21c0, 0x10020827, // add r0, r0, rb_Y_STRIDE_SRC
+0x00000000, 0xe0020067, // mov ra_y, 0
+0x00000000, 0xe0020027, // mov ra_x, 0
+0x40052037, 0x100049e1, // nop; mul24 r1, ra_y, rb_Y_STRIDE_SRC
+0x40011037, 0x100049e0, // nop; mul24 r0, ra_x, rb_X_STRIDE
+0x0c9e7040, 0x10020827, // add r0, r0, r1
+0x0c0a7c00, 0x10020127, // add ra_src_cell, ra_src_base, r0
+0x40013037, 0x100049e1, // nop; mul24 r1, ra_x, rb_Y_STRIDE_DST
+0x40051037, 0x100049e0, // nop; mul24 r0, ra_y, rb_X_STRIDE
+0x0c9e7040, 0x10020827, // add r0, r0, r1
+0x0c0e7c00, 0x10020167, // add ra_dst_cell, ra_dst_base, r0
+0x00001200, 0xe0021c67, // mov vw_setup, vpm_setup(16, 1, v32(0,0))
+0x0c100dc0, 0x10020e27, // add t0s, ra_src_cell, rb_offsets_re
+0x0c108dc0, 0x10020f27, // add t1s, ra_src_cell, rb_offsets_im
+0x0c101dc0, 0x10020e27, // add t0s, ra_src_cell, rb_offsets_re+1+i
+0x0c109dc0, 0x10020f27, // add t1s, ra_src_cell, rb_offsets_im+1+i
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xb00009e7, // ldtmu1
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x0c102dc0, 0x10020e27, // add t0s, ra_src_cell, rb_offsets_re+1+i
+0x0c10adc0, 0x10020f27, // add t1s, ra_src_cell, rb_offsets_im+1+i
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xb00009e7, // ldtmu1
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x0c103dc0, 0x10020e27, // add t0s, ra_src_cell, rb_offsets_re+1+i
+0x0c10bdc0, 0x10020f27, // add t1s, ra_src_cell, rb_offsets_im+1+i
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xb00009e7, // ldtmu1
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x0c104dc0, 0x10020e27, // add t0s, ra_src_cell, rb_offsets_re+1+i
+0x0c10cdc0, 0x10020f27, // add t1s, ra_src_cell, rb_offsets_im+1+i
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xb00009e7, // ldtmu1
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x0c105dc0, 0x10020e27, // add t0s, ra_src_cell, rb_offsets_re+1+i
+0x0c10ddc0, 0x10020f27, // add t1s, ra_src_cell, rb_offsets_im+1+i
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xb00009e7, // ldtmu1
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x0c106dc0, 0x10020e27, // add t0s, ra_src_cell, rb_offsets_re+1+i
+0x0c10edc0, 0x10020f27, // add t1s, ra_src_cell, rb_offsets_im+1+i
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xb00009e7, // ldtmu1
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x0c107dc0, 0x10020e27, // add t0s, ra_src_cell, rb_offsets_re+1+i
+0x0c10fdc0, 0x10020f27, // add t1s, ra_src_cell, rb_offsets_im+1+i
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xb00009e7, // ldtmu1
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xa00009e7, // ldtmu0
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x009e7000, 0xb00009e7, // ldtmu1
+0x159e7900, 0x10020c27, // mov vpm, r4
+0x88104000, 0xe0021c67, // mov vw_setup, vdw_setup_0(16, 16, dma_h32(0,0))
+0x151a7d80, 0x10021c67, // mov vw_setup, ra_vdw_stride
+0x15167d80, 0x10021ca7, // mov vw_addr, ra_dst_cell
+0x159f2fc0, 0x100009e7, // mov -, vw_wait
+0x0c010dc0, 0x10020027, // add ra_x, ra_x, rb_0x10
+0x009e7000, 0x100009e7, // nop
+0x0d014dc0, 0x100229e7, // sub.setf -, ra_x, rb_NX
+0xfffffde0, 0xf01809e7, // brr.allnz -, r:inner
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x0c048dc0, 0xd0020067, // add ra_y, ra_y, 8
+0x009e7000, 0x100009e7, // nop
+0x0d055dc0, 0x100229e7, // sub.setf -, ra_y, rb_NY
+0xfffffda0, 0xf01809e7, // brr.allnz -, r:outer
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop
+0x00000001, 0xe00209a7, // mov interrupt, 1
+0x009e7000, 0x300009e7, // nop; nop; thrend
+0x009e7000, 0x100009e7, // nop
+0x009e7000, 0x100009e7, // nop

+ 248 - 0
src/hello_fft/mailbox.c

@@ -0,0 +1,248 @@
+/*
+Copyright (c) 2012, Broadcom Europe Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <rtl_airband.h>
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <syslog.h>
+#include <unistd.h>
+#include <cerrno>
+
+#include "mailbox.h"
+
+#define PAGE_SIZE (4 * 1024)
+
+void* mapmem(unsigned base, unsigned size) {
+    int mem_fd;
+    unsigned offset = base % PAGE_SIZE;
+    base = base - offset;
+    /* open /dev/mem */
+    if ((mem_fd = open("/dev/mem", O_RDWR | O_SYNC)) < 0) {
+        log(LOG_CRIT, "mapmem(): can't open /dev/mem: %s\n", strerror(errno));
+        exit(-1);
+    }
+    void* mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED /*|MAP_FIXED*/, mem_fd, base);
+#ifdef GPU_FFT_DEBUG
+    printf("base=0x%x, mem=%p\n", base, mem);
+#endif
+    if (mem == MAP_FAILED) {
+        log(LOG_CRIT, "mapmem(): mmap error: %s\n", strerror(errno));
+        exit(-1);
+    }
+    close(mem_fd);
+    return (char*)mem + offset;
+}
+
+void unmapmem(void* addr, unsigned size) {
+    int s = munmap(addr, size);
+    if (s != 0) {
+        log(LOG_CRIT, "unmapmem(): munmap error: %s\n", strerror(errno));
+        exit(-1);
+    }
+}
+
+/*
+ * use ioctl to send mbox property message
+ */
+
+static int mbox_property(int file_desc, void* buf) {
+    int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+
+    if (ret_val < 0) {
+        log(LOG_ERR, "mbox_property(): ioctl_set_msg failed: %s\n", strerror(errno));
+    }
+
+#ifdef GPU_FFT_DEBUG
+    unsigned* p = buf;
+    int i;
+    unsigned size = *(unsigned*)buf;
+    for (i = 0; i < size / 4; i++)
+        printf("%04x: 0x%08x\n", i * sizeof *p, p[i]);
+#endif
+    return ret_val;
+}
+
+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags) {
+    int i = 0;
+    unsigned p[32];
+    p[i++] = 0;           // size
+    p[i++] = 0x00000000;  // process request
+
+    p[i++] = 0x3000c;  // (the tag id)
+    p[i++] = 12;       // (size of the buffer)
+    p[i++] = 12;       // (size of the data)
+    p[i++] = size;     // (num bytes? or pages?)
+    p[i++] = align;    // (alignment)
+    p[i++] = flags;    // (MEM_FLAG_L1_NONALLOCATING)
+
+    p[i++] = 0x00000000;   // end tag
+    p[0] = i * sizeof *p;  // actual size
+
+    mbox_property(file_desc, p);
+    return p[5];
+}
+
+unsigned mem_free(int file_desc, unsigned handle) {
+    int i = 0;
+    unsigned p[32];
+    p[i++] = 0;           // size
+    p[i++] = 0x00000000;  // process request
+
+    p[i++] = 0x3000f;  // (the tag id)
+    p[i++] = 4;        // (size of the buffer)
+    p[i++] = 4;        // (size of the data)
+    p[i++] = handle;
+
+    p[i++] = 0x00000000;   // end tag
+    p[0] = i * sizeof *p;  // actual size
+
+    mbox_property(file_desc, p);
+    return p[5];
+}
+
+unsigned mem_lock(int file_desc, unsigned handle) {
+    int i = 0;
+    unsigned p[32];
+    p[i++] = 0;           // size
+    p[i++] = 0x00000000;  // process request
+
+    p[i++] = 0x3000d;  // (the tag id)
+    p[i++] = 4;        // (size of the buffer)
+    p[i++] = 4;        // (size of the data)
+    p[i++] = handle;
+
+    p[i++] = 0x00000000;   // end tag
+    p[0] = i * sizeof *p;  // actual size
+
+    mbox_property(file_desc, p);
+    return p[5];
+}
+
+unsigned mem_unlock(int file_desc, unsigned handle) {
+    int i = 0;
+    unsigned p[32];
+    p[i++] = 0;           // size
+    p[i++] = 0x00000000;  // process request
+
+    p[i++] = 0x3000e;  // (the tag id)
+    p[i++] = 4;        // (size of the buffer)
+    p[i++] = 4;        // (size of the data)
+    p[i++] = handle;
+
+    p[i++] = 0x00000000;   // end tag
+    p[0] = i * sizeof *p;  // actual size
+
+    mbox_property(file_desc, p);
+    return p[5];
+}
+
+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5) {
+    int i = 0;
+    unsigned p[32];
+    p[i++] = 0;           // size
+    p[i++] = 0x00000000;  // process request
+
+    p[i++] = 0x30010;  // (the tag id)
+    p[i++] = 28;       // (size of the buffer)
+    p[i++] = 28;       // (size of the data)
+    p[i++] = code;
+    p[i++] = r0;
+    p[i++] = r1;
+    p[i++] = r2;
+    p[i++] = r3;
+    p[i++] = r4;
+    p[i++] = r5;
+
+    p[i++] = 0x00000000;   // end tag
+    p[0] = i * sizeof *p;  // actual size
+
+    mbox_property(file_desc, p);
+    return p[5];
+}
+
+unsigned qpu_enable(int file_desc, unsigned enable) {
+    int i = 0;
+    unsigned p[32];
+
+    p[i++] = 0;           // size
+    p[i++] = 0x00000000;  // process request
+
+    p[i++] = 0x30012;  // (the tag id)
+    p[i++] = 4;        // (size of the buffer)
+    p[i++] = 4;        // (size of the data)
+    p[i++] = enable;
+
+    p[i++] = 0x00000000;   // end tag
+    p[0] = i * sizeof *p;  // actual size
+
+    mbox_property(file_desc, p);
+    return p[5];
+}
+
+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+    int i = 0;
+    unsigned p[32];
+
+    p[i++] = 0;           // size
+    p[i++] = 0x00000000;  // process request
+    p[i++] = 0x30011;     // (the tag id)
+    p[i++] = 16;          // (size of the buffer)
+    p[i++] = 16;          // (size of the data)
+    p[i++] = num_qpus;
+    p[i++] = control;
+    p[i++] = noflush;
+    p[i++] = timeout;  // ms
+
+    p[i++] = 0x00000000;   // end tag
+    p[0] = i * sizeof *p;  // actual size
+
+    mbox_property(file_desc, p);
+    return p[5];
+}
+
+int mbox_open() {
+    int file_desc;
+
+    // open a char device file used for communicating with kernel mbox driver
+    file_desc = open(DEVICE_FILE_NAME, 0);
+    if (file_desc < 0) {
+        log(LOG_CRIT, "Can't open device file %s: %s\n", DEVICE_FILE_NAME, strerror(errno));
+        exit(-1);
+    }
+    return file_desc;
+}
+
+void mbox_close(int file_desc) {
+    close(file_desc);
+}

+ 47 - 0
src/hello_fft/mailbox.h

@@ -0,0 +1,47 @@
+/*
+Copyright (c) 2012, Broadcom Europe Ltd.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <linux/ioctl.h>
+
+#define MAJOR_NUM 100
+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char*)
+#define DEVICE_FILE_NAME "/dev/vcio"
+
+int mbox_open();
+void mbox_close(int file_desc);
+
+unsigned get_version(int file_desc);
+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+unsigned mem_free(int file_desc, unsigned handle);
+unsigned mem_lock(int file_desc, unsigned handle);
+unsigned mem_unlock(int file_desc, unsigned handle);
+void* mapmem(unsigned base, unsigned size);
+void unmapmem(void* addr, unsigned size);
+
+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+unsigned qpu_enable(int file_desc, unsigned enable);

+ 86 - 0
src/helper_functions.cpp

@@ -0,0 +1,86 @@
+/*
+ * helper_functions.cpp
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/stat.h>  // struct stat, S_ISDIR
+#include <cstddef>     // size_t
+#include <cstring>     // strerror
+
+#include "helper_functions.h"
+#include "logging.h"
+
+using namespace std;
+
+bool dir_exists(const string& dir_path) {
+    struct stat st;
+    return (stat(dir_path.c_str(), &st) == 0 && S_ISDIR(st.st_mode));
+}
+
+bool file_exists(const string& file_path) {
+    struct stat st;
+    return (stat(file_path.c_str(), &st) == 0 && S_ISREG(st.st_mode));
+}
+
+bool make_dir(const string& dir_path) {
+    if (dir_exists(dir_path)) {
+        return true;
+    }
+
+    if (mkdir(dir_path.c_str(), 0755) != 0) {
+        log(LOG_ERR, "Could not create directory %s: %s\n", dir_path.c_str(), strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+bool make_subdirs(const string& basedir, const string& subdirs) {
+    // if final directory exists then nothing to do
+    const string delim = "/";
+    const string final_path = basedir + delim + subdirs;
+    if (dir_exists(final_path)) {
+        return true;
+    }
+
+    // otherwise scan through subdirs for each slash and make each directory.  start with index of 0
+    // to create basedir incase that doesn't exist
+    size_t index = 0;
+    while (index != string::npos) {
+        if (!make_dir(basedir + delim + subdirs.substr(0, index))) {
+            return false;
+        }
+        index = subdirs.find_first_of(delim, index + 1);
+    }
+
+    make_dir(final_path);
+    return dir_exists(final_path);
+}
+
+string make_dated_subdirs(const string& basedir, const struct tm* time) {
+    // use the time to build the date subdirectories
+    char date_path[11];
+    strftime(date_path, sizeof(date_path), "%Y/%m/%d", time);
+    const string date_path_str = string(date_path);
+
+    // make all the subdirectories, and return the full path if successful
+    if (make_subdirs(basedir, date_path_str)) {
+        return basedir + "/" + date_path_str;
+    }
+
+    // on any error return empty string
+    return "";
+}

+ 32 - 0
src/helper_functions.h

@@ -0,0 +1,32 @@
+/*
+ * helper_functions.h
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _HELPER_FUNCTIONS_H
+#define _HELPER_FUNCTIONS_H
+
+#include <ctime>  // struct tm
+#include <string>
+
+bool dir_exists(const std::string& dir_path);
+bool file_exists(const std::string& file_path);
+bool make_dir(const std::string& dir_path);
+bool make_subdirs(const std::string& basedir, const std::string& subdirs);
+std::string make_dated_subdirs(const std::string& basedir, const struct tm* time);
+
+#endif /* _HELPER_FUNCTIONS_H */

+ 131 - 0
src/input-common.cpp

@@ -0,0 +1,131 @@
+/*
+ * input-common.cpp
+ * common input handling routines
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "input-common.h"
+#include <assert.h>
+#include <dlfcn.h>  // dlopen, dlsym
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>   // asprintf
+#include <stdlib.h>  // free
+#include <string.h>
+#include <iostream>
+
+using namespace std;
+
+typedef input_t* (*input_new_func_t)(void);
+
+input_t* input_new(char const* const type) {
+    assert(type != NULL);
+    void* dlhandle = dlopen(NULL, RTLD_NOW);
+    assert(dlhandle != NULL);
+    char* fname = NULL;
+    int chars_written = asprintf(&fname, "%s_input_new", type);
+    if (chars_written <= 0) {
+        return NULL;
+    }
+    input_new_func_t fptr = (input_new_func_t)dlsym(dlhandle, fname);
+    free(fname);
+    if (fptr == NULL) {
+        return NULL;
+    }
+    input_t* input = (*fptr)();
+    assert(input->init != NULL);
+    assert(input->run_rx_thread != NULL);
+    assert(input->set_centerfreq != NULL);
+    return input;
+}
+
+int input_init(input_t* const input) {
+    assert(input != NULL);
+    input_state_t new_state = INPUT_FAILED;  // fail-safe default
+    errno = 0;
+    int ret = input->init(input);
+    if (ret < 0) {
+        ret = -1;
+    } else if ((ret = pthread_mutex_init(&input->buffer_lock, NULL)) != 0) {
+        errno = ret;
+        ret = -1;
+    } else {
+        new_state = INPUT_INITIALIZED;
+        ret = 0;
+    }
+    input->state = new_state;
+    return ret;
+}
+
+int input_start(input_t* const input) {
+    assert(input != NULL);
+    assert(input->dev_data != NULL);
+    assert(input->state == INPUT_INITIALIZED);
+    int err = pthread_create(&input->rx_thread, NULL, input->run_rx_thread, (void*)input);
+    if (err != 0) {
+        errno = err;
+        return -1;
+    }
+    return 0;
+}
+
+int input_parse_config(input_t* const input, libconfig::Setting& cfg) {
+    assert(input != NULL);
+    if (input->parse_config != NULL) {
+        return input->parse_config(input, cfg);
+    } else {
+        // Very simple inputs (like stdin) might not necessarily have any configuration
+        // variables, so it's legal not to have parse_config defined.
+        return 0;
+    }
+}
+
+int input_stop(input_t* const input) {
+    assert(input != NULL);
+    assert(input->dev_data != NULL);
+    int err = 0;
+    errno = 0;
+    if (input->state == INPUT_RUNNING && input->stop != NULL) {
+        err = input->stop(input);
+        if (err != 0) {
+            input->state = INPUT_FAILED;
+            return -1;
+        }
+    }
+    input->state = INPUT_STOPPED;
+    err = pthread_join(input->rx_thread, NULL);
+    if (err != 0) {
+        errno = err;
+        return -1;
+    }
+    return 0;
+}
+
+int input_set_centerfreq(input_t* const input, int const centerfreq) {
+    assert(input != NULL);
+    assert(input->dev_data != NULL);
+    if (input->state != INPUT_RUNNING) {
+        return -1;
+    }
+    int ret = input->set_centerfreq(input, centerfreq);
+    if (ret != 0) {
+        input->state = INPUT_FAILED;
+        return -1;
+    }
+    input->centerfreq = centerfreq;
+    return 0;
+}

+ 66 - 0
src/input-common.h

@@ -0,0 +1,66 @@
+/*
+ * input-common.h
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _INPUT_COMMON_H
+#define _INPUT_COMMON_H 1
+#include <pthread.h>
+#include <libconfig.h++>
+
+#if __GNUC__ >= 4
+#define MODULE_EXPORT extern "C" __attribute__((visibility("default")))
+#else
+#define MODULE_EXPORT extern "C"
+#endif /* __GNUC__ */
+
+typedef enum { SFMT_UNDEF = 0, SFMT_U8, SFMT_S8, SFMT_S16, SFMT_F32 } sample_format_t;
+#define SAMPLE_FORMAT_CNT 5
+
+typedef enum { INPUT_UNKNOWN = 0, INPUT_INITIALIZED, INPUT_RUNNING, INPUT_FAILED, INPUT_STOPPED, INPUT_DISABLED } input_state_t;
+#define INPUT_STATE_CNT 6
+
+typedef struct input_t input_t;
+
+struct input_t {
+    unsigned char* buffer;
+    void* dev_data;
+    size_t buf_size, bufs, bufe;
+    size_t overflow_count;
+    input_state_t state;
+    sample_format_t sfmt;
+    float fullscale;
+    int bytes_per_sample;
+    int sample_rate;
+    int centerfreq;
+    int (*parse_config)(input_t* const input, libconfig::Setting& cfg);
+    int (*init)(input_t* const input);
+    void* (*run_rx_thread)(void* input_ptr);  // to be launched via pthread_create()
+    int (*set_centerfreq)(input_t* const input, int const centerfreq);
+    int (*stop)(input_t* const input);
+    pthread_t rx_thread;
+    pthread_mutex_t buffer_lock;
+};
+
+input_t* input_new(char const* const type);
+int input_init(input_t* const input);
+int input_parse_config(input_t* const input, libconfig::Setting& cfg);
+int input_start(input_t* const input);
+int input_set_centerfreq(input_t* const input, int const centerfreq);
+int input_stop(input_t* const input);
+
+#endif /* _INPUT_COMMON_H */

+ 181 - 0
src/input-file.cpp

@@ -0,0 +1,181 @@
+/*
+ * input-file.cpp
+ * binary file specific routines
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "input-file.h"  // file_dev_data_t
+#include <assert.h>
+#include <limits.h>  // SCHAR_MAX
+#include <stdio.h>
+#include <string.h>
+#include <syslog.h>         // FIXME: get rid of this
+#include <unistd.h>         // usleep
+#include <libconfig.h++>    // Setting
+#include "input-common.h"   // input_t, sample_format_t, input_state_t, MODULE_EXPORT
+#include "input-helpers.h"  // circbuffer_append
+#include "rtl_airband.h"    // do_exit, fft_size, debug_print, XCALLOC, error()
+
+using namespace std;
+
+int file_parse_config(input_t* const input, libconfig::Setting& cfg) {
+    assert(input != NULL);
+    file_dev_data_t* dev_data = (file_dev_data_t*)input->dev_data;
+    assert(dev_data != NULL);
+
+    if (cfg.exists("filepath")) {
+        dev_data->filepath = strdup(cfg["filepath"]);
+    } else {
+        cerr << "File configuration error: no 'filepath' given\n";
+        error();
+    }
+
+    if (cfg.exists("speedup_factor")) {
+        if (cfg["speedup_factor"].getType() == libconfig::Setting::TypeInt) {
+            dev_data->speedup_factor = (int)cfg["speedup_factor"];
+        } else if (cfg["speedup_factor"].getType() == libconfig::Setting::TypeFloat) {
+            dev_data->speedup_factor = (float)cfg["speedup_factor"];
+        } else {
+            cerr << "File configuration error: 'speedup_factor' must be a float or int if set\n";
+            error();
+        }
+        if (dev_data->speedup_factor <= 0.0) {
+            cerr << "File configuration error: 'speedup_factor' must be >= 0.0\n";
+            error();
+        }
+    } else {
+        dev_data->speedup_factor = 4;
+    }
+
+    return 0;
+}
+
+int file_init(input_t* const input) {
+    assert(input != NULL);
+    file_dev_data_t* dev_data = (file_dev_data_t*)input->dev_data;
+    assert(dev_data != NULL);
+
+    dev_data->input_file = fopen(dev_data->filepath, "rb");
+    if (!dev_data->input_file) {
+        cerr << "File input failed to open '" << dev_data->filepath << "' - " << strerror(errno) << endl;
+        error();
+    }
+
+    log(LOG_INFO, "File input %s initialized\n", dev_data->filepath);
+    return 0;
+}
+
+void* file_rx_thread(void* ctx) {
+    input_t* input = (input_t*)ctx;
+    assert(input != NULL);
+    assert(input->sample_rate != 0);
+    file_dev_data_t* dev_data = (file_dev_data_t*)input->dev_data;
+    assert(dev_data != NULL);
+    assert(dev_data->input_file != NULL);
+    assert(dev_data->speedup_factor != 0.0);
+
+    size_t buf_len = (input->buf_size / 2) - 1;
+    unsigned char* buf = (unsigned char*)XCALLOC(1, buf_len);
+
+    float time_per_byte_ms = 1000 / (input->sample_rate * input->bytes_per_sample * 2 * dev_data->speedup_factor);
+
+    log(LOG_DEBUG, "sample_rate: %d, bytes_per_sample: %d, speedup_factor: %f, time_per_byte_ms: %f\n", input->sample_rate, input->bytes_per_sample, dev_data->speedup_factor, time_per_byte_ms);
+
+    input->state = INPUT_RUNNING;
+
+    while (true) {
+        if (do_exit) {
+            break;
+        }
+        if (feof(dev_data->input_file)) {
+            log(LOG_INFO, "File '%s': hit end of file at %d, disabling\n", dev_data->filepath, ftell(dev_data->input_file));
+            input->state = INPUT_FAILED;
+            break;
+        }
+        if (ferror(dev_data->input_file)) {
+            log(LOG_ERR, "File '%s': read error (%d), disabling\n", dev_data->filepath, ferror(dev_data->input_file));
+            input->state = INPUT_FAILED;
+            break;
+        }
+
+        timeval start;
+        gettimeofday(&start, NULL);
+
+        size_t space_left;
+        pthread_mutex_lock(&input->buffer_lock);
+        if (input->bufe >= input->bufs) {
+            space_left = input->bufs + (input->buf_size - input->bufe);
+        } else {
+            space_left = input->bufs - input->bufe;
+        }
+        pthread_mutex_unlock(&input->buffer_lock);
+
+        if (space_left > buf_len) {
+            size_t len = fread(buf, sizeof(unsigned char), buf_len, dev_data->input_file);
+            circbuffer_append(input, buf, len);
+
+            timeval end;
+            gettimeofday(&end, NULL);
+
+            int time_taken_ms = delta_sec(&start, &end) * 1000;
+            int sleep_time_ms = len * time_per_byte_ms - time_taken_ms;
+
+            if (sleep_time_ms > 0) {
+                SLEEP(sleep_time_ms);
+            }
+        } else {
+            SLEEP(10);
+        }
+    }
+
+    free(buf);
+    return 0;
+}
+
+int file_set_centerfreq(input_t* const /*input*/, int const /*centerfreq*/) {
+    return 0;
+}
+
+int file_stop(input_t* const input) {
+    assert(input != NULL);
+    file_dev_data_t* dev_data = (file_dev_data_t*)input->dev_data;
+    assert(dev_data != NULL);
+    fclose(dev_data->input_file);
+    dev_data->input_file = NULL;
+    return 0;
+}
+
+MODULE_EXPORT input_t* file_input_new() {
+    file_dev_data_t* dev_data = (file_dev_data_t*)XCALLOC(1, sizeof(file_dev_data_t));
+    dev_data->input_file = NULL;
+    dev_data->speedup_factor = 0.0;
+
+    input_t* input = (input_t*)XCALLOC(1, sizeof(input_t));
+    input->dev_data = dev_data;
+    input->state = INPUT_UNKNOWN;
+    input->sfmt = SFMT_U8;
+    input->fullscale = (float)SCHAR_MAX - 0.5f;
+    input->bytes_per_sample = sizeof(unsigned char);
+    input->sample_rate = 0;
+    input->parse_config = &file_parse_config;
+    input->init = &file_init;
+    input->run_rx_thread = &file_rx_thread;
+    input->set_centerfreq = &file_set_centerfreq;
+    input->stop = &file_stop;
+
+    return input;
+}

+ 29 - 0
src/input-file.h

@@ -0,0 +1,29 @@
+/*
+ *  input-file.h
+ *  RTLSDR-specific declarations
+ *
+ *  Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+typedef struct {
+    char* filepath;
+    FILE* input_file;
+    float speedup_factor;
+} file_dev_data_t;

+ 63 - 0
src/input-helpers.cpp

@@ -0,0 +1,63 @@
+/*
+ * input-helpers.cpp
+ * Convenience functions to be called by input drivers
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <pthread.h>       // pthread_mutex_lock, unlock
+#include <string.h>        // memcpy
+#include <iostream>        // cerr
+#include "input-common.h"  // input_t
+#include "rtl_airband.h"   // debug_print
+
+/* Write input data into circular buffer input->buffer.
+ * In general, input->buffer_size is not an exact multiple of len,
+ * so we have to take care about proper wrapping.
+ * input->buffer_size is an exact multiple of FFT_BATCH * bps
+ * (input bytes per output audio sample) and input->buffer's real length
+ * is input->buf_size + 2 * bytes_per_input-sample * fft_size. On each
+ * wrap we copy 2 * fft_size bytes from the start of input->buffer to its end,
+ * so that the signal windowing function could handle the whole FFT batch
+ * without wrapping.
+ */
+void circbuffer_append(input_t* const input, unsigned char* buf, size_t len) {
+    if (len == 0)
+        return;
+    pthread_mutex_lock(&input->buffer_lock);
+    size_t space_left = input->buf_size - input->bufe;
+    if (space_left >= len) {
+        memcpy(input->buffer + input->bufe, buf, len);
+        if (input->bufe == 0) {
+            memcpy(input->buffer + input->buf_size, input->buffer, std::min(len, 2 * input->bytes_per_sample * fft_size));
+            debug_print("tail_len=%zu bytes\n", std::min(len, 2 * input->bytes_per_sample * fft_size));
+        }
+    } else {
+        memcpy(input->buffer + input->bufe, buf, space_left);
+        memcpy(input->buffer, buf + space_left, len - space_left);
+        memcpy(input->buffer + input->buf_size, input->buffer, std::min(len - space_left, 2 * input->bytes_per_sample * fft_size));
+        debug_print("buf wrap: space_left=%zu len=%zu bufe=%zu wrap_len=%zu tail_len=%zu\n", space_left, len, input->bufe, len - space_left,
+                    std::min(len - space_left, 2 * input->bytes_per_sample * fft_size));
+    }
+
+    size_t old_end = input->bufe;
+    input->bufe = (input->bufe + len) % input->buf_size;
+    if (old_end < input->bufs && input->bufe >= input->bufs) {
+        std::cerr << "Warning: buffer overflow\n";
+        input->overflow_count++;
+    }
+    pthread_mutex_unlock(&input->buffer_lock);
+}

+ 24 - 0
src/input-helpers.h

@@ -0,0 +1,24 @@
+/*
+ * input-helpers.h
+ * Convenience functions to be called by input drivers
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "input-common.h"  // input_t
+
+// input-helpers.cpp
+void circbuffer_append(input_t* const input, unsigned char* buf, size_t len);

+ 239 - 0
src/input-mirisdr.cpp

@@ -0,0 +1,239 @@
+/*
+ *  input-mirisdr.cpp
+ *  MiriSDR-specific routines
+ *
+ *  Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "input-mirisdr.h"  // mirisdr_dev_data_t
+#include <assert.h>
+#include <limits.h>  // SCHAR_MAX
+#include <mirisdr.h>
+#include <stdint.h>  // uint32_t
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>  // FIXME: get rid of this
+#include <iostream>
+#include <libconfig.h++>    // Setting
+#include "input-common.h"   // input_t, sample_format_t, input_state_t, MODULE_EXPORT
+#include "input-helpers.h"  // circbuffer_append
+#include "rtl_airband.h"    // do_exit, fft_size, debug_print, XCALLOC, error()
+
+using namespace std;
+
+static void mirisdr_callback(unsigned char* buf, uint32_t len, void* ctx) {
+    if (do_exit)
+        return;
+    input_t* input = (input_t*)ctx;
+    circbuffer_append(input, buf, (size_t)len);
+}
+
+/* based on librtlsdr-keenerd, (c) Kyle Keen */
+static bool mirisdr_nearest_gain(mirisdr_dev_t* dev, int target_gain, int* nearest) {
+    assert(nearest != NULL);
+    int i, r, err1, err2, count;
+    int* gains;
+    r = mirisdr_set_tuner_gain_mode(dev, 1);
+    if (r < 0) {
+        return false;
+    }
+    count = mirisdr_get_tuner_gains(dev, NULL);
+    if (count <= 0) {
+        return false;
+    }
+    gains = (int*)XCALLOC(count, sizeof(int));
+    count = mirisdr_get_tuner_gains(dev, gains);
+    *nearest = gains[0];
+    for (i = 0; i < count; i++) {
+        err1 = abs(target_gain - *nearest);
+        err2 = abs(target_gain - gains[i]);
+        if (err2 < err1) {
+            *nearest = gains[i];
+        }
+    }
+    free(gains);
+    return true;
+}
+
+static int mirisdr_find_device_by_serial(char const* const s) {
+    char vendor[256] = {0}, product[256] = {0}, serial[256] = {0};
+    int count = mirisdr_get_device_count();
+    if (count < 1) {
+        return -1;
+    }
+    for (int i = 0; i < count; i++) {
+        mirisdr_get_device_usb_strings(i, vendor, product, serial);
+        if (strcmp(s, serial) != 0) {
+            continue;
+        }
+        return i;
+    }
+    return -1;
+}
+
+int mirisdr_init(input_t* const input) {
+    mirisdr_dev_data_t* dev_data = (mirisdr_dev_data_t*)input->dev_data;
+    if (dev_data->serial != NULL) {
+        dev_data->index = mirisdr_find_device_by_serial(dev_data->serial);
+        if (dev_data->index < 0) {
+            cerr << "MiriSDR device with serial number " << dev_data->serial << " not found\n";
+            error();
+        }
+    }
+
+    dev_data->dev = NULL;
+    mirisdr_open(&dev_data->dev, dev_data->index);
+    if (NULL == dev_data->dev) {
+        log(LOG_ERR, "Failed to open mirisdr device #%d.\n", dev_data->index);
+        error();
+    }
+
+    char transfer_str[] = "BULK";
+    char sample_format_str[] = "504_S8";
+
+    mirisdr_dev_t* miri = dev_data->dev;
+    int r = mirisdr_set_transfer(miri, transfer_str);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set bulk transfer mode for MiriSDR device #%d: error %d\n", dev_data->index, r);
+        error();
+    }
+    r = mirisdr_set_sample_rate(miri, input->sample_rate);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set sample rate for device #%d. Error %d.\n", dev_data->index, r);
+    }
+
+    r = mirisdr_set_center_freq(miri, input->centerfreq - dev_data->correction);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set center freq for device #%d. Error %d.\n", dev_data->index, r);
+    }
+
+    int ngain = 0;
+    if (mirisdr_nearest_gain(miri, dev_data->gain, &ngain) != true) {
+        log(LOG_ERR, "Failed to read supported gain list for device #%d\n", dev_data->index);
+        error();
+    }
+    r = mirisdr_set_tuner_gain_mode(miri, 1);
+    r |= mirisdr_set_tuner_gain(miri, ngain);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set gain to %d for device #%d: error %d\n", ngain, dev_data->index, r);
+    } else {
+        log(LOG_INFO, "Device #%d: gain set to %d dB\n", dev_data->index, mirisdr_get_tuner_gain(miri));
+    }
+    r = mirisdr_set_sample_format(miri, sample_format_str);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set sample format for device #%d: error %d\n", dev_data->index, r);
+        error();
+    }
+    mirisdr_reset_buffer(miri);
+    log(LOG_INFO, "MiriSDR device %d initialized\n", dev_data->index);
+    return 0;
+}
+
+void* mirisdr_rx_thread(void* ctx) {
+    input_t* input = (input_t*)ctx;
+    mirisdr_dev_data_t* dev_data = (mirisdr_dev_data_t*)input->dev_data;
+    assert(dev_data->dev != NULL);
+
+    input->state = INPUT_RUNNING;
+    if (mirisdr_read_async(dev_data->dev, mirisdr_callback, ctx, dev_data->bufcnt, MIRISDR_BUFSIZE) < 0) {
+        log(LOG_ERR, "MiriSDR device #%d: async read failed, disabling\n", dev_data->index);
+        input->state = INPUT_FAILED;
+    }
+    return 0;
+}
+
+int mirisdr_stop(input_t* const input) {
+    mirisdr_dev_data_t* dev_data = (mirisdr_dev_data_t*)input->dev_data;
+    assert(dev_data->dev != NULL);
+
+    if (mirisdr_cancel_async(dev_data->dev) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+int mirisdr_set_centerfreq(input_t* const input, int const centerfreq) {
+    mirisdr_dev_data_t* dev_data = (mirisdr_dev_data_t*)input->dev_data;
+    assert(dev_data->dev != NULL);
+
+    int r = mirisdr_set_center_freq(dev_data->dev, centerfreq - dev_data->correction);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set centerfreq for MiriSDR device #%d: error %d\n", dev_data->index, r);
+        return -1;
+    }
+    return 0;
+}
+
+int mirisdr_parse_config(input_t* const input, libconfig::Setting& cfg) {
+    mirisdr_dev_data_t* dev_data = (mirisdr_dev_data_t*)input->dev_data;
+    if (cfg.exists("serial")) {
+        dev_data->serial = strdup(cfg["serial"]);
+    } else if (cfg.exists("index")) {
+        dev_data->index = (int)cfg["index"];
+    } else {
+        cerr << "MiriSDR configuration error: no index and no serial number given\n";
+        error();
+    }
+    if (cfg.exists("gain")) {
+        dev_data->gain = (int)cfg["gain"];
+    } else {
+        cerr << "MiriSDR configuration error: gain is not configured\n";
+        error();
+    }
+    if (cfg.exists("correction")) {
+        dev_data->correction = (int)cfg["correction"];
+    }
+    if (cfg.exists("num_buffers")) {
+        dev_data->bufcnt = (int)(cfg["num_buffers"]);
+        if (dev_data->bufcnt < 1) {
+            cerr << "MiriSDR configuration error: num_buffers must be greater than 0\n";
+            error();
+        }
+    }
+    return 0;
+}
+
+MODULE_EXPORT input_t* mirisdr_input_new() {
+    mirisdr_dev_data_t* dev_data = (mirisdr_dev_data_t*)XCALLOC(1, sizeof(mirisdr_dev_data_t));
+    dev_data->index = -1;  // invalid default receiver index
+    dev_data->gain = -1;   // invalid default gain value
+    dev_data->bufcnt = MIRISDR_DEFAULT_LIBUSB_BUFFER_COUNT;
+    /*	return &( input_t ){
+                    .dev_data = dev_data,
+                    .state = INPUT_UNKNOWN,
+                    .sfmt = SFMT_U8,
+                    .sample_rate = MIRISDR_DEFAULT_SAMPLE_RATE,
+                    .parse_config = &mirisdr_parse_config,
+                    .init = &mirisdr_init,
+                    .run_rx_thread = &mirisdr_rx_thread,
+                    .set_centerfreq = &mirisdr_set_centerfreq,
+                    .stop = &mirisdr_stop
+            }; */
+    input_t* input = (input_t*)XCALLOC(1, sizeof(input_t));
+    input->dev_data = dev_data;
+    input->state = INPUT_UNKNOWN;
+    input->sfmt = SFMT_S8;
+    input->fullscale = (float)SCHAR_MAX - 0.5f;
+    input->bytes_per_sample = sizeof(char);
+    input->sample_rate = MIRISDR_DEFAULT_SAMPLE_RATE;
+    input->parse_config = &mirisdr_parse_config;
+    input->init = &mirisdr_init;
+    input->run_rx_thread = &mirisdr_rx_thread;
+    input->set_centerfreq = &mirisdr_set_centerfreq;
+    input->stop = &mirisdr_stop;
+    return input;
+}

+ 32 - 0
src/input-mirisdr.h

@@ -0,0 +1,32 @@
+/*
+ *  input-mirisdr.h
+ *  MiriSDR-specific declarations
+ *
+ *  Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <mirisdr.h>  // mirisdr_dev_t
+#define MIRISDR_BUFSIZE 320000
+#define MIRISDR_DEFAULT_LIBUSB_BUFFER_COUNT 10
+#define MIRISDR_DEFAULT_SAMPLE_RATE 2560000
+
+typedef struct {
+    mirisdr_dev_t* dev;  // pointer to libmirisdr device struct
+    char* serial;        // dongle serial number
+    int index;           // dongle index
+    int correction;      // correction in Hertz (PPM correction is not supported by libmirisdr)
+    int gain;            // gain in dB
+    int bufcnt;          // libusb buffer count
+} mirisdr_dev_data_t;

+ 254 - 0
src/input-rtlsdr.cpp

@@ -0,0 +1,254 @@
+/*
+ * input-rtlsdr.cpp
+ * RTLSDR-specific routines
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "input-rtlsdr.h"  // rtlsdr_dev_data_t
+#include <assert.h>
+#include <limits.h>  // SCHAR_MAX
+#include <rtl-sdr.h>
+#include <stdint.h>  // uint32_t
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>  // FIXME: get rid of this
+#include <iostream>
+#include <libconfig.h++>    // Setting
+#include "input-common.h"   // input_t, sample_format_t, input_state_t, MODULE_EXPORT
+#include "input-helpers.h"  // circbuffer_append
+#include "rtl_airband.h"    // do_exit, fft_size, debug_print, XCALLOC, error()
+
+using namespace std;
+
+static void rtlsdr_callback(unsigned char* buf, uint32_t len, void* ctx) {
+    if (do_exit)
+        return;
+    input_t* input = (input_t*)ctx;
+    circbuffer_append(input, buf, (size_t)len);
+}
+
+/* based on librtlsdr-keenerd, (c) Kyle Keen */
+static bool rtlsdr_nearest_gain(rtlsdr_dev_t* dev, int target_gain, int* nearest) {
+    assert(nearest != NULL);
+    int i, r, err1, err2, count;
+    int* gains;
+    r = rtlsdr_set_tuner_gain_mode(dev, 1);
+    if (r < 0) {
+        return false;
+    }
+    count = rtlsdr_get_tuner_gains(dev, NULL);
+    if (count <= 0) {
+        return false;
+    }
+    gains = (int*)XCALLOC(count, sizeof(int));
+    count = rtlsdr_get_tuner_gains(dev, gains);
+    *nearest = gains[0];
+    for (i = 0; i < count; i++) {
+        err1 = abs(target_gain - *nearest);
+        err2 = abs(target_gain - gains[i]);
+        if (err2 < err1) {
+            *nearest = gains[i];
+        }
+    }
+    free(gains);
+    return true;
+}
+
+static int rtlsdr_find_device_by_serial(char const* const s) {
+    char vendor[256] = {0}, product[256] = {0}, serial[256] = {0};
+    int count = rtlsdr_get_device_count();
+    if (count < 1) {
+        return -1;
+    }
+    for (int i = 0; i < count; i++) {
+        rtlsdr_get_device_usb_strings(i, vendor, product, serial);
+        if (strcmp(s, serial) != 0) {
+            continue;
+        }
+        return i;
+    }
+    return -1;
+}
+
+int rtlsdr_init(input_t* const input) {
+    rtlsdr_dev_data_t* dev_data = (rtlsdr_dev_data_t*)input->dev_data;
+    if (dev_data->serial != NULL) {
+        dev_data->index = rtlsdr_find_device_by_serial(dev_data->serial);
+        if (dev_data->index < 0) {
+            cerr << "RTLSDR device with serial number " << dev_data->serial << " not found\n";
+            error();
+        }
+    }
+
+    dev_data->dev = NULL;
+    rtlsdr_open(&dev_data->dev, dev_data->index);
+    if (NULL == dev_data->dev) {
+        log(LOG_ERR, "Failed to open rtlsdr device #%d.\n", dev_data->index);
+        error();
+    }
+
+    rtlsdr_dev_t* rtl = dev_data->dev;
+    int r = rtlsdr_set_sample_rate(rtl, input->sample_rate);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set sample rate for device #%d. Error %d.\n", dev_data->index, r);
+    }
+
+    r = rtlsdr_set_center_freq(rtl, input->centerfreq);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set center freq for device #%d. Error %d.\n", dev_data->index, r);
+    }
+
+    r = rtlsdr_set_freq_correction(rtl, dev_data->correction);
+    if (r < 0 && r != -2) {
+        log(LOG_ERR, "Failed to set freq correction for device #%d. Error %d.\n", dev_data->index, r);
+    }
+
+    // Fitipower FC0012 gain needs to be initialized to its lowest value before setting it to the desired value
+    if (rtlsdr_get_tuner_type(rtl) == RTLSDR_TUNER_FC0012) {
+        int initialGain = 0;
+        if (rtlsdr_nearest_gain(rtl, -99, &initialGain) != true) {
+            log(LOG_ERR, "Failed to read supported gain list for device #%d\n", dev_data->index);
+            error();
+        }
+
+        r |= rtlsdr_set_tuner_gain(rtl, initialGain);
+        if (r < 0) {
+            log(LOG_ERR, "Failed to initialize gain for device #%d: error %d\n", (float)initialGain / 10.f, dev_data->index, r);
+        }
+    }
+
+    int ngain = 0;
+    if (rtlsdr_nearest_gain(rtl, dev_data->gain, &ngain) != true) {
+        log(LOG_ERR, "Failed to read supported gain list for device #%d\n", dev_data->index);
+        error();
+    }
+    r = rtlsdr_set_tuner_gain_mode(rtl, 1);
+    r |= rtlsdr_set_tuner_gain(rtl, ngain);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set gain to %0.2f for device #%d: error %d\n", (float)ngain / 10.f, dev_data->index, r);
+    } else {
+        log(LOG_INFO, "Device #%d: gain set to %0.2f dB\n", dev_data->index, (float)rtlsdr_get_tuner_gain(rtl) / 10.f);
+    }
+
+    r = rtlsdr_set_agc_mode(rtl, 0);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to disable AGC for device #%d. Error %d.\n", dev_data->index, r);
+    }
+    rtlsdr_reset_buffer(rtl);
+    log(LOG_INFO, "RTLSDR device %d initialized\n", dev_data->index);
+    return 0;
+}
+
+void* rtlsdr_rx_thread(void* ctx) {
+    input_t* input = (input_t*)ctx;
+    rtlsdr_dev_data_t* dev_data = (rtlsdr_dev_data_t*)input->dev_data;
+    assert(dev_data->dev != NULL);
+
+    input->state = INPUT_RUNNING;
+    if (rtlsdr_read_async(dev_data->dev, rtlsdr_callback, ctx, dev_data->bufcnt, RTLSDR_BUFSIZE) < 0) {
+        log(LOG_ERR, "RTLSDR device #%d: async read failed, disabling\n", dev_data->index);
+        input->state = INPUT_FAILED;
+    }
+    return 0;
+}
+
+int rtlsdr_stop(input_t* const input) {
+    rtlsdr_dev_data_t* dev_data = (rtlsdr_dev_data_t*)input->dev_data;
+    assert(dev_data->dev != NULL);
+
+    if (rtlsdr_cancel_async(dev_data->dev) < 0) {
+        return -1;
+    }
+    return rtlsdr_close(dev_data->dev);
+}
+
+int rtlsdr_set_centerfreq(input_t* const input, int const centerfreq) {
+    rtlsdr_dev_data_t* dev_data = (rtlsdr_dev_data_t*)input->dev_data;
+    assert(dev_data->dev != NULL);
+
+    int r = rtlsdr_set_center_freq(dev_data->dev, centerfreq);
+    if (r < 0) {
+        log(LOG_ERR, "Failed to set centerfreq for RTLSDR device #%d: error %d\n", dev_data->index, r);
+        return -1;
+    }
+    return 0;
+}
+
+int rtlsdr_parse_config(input_t* const input, libconfig::Setting& cfg) {
+    rtlsdr_dev_data_t* dev_data = (rtlsdr_dev_data_t*)input->dev_data;
+    if (cfg.exists("serial")) {
+        dev_data->serial = strdup(cfg["serial"]);
+    } else if (cfg.exists("index")) {
+        dev_data->index = (int)cfg["index"];
+    } else {
+        cerr << "RTLSDR configuration error: no index and no serial number given\n";
+        error();
+    }
+    if (cfg.exists("gain")) {
+        if (cfg["gain"].getType() == libconfig::Setting::TypeInt) {  // backward compatibility
+            dev_data->gain = (int)cfg["gain"] * 10;
+        } else if (cfg["gain"].getType() == libconfig::Setting::TypeFloat) {
+            dev_data->gain = (int)((float)cfg["gain"] * 10.0f);
+        }
+    } else {
+        cerr << "RTLSDR configuration error: gain is not configured\n";
+        error();
+    }
+    if (cfg.exists("correction")) {
+        dev_data->correction = (int)cfg["correction"];
+    }
+    if (cfg.exists("buffers")) {
+        dev_data->bufcnt = (int)(cfg["buffers"]);
+        if (dev_data->bufcnt < 1) {
+            cerr << "RTLSDR configuration error: buffers must be greater than 0\n";
+            error();
+        }
+    }
+    return 0;
+}
+
+MODULE_EXPORT input_t* rtlsdr_input_new() {
+    rtlsdr_dev_data_t* dev_data = (rtlsdr_dev_data_t*)XCALLOC(1, sizeof(rtlsdr_dev_data_t));
+    dev_data->index = -1;  // invalid default receiver index
+    dev_data->gain = -1;   // invalid default gain value
+    dev_data->bufcnt = RTLSDR_DEFAULT_LIBUSB_BUFFER_COUNT;
+    /*	return &( input_t ){
+                    .dev_data = dev_data,
+                    .state = INPUT_UNKNOWN,
+                    .sfmt = SFMT_U8,
+                    .sample_rate = RTLSDR_DEFAULT_SAMPLE_RATE,
+                    .parse_config = &rtlsdr_parse_config,
+                    .init = &rtlsdr_init,
+                    .run_rx_thread = &rtlsdr_rx_thread,
+                    .set_centerfreq = &rtlsdr_set_centerfreq,
+                    .stop = &rtlsdr_stop
+            }; */
+    input_t* input = (input_t*)XCALLOC(1, sizeof(input_t));
+    input->dev_data = dev_data;
+    input->state = INPUT_UNKNOWN;
+    input->sfmt = SFMT_U8;
+    input->fullscale = (float)SCHAR_MAX - 0.5f;
+    input->bytes_per_sample = sizeof(unsigned char);
+    input->sample_rate = RTLSDR_DEFAULT_SAMPLE_RATE;
+    input->parse_config = &rtlsdr_parse_config;
+    input->init = &rtlsdr_init;
+    input->run_rx_thread = &rtlsdr_rx_thread;
+    input->set_centerfreq = &rtlsdr_set_centerfreq;
+    input->stop = &rtlsdr_stop;
+    return input;
+}

+ 32 - 0
src/input-rtlsdr.h

@@ -0,0 +1,32 @@
+/*
+ *  input-rtlsdr.h
+ *  RTLSDR-specific declarations
+ *
+ *  Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <rtl-sdr.h>  // rtlsdr_dev_t
+#define RTLSDR_BUFSIZE 320000
+#define RTLSDR_DEFAULT_LIBUSB_BUFFER_COUNT 10
+#define RTLSDR_DEFAULT_SAMPLE_RATE 2560000
+
+typedef struct {
+    rtlsdr_dev_t* dev;  // pointer to librtlsdr device struct
+    char* serial;       // dongle serial number
+    int index;          // dongle index
+    int correction;     // PPM correction
+    int gain;           // gain in tenths of dB
+    int bufcnt;         // libusb buffer count
+} rtlsdr_dev_data_t;

+ 366 - 0
src/input-soapysdr.cpp

@@ -0,0 +1,366 @@
+/*
+ *  input-soapysdr.cpp
+ *  SoapySDR-specific routines
+ *
+ *  Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "input-soapysdr.h"    // soapysdr_dev_data_t
+#include <SoapySDR/Device.h>   // SoapySDRDevice, SoapySDRDevice_makeStrArgs
+#include <SoapySDR/Formats.h>  // SOAPY_SDR_CS constants
+#include <SoapySDR/Version.h>  // SOAPY_SDR_API_VERSION
+#include <assert.h>
+#include <limits.h>  // SCHAR_MAX, SHRT_MAX
+#include <math.h>    // round
+#include <stdlib.h>  // calloc
+#include <string.h>  // memcpy, strcmp
+#include <syslog.h>  // LOG_* macros
+#include <iostream>
+#include <libconfig.h++>    // Setting
+#include "input-common.h"   // input_t, sample_format_t, input_state_t, MODULE_EXPORT
+#include "input-helpers.h"  // circbuffer_append
+#include "rtl_airband.h"    // do_exit, fft_size, debug_print, XCALLOC, error()
+
+using namespace std;
+
+// Map SoapySDR sample format string to our internal sample format
+// and set bytes_per_sample and fullscale values appropriately.
+// We prefer U8 and S8 over S16 to minimize CPU load.
+// If fullscale is > 0, it means it has been read by
+// SoapySDRDevice_getNativeStreamFormat, so we treat this value as valid.
+// Otherwise, guess a suitable default value.
+static bool soapysdr_match_sfmt(input_t* const input, char const* const fmt, double const fullscale) {
+    if (strcmp(fmt, SOAPY_SDR_CU8) == 0) {
+        input->sfmt = SFMT_U8;
+        input->bytes_per_sample = sizeof(unsigned char);
+        input->fullscale = (fullscale > 0 ? fullscale : (float)SCHAR_MAX - 0.5f);
+        goto matched;
+    } else if (strcmp(fmt, SOAPY_SDR_CS8) == 0) {
+        input->sfmt = SFMT_S8;
+        input->bytes_per_sample = sizeof(char);
+        input->fullscale = (fullscale > 0 ? fullscale : (float)SCHAR_MAX - 0.5f);
+        goto matched;
+    } else if (strcmp(fmt, SOAPY_SDR_CS16) == 0) {
+        input->sfmt = SFMT_S16;
+        input->bytes_per_sample = sizeof(short);
+        input->fullscale = (fullscale > 0 ? fullscale : (float)SHRT_MAX - 0.5f);
+        goto matched;
+    } else if (strcmp(fmt, SOAPY_SDR_CF32) == 0) {
+        input->sfmt = SFMT_F32;
+        input->bytes_per_sample = sizeof(float);
+        input->fullscale = (fullscale > 0 ? fullscale : 1.0f);
+        goto matched;
+    }
+    return false;
+matched:
+    soapysdr_dev_data_t* dev_data = (soapysdr_dev_data_t*)input->dev_data;
+    dev_data->sample_format = strdup(fmt);
+    return true;
+}
+
+// Choose a suitable sample format.
+// Bail out if no supported sample format is found.
+static bool soapysdr_choose_sample_format(SoapySDRDevice* const sdr, input_t* const input) {
+    bool ret = false;
+    size_t len = 0;
+    char** formats = NULL;
+    soapysdr_dev_data_t* dev_data = (soapysdr_dev_data_t*)input->dev_data;
+    input->sfmt = SFMT_UNDEF;
+    // First try device's native format to avoid extra conversion
+    double fullscale = 0.0;
+    char* fmt = SoapySDRDevice_getNativeStreamFormat(sdr, SOAPY_SDR_RX, dev_data->channel, &fullscale);
+
+    if (soapysdr_match_sfmt(input, fmt, fullscale) == true) {
+        log(LOG_NOTICE, "SoapySDR: device '%s': using native sample format '%s' (fullScale=%.1f)\n", dev_data->device_string, fmt, input->fullscale);
+        ret = true;
+        goto end;
+    }
+    // Native format is not supported by rtl_airband; find out if there is anything else.
+    formats = SoapySDRDevice_getStreamFormats(sdr, SOAPY_SDR_RX, dev_data->channel, &len);
+    if (formats == NULL || len == 0) {
+        log(LOG_ERR, "SoapySDR: device '%s': failed to read supported sample formats\n", dev_data->device_string);
+        ret = false;
+        goto end;
+    }
+    for (size_t i = 0; i < len; i++) {
+        if (soapysdr_match_sfmt(input, formats[i], -1.0) == true) {
+            log(LOG_NOTICE, "SoapySDR: device '%s': using non-native sample format '%s' (assuming fullScale=%.1f)\n", dev_data->device_string, formats[i], input->fullscale);
+            ret = true;
+            goto end;
+        }
+    }
+    // Nothing found; we can't use this device.
+    log(LOG_ERR, "SoapySDR: device '%s': no supported sample format found\n", dev_data->device_string);
+end:
+    return ret;
+}
+
+static int sdrplay_get_nearest_sample_rate(SoapySDRDevice* sdr, int channel, int sample_rate) {
+    size_t len = 0;
+    double sr = (double)sample_rate;
+    SoapySDRRange* range = SoapySDRDevice_getSampleRateRange(sdr, SOAPY_SDR_RX, channel, &len);
+    if (range == NULL) {
+        log(LOG_ERR, "SoapySDR: failed to read supported sampling rate ranges from the device\n");
+        return -1;
+    }
+    debug_print("Got %zu ranges\n", len);
+    double nearest_rate = range[0].minimum;
+    double offset1, offset2;
+    for (size_t i = 0; i < len; i++) {
+        debug_print("sr=%.1f min=%.1f max=%.1f step=%.1f\n", sr, range[i].minimum, range[i].maximum, range[i].step);
+        if (sr >= range[i].minimum && sr <= range[i].maximum) {
+            debug_print("Found suitable range: min=%.0f max=%0.f step=%0.f\n", range[i].minimum, range[i].maximum, range[i].step);
+            if (range[i].step == 0.0 || range[i].step >= (range[i].maximum - range[i].minimum)) {
+                return (int)(range[i].maximum - sr > sr - range[i].minimum ? range[i].minimum : range[i].maximum);
+            }
+            sr = (int)(range[i].minimum + range[i].step * round((sr - range[i].minimum) / range[i].step));
+            if (sr > range[i].maximum) {
+                sr = (int)range[i].maximum;
+            }
+            return (int)sr;
+        } else {
+            offset1 = abs(sr - nearest_rate);
+            offset2 = abs(sr - range[i].minimum);
+            if (offset2 < offset1)
+                nearest_rate = range[i].minimum;
+            offset1 = abs(sr - nearest_rate);
+            offset2 = abs(sr - range[i].maximum);
+            if (offset2 < offset1)
+                nearest_rate = range[i].maximum;
+        }
+    }
+    return (int)nearest_rate;
+}
+
+int soapysdr_parse_config(input_t* const input, libconfig::Setting& cfg) {
+    soapysdr_dev_data_t* dev_data = (soapysdr_dev_data_t*)input->dev_data;
+
+    if (cfg.exists("device_string")) {
+        dev_data->device_string = strdup(cfg["device_string"]);
+    } else {
+        cerr << "SoapySDR configuration error: mandatory parameter missing: device_string\n";
+        error();
+    }
+    if (cfg.exists("gain")) {
+        if (cfg["gain"].getType() == libconfig::Setting::TypeInt) {
+            dev_data->gain = (double)((int)cfg["gain"]);
+        } else if (cfg["gain"].getType() == libconfig::Setting::TypeFloat) {
+            dev_data->gain = (double)cfg["gain"];
+        } else {
+            // Either it's a string or an unsupported type which will cause an exception - this is fine
+            dev_data->gains = SoapySDRKwargs_fromString((const char*)cfg["gain"]);
+            if (dev_data->gains.size < 1) {
+                cerr << "SoapySDR configuration error: device '" << dev_data->device_string << "': gain: syntax error (must be a sequence of 'name1=value1,name2=value2,...')\n";
+                error();
+            }
+        }
+        dev_data->agc = false;
+    } else {
+        dev_data->agc = true;
+    }
+    if (cfg.exists("correction")) {
+        if (cfg["correction"].getType() == libconfig::Setting::TypeInt) {
+            dev_data->correction = (double)((int)cfg["correction"]);
+        } else if (cfg["correction"].getType() == libconfig::Setting::TypeFloat) {
+            dev_data->correction = (float)cfg["correction"];
+        } else {
+            cerr << "SoapySDR configuration error: device '" << dev_data->device_string << "': correction value must be numeric\n";
+            error();
+        }
+    }
+    if (cfg.exists("channel")) {
+        dev_data->channel = (size_t)(int)cfg["channel"];
+    }
+    if (cfg.exists("antenna")) {
+        dev_data->antenna = strdup(cfg["antenna"]);
+    }
+    // Find a suitable sample format and sample rate (unless set in the config)
+    // based on device capabilities.
+    // We have to do this here and not in soapysdr_init, because parse_devices()
+    // requires sample_rate and bytes_per_sample to be set correctly in order to
+    // calculate the size of the sample buffer, which has to be done before
+    // soapysdr_init() is run.
+    SoapySDRDevice* sdr = SoapySDRDevice_makeStrArgs(dev_data->device_string);
+    if (sdr == NULL) {
+        log(LOG_ERR, "Failed to open SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+        error();
+    }
+    if (soapysdr_choose_sample_format(sdr, input) == false) {
+        cerr << "SoapySDR configuration error: device '" << dev_data->device_string << "': no suitable sample format found\n";
+        error();
+    }
+    if (input->sample_rate < 0) {
+        input->sample_rate = sdrplay_get_nearest_sample_rate(sdr, dev_data->channel, SOAPYSDR_DEFAULT_SAMPLE_RATE);
+        if (input->sample_rate < 0) {
+            log(LOG_ERR, "Failed to find a suitable sample rate for SoapySDR device '%s'\n", dev_data->device_string);
+            log(LOG_ERR, "Specify a supported value using \"sample_rate\" option in the device configuration\n");
+            error();
+        }
+    }
+    SoapySDRDevice_unmake(sdr);
+    return 0;
+}
+
+int soapysdr_init(input_t* const input) {
+    soapysdr_dev_data_t* dev_data = (soapysdr_dev_data_t*)input->dev_data;
+
+    dev_data->dev = SoapySDRDevice_makeStrArgs(dev_data->device_string);
+    if (dev_data->dev == NULL) {
+        log(LOG_ERR, "Failed to open SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+        error();
+    }
+    SoapySDRDevice* sdr = dev_data->dev;
+
+    if (SoapySDRDevice_setSampleRate(sdr, SOAPY_SDR_RX, dev_data->channel, input->sample_rate) != 0) {
+        log(LOG_ERR, "Failed to set sample rate for SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+        error();
+    }
+    log(LOG_INFO, "SoapySDR: device '%s': sample rate set to %.0f sps\n", dev_data->device_string, SoapySDRDevice_getSampleRate(sdr, SOAPY_SDR_RX, dev_data->channel));
+    if (SoapySDRDevice_setFrequency(sdr, SOAPY_SDR_RX, dev_data->channel, input->centerfreq, NULL) != 0) {
+        log(LOG_ERR, "Failed to set frequency for SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+        error();
+    }
+    if (SoapySDRDevice_setFrequencyCorrection(sdr, SOAPY_SDR_RX, dev_data->channel, dev_data->correction) != 0) {
+        log(LOG_ERR, "Failed to set frequency correction for SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+        error();
+    }
+    if (dev_data->antenna != NULL) {
+        if (SoapySDRDevice_setAntenna(sdr, SOAPY_SDR_RX, dev_data->channel, dev_data->antenna) != 0) {
+            log(LOG_ERR, "Failed to set antenna to '%s' for SoapySDR device '%s': %s\n", dev_data->device_string, dev_data->antenna, SoapySDRDevice_lastError());
+            error();
+        }
+        log(LOG_INFO, "SoapySDR: device '%s': antenna set to '%s'\n", dev_data->device_string, SoapySDRDevice_getAntenna(sdr, SOAPY_SDR_RX, dev_data->channel));
+    }
+    if (SoapySDRDevice_setGainMode(sdr, SOAPY_SDR_RX, dev_data->channel, dev_data->agc) != 0) {
+        log(LOG_ERR, "Failed to %s AGC for SoapySDR device '%s': %s\n", dev_data->agc ? "enable" : "disable", dev_data->device_string, SoapySDRDevice_lastError());
+        error();
+    }
+    log(LOG_INFO, "SoapySDR: device '%s': AGC %s (requested: %s)\n", dev_data->device_string, SoapySDRDevice_getGainMode(sdr, SOAPY_SDR_RX, dev_data->channel) ? "on" : "off",
+        dev_data->agc ? "on" : "off");
+    if (!dev_data->agc) {
+        if (dev_data->gains.size > 0) {
+            for (size_t i = 0; i < dev_data->gains.size; i++) {
+                char* const key = dev_data->gains.keys[i];
+                double val = atof(dev_data->gains.vals[i]);
+                if (SoapySDRDevice_setGainElement(sdr, SOAPY_SDR_RX, dev_data->channel, key, val) != 0) {
+                    log(LOG_ERR, "Failed to set gain element '%s' for SoapySDR device '%s': %s\n", key, dev_data->device_string, SoapySDRDevice_lastError());
+                    error();
+                }
+                log(LOG_INFO, "SoapySDR: device '%s': gain '%s' set to %.1f dB\n", dev_data->device_string, key, SoapySDRDevice_getGainElement(sdr, SOAPY_SDR_RX, dev_data->channel, key));
+            }
+        } else {
+            if (SoapySDRDevice_setGain(sdr, SOAPY_SDR_RX, dev_data->channel, dev_data->gain) != 0) {
+                log(LOG_ERR, "Failed to set gain for SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+                error();
+            }
+            log(LOG_INFO, "SoapySDR: device '%s': gain set to %.1f dB\n", dev_data->device_string, SoapySDRDevice_getGain(sdr, SOAPY_SDR_RX, dev_data->channel));
+        }
+    }
+    log(LOG_INFO, "SoapySDR: device '%s' initialized\n", dev_data->device_string);
+    return 0;
+}
+
+void* soapysdr_rx_thread(void* ctx) {
+    input_t* input = (input_t*)ctx;
+    soapysdr_dev_data_t* dev_data = (soapysdr_dev_data_t*)input->dev_data;
+    SoapySDRDevice* sdr = dev_data->dev;
+    assert(sdr != NULL);
+
+    unsigned char buf[SOAPYSDR_BUFSIZE];
+    // size of the buffer in number of I/Q sample pairs
+    size_t num_elems = SOAPYSDR_BUFSIZE / (2 * input->bytes_per_sample);
+
+    SoapySDRStream* rxStream = NULL;
+#if SOAPY_SDR_API_VERSION < 0x00080000
+    if (SoapySDRDevice_setupStream(sdr, &rxStream, SOAPY_SDR_RX, dev_data->sample_format, &dev_data->channel, 1, NULL) != 0) {
+#else
+    if ((rxStream = SoapySDRDevice_setupStream(sdr, SOAPY_SDR_RX, dev_data->sample_format, &dev_data->channel, 1, NULL)) == NULL) {
+#endif /* SOAPY_SDR_API_VERSION */
+        log(LOG_ERR, "Failed to set up stream for SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+        input->state = INPUT_FAILED;
+        goto cleanup;
+    }
+    if (SoapySDRDevice_activateStream(sdr, rxStream, 0, 0, 0)) {  // start streaming
+        log(LOG_ERR, "Failed to activate stream for SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+        input->state = INPUT_FAILED;
+        goto cleanup;
+    }
+    input->state = INPUT_RUNNING;
+    log(LOG_NOTICE, "SoapySDR: device '%s' started\n", dev_data->device_string);
+
+    while (!do_exit) {
+        void* bufs[] = {buf};  // array of buffers
+        int flags;             // flags set by receive operation
+        long long timeNs;      // timestamp for receive buffer
+        int samples_read = SoapySDRDevice_readStream(sdr, rxStream, bufs, num_elems, &flags, &timeNs, SOAPYSDR_READSTREAM_TIMEOUT_US);
+        if (samples_read < 0) {  // when it's negative, it's the error code
+            log(LOG_ERR, "SoapySDR device '%s': readStream failed: %s\n", dev_data->device_string, SoapySDR_errToStr(samples_read));
+            continue;
+        }
+        circbuffer_append(input, buf, (size_t)(samples_read * 2 * input->bytes_per_sample));
+    }
+cleanup:
+    SoapySDRDevice_deactivateStream(sdr, rxStream, 0, 0);
+    SoapySDRDevice_closeStream(sdr, rxStream);
+    SoapySDRDevice_unmake(sdr);
+    return 0;
+}
+
+int soapysdr_set_centerfreq(input_t* const input, int const centerfreq) {
+    soapysdr_dev_data_t* dev_data = (soapysdr_dev_data_t*)input->dev_data;
+    assert(dev_data->dev != NULL);
+
+    if (SoapySDRDevice_setFrequency(dev_data->dev, SOAPY_SDR_RX, dev_data->channel, centerfreq, NULL) != 0) {
+        log(LOG_ERR, "Failed to set frequency for SoapySDR device '%s': %s\n", dev_data->device_string, SoapySDRDevice_lastError());
+        return -1;
+    }
+    return 0;
+}
+
+MODULE_EXPORT input_t* soapysdr_input_new() {
+    soapysdr_dev_data_t* dev_data = (soapysdr_dev_data_t*)XCALLOC(1, sizeof(soapysdr_dev_data_t));
+    dev_data->gain = -1.0;  // invalid default gain value
+    dev_data->agc = false;
+    memset(&dev_data->gains, 0, sizeof(dev_data->gains));
+    dev_data->channel = 0;
+    dev_data->antenna = NULL;
+    /*	return &( input_t ){
+                    .dev_data = dev_data,
+                    .state = INPUT_UNKNOWN,
+                    .sfmt = SFMT_U8,
+                    .sample_rate = -1,
+                    .parse_config = &soapysdr_parse_config,
+                    .init = &soapysdr_init,
+                    .run_rx_thread = &soapysdr_rx_thread,
+                    .set_centerfreq = &soapysdr_set_centerfreq,
+                    .stop = &soapysdr_stop
+            }; */
+    input_t* input = (input_t*)XCALLOC(1, sizeof(input_t));
+    input->dev_data = dev_data;
+    // invalid values as defaults
+    input->state = INPUT_UNKNOWN;
+    input->sfmt = SFMT_UNDEF;
+    input->fullscale = 0.0f;
+    input->bytes_per_sample = 0;
+    input->sample_rate = -1;
+
+    input->parse_config = &soapysdr_parse_config;
+    input->init = &soapysdr_init;
+    input->run_rx_thread = &soapysdr_rx_thread;
+    input->set_centerfreq = &soapysdr_set_centerfreq;
+    input->stop = NULL;
+    return input;
+}

+ 36 - 0
src/input-soapysdr.h

@@ -0,0 +1,36 @@
+/*
+ *  input-soapysdr.h
+ *  SoapySDR-specific declarations
+ *
+ *  Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <SoapySDR/Device.h>  // SoapySDRDevice
+#include <SoapySDR/Types.h>   // SoapySDRKwargs
+#define SOAPYSDR_DEFAULT_SAMPLE_RATE 2560000
+#define SOAPYSDR_BUFSIZE 320000
+#define SOAPYSDR_READSTREAM_TIMEOUT_US 1000000L
+
+typedef struct {
+    SoapySDRDevice* dev;        // pointer to device struct
+    char const* device_string;  // SoapySDR device arg string
+    char const* sample_format;  // sample format
+    char const* antenna;        // antenna name
+    SoapySDRKwargs gains;       // gain elements and their values
+    double correction;          // PPM correction
+    double gain;                // gain in dB
+    size_t channel;             // HW channel number
+    bool agc;                   // enable AGC
+} soapysdr_dev_data_t;

+ 71 - 0
src/logging.cpp

@@ -0,0 +1,71 @@
+/*
+ * logging.cpp
+ *
+ * Copyright (C) 2022-2023 charlie-foxtrot
+ * Copyright (c) 2015-2022 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdarg.h>  // va_start() / va_end()
+#include <cstdio>    // fopen()
+#include <cstring>   // strerror()
+#include <iostream>  // cerr()
+
+#include "logging.h"
+
+LogDestination log_destination = SYSLOG;
+FILE* debugf = NULL;
+
+void error() {
+    close_debug();
+    _Exit(1);
+}
+
+void init_debug(const char* file) {
+#ifdef DEBUG
+    if (!file)
+        return;
+    if ((debugf = fopen(file, "a")) == NULL) {
+        std::cerr << "Could not open debug file " << file << ": " << strerror(errno) << "\n";
+        error();
+    }
+#else
+    UNUSED(file);
+#endif /* DEBUG */
+}
+
+void close_debug() {
+#ifdef DEBUG
+    if (!debugf)
+        return;
+    fclose(debugf);
+#endif /* DEBUG */
+}
+
+void log(int priority, const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    switch (log_destination) {
+        case SYSLOG:
+            vsyslog(priority, format, args);
+            break;
+        case STDERR:
+            vfprintf(stderr, format, args);
+            break;
+        case NONE:
+            break;
+    }
+    va_end(args);
+}

+ 57 - 0
src/logging.h

@@ -0,0 +1,57 @@
+/*
+ * logging.h
+ *
+ * Copyright (C) 2022-2023 charlie-foxtrot
+ * Copyright (c) 2015-2022 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _LOGGING_H
+#define _LOGGING_H 1
+
+#include <syslog.h>  // LOG_ERR
+#include <cstdio>    // FILE
+
+#define nop() \
+    do {      \
+    } while (0)
+#define UNUSED(x) (void)(x)
+
+#ifdef DEBUG
+#define DEBUG_PATH "rtl_airband_debug.log"
+#define debug_print(fmt, ...)                                 \
+    do {                                                      \
+        fprintf(debugf, "%s(): " fmt, __func__, __VA_ARGS__); \
+        fflush(debugf);                                       \
+    } while (0)
+#define debug_bulk_print(fmt, ...)                            \
+    do {                                                      \
+        fprintf(debugf, "%s(): " fmt, __func__, __VA_ARGS__); \
+    } while (0)
+#else
+#define debug_print(fmt, ...) nop()
+#define debug_bulk_print(fmt, ...) nop()
+#endif /* DEBUG */
+
+enum LogDestination { SYSLOG, STDERR, NONE };
+extern LogDestination log_destination;
+extern FILE* debugf;
+
+void error();
+void init_debug(const char* file);
+void close_debug();
+void log(int priority, const char* format, ...);
+
+#endif /* _LOGGING_H */

+ 261 - 0
src/mixer.cpp

@@ -0,0 +1,261 @@
+/*
+ * mixer.cpp
+ * Mixer related routines
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <sys/time.h>
+#include <syslog.h>
+#include <unistd.h>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include "config.h"
+#include "rtl_airband.h"
+
+static char* err;
+
+static inline void mixer_set_error(const char* msg) {
+    err = strdup(msg);
+}
+
+const char* mixer_get_error() {
+    return (const char*)err;
+}
+
+mixer_t* getmixerbyname(const char* name) {
+    for (int i = 0; i < mixer_count; i++) {
+        if (!strcmp(mixers[i].name, name)) {
+            debug_print("%s found at %d\n", name, i);
+            return &mixers[i];
+        }
+    }
+    debug_print("%s not found\n", name);
+    return NULL;
+}
+
+void mixer_disable(mixer_t* mixer) {
+    mixer->enabled = false;
+    disable_channel_outputs(&mixer->channel);
+}
+
+int mixer_connect_input(mixer_t* mixer, float ampfactor, float balance) {
+    if (!mixer) {
+        mixer_set_error("mixer is undefined");
+        return (-1);
+    }
+    int i = mixer->input_count;
+
+    // allocate new mixer - this could be more efficient by pre-allocating but this
+    // is only run at startup so not a big deal
+    if (mixer->inputs == NULL) {
+        mixer->inputs = (mixinput_t*)XCALLOC(i + 1, sizeof(struct mixinput_t));
+        mixer->inputs_todo = (bool*)XCALLOC(i + 1, sizeof(bool));
+        mixer->input_mask = (bool*)XCALLOC(i + 1, sizeof(bool));
+    } else {
+        mixer->inputs = (mixinput_t*)XREALLOC(mixer->inputs, (i + 1) * sizeof(struct mixinput_t));
+        mixer->inputs_todo = (bool*)XREALLOC(mixer->inputs_todo, (i + 1) * sizeof(bool));
+        mixer->input_mask = (bool*)XREALLOC(mixer->input_mask, (i + 1) * sizeof(bool));
+    }
+
+    mixer->inputs[i].wavein = (float*)XCALLOC(WAVE_LEN, sizeof(float));
+    if ((pthread_mutex_init(&mixer->inputs[i].mutex, NULL)) != 0) {
+        mixer_set_error("failed to initialize input mutex");
+        return (-1);
+    }
+    mixer->inputs[i].ampfactor = ampfactor;
+    mixer->inputs[i].ampl = fminf(1.0f, 1.0f - balance);
+    mixer->inputs[i].ampr = fminf(1.0f, 1.0f + balance);
+    if (balance != 0.0f)
+        mixer->channel.mode = MM_STEREO;
+    mixer->inputs[i].ready = false;
+    mixer->inputs[i].has_signal = false;
+    mixer->inputs[i].input_overrun_count = 0;
+    mixer->input_mask[i] = true;
+    mixer->inputs_todo[i] = true;
+    mixer->enabled = true;
+    debug_print("ampfactor=%.1f ampl=%.1f ampr=%.1f\n", mixer->inputs[i].ampfactor, mixer->inputs[i].ampl, mixer->inputs[i].ampr);
+    return (mixer->input_count++);
+}
+
+void mixer_disable_input(mixer_t* mixer, int input_idx) {
+    assert(mixer);
+    assert(input_idx < mixer->input_count);
+
+    mixer->input_mask[input_idx] = false;
+
+    // break out if any inputs remain true
+    for (int i = 0; i < mixer->input_count; i++) {
+        if (mixer->input_mask[i]) {
+            return;
+        }
+    }
+
+    // all inputs are false so disable the mixer
+    log(LOG_NOTICE, "Disabling mixer '%s' - all inputs died\n", mixer->name);
+    mixer_disable(mixer);
+}
+
+void mixer_put_samples(mixer_t* mixer, int input_idx, const float* samples, bool has_signal, unsigned int len) {
+    assert(mixer);
+    assert(samples);
+    assert(input_idx < mixer->input_count);
+    mixinput_t* input = &mixer->inputs[input_idx];
+    pthread_mutex_lock(&input->mutex);
+    input->has_signal = has_signal;
+    if (has_signal) {
+        memcpy(input->wavein, samples, len * sizeof(float));
+    }
+    if (input->ready == true) {
+        debug_print("input %d overrun\n", input_idx);
+        input->input_overrun_count++;
+    } else {
+        input->ready = true;
+    }
+    pthread_mutex_unlock(&input->mutex);
+}
+
+void mix_waveforms(float* sum, const float* in, float mult, int size) {
+    if (mult == 0.0f) {
+        return;
+    }
+    for (int s = 0; s < size; s++) {
+        sum[s] += in[s] * mult;
+    }
+}
+
+/* Samples are delivered to mixer inputs in batches of WAVE_BATCH size (default 1000, ie. 1/8 secs
+ * of audio). mixer_thread emits mixed audio in batches of the same size, but the loop runs
+ * twice more often (MIX_DIVISOR = 2) in order to accomodate for any possible input jitter
+ * caused by irregular process scheduling, RTL clock instability, etc. For this purpose
+ * we allow each input batch to become delayed by 1/16 secs (max). This is accomplished by
+ * the mixer->interval counter, which counts from 2 to 0:
+ * - 2 - initial state after mixed audio output. We don't expect inputs to be ready yet,
+ *       but we check their readiness anyway.
+ * - 1 - here we expect most (if not all) inputs to be ready, so we mix them. If there are no
+ *       inputs left to handle in this WAVE_BATCH interval, we emit the mixed audio and reset
+ *       mixer->interval to the initial state (2).
+ * - 0 - here we expect to get output from all delayed inputs, which were not ready in the
+ *       interval. Any input which is still not ready, is skipped (filled with 0s), because
+ *       here we must emit the mixed audio to keep the desired audio bitrate.
+ */
+void* mixer_thread(void* param) {
+    assert(param != NULL);
+    Signal* signal = (Signal*)param;
+    int interval_usec = 1e+6 * WAVE_BATCH / WAVE_RATE / MIX_DIVISOR;
+
+    debug_print("Starting mixer thread, signal %p\n", signal);
+
+    if (mixer_count <= 0)
+        return 0;
+#ifdef DEBUG
+    struct timeval ts, te;
+    gettimeofday(&ts, NULL);
+#endif /* DEBUG */
+    while (!do_exit) {
+        usleep(interval_usec);
+        if (do_exit)
+            return 0;
+        for (int i = 0; i < mixer_count; i++) {
+            mixer_t* mixer = mixers + i;
+            if (mixer->enabled == false)
+                continue;
+            channel_t* channel = &mixer->channel;
+
+            if (channel->state == CH_READY) {  // previous output not yet handled by output thread
+                if (--mixer->interval > 0) {
+                    continue;
+                } else {
+                    debug_print("mixer[%d]: output channel overrun\n", i);
+                    mixer->output_overrun_count++;
+                }
+            }
+
+            for (int j = 0; j < mixer->input_count; j++) {
+                mixinput_t* input = mixer->inputs + j;
+                pthread_mutex_lock(&input->mutex);
+                if (mixer->inputs_todo[j] && mixer->input_mask[j] && input->ready) {
+                    if (channel->state == CH_DIRTY) {
+                        memset(channel->waveout, 0, WAVE_BATCH * sizeof(float));
+                        if (channel->mode == MM_STEREO)
+                            memset(channel->waveout_r, 0, WAVE_BATCH * sizeof(float));
+                        channel->axcindicate = NO_SIGNAL;
+                        channel->state = CH_WORKING;
+                    }
+                    debug_bulk_print("mixer[%d]: ampleft=%.1f ampright=%.1f\n", i, input->ampfactor * input->ampl, input->ampfactor * input->ampr);
+                    if (input->has_signal) {
+                        /* left channel */
+                        mix_waveforms(channel->waveout, input->wavein, input->ampfactor * input->ampl, WAVE_BATCH);
+                        /* right channel */
+                        if (channel->mode == MM_STEREO) {
+                            mix_waveforms(channel->waveout_r, input->wavein, input->ampfactor * input->ampr, WAVE_BATCH);
+                        }
+                        channel->axcindicate = SIGNAL;
+                    }
+                    input->ready = false;
+                    mixer->inputs_todo[j] = false;
+                }
+                pthread_mutex_unlock(&input->mutex);
+            }
+
+            // check if all "good" inputs have been handled.  this means there is no enabled mixer (mixer->input_mask is true) that has a
+            // input to handle (mixer->inputs_todo is true)
+            bool all_good_inputs_handled = true;
+            for (int k = 0; k < mixer->input_count && all_good_inputs_handled; k++) {
+                if (mixer->inputs_todo[k] && mixer->input_mask[k]) {
+                    all_good_inputs_handled = false;
+                }
+            }
+
+            if ((all_good_inputs_handled == true) || mixer->interval == 0) {  // all good inputs handled or last interval passed
+
+#ifdef DEBUG
+                gettimeofday(&te, NULL);
+
+                char* inputs_todo_char = (char*)XCALLOC(mixer->input_count + 1, sizeof(char));
+                char* input_mask_char = (char*)XCALLOC(mixer->input_count + 1, sizeof(char));
+                for (int k = 0; k < mixer->input_count; k++) {
+                    inputs_todo_char[k] = mixer->inputs_todo[k] ? '+' : '-';
+                    input_mask_char[k] = mixer->input_mask[k] ? '+' : '-';
+                }
+                inputs_todo_char[mixer->input_count] = '\0';
+                input_mask_char[mixer->input_count] = '\0';
+
+                debug_bulk_print("mixerinput: %lu.%lu %lu int=%d inp_unhandled=%s inp_mask=%s\n", te.tv_sec, (unsigned long)te.tv_usec, (te.tv_sec - ts.tv_sec) * 1000000UL + te.tv_usec - ts.tv_usec,
+                                 mixer->interval, inputs_todo_char, input_mask_char);
+
+                free(inputs_todo_char);
+                free(input_mask_char);
+
+                ts.tv_sec = te.tv_sec;
+                ts.tv_usec = te.tv_usec;
+#endif /* DEBUG */
+
+                channel->state = CH_READY;
+                signal->send();
+                mixer->interval = MIX_DIVISOR;
+                for (int k = 0; k < mixer->input_count; k++) {
+                    mixer->inputs_todo[k] = true;
+                }
+            } else {
+                mixer->interval--;
+            }
+        }
+    }
+    return 0;
+}

+ 1005 - 0
src/output.cpp

@@ -0,0 +1,1005 @@
+/*
+ * output.cpp
+ * Output related routines
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <ogg/ogg.h>
+#include <shout/shout.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vorbis/vorbisenc.h>
+
+// SHOUTERR_RETRY is available since libshout 2.4.0.
+// Set it to an impossible value if it's not there.
+#ifndef SHOUTERR_RETRY
+#define SHOUTERR_RETRY (-255)
+#endif /* SHOUTERR_RETRY */
+
+#include <lame/lame.h>
+
+#ifdef WITH_PULSEAUDIO
+#include <pulse/pulseaudio.h>
+#endif /* WITH_PULSEAUDIO */
+
+#include <syslog.h>
+#include <cassert>
+#include <cerrno>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <sstream>
+#include <string>
+#include "config.h"
+#include "helper_functions.h"
+#include "input-common.h"
+#include "rtl_airband.h"
+
+void shout_setup(icecast_data* icecast, mix_modes mixmode) {
+    int ret;
+    shout_t* shouttemp = shout_new();
+    if (shouttemp == NULL) {
+        printf("cannot allocate\n");
+    }
+    if (shout_set_host(shouttemp, icecast->hostname) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+    if (shout_set_protocol(shouttemp, SHOUT_PROTOCOL_HTTP) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+    if (shout_set_port(shouttemp, icecast->port) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+#ifdef LIBSHOUT_HAS_TLS
+    if (shout_set_tls(shouttemp, icecast->tls_mode) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+#endif /* LIBSHOUT_HAS_TLS */
+    char mp[100];
+    sprintf(mp, "/%s", icecast->mountpoint);
+    if (shout_set_mount(shouttemp, mp) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+    if (shout_set_user(shouttemp, icecast->username) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+    if (shout_set_password(shouttemp, icecast->password) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+#ifdef LIBSHOUT_HAS_CONTENT_FORMAT
+    if (shout_set_content_format(shouttemp, SHOUT_FORMAT_MP3, SHOUT_USAGE_AUDIO, NULL) != SHOUTERR_SUCCESS) {
+#else
+    if (shout_set_format(shouttemp, SHOUT_FORMAT_MP3) != SHOUTERR_SUCCESS) {
+#endif /* LIBSHOUT_HAS_CONTENT_FORMAT */
+        shout_free(shouttemp);
+        return;
+    }
+    if (icecast->name && shout_set_meta(shouttemp, SHOUT_META_NAME, icecast->name) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+    if (icecast->genre && shout_set_meta(shouttemp, SHOUT_META_GENRE, icecast->genre) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+    if (icecast->description && shout_set_meta(shouttemp, SHOUT_META_DESCRIPTION, icecast->description) != SHOUTERR_SUCCESS) {
+        shout_free(shouttemp);
+        return;
+    }
+    char samplerates[20];
+    sprintf(samplerates, "%d", MP3_RATE);
+    shout_set_audio_info(shouttemp, SHOUT_AI_SAMPLERATE, samplerates);
+    shout_set_audio_info(shouttemp, SHOUT_AI_CHANNELS, (mixmode == MM_STEREO ? "2" : "1"));
+
+    if (shout_set_nonblocking(shouttemp, 1) != SHOUTERR_SUCCESS) {
+        log(LOG_ERR, "Error setting non-blocking mode: %s\n", shout_get_error(shouttemp));
+        return;
+    }
+    ret = shout_open(shouttemp);
+    if (ret == SHOUTERR_SUCCESS)
+        ret = SHOUTERR_CONNECTED;
+
+    if (ret == SHOUTERR_BUSY || ret == SHOUTERR_RETRY)
+        log(LOG_NOTICE, "Connecting to %s:%d/%s...\n", icecast->hostname, icecast->port, icecast->mountpoint);
+
+    int shout_timeout = 30 * 5;  // 30 * 5 * 200ms = 30s
+    while ((ret == SHOUTERR_BUSY || ret == SHOUTERR_RETRY) && shout_timeout-- > 0) {
+        SLEEP(200);
+        ret = shout_get_connected(shouttemp);
+    }
+
+    if (ret == SHOUTERR_CONNECTED) {
+        log(LOG_NOTICE, "Connected to %s:%d/%s\n", icecast->hostname, icecast->port, icecast->mountpoint);
+        SLEEP(100);
+        icecast->shout = shouttemp;
+    } else {
+        log(LOG_WARNING, "Could not connect to %s:%d/%s: %s\n", icecast->hostname, icecast->port, icecast->mountpoint, shout_get_error(shouttemp));
+        shout_close(shouttemp);
+        shout_free(shouttemp);
+        return;
+    }
+}
+
+lame_t airlame_init(mix_modes mixmode, int highpass, int lowpass) {
+    lame_t lame = lame_init();
+    if (!lame) {
+        log(LOG_WARNING, "lame_init failed\n");
+        return NULL;
+    }
+
+    lame_set_in_samplerate(lame, WAVE_RATE);
+    lame_set_VBR(lame, vbr_mtrh);
+    lame_set_brate(lame, 16);
+    lame_set_quality(lame, 7);
+    lame_set_lowpassfreq(lame, lowpass);
+    lame_set_highpassfreq(lame, highpass);
+    lame_set_out_samplerate(lame, MP3_RATE);
+    if (mixmode == MM_STEREO) {
+        lame_set_num_channels(lame, 2);
+        lame_set_mode(lame, JOINT_STEREO);
+    } else {
+        lame_set_num_channels(lame, 1);
+        lame_set_mode(lame, MONO);
+    }
+    debug_print("lame init with mixmode=%s\n", mixmode == MM_STEREO ? "MM_STEREO" : "MM_MONO");
+    lame_init_params(lame);
+    return lame;
+}
+
+class LameTone {
+    unsigned char* _data;
+    int _bytes;
+
+   public:
+    LameTone(mix_modes mixmode, int msec, unsigned int hz = 0) : _data(NULL), _bytes(0) {
+        _data = (unsigned char*)XCALLOC(1, LAMEBUF_SIZE);
+
+        int samples = (msec * WAVE_RATE) / 1000;
+        float* buf = (float*)XCALLOC(samples, sizeof(float));
+
+        debug_print("LameTone with mixmode=%s msec=%d hz=%u\n", mixmode == MM_STEREO ? "MM_STEREO" : "MM_MONO", msec, hz);
+        if (hz > 0) {
+            const float period = 1.0 / (float)hz;
+            const float sample_time = 1.0 / (float)WAVE_RATE;
+            float t = 0;
+            for (int i = 0; i < samples; ++i, t += sample_time) {
+                buf[i] = 0.9 * sinf(t * 2.0 * M_PI / period);
+            }
+        } else
+            memset(buf, 0, samples * sizeof(float));
+        lame_t lame = airlame_init(mixmode, 0, 0);
+        if (lame) {
+            _bytes = lame_encode_buffer_ieee_float(lame, buf, (mixmode == MM_STEREO ? buf : NULL), samples, _data, LAMEBUF_SIZE);
+            if (_bytes > 0) {
+                int flush_ofs = _bytes;
+                if (flush_ofs & 0x1f)
+                    flush_ofs += 0x20 - (flush_ofs & 0x1f);
+                if (flush_ofs < LAMEBUF_SIZE) {
+                    int flush_bytes = lame_encode_flush(lame, _data + flush_ofs, LAMEBUF_SIZE - flush_ofs);
+                    if (flush_bytes > 0) {
+                        memmove(_data + _bytes, _data + flush_ofs, flush_bytes);
+                        _bytes += flush_bytes;
+                    }
+                }
+            } else
+                log(LOG_WARNING, "lame_encode_buffer_ieee_float: %d\n", _bytes);
+            lame_close(lame);
+        }
+        free(buf);
+    }
+
+    ~LameTone() {
+        if (_data)
+            free(_data);
+    }
+
+    int write(FILE* f) {
+        if (!_data || _bytes <= 0)
+            return 1;
+
+        if (fwrite(_data, 1, _bytes, f) != (unsigned int)_bytes) {
+            log(LOG_WARNING, "LameTone: failed to write %d bytes\n", _bytes);
+            return -1;
+        }
+
+        return 0;
+    }
+};
+
+int rename_if_exists(char const* oldpath, char const* newpath) {
+    int ret = rename(oldpath, newpath);
+    if (ret < 0) {
+        if (errno == ENOENT) {
+            return 0;
+        } else {
+            log(LOG_ERR, "Could not rename %s to %s: %s\n", oldpath, newpath, strerror(errno));
+        }
+    }
+    return ret;
+}
+
+/*
+ * Open output file (mp3 or raw IQ) for append or initial write.
+ * If appending to an audio file, insert discontinuity indictor tones
+ * as well as the appropriate amount of silence when in continuous mode.
+ */
+static int open_file(file_data* fdata, mix_modes mixmode, int is_audio) {
+    int rename_result = rename_if_exists(fdata->file_path.c_str(), fdata->file_path_tmp.c_str());
+    fdata->f = fopen(fdata->file_path_tmp.c_str(), fdata->append ? "a+" : "w");
+    if (fdata->f == NULL) {
+        return -1;
+    }
+
+    struct stat st = {};
+    if (!fdata->append || fstat(fileno(fdata->f), &st) != 0 || st.st_size == 0) {
+        if (!fdata->split_on_transmission) {
+            log(LOG_INFO, "Writing to %s\n", fdata->file_path.c_str());
+        } else {
+            debug_print("Writing to %s\n", fdata->file_path_tmp.c_str());
+        }
+        return 0;
+    }
+    if (rename_result < 0) {
+        log(LOG_INFO, "Writing to %s\n", fdata->file_path.c_str());
+        debug_print("Writing to %s\n", fdata->file_path_tmp.c_str());
+    } else {
+        log(LOG_INFO, "Appending from pos %llu to %s\n", (unsigned long long)st.st_size, fdata->file_path.c_str());
+        debug_print("Appending from pos %llu to %s\n", (unsigned long long)st.st_size, fdata->file_path_tmp.c_str());
+    }
+
+    if (is_audio) {
+        // fill missing space with marker tones
+        LameTone lt_a(mixmode, 120, 2222);
+        LameTone lt_b(mixmode, 120, 1111);
+        LameTone lt_c(mixmode, 120, 555);
+
+        int r = lt_a.write(fdata->f);
+        if (r == 0)
+            r = lt_b.write(fdata->f);
+        if (r == 0)
+            r = lt_c.write(fdata->f);
+
+        // fill in time delta with silence if continuous output mode
+        if (fdata->continuous) {
+            time_t now = time(NULL);
+            if (now > st.st_mtime) {
+                time_t delta = now - st.st_mtime;
+                if (delta > 3600) {
+                    log(LOG_WARNING, "Too big time difference: %llu sec, limiting to one hour\n", (unsigned long long)delta);
+                    delta = 3600;
+                }
+                LameTone lt_silence(mixmode, 1000);
+                for (; (r == 0 && delta > 1); --delta)
+                    r = lt_silence.write(fdata->f);
+            }
+        }
+
+        if (r == 0)
+            r = lt_c.write(fdata->f);
+        if (r == 0)
+            r = lt_b.write(fdata->f);
+        if (r == 0)
+            r = lt_a.write(fdata->f);
+
+        if (r < 0)
+            fseek(fdata->f, st.st_size, SEEK_SET);
+    }
+    return 0;
+}
+
+static void close_file(channel_t* channel, file_data* fdata) {
+    if (!fdata) {
+        return;
+    }
+
+    if (fdata->type == O_FILE && fdata->f && channel->lame) {
+        int encoded = lame_encode_flush_nogap(channel->lame, channel->lamebuf, LAMEBUF_SIZE);
+        debug_print("closing file %s flushed %d\n", fdata->file_path.c_str(), encoded);
+
+        if (encoded > 0) {
+            size_t written = fwrite((void*)channel->lamebuf, 1, (size_t)encoded, fdata->f);
+            if (written == 0 || written < (size_t)encoded)
+                log(LOG_WARNING, "Problem writing %s (%s)\n", fdata->file_path.c_str(), strerror(errno));
+        }
+    }
+
+    if (fdata->f) {
+        fclose(fdata->f);
+        fdata->f = NULL;
+        rename_if_exists(fdata->file_path_tmp.c_str(), fdata->file_path.c_str());
+    }
+    fdata->file_path.clear();
+    fdata->file_path_tmp.clear();
+}
+
+/*
+ * Close current output file based on certain conditions:
+ * If "split_on_transmission" mode is true check:
+ *   If current duration too long, or we've been idle too long
+ * else (append or continuous) check:
+ *   if hour is different.
+ */
+static void close_if_necessary(channel_t* channel, file_data* fdata) {
+    static const double MIN_TRANSMISSION_TIME_SEC = 1.0;
+    static const double MAX_TRANSMISSION_TIME_SEC = 60.0 * 60.0;
+    static const double MAX_TRANSMISSION_IDLE_SEC = 0.5;
+
+    if (!fdata || !fdata->f) {
+        return;
+    }
+
+    timeval current_time;
+    gettimeofday(&current_time, NULL);
+
+    if (fdata->split_on_transmission) {
+        double duration_sec = delta_sec(&fdata->open_time, &current_time);
+        double idle_sec = delta_sec(&fdata->last_write_time, &current_time);
+
+        if (duration_sec > MAX_TRANSMISSION_TIME_SEC || (duration_sec > MIN_TRANSMISSION_TIME_SEC && idle_sec > MAX_TRANSMISSION_IDLE_SEC)) {
+            debug_print("closing file %s, duration %f sec, idle %f sec\n", fdata->file_path.c_str(), duration_sec, idle_sec);
+            close_file(channel, fdata);
+        }
+        return;
+    }
+
+    // Check if the hour boundary was just crossed.  NOTE: Actual hour number doesn't matter but still
+    // need to use localtime if enabled (some timezones have partial hour offsets)
+    int start_hour;
+    int current_hour;
+    if (use_localtime) {
+        start_hour = localtime(&(fdata->open_time.tv_sec))->tm_hour;
+        current_hour = localtime(&current_time.tv_sec)->tm_hour;
+    } else {
+        start_hour = gmtime(&(fdata->open_time.tv_sec))->tm_hour;
+        current_hour = gmtime(&current_time.tv_sec)->tm_hour;
+    }
+
+    if (start_hour != current_hour) {
+        debug_print("closing file %s after crossing hour boundary\n", fdata->file_path.c_str());
+        close_file(channel, fdata);
+    }
+}
+
+/*
+ * For a particular channel file output, check if there is a file currently open.
+ * If so, that file may need to be flushed and closed.
+ *
+ * If the existing open file is good for continued use, return true.
+ * Otherwise, create a file name based on the current timestamp and
+ * open that new file.  If that file open succeeded, return true.
+ */
+static bool output_file_ready(channel_t* channel, file_data* fdata, mix_modes mixmode, int is_audio) {
+    if (!fdata) {
+        return false;
+    }
+
+    close_if_necessary(channel, fdata);
+
+    if (fdata->f) {  // still open
+        return true;
+    }
+
+    timeval current_time;
+    gettimeofday(&current_time, NULL);
+    struct tm* time;
+    if (use_localtime) {
+        time = localtime(&current_time.tv_sec);
+    } else {
+        time = gmtime(&current_time.tv_sec);
+    }
+
+    char timestamp[32];
+    if (strftime(timestamp, sizeof(timestamp), fdata->split_on_transmission ? "_%Y%m%d_%H%M%S" : "_%Y%m%d_%H", time) == 0) {
+        log(LOG_NOTICE, "strftime returned 0\n");
+        return false;
+    }
+
+    std::string output_dir;
+    if (fdata->dated_subdirectories) {
+        output_dir = make_dated_subdirs(fdata->basedir, time);
+        if (output_dir.empty()) {
+            log(LOG_ERR, "Failed to create dated subdirectory\n");
+            return false;
+        }
+    } else {
+        output_dir = fdata->basedir;
+        make_dir(output_dir);
+    }
+
+    // use a string stream to build the output filepath
+    std::stringstream ss;
+    ss << output_dir << '/' << fdata->basename << timestamp;
+    if (fdata->include_freq) {
+        ss << '_' << channel->freqlist[channel->freq_idx].frequency;
+    }
+    ss << fdata->suffix;
+    fdata->file_path = ss.str();
+
+    fdata->file_path_tmp = fdata->file_path + ".tmp";
+
+    fdata->open_time = fdata->last_write_time = current_time;
+
+    if (open_file(fdata, mixmode, is_audio) < 0) {
+        log(LOG_WARNING, "Cannot open output file %s (%s)\n", fdata->file_path_tmp.c_str(), strerror(errno));
+        return false;
+    }
+
+    return true;
+}
+
+// Create all the output for a particular channel.
+void process_outputs(channel_t* channel, int cur_scan_freq) {
+    int mp3_bytes = 0;
+    if (channel->need_mp3) {
+        // debug_bulk_print("channel->mode=%s\n", channel->mode == MM_STEREO ? "MM_STEREO" : "MM_MONO");
+        mp3_bytes = lame_encode_buffer_ieee_float(channel->lame, channel->waveout, (channel->mode == MM_STEREO ? channel->waveout_r : NULL), WAVE_BATCH, channel->lamebuf, LAMEBUF_SIZE);
+        if (mp3_bytes < 0)
+            log(LOG_WARNING, "lame_encode_buffer_ieee_float: %d\n", mp3_bytes);
+    }
+    for (int k = 0; k < channel->output_count; k++) {
+        if (channel->outputs[k].enabled == false)
+            continue;
+        if (channel->outputs[k].type == O_ICECAST) {
+            icecast_data* icecast = (icecast_data*)(channel->outputs[k].data);
+            if (icecast->shout == NULL || mp3_bytes <= 0)
+                continue;
+            int ret = shout_send(icecast->shout, channel->lamebuf, mp3_bytes);
+            if (ret != SHOUTERR_SUCCESS || shout_queuelen(icecast->shout) > MAX_SHOUT_QUEUELEN) {
+                if (shout_queuelen(icecast->shout) > MAX_SHOUT_QUEUELEN)
+                    log(LOG_WARNING, "Exceeded max backlog for %s:%d/%s, disconnecting\n", icecast->hostname, icecast->port, icecast->mountpoint);
+                // reset connection
+                log(LOG_WARNING, "Lost connection to %s:%d/%s\n", icecast->hostname, icecast->port, icecast->mountpoint);
+                shout_close(icecast->shout);
+                shout_free(icecast->shout);
+                icecast->shout = NULL;
+            } else if (icecast->send_scan_freq_tags && cur_scan_freq >= 0) {
+                shout_metadata_t* meta = shout_metadata_new();
+                char description[32];
+                if (channel->freqlist[channel->freq_idx].label != NULL) {
+                    if (shout_metadata_add(meta, "song", channel->freqlist[channel->freq_idx].label) != SHOUTERR_SUCCESS) {
+                        log(LOG_WARNING, "Failed to add shout metadata\n");
+                    }
+                } else {
+                    snprintf(description, sizeof(description), "%.3f MHz", channel->freqlist[channel->freq_idx].frequency / 1000000.0);
+                    if (shout_metadata_add(meta, "song", description) != SHOUTERR_SUCCESS) {
+                        log(LOG_WARNING, "Failed to add shout metadata\n");
+                    }
+                }
+                if (SHOUT_SET_METADATA(icecast->shout, meta) != SHOUTERR_SUCCESS) {
+                    log(LOG_WARNING, "Failed to add shout metadata\n");
+                }
+                shout_metadata_free(meta);
+            }
+        } else if (channel->outputs[k].type == O_FILE || channel->outputs[k].type == O_RAWFILE) {
+            file_data* fdata = (file_data*)(channel->outputs[k].data);
+
+            if (fdata->continuous == false && channel->axcindicate == NO_SIGNAL && channel->outputs[k].active == false) {
+                close_if_necessary(channel, fdata);
+                continue;
+            }
+
+            if (channel->outputs[k].type == O_FILE && mp3_bytes <= 0)
+                continue;
+
+            if (!output_file_ready(channel, fdata, channel->mode, (channel->outputs[k].type == O_RAWFILE ? 0 : 1))) {
+                log(LOG_WARNING, "Output disabled\n");
+                channel->outputs[k].enabled = false;
+                continue;
+            };
+
+            size_t buflen = 0, written = 0;
+            if (channel->outputs[k].type == O_FILE) {
+                buflen = (size_t)mp3_bytes;
+                written = fwrite(channel->lamebuf, 1, buflen, fdata->f);
+            } else if (channel->outputs[k].type == O_RAWFILE) {
+                buflen = 2 * sizeof(float) * WAVE_BATCH;
+                written = fwrite(channel->iq_out, 1, buflen, fdata->f);
+            }
+            if (written < buflen) {
+                if (ferror(fdata->f))
+                    log(LOG_WARNING, "Cannot write to %s (%s), output disabled\n", fdata->file_path.c_str(), strerror(errno));
+                else
+                    log(LOG_WARNING, "Short write on %s, output disabled\n", fdata->file_path.c_str());
+                close_file(channel, fdata);
+                channel->outputs[k].enabled = false;
+            }
+            channel->outputs[k].active = (channel->axcindicate != NO_SIGNAL);
+            gettimeofday(&fdata->last_write_time, NULL);
+        } else if (channel->outputs[k].type == O_MIXER) {
+            mixer_data* mdata = (mixer_data*)(channel->outputs[k].data);
+            mixer_put_samples(mdata->mixer, mdata->input, channel->waveout, channel->axcindicate != NO_SIGNAL, WAVE_BATCH);
+        } else if (channel->outputs[k].type == O_UDP_STREAM) {
+            udp_stream_data* sdata = (udp_stream_data*)channel->outputs[k].data;
+
+            if (sdata->continuous == false && channel->axcindicate == NO_SIGNAL) {
+                continue;
+            }
+
+            if (channel->mode == MM_MONO) {
+                udp_stream_write(sdata, channel->waveout, (size_t)WAVE_BATCH * sizeof(float));
+            } else {
+                udp_stream_write(sdata, channel->waveout, channel->waveout_r, (size_t)WAVE_BATCH * sizeof(float));
+            }
+
+#ifdef WITH_PULSEAUDIO
+        } else if (channel->outputs[k].type == O_PULSE) {
+            pulse_data* pdata = (pulse_data*)(channel->outputs[k].data);
+            if (pdata->continuous == false && channel->axcindicate == NO_SIGNAL)
+                continue;
+
+            pulse_write_stream(pdata, channel->mode, channel->waveout, channel->waveout_r, (size_t)WAVE_BATCH * sizeof(float));
+#endif /* WITH_PULSEAUDIO */
+        }
+    }
+}
+
+void disable_channel_outputs(channel_t* channel) {
+    for (int k = 0; k < channel->output_count; k++) {
+        output_t* output = channel->outputs + k;
+        output->enabled = false;
+        if (output->type == O_ICECAST) {
+            icecast_data* icecast = (icecast_data*)(channel->outputs[k].data);
+            if (icecast->shout == NULL)
+                continue;
+            log(LOG_WARNING, "Closing connection to %s:%d/%s\n", icecast->hostname, icecast->port, icecast->mountpoint);
+            shout_close(icecast->shout);
+            shout_free(icecast->shout);
+            icecast->shout = NULL;
+        } else if (output->type == O_FILE || output->type == O_RAWFILE) {
+            file_data* fdata = (file_data*)(channel->outputs[k].data);
+            close_file(channel, fdata);
+        } else if (output->type == O_MIXER) {
+            mixer_data* mdata = (mixer_data*)(output->data);
+            mixer_disable_input(mdata->mixer, mdata->input);
+        } else if (output->type == O_UDP_STREAM) {
+            udp_stream_data* sdata = (udp_stream_data*)output->data;
+            udp_stream_shutdown(sdata);
+#ifdef WITH_PULSEAUDIO
+        } else if (output->type == O_PULSE) {
+            pulse_data* pdata = (pulse_data*)(output->data);
+            pulse_shutdown(pdata);
+#endif /* WITH_PULSEAUDIO */
+        }
+    }
+}
+
+void disable_device_outputs(device_t* dev) {
+    log(LOG_INFO, "Disabling device outputs\n");
+    for (int j = 0; j < dev->channel_count; j++) {
+        disable_channel_outputs(dev->channels + j);
+    }
+}
+
+static void print_channel_metric(FILE* f, char const* name, float freq, char* label) {
+    fprintf(f, "%s{freq=\"%.3f\"", name, freq / 1000000.0);
+    if (label != NULL) {
+        fprintf(f, ",label=\"%s\"", label);
+    }
+    fprintf(f, "}");
+}
+
+static void output_channel_noise_levels(FILE* f) {
+    fprintf(f,
+            "# HELP channel_noise_level Raw squelch noise_level.\n"
+            "# TYPE channel_noise_level gauge\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_noise_level", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%.3f\n", channel->freqlist[k].squelch.noise_level());
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_dbfs_noise_levels(FILE* f) {
+    fprintf(f,
+            "# HELP channel_dbfs_noise_level Squelch noise_level as dBFS.\n"
+            "# TYPE channel_dbfs_noise_level gauge\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_dbfs_noise_level", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%.3f\n", level_to_dBFS(channel->freqlist[k].squelch.noise_level()));
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_signal_levels(FILE* f) {
+    fprintf(f,
+            "# HELP channel_signal_level Raw squelch signal_level.\n"
+            "# TYPE channel_signal_level gauge\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_signal_level", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%.3f\n", channel->freqlist[k].squelch.signal_level());
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_dbfs_signal_levels(FILE* f) {
+    fprintf(f,
+            "# HELP channel_dbfs_signal_level Squelch signal_level as dBFS.\n"
+            "# TYPE channel_dbfs_signal_level gauge\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_dbfs_signal_level", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%.3f\n", level_to_dBFS(channel->freqlist[k].squelch.signal_level()));
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_squelch_levels(FILE* f) {
+    fprintf(f,
+            "# HELP channel_squelch_level Squelch squelch_level.\n"
+            "# TYPE channel_squelch_level gauge\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_squelch_level", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%.3f\n", channel->freqlist[k].squelch.squelch_level());
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_squelch_counter(FILE* f) {
+    fprintf(f,
+            "# HELP channel_squelch_counter Squelch open_count.\n"
+            "# TYPE channel_squelch_counter counter\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_squelch_counter", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%zu\n", channel->freqlist[k].squelch.open_count());
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_flappy_counter(FILE* f) {
+    fprintf(f,
+            "# HELP channel_flappy_counter Squelch flappy_count.\n"
+            "# TYPE channel_flappy_counter counter\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_flappy_counter", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%zu\n", channel->freqlist[k].squelch.flappy_count());
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_ctcss_counter(FILE* f) {
+    fprintf(f,
+            "# HELP channel_ctcss_counter count of windows with CTCSS detected.\n"
+            "# TYPE channel_ctcss_counter counter\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_ctcss_counter", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%zu\n", channel->freqlist[k].squelch.ctcss_count());
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_no_ctcss_counter(FILE* f) {
+    fprintf(f,
+            "# HELP channel_no_ctcss_counter count of windows without CTCSS detected.\n"
+            "# TYPE channel_no_ctcss_counter counter\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_no_ctcss_counter", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%zu\n", channel->freqlist[k].squelch.no_ctcss_count());
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_channel_activity_counters(FILE* f) {
+    fprintf(f,
+            "# HELP channel_activity_counter Loops of output_thread with frequency active.\n"
+            "# TYPE channel_activity_counter counter\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = devices[i].channels + j;
+            for (int k = 0; k < channel->freq_count; k++) {
+                print_channel_metric(f, "channel_activity_counter", channel->freqlist[k].frequency, channel->freqlist[k].label);
+                fprintf(f, "\t%zu\n", channel->freqlist[k].active_counter);
+            }
+        }
+    }
+    fprintf(f, "\n");
+}
+
+static void output_device_buffer_overflows(FILE* f) {
+    fprintf(f,
+            "# HELP buffer_overflow_count Number of times a device's buffer has overflowed.\n"
+            "# TYPE buffer_overflow_count counter\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        fprintf(f, "buffer_overflow_count{device=\"%d\"}\t%zu\n", i, dev->input->overflow_count);
+    }
+    fprintf(f, "\n");
+}
+
+static void output_output_overruns(FILE* f) {
+    fprintf(f,
+            "# HELP output_overrun_count Number of times a device or mixer output has overrun.\n"
+            "# TYPE output_overrun_count counter\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        fprintf(f, "output_overrun_count{device=\"%d\"}\t%zu\n", i, dev->output_overrun_count);
+    }
+    for (int i = 0; i < mixer_count; i++) {
+        mixer_t* mixer = mixers + i;
+        fprintf(f, "output_overrun_count{mixer=\"%d\"}\t%zu\n", i, mixer->output_overrun_count);
+    }
+    fprintf(f, "\n");
+}
+
+static void output_input_overruns(FILE* f) {
+    if (mixer_count == 0) {
+        return;
+    }
+
+    fprintf(f,
+            "# HELP input_overrun_count Number of times mixer input has overrun.\n"
+            "# TYPE input_overrun_count counter\n");
+
+    for (int i = 0; i < mixer_count; i++) {
+        mixer_t* mixer = mixers + i;
+        for (int j = 0; j < mixer->input_count; j++) {
+            mixinput_t* input = mixer->inputs + j;
+            fprintf(f, "input_overrun_count{mixer=\"%d\",input=\"%d\"}\t%zu\n", i, j, input->input_overrun_count);
+        }
+    }
+    fprintf(f, "\n");
+}
+
+void write_stats_file(timeval* last_stats_write) {
+    if (!stats_filepath) {
+        return;
+    }
+
+    timeval current_time;
+    gettimeofday(&current_time, NULL);
+
+    static const double STATS_FILE_TIMING = 15.0;
+    if (!do_exit && delta_sec(last_stats_write, &current_time) < STATS_FILE_TIMING) {
+        return;
+    }
+
+    *last_stats_write = current_time;
+
+    FILE* file = fopen(stats_filepath, "w");
+    if (!file) {
+        log(LOG_WARNING, "Cannot open output file %s (%s)\n", stats_filepath, strerror(errno));
+        return;
+    }
+
+    output_channel_activity_counters(file);
+    output_channel_noise_levels(file);
+    output_channel_dbfs_noise_levels(file);
+    output_channel_signal_levels(file);
+    output_channel_dbfs_signal_levels(file);
+    output_channel_squelch_counter(file);
+    output_channel_squelch_levels(file);
+    output_channel_flappy_counter(file);
+    output_channel_ctcss_counter(file);
+    output_channel_no_ctcss_counter(file);
+    output_device_buffer_overflows(file);
+    output_output_overruns(file);
+    output_input_overruns(file);
+
+    fclose(file);
+}
+
+void* output_thread(void* param) {
+    assert(param != NULL);
+    output_params_t* output_param = (output_params_t*)param;
+    struct freq_tag tag;
+    struct timeval tv;
+    int new_freq = -1;
+    timeval last_stats_write = {0, 0};
+
+    debug_print("Starting output thread, devices %d:%d, mixers %d:%d, signal %p\n", output_param->device_start, output_param->device_end, output_param->mixer_start, output_param->mixer_end,
+                output_param->mp3_signal);
+
+#ifdef DEBUG
+    timeval ts, te;
+    gettimeofday(&ts, NULL);
+#endif /* DEBUG */
+    while (!do_exit) {
+        output_param->mp3_signal->wait();
+        for (int i = output_param->mixer_start; i < output_param->mixer_end; i++) {
+            if (mixers[i].enabled == false)
+                continue;
+            channel_t* channel = &mixers[i].channel;
+            if (channel->state == CH_READY) {
+                process_outputs(channel, -1);
+                channel->state = CH_DIRTY;
+            }
+        }
+#ifdef DEBUG
+        gettimeofday(&te, NULL);
+        debug_bulk_print("mixeroutput: %lu.%lu %lu\n", te.tv_sec, (unsigned long)te.tv_usec, (te.tv_sec - ts.tv_sec) * 1000000UL + te.tv_usec - ts.tv_usec);
+        ts.tv_sec = te.tv_sec;
+        ts.tv_usec = te.tv_usec;
+#endif /* DEBUG */
+        for (int i = output_param->device_start; i < output_param->device_end; i++) {
+            device_t* dev = devices + i;
+            if (dev->input->state == INPUT_RUNNING && dev->waveavail) {
+                if (dev->mode == R_SCAN) {
+                    tag_queue_get(dev, &tag);
+                    if (tag.freq >= 0) {
+                        tag.tv.tv_sec += shout_metadata_delay;
+                        gettimeofday(&tv, NULL);
+                        if (tag.tv.tv_sec < tv.tv_sec || (tag.tv.tv_sec == tv.tv_sec && tag.tv.tv_usec <= tv.tv_usec)) {
+                            new_freq = tag.freq;
+                            tag_queue_advance(dev);
+                        }
+                    }
+                }
+                for (int j = 0; j < dev->channel_count; j++) {
+                    channel_t* channel = devices[i].channels + j;
+                    process_outputs(channel, new_freq);
+                    memcpy(channel->waveout, channel->waveout + WAVE_BATCH, AGC_EXTRA * 4);
+                }
+                dev->waveavail = 0;
+            }
+            // make sure we don't carry new_freq value to the next receiver which might be working
+            // in multichannel mode
+            new_freq = -1;
+        }
+        if (output_param->device_start == 0) {
+            write_stats_file(&last_stats_write);
+        }
+    }
+    return 0;
+}
+
+// reconnect as required
+void* output_check_thread(void*) {
+    while (!do_exit) {
+        SLEEP(10000);
+        for (int i = 0; i < device_count; i++) {
+            device_t* dev = devices + i;
+            for (int j = 0; j < dev->channel_count; j++) {
+                for (int k = 0; k < dev->channels[j].output_count; k++) {
+                    if (dev->channels[j].outputs[k].type == O_ICECAST) {
+                        icecast_data* icecast = (icecast_data*)(dev->channels[j].outputs[k].data);
+                        if (dev->input->state == INPUT_FAILED) {
+                            if (icecast->shout) {
+                                log(LOG_WARNING, "Device #%d failed, disconnecting stream %s:%d/%s\n", i, icecast->hostname, icecast->port, icecast->mountpoint);
+                                shout_close(icecast->shout);
+                                shout_free(icecast->shout);
+                                icecast->shout = NULL;
+                            }
+                        } else if (dev->input->state == INPUT_RUNNING) {
+                            if (icecast->shout == NULL) {
+                                log(LOG_NOTICE, "Trying to reconnect to %s:%d/%s...\n", icecast->hostname, icecast->port, icecast->mountpoint);
+                                shout_setup(icecast, dev->channels[j].mode);
+                            }
+                        }
+                    } else if (dev->channels[j].outputs[k].type == O_UDP_STREAM) {
+                        udp_stream_data* sdata = (udp_stream_data*)dev->channels[j].outputs[k].data;
+
+                        if (dev->input->state == INPUT_FAILED) {
+                            udp_stream_shutdown(sdata);
+                        }
+#ifdef WITH_PULSEAUDIO
+                    } else if (dev->channels[j].outputs[k].type == O_PULSE) {
+                        pulse_data* pdata = (pulse_data*)(dev->channels[j].outputs[k].data);
+                        if (dev->input->state == INPUT_FAILED) {
+                            if (pdata->context) {
+                                pulse_shutdown(pdata);
+                            }
+                        } else if (dev->input->state == INPUT_RUNNING) {
+                            if (pdata->context == NULL) {
+                                pulse_setup(pdata, dev->channels[j].mode);
+                            }
+                        }
+#endif /* WITH_PULSEAUDIO */
+                    }
+                }
+            }
+        }
+        for (int i = 0; i < mixer_count; i++) {
+            if (mixers[i].enabled == false)
+                continue;
+            for (int k = 0; k < mixers[i].channel.output_count; k++) {
+                if (mixers[i].channel.outputs[k].enabled == false)
+                    continue;
+                if (mixers[i].channel.outputs[k].type == O_ICECAST) {
+                    icecast_data* icecast = (icecast_data*)(mixers[i].channel.outputs[k].data);
+                    if (icecast->shout == NULL) {
+                        log(LOG_NOTICE, "Trying to reconnect to %s:%d/%s...\n", icecast->hostname, icecast->port, icecast->mountpoint);
+                        shout_setup(icecast, mixers[i].channel.mode);
+                    }
+#ifdef WITH_PULSEAUDIO
+                } else if (mixers[i].channel.outputs[k].type == O_PULSE) {
+                    pulse_data* pdata = (pulse_data*)(mixers[i].channel.outputs[k].data);
+                    if (pdata->context == NULL) {
+                        pulse_setup(pdata, mixers[i].channel.mode);
+                    }
+#endif /* WITH_PULSEAUDIO */
+                }
+            }
+        }
+    }
+    return 0;
+}

+ 249 - 0
src/pulse.cpp

@@ -0,0 +1,249 @@
+/*
+ * pulse.cpp
+ * PulseAudio output routines
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <pulse/pulseaudio.h>
+#include <syslog.h>
+#include <iostream>
+#include "rtl_airband.h"
+
+#define SERVER_IFNOTNULL(x) ((x) ? (x) : "<default_server>")
+#define PA_LOOP_LOCK(x)                       \
+    if (!pa_threaded_mainloop_in_thread(x)) { \
+        pa_threaded_mainloop_lock(x);         \
+    }
+#define PA_LOOP_UNLOCK(x)                     \
+    if (!pa_threaded_mainloop_in_thread(x)) { \
+        pa_threaded_mainloop_unlock(x);       \
+    }
+
+using namespace std;
+
+pa_threaded_mainloop* mainloop = NULL;
+
+void pulse_shutdown(pulse_data* pdata) {
+    if (!pdata)
+        return;
+    PA_LOOP_LOCK(mainloop);
+    if (pdata->left) {
+        pa_stream_disconnect(pdata->left);
+        pa_stream_unref(pdata->left);
+        pdata->left = NULL;
+    }
+    if (pdata->right) {
+        pa_stream_disconnect(pdata->right);
+        pa_stream_unref(pdata->right);
+        pdata->right = NULL;
+    }
+    if (pdata->context) {
+        pa_context_disconnect(pdata->context);
+        pa_context_unref(pdata->context);
+        pdata->context = NULL;
+    }
+    PA_LOOP_UNLOCK(mainloop);
+}
+
+static void pulse_stream_underflow_cb(pa_stream*, void* userdata) {
+    pulse_data* pdata = (pulse_data*)userdata;
+    if (pdata->continuous)  // do not flood the logs on every squelch closing
+        log(LOG_INFO, "pulse: %s: stream \"%s\": underflow\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name);
+}
+
+static void pulse_stream_overflow_cb(pa_stream*, void* userdata) {
+    pulse_data* pdata = (pulse_data*)userdata;
+    log(LOG_INFO, "pulse: %s: stream \"%s\": overflow\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name);
+}
+
+static void stream_state_cb(pa_stream* stream, void* userdata) {
+    pulse_data* pdata = (pulse_data*)userdata;
+
+    switch (pa_stream_get_state(stream)) {
+        case PA_STREAM_READY:
+            if (pdata->mode == MM_MONO || (pa_stream_get_state(pdata->left) == PA_STREAM_READY && pa_stream_get_state(pdata->right) == PA_STREAM_READY))
+                pa_stream_cork(pdata->left, 0, NULL, NULL);
+            break;
+        case PA_STREAM_UNCONNECTED:
+        case PA_STREAM_CREATING:
+            break;
+        case PA_STREAM_FAILED:
+            log(LOG_WARNING, "pulse: %s: stream \"%s\" failed: %s\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name, pa_strerror(pa_context_errno(pdata->context)));
+            break;
+        case PA_STREAM_TERMINATED:
+            log(LOG_WARNING, "pulse: %s: stream \"%s\" terminated\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name);
+            break;
+            break;
+    }
+}
+
+static pa_stream* pulse_setup_stream(pulse_data* pdata, const pa_sample_spec* ss, pa_channel_map* cmap, pa_stream* sync_stream) {
+    pa_stream* stream = NULL;
+    PA_LOOP_LOCK(mainloop);
+    if (!(stream = pa_stream_new(pdata->context, pdata->stream_name, ss, cmap))) {
+        log(LOG_ERR, "pulse: %s: failed to create stream \"%s\": %s\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name, pa_strerror(pa_context_errno(pdata->context)));
+        goto fail;
+    }
+    pa_stream_set_state_callback(stream, stream_state_cb, pdata);
+    pa_stream_set_underflow_callback(stream, pulse_stream_underflow_cb, pdata);
+    pa_stream_set_overflow_callback(stream, pulse_stream_overflow_cb, pdata);
+    // Initially streams are corked (paused). For mono streams this is irrelevant,
+    // but for stereo mixers it's required to keep left and right channels in sync.
+    // Starting the left channel stream before the other stream from the sync pair is
+    // set up causes the left channel stream to fail.
+    if (pa_stream_connect_playback(stream, pdata->sink, NULL, (pa_stream_flags_t)(PA_STREAM_INTERPOLATE_TIMING | PA_STREAM_ADJUST_LATENCY | PA_STREAM_START_CORKED | PA_STREAM_AUTO_TIMING_UPDATE),
+                                   NULL, sync_stream) < 0) {
+        log(LOG_ERR, "pulse: %s: failed to connect stream \"%s\": %s\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name, pa_strerror(pa_context_errno(pdata->context)));
+        goto fail;
+    }
+    log(LOG_INFO, "pulse: %s: stream \"%s\" connected\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name);
+    PA_LOOP_UNLOCK(mainloop);
+    return stream;
+fail:
+    PA_LOOP_UNLOCK(mainloop);
+    return NULL;
+}
+
+static void pulse_setup_streams(pulse_data* pdata) {
+    const pa_sample_spec ss = {
+#if __cplusplus >= 199711L
+        .format = PA_SAMPLE_FLOAT32LE,
+        .rate = WAVE_RATE,
+        .channels = 1
+#else  // for g++ 4.6 (eg. Raspbian Wheezy)
+        PA_SAMPLE_FLOAT32LE,
+        WAVE_RATE,
+        1
+#endif /* __cplusplus */
+    };
+    pa_channel_map_init_mono(&pdata->lmap);
+    pdata->lmap.map[0] = (pdata->mode == MM_STEREO ? PA_CHANNEL_POSITION_LEFT : PA_CHANNEL_POSITION_MONO);
+    if (!(pdata->left = pulse_setup_stream(pdata, &ss, &pdata->lmap, NULL)))
+        goto fail;
+    if (pdata->mode == MM_STEREO) {
+        pa_channel_map_init_mono(&pdata->rmap);
+        pdata->rmap.map[0] = PA_CHANNEL_POSITION_RIGHT;
+        if (!(pdata->right = pulse_setup_stream(pdata, &ss, &pdata->rmap, pdata->left)))
+            goto fail;
+    }
+    return;
+fail:
+    pulse_shutdown(pdata);
+}
+
+static void pulse_ctx_state_cb(pa_context* c, void* userdata) {
+    pulse_data* pdata = (pulse_data*)userdata;
+    switch (pa_context_get_state(c)) {
+        case PA_CONTEXT_READY:
+            pulse_setup_streams(pdata);
+            break;
+        case PA_CONTEXT_TERMINATED:
+            break;
+        case PA_CONTEXT_FAILED:
+            log(LOG_ERR, "pulse: %s: connection failed: %s\n", SERVER_IFNOTNULL(pdata->server), pa_strerror(pa_context_errno(pdata->context)));
+            pulse_shutdown(pdata);
+            break;
+        case PA_CONTEXT_CONNECTING:
+            log(LOG_INFO, "pulse: %s: connecting...\n", SERVER_IFNOTNULL(pdata->server));
+            break;
+        case PA_CONTEXT_UNCONNECTED:
+        case PA_CONTEXT_AUTHORIZING:
+        case PA_CONTEXT_SETTING_NAME:
+            break;
+    }
+}
+
+void pulse_init() {
+    if (!mainloop && !(mainloop = pa_threaded_mainloop_new())) {
+        cerr << "Failed to initialize PulseAudio main loop - aborting\n";
+        error();
+    }
+}
+
+int pulse_setup(pulse_data* pdata, mix_modes mixmode) {
+    if (!(pdata->context = pa_context_new(pa_threaded_mainloop_get_api(mainloop), pdata->name))) {
+        log(LOG_ERR, "%s", "pulse: failed to create context\n");
+        return -1;
+    }
+    pdata->mode = mixmode;
+    PA_LOOP_LOCK(mainloop);
+    int ret = 0;
+    pa_context_set_state_callback(pdata->context, &pulse_ctx_state_cb, pdata);
+    if (pa_context_connect(pdata->context, pdata->server, PA_CONTEXT_NOFLAGS, NULL) < 0) {
+        log(LOG_WARNING, "pulse: %s: failed to connect: %s\n", SERVER_IFNOTNULL(pdata->server), pa_strerror(pa_context_errno(pdata->context)));
+        // Don't clean up things here, context state is now set to PA_CONTEXT_FAILED,
+        // so pulse_ctx_state_cb will take care of that.
+        ret = -1;
+    }
+    PA_LOOP_UNLOCK(mainloop);
+    return ret;
+}
+
+void pulse_start() {
+    if (!mainloop)
+        return;
+    PA_LOOP_LOCK(mainloop);
+    pa_threaded_mainloop_start(mainloop);
+    PA_LOOP_UNLOCK(mainloop);
+}
+
+static int pulse_write_single_stream(pa_stream* stream, pulse_data* pdata, const float* data, size_t len, bool is_master) {
+    pa_usec_t latency;
+    int ret = -1;
+    int lret;
+
+    PA_LOOP_LOCK(mainloop);
+    if (!stream || pa_stream_get_state(stream) != PA_STREAM_READY)
+        goto end;
+
+    if (is_master) { /* latency info is only meaningful for master stream) */
+        lret = pa_stream_get_latency(stream, &latency, NULL);
+        if (lret < 0) {
+            log(LOG_WARNING, "pulse: %s: failed to get latency info for stream \"%s\" (error is: %s), disconnecting\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name, pa_strerror(lret));
+            goto end;
+        }
+        if (latency > PULSE_STREAM_LATENCY_LIMIT) {
+            log(LOG_INFO, "pulse: %s: exceeded max backlog for stream \"%s\", disconnecting\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name);
+            goto end;
+        }
+        debug_bulk_print("pulse: %s: stream=\"%s\" lret=%d latency=%f ms\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name, lret, (float)latency / 1000.0f);
+    }
+    if (pa_stream_write(stream, data, len, NULL, 0LL, PA_SEEK_RELATIVE) < 0) {
+        log(LOG_WARNING, "pulse: %s: could not write to stream \"%s\", disconnecting\n", SERVER_IFNOTNULL(pdata->server), pdata->stream_name);
+        goto end;
+    }
+    ret = 0;
+end:
+    PA_LOOP_UNLOCK(mainloop);
+    return ret;
+}
+
+void pulse_write_stream(pulse_data* pdata, mix_modes mode, const float* data_left, const float* data_right, size_t len) {
+    PA_LOOP_LOCK(mainloop);
+    if (!pdata->context || pa_context_get_state(pdata->context) != PA_CONTEXT_READY)
+        goto end;
+    if (pulse_write_single_stream(pdata->left, pdata, data_left, len, true) < 0)
+        goto fail;
+    if (mode == MM_STEREO && pulse_write_single_stream(pdata->right, pdata, data_right, len, false) < 0)
+        goto fail;
+    goto end;
+fail:
+    pulse_shutdown(pdata);
+end:
+    PA_LOOP_UNLOCK(mainloop);
+    return;
+}

+ 1332 - 0
src/rtl_airband.cpp

@@ -0,0 +1,1332 @@
+/*
+ * RTLSDR AM/NFM demodulator, mixer, streamer and recorder
+ *
+ * Copyright (c) 2014 Wong Man Hang <microtony@gmail.com>
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined WITH_BCM_VC && !defined __arm__
+#error Broadcom VideoCore support can only be enabled on ARM builds
+#endif
+
+// From this point we may safely assume that WITH_BCM_VC implies __arm__
+
+#ifdef WITH_BCM_VC
+#include "hello_fft/gpu_fft.h"
+#include "hello_fft/mailbox.h"
+#endif /* WITH_BCM_VC */
+
+#include <fcntl.h>
+#include <lame/lame.h>
+#include <ogg/ogg.h>
+#include <pthread.h>
+#include <shout/shout.h>
+#include <stdint.h>  // uint8_t
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <syslog.h>
+#include <unistd.h>
+#include <vorbis/vorbisenc.h>
+#include <algorithm>
+#include <cassert>
+#include <cerrno>
+#include <cmath>
+#include <csignal>
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <iostream>
+#include <libconfig.h++>
+#include "input-common.h"
+#include "logging.h"
+#include "rtl_airband.h"
+#include "squelch.h"
+#include <semaphore.h>
+#ifdef WITH_PROFILING
+#include "gperftools/profiler.h"
+#endif /* WITH_PROFILING */
+#include <gpiod.h>
+#include <ncurses.h>
+
+
+#define START_ON_TIME 30
+#define OPEN_TIME   5
+#define PROLONG_TIME 10
+#define WAVE_OUT_LEVEL 0.1
+#define WAVE_IN_LEVEL 10
+#define BLINK_TIME 5
+
+using namespace std;
+using namespace libconfig;
+	struct gpiod_chip *chip;
+	struct gpiod_line *gpio16line;
+    struct gpiod_line *gpio20line;
+    struct gpiod_line *gpio26line;
+
+
+#define MAX_BUFFERS 10
+
+sem_t mutex_sem, spooler_sem;
+pthread_mutex_t lock_data; 
+device_t* devices;
+mixer_t* mixers;
+int squelch_status = 0;
+int audio_status = 0;
+int status_data = 0;
+int status_data_old = 0;
+int device_count, mixer_count;
+static int devices_running = 0;
+int tui = 0;  // do not display textual user interface
+int shout_metadata_delay = 3;
+volatile int do_exit = 0;
+bool use_localtime = false;
+bool multiple_demod_threads = false;
+bool multiple_output_threads = false;
+bool log_scan_activity = false;
+char* stats_filepath = NULL;
+size_t fft_size_log = DEFAULT_FFT_SIZE_LOG;
+size_t fft_size = 1 << fft_size_log;
+
+#ifdef NFM
+float alpha = exp(-1.0f / (WAVE_RATE * 2e-4));
+enum fm_demod_algo { FM_FAST_ATAN2, FM_QUADRI_DEMOD };
+enum fm_demod_algo fm_demod = FM_FAST_ATAN2;
+#endif /* NFM */
+
+#ifdef DEBUG
+char* debug_path;
+#endif /* DEBUG */
+
+void sighandler(int sig) {
+    log(LOG_NOTICE, "Got signal %d, exiting\n", sig);
+    do_exit = 1;
+}
+
+void* controller_thread(void* params) {
+    device_t* dev = (device_t*)params;
+    int i = 0;
+    int consecutive_squelch_off = 0;
+    int new_centerfreq = 0;
+    struct timeval tv;
+
+    if (dev->channels[0].freq_count < 2)
+        return 0;
+    while (!do_exit) {
+        SLEEP(200);
+        if (dev->channels[0].axcindicate == NO_SIGNAL) {
+            if (consecutive_squelch_off < 10) {
+                consecutive_squelch_off++;
+            } else {
+                i++;
+                i %= dev->channels[0].freq_count;
+                dev->channels[0].freq_idx = i;
+                new_centerfreq = dev->channels[0].freqlist[i].frequency + 20 * (double)(dev->input->sample_rate / fft_size);
+                if (input_set_centerfreq(dev->input, new_centerfreq) < 0) {
+                    break;
+                }
+            }
+        } else {
+            if (consecutive_squelch_off == 10) {
+                if (log_scan_activity)
+                    log(LOG_INFO, "Activity on %7.3f MHz\n", dev->channels[0].freqlist[i].frequency / 1000000.0);
+                if (i != dev->last_frequency) {
+                    // squelch has just opened on a new frequency - we might need to update outputs' metadata
+                    gettimeofday(&tv, NULL);
+                    tag_queue_put(dev, i, tv);
+                    dev->last_frequency = i;
+                }
+            }
+            consecutive_squelch_off = 0;
+        }
+    }
+    return 0;
+}
+
+void multiply(float ar, float aj, float br, float bj, float* cr, float* cj) {
+    *cr = ar * br - aj * bj;
+    *cj = aj * br + ar * bj;
+}
+void* belysning_control_thread(void*){
+    struct timespec ts;
+    struct timeval tv;
+    long int start_time;
+    long int stop_time;
+    long int sq_start_time=0;
+    long int sq_stop_time=0;
+    bool sq_status = false;
+    long int audio_start_time=0;
+    bool active=false;
+    gettimeofday(&tv, 0);
+    start_time=tv.tv_sec;
+    stop_time=tv.tv_sec+1;
+    while (!do_exit) {
+        //double t;
+        status_data=status_data;
+        int sem_try=0;
+        bool activate=false;
+        bool prolong=false;
+        if (clock_gettime(CLOCK_REALTIME, &ts) == -1)
+            printf("clock_gettime");
+        ts.tv_sec +=1;
+        sem_try = sem_timedwait(&mutex_sem, &ts);
+        if (sem_try == 0){
+            if (sq_status == false && (status_data & 1)){
+                sq_start_time = tv.tv_sec;
+                sq_status = true;
+            }
+            if (sq_status == true && !(status_data & 1)){
+                sq_stop_time = tv.tv_sec;
+                sq_status = false;
+            }
+            printf("ST %u, %u ",(status_data & 1),(status_data >> 1));
+            if ( (status_data >> 1) ){
+                audio_start_time = tv.tv_sec;   
+            }
+            prolong = true;
+            sem_post(&spooler_sem);
+        }
+        printf("DATA %u, %ld, %ld, %ld %u\n",sq_status,sq_start_time,audio_start_time,tv.tv_sec,prolong);
+        if (sq_status == true && sq_start_time+OPEN_TIME < tv.tv_sec && audio_start_time+OPEN_TIME < tv.tv_sec){
+            sq_start_time=sq_start_time;
+            sq_stop_time=sq_stop_time;
+            activate=true;
+        }
+        gettimeofday(&tv, 0);
+        //First time activation
+        if (active==false && activate==true){
+            start_time=tv.tv_sec;
+            stop_time=tv.tv_sec+START_ON_TIME;
+            active=true;
+            printf("start\n");
+
+        }
+        // prolong
+        if (active==true && prolong==true){
+            if (tv.tv_sec+10 >= stop_time){
+                stop_time=tv.tv_sec+PROLONG_TIME;
+                printf("prolong\n");
+                prolong = false;
+            }
+        }
+        if (stop_time<=tv.tv_sec && active){
+            active=false;
+            start_time=start_time;
+            printf("Stop\n");
+        }
+        if (active)
+            gpiod_line_set_value(gpio26line, 1);
+        else
+            gpiod_line_set_value(gpio26line, 0);  
+    }
+    return 0;
+}
+#ifdef NFM
+float fast_atan2(float y, float x) {
+    float yabs, angle;
+    float pi4 = M_PI_4, pi34 = 3 * M_PI_4;
+    if (x == 0.0f && y == 0.0f) {
+        return 0;
+    }
+    yabs = y;
+    if (yabs < 0.0f) {
+        yabs = -yabs;
+    }
+    if (x >= 0.0f) {
+        angle = pi4 - pi4 * (x - yabs) / (x + yabs);
+    } else {
+        angle = pi34 - pi4 * (x + yabs) / (yabs - x);
+    }
+    if (y < 0.0f) {
+        return -angle;
+    }
+    return angle;
+}
+
+float polar_disc_fast(float ar, float aj, float br, float bj) {
+    float cr, cj;
+    multiply(ar, aj, br, -bj, &cr, &cj);
+    return (float)(fast_atan2(cj, cr) * M_1_PI);
+}
+
+float fm_quadri_demod(float ar, float aj, float br, float bj) {
+    return (float)((br * aj - ar * bj) / (ar * ar + aj * aj + 1.0f) * M_1_PI);
+}
+
+#endif /* NFM */
+
+class AFC {
+    const status _prev_axcindicate;
+
+#ifdef WITH_BCM_VC
+    float square(const GPU_FFT_COMPLEX* fft_results, size_t index) {
+        return fft_results[index].re * fft_results[index].re + fft_results[index].im * fft_results[index].im;
+    }
+#else
+    float square(const fftwf_complex* fft_results, size_t index) {
+        return fft_results[index][0] * fft_results[index][0] + fft_results[index][1] * fft_results[index][1];
+    }
+#endif /* WITH_BCM_VC */
+
+    template <class FFT_RESULTS, int STEP>
+    size_t check(const FFT_RESULTS* fft_results, const size_t base, const float base_value, unsigned char afc) {
+        float threshold = 0;
+        size_t bin;
+        for (bin = base;; bin += STEP) {
+            if (STEP < 0) {
+                if (bin < -STEP)
+                    break;
+
+            } else if ((size_t)(bin + STEP) >= fft_size)
+                break;
+
+            const float value = square(fft_results, (size_t)(bin + STEP));
+            if (value <= base_value)
+                break;
+
+            if (base == (size_t)bin) {
+                threshold = (value - base_value) / (float)afc;
+            } else {
+                if ((value - base_value) < threshold)
+                    break;
+
+                threshold += threshold / 10.0;
+            }
+        }
+        return bin;
+    }
+
+   public:
+    AFC(device_t* dev, int index) : _prev_axcindicate(dev->channels[index].axcindicate) {}
+
+    template <class FFT_RESULTS>
+    void finalize(device_t* dev, int index, const FFT_RESULTS* fft_results) {
+        channel_t* channel = &dev->channels[index];
+        if (channel->afc == 0)
+            return;
+
+        const char axcindicate = channel->axcindicate;
+        if (axcindicate != NO_SIGNAL && _prev_axcindicate == NO_SIGNAL) {
+            const size_t base = dev->base_bins[index];
+            const float base_value = square(fft_results, base);
+            size_t bin = check<FFT_RESULTS, -1>(fft_results, base, base_value, channel->afc);
+            if (bin == base)
+                bin = check<FFT_RESULTS, 1>(fft_results, base, base_value, channel->afc);
+
+            if (dev->bins[index] != bin) {
+#ifdef AFC_LOGGING
+                log(LOG_INFO, "AFC device=%d channel=%d: base=%zu prev=%zu now=%zu\n", dev->device, index, base, dev->bins[index], bin);
+#endif /* AFC_LOGGING */
+                dev->bins[index] = bin;
+                if (bin > base)
+                    channel->axcindicate = AFC_UP;
+                else if (bin < base)
+                    channel->axcindicate = AFC_DOWN;
+            }
+        } else if (axcindicate == NO_SIGNAL && _prev_axcindicate != NO_SIGNAL)
+            dev->bins[index] = dev->base_bins[index];
+    }
+};
+
+void init_demod(demod_params_t* params, Signal* signal, int device_start, int device_end) {
+    assert(params != NULL);
+    assert(signal != NULL);
+
+    params->mp3_signal = signal;
+    params->device_start = device_start;
+    params->device_end = device_end;
+
+#ifndef WITH_BCM_VC
+    params->fftin = fftwf_alloc_complex(fft_size);
+    params->fftout = fftwf_alloc_complex(fft_size);
+    params->fft = fftwf_plan_dft_1d(fft_size, params->fftin, params->fftout, FFTW_FORWARD, FFTW_MEASURE);
+#endif /* WITH_BCM_VC */
+}
+
+void init_output(output_params_t* params, int device_start, int device_end, int mixer_start, int mixer_end) {
+    assert(params != NULL);
+
+    params->mp3_signal = new Signal;
+    params->device_start = device_start;
+    params->device_end = device_end;
+    params->mixer_start = mixer_start;
+    params->mixer_end = mixer_end;
+}
+
+int next_device(demod_params_t* params, int current) {
+    current++;
+    if (current < params->device_end) {
+        return current;
+    }
+    return params->device_start;
+}
+
+void* demodulate(void* params) {
+    assert(params != NULL);
+    demod_params_t* demod_params = (demod_params_t*)params;
+    
+    debug_print("Starting demod thread, devices %d:%d, signal %p\n", demod_params->device_start, demod_params->device_end, demod_params->mp3_signal);
+
+    // initialize fft engine
+#ifdef WITH_BCM_VC
+    int mb = mbox_open();
+    struct GPU_FFT* fft;
+    int ret = gpu_fft_prepare(mb, fft_size_log, GPU_FFT_FWD, FFT_BATCH, &fft);
+    switch (ret) {
+        case -1:
+            log(LOG_CRIT, "Unable to enable V3D. Please check your firmware is up to date.\n");
+            error();
+            break;
+        case -2:
+            log(LOG_CRIT, "log2_N=%d not supported. Try between 8 and 17.\n", fft_size_log);
+            error();
+            break;
+        case -3:
+            log(LOG_CRIT, "Out of memory. Try a smaller batch or increase GPU memory.\n");
+            error();
+            break;
+    }
+#else
+    fftwf_complex* fftin = demod_params->fftin;
+    fftwf_complex* fftout = demod_params->fftout;
+#endif /* WITH_BCM_VC */
+
+    float ALIGNED32 levels_u8[256], levels_s8[256];
+    float* levels_ptr = NULL;
+
+    for (int i = 0; i < 256; i++) {
+        levels_u8[i] = (i - 127.5f) / 127.5f;
+    }
+    for (int16_t i = -127; i < 128; i++) {
+        levels_s8[(uint8_t)i] = i / 128.0f;
+    }
+
+    // initialize fft window
+    // blackman 7
+    // the whole matrix is computed
+#ifdef WITH_BCM_VC
+    float ALIGNED32 window[fft_size * 2];
+#else
+    float ALIGNED32 window[fft_size];
+#endif /* WITH_BCM_VC */
+
+    const double a0 = 0.27105140069342f;
+    const double a1 = 0.43329793923448f;
+    const double a2 = 0.21812299954311f;
+    const double a3 = 0.06592544638803f;
+    const double a4 = 0.01081174209837f;
+    const double a5 = 0.00077658482522f;
+    const double a6 = 0.00001388721735f;
+
+    for (size_t i = 0; i < fft_size; i++) {
+        double x = a0 - (a1 * cos((2.0 * M_PI * i) / (fft_size - 1))) + (a2 * cos((4.0 * M_PI * i) / (fft_size - 1))) - (a3 * cos((6.0 * M_PI * i) / (fft_size - 1))) +
+                   (a4 * cos((8.0 * M_PI * i) / (fft_size - 1))) - (a5 * cos((10.0 * M_PI * i) / (fft_size - 1))) + (a6 * cos((12.0 * M_PI * i) / (fft_size - 1)));
+#ifdef WITH_BCM_VC
+        window[i * 2] = window[i * 2 + 1] = (float)x;
+#else
+        window[i] = (float)x;
+#endif /* WITH_BCM_VC */
+    }
+
+#ifdef DEBUG
+    struct timeval ts, te;
+    gettimeofday(&ts, NULL);
+#endif /* DEBUG */
+    size_t available;
+    int device_num = demod_params->device_start;
+    while (true) {
+        if (do_exit) {
+#ifdef WITH_BCM_VC
+            log(LOG_INFO, "Freeing GPU memory\n");
+            gpu_fft_release(fft);
+#endif /* WITH_BCM_VC */
+            return NULL;
+        }
+
+        device_t* dev = devices + device_num;
+
+        pthread_mutex_lock(&dev->input->buffer_lock);
+        if (dev->input->bufe >= dev->input->bufs)
+            available = dev->input->bufe - dev->input->bufs;
+        else
+            available = dev->input->buf_size - dev->input->bufs + dev->input->bufe;
+        pthread_mutex_unlock(&dev->input->buffer_lock);
+
+        if (devices_running == 0) {
+            log(LOG_ERR, "All receivers failed, exiting\n");
+            do_exit = 1;
+            continue;
+        }
+
+        if (dev->input->state != INPUT_RUNNING) {
+            if (dev->input->state == INPUT_FAILED) {
+                dev->input->state = INPUT_DISABLED;
+                disable_device_outputs(dev);
+                devices_running--;
+            }
+            device_num = next_device(demod_params, device_num);
+            continue;
+        }
+
+        // number of input bytes per output wave sample (x 2 for I and Q)
+        size_t bps = 2 * dev->input->bytes_per_sample * (size_t)round((double)dev->input->sample_rate / (double)WAVE_RATE);
+        if (available < bps * FFT_BATCH + fft_size * dev->input->bytes_per_sample * 2) {
+            // move to next device
+            device_num = next_device(demod_params, device_num);
+            SLEEP(10);
+            continue;
+        }
+
+        if (dev->input->sfmt == SFMT_S16) {
+            float const scale = 1.0f / dev->input->fullscale;
+#ifdef WITH_BCM_VC
+            struct GPU_FFT_COMPLEX* ptr = fft->in;
+            for (size_t b = 0; b < FFT_BATCH; b++, ptr += fft->step) {
+                short* buf2 = (short*)(dev->input->buffer + dev->input->bufs + b * bps);
+                for (size_t i = 0; i < fft_size; i++, buf2 += 2) {
+                    ptr[i].re = scale * (float)buf2[0] * window[i * 2];
+                    ptr[i].im = scale * (float)buf2[1] * window[i * 2];
+                }
+            }
+#else
+            short* buf2 = (short*)(dev->input->buffer + dev->input->bufs);
+            for (size_t i = 0; i < fft_size; i++, buf2 += 2) {
+                fftin[i][0] = scale * (float)buf2[0] * window[i];
+                fftin[i][1] = scale * (float)buf2[1] * window[i];
+            }
+#endif /* WITH_BCM_VC */
+        } else if (dev->input->sfmt == SFMT_F32) {
+            float const scale = 1.0f / dev->input->fullscale;
+#ifdef WITH_BCM_VC
+            struct GPU_FFT_COMPLEX* ptr = fft->in;
+            for (size_t b = 0; b < FFT_BATCH; b++, ptr += fft->step) {
+                float* buf2 = (float*)(dev->input->buffer + dev->input->bufs + b * bps);
+                for (size_t i = 0; i < fft_size; i++, buf2 += 2) {
+                    ptr[i].re = scale * buf2[0] * window[i * 2];
+                    ptr[i].im = scale * buf2[1] * window[i * 2];
+                }
+            }
+#else  // WITH_BCM_VC
+            float* buf2 = (float*)(dev->input->buffer + dev->input->bufs);
+            for (size_t i = 0; i < fft_size; i++, buf2 += 2) {
+                fftin[i][0] = scale * buf2[0] * window[i];
+                fftin[i][1] = scale * buf2[1] * window[i];
+            }
+#endif /* WITH_BCM_VC */
+
+        } else {  // S8 or U8
+            levels_ptr = (dev->input->sfmt == SFMT_U8 ? levels_u8 : levels_s8);
+
+#ifdef WITH_BCM_VC
+            sample_fft_arg sfa = {fft_size / 4, fft->in};
+            for (size_t i = 0; i < FFT_BATCH; i++) {
+                samplefft(&sfa, dev->input->buffer + dev->input->bufs + i * bps, window, levels_ptr);
+                sfa.dest += fft->step;
+            }
+#else
+            unsigned char* buf2 = dev->input->buffer + dev->input->bufs;
+            for (size_t i = 0; i < fft_size; i++, buf2 += 2) {
+                fftin[i][0] = levels_ptr[buf2[0]] * window[i];
+                fftin[i][1] = levels_ptr[buf2[1]] * window[i];
+            }
+#endif /* WITH_BCM_VC */
+        }
+
+#ifdef WITH_BCM_VC
+        gpu_fft_execute(fft);
+#else
+        fftwf_execute(demod_params->fft);
+#endif /* WITH_BCM_VC */
+
+#ifdef WITH_BCM_VC
+        for (int i = 0; i < dev->channel_count; i++) {
+            float* wavein = dev->channels[i].wavein + dev->waveend;
+            __builtin_prefetch(wavein, 1);
+            const int bin = dev->bins[i];
+            const GPU_FFT_COMPLEX* fftout = fft->out + bin;
+            for (int j = 0; j < FFT_BATCH; j++, ++wavein, fftout += fft->step)
+                *wavein = sqrtf(fftout->im * fftout->im + fftout->re * fftout->re);
+        }
+        for (int j = 0; j < dev->channel_count; j++) {
+            if (dev->channels[j].needs_raw_iq) {
+                struct GPU_FFT_COMPLEX* ptr = fft->out;
+                for (int job = 0; job < FFT_BATCH; job++) {
+                    dev->channels[j].iq_in[2 * (dev->waveend + job)] = ptr[dev->bins[j]].re;
+                    dev->channels[j].iq_in[2 * (dev->waveend + job) + 1] = ptr[dev->bins[j]].im;
+                    ptr += fft->step;
+                }
+            }
+        }
+#else
+        for (int j = 0; j < dev->channel_count; j++) {
+            dev->channels[j].wavein[dev->waveend] = sqrtf(fftout[dev->bins[j]][0] * fftout[dev->bins[j]][0] + fftout[dev->bins[j]][1] * fftout[dev->bins[j]][1]);
+            if (dev->channels[j].needs_raw_iq) {
+                dev->channels[j].iq_in[2 * dev->waveend] = fftout[dev->bins[j]][0];
+                dev->channels[j].iq_in[2 * dev->waveend + 1] = fftout[dev->bins[j]][1];
+            }
+        }
+#endif /* WITH_BCM_VC */
+
+        dev->waveend += FFT_BATCH;
+
+        if (dev->waveend >= WAVE_BATCH + AGC_EXTRA) {
+            for (int i = 0; i < dev->channel_count; i++) {
+                AFC afc(dev, i);
+                channel_t* channel = dev->channels + i;
+                freq_t* fparms = channel->freqlist + channel->freq_idx;
+
+                // set to NO_SIGNAL, will be updated to SIGNAL based on squelch below
+                channel->axcindicate = NO_SIGNAL;
+
+                for (int j = AGC_EXTRA; j < WAVE_BATCH + AGC_EXTRA; j++) {
+                    float& real = channel->iq_in[2 * (j - AGC_EXTRA)];
+                    float& imag = channel->iq_in[2 * (j - AGC_EXTRA) + 1];
+
+                    fparms->squelch.process_raw_sample(channel->wavein[j]);
+
+                    // If squelch is open / opening and using I/Q, then cleanup the signal and possibly update squelch.
+                    if (fparms->squelch.should_filter_sample() && channel->needs_raw_iq) {
+                        // remove phase rotation introduced by FFT sliding window
+                        float swf, cwf, re_tmp, im_tmp;
+                        sincosf_lut(channel->dm_phi, &swf, &cwf);
+                        multiply(real, imag, cwf, -swf, &re_tmp, &im_tmp);
+                        channel->dm_phi += channel->dm_dphi;
+                        channel->dm_phi &= 0xffffff;
+
+                        // apply lowpass filter, will be a no-op if not configured
+                        fparms->lowpass_filter.apply(re_tmp, im_tmp);
+
+                        // update I/Q and wave
+                        real = re_tmp;
+                        imag = im_tmp;
+                        channel->wavein[j] = sqrt(real * real + imag * imag);
+
+                        // update squelch post-cleanup
+                        if (fparms->lowpass_filter.enabled()) {
+                            fparms->squelch.process_filtered_sample(channel->wavein[j]);
+                        }
+                    }
+
+                    if (fparms->modulation == MOD_AM) {
+                        // if squelch is just opening then bootstrip agcavgfast with prior values of wavein
+                        if (fparms->squelch.first_open_sample()) {
+                            for (int k = j - AGC_EXTRA; k < j; k++) {
+                                if (channel->wavein[k] >= fparms->squelch.squelch_level()) {
+                                    fparms->agcavgfast = fparms->agcavgfast * 0.9f + channel->wavein[k] * 0.1f;
+                                }
+                            }
+                        }
+                        // if squelch is just closing then fade out the prior samples of waveout
+                        else if (fparms->squelch.last_open_sample()) {
+                            for (int k = j - AGC_EXTRA + 1; k < j; k++) {
+                                channel->waveout[k] = channel->waveout[k - 1] * 0.94f;
+                            }
+                        }
+                    }
+
+                    float& waveout = channel->waveout[j];
+
+                    // If squelch sees power then do modulation-specific processing
+                    if (fparms->squelch.should_process_audio()) {
+                        if (fparms->modulation == MOD_AM) {
+                            if (channel->wavein[j] > fparms->squelch.squelch_level()) {
+                                fparms->agcavgfast = fparms->agcavgfast * 0.995f + channel->wavein[j] * 0.005f;
+                            }
+
+                            waveout = (channel->wavein[j - AGC_EXTRA] - fparms->agcavgfast) / (fparms->agcavgfast * 1.5f);
+                            if (abs(waveout) > 0.8f) {
+                                waveout *= 0.85f;
+                                fparms->agcavgfast *= 1.15f;
+                            }
+                            
+                            if ((abs(waveout)>WAVE_OUT_LEVEL && channel->wavein[j]>WAVE_IN_LEVEL) ){
+                                gpiod_line_set_value(gpio20line, 1);                                
+                                audio_status=1;
+                            }else{
+                                gpiod_line_set_value(gpio20line, 0);
+                                audio_status=0;
+
+                            }
+                            int temp_status = squelch_status + (audio_status << 1);
+                            if (status_data_old !=temp_status){
+                                pthread_mutex_lock(&lock_data); 
+                                status_data = temp_status;
+                                if (sem_post (&mutex_sem) == -1) {
+                                    perror ("sem_post: mutex_sem"); exit (1);
+                                }
+                                if (sem_wait(&spooler_sem) == -1){
+                                    perror ("sem_post: spooler_sem"); exit (1);
+                                }
+                                status_data_old = status_data;
+                                pthread_mutex_unlock(&lock_data); 
+                            }
+                        }
+#ifdef NFM
+                        else if (fparms->modulation == MOD_NFM) {
+                            // FM demod
+                            if (fm_demod == FM_FAST_ATAN2) {
+                                waveout = polar_disc_fast(real, imag, channel->pr, channel->pj);
+                            } else if (fm_demod == FM_QUADRI_DEMOD) {
+                                waveout = fm_quadri_demod(real, imag, channel->pr, channel->pj);
+                            }
+                            channel->pr = real;
+                            channel->pj = imag;
+
+                            // de-emphasis IIR + DC blocking
+                            fparms->agcavgfast = fparms->agcavgfast * 0.995f + waveout * 0.005f;
+                            waveout -= fparms->agcavgfast;
+                            waveout = waveout * (1.0f - channel->alpha) + channel->prev_waveout * channel->alpha;
+
+                            // save off waveout before notch and ampfactor
+                            channel->prev_waveout = waveout;
+                        }
+#endif /* NFM */
+
+                        // process audio sample for CTCSS, will be no-op if not configured
+                        fparms->squelch.process_audio_sample(waveout);
+                    }
+
+                    // If squelch is still open then save samples to output
+                    if (fparms->squelch.is_open()) {
+                        // apply the notch filter, will be a no-op if not configured
+                        fparms->notch_filter.apply(waveout);
+
+                        // apply the ampfactor
+                        waveout *= fparms->ampfactor;
+
+                        // make sure the value is between +/- 1 (requirement for libmp3lame)
+                        if (isnan(waveout)) {
+                            waveout = 0.0;
+                        } else if (waveout > 1.0) {
+                            waveout = 1.0;
+                        } else if (waveout < -1.0) {
+                            waveout = -1.0;
+                        }
+
+                        channel->axcindicate = SIGNAL;
+                        if (channel->has_iq_outputs) {
+                            channel->iq_out[2 * (j - AGC_EXTRA)] = real;
+                            channel->iq_out[2 * (j - AGC_EXTRA) + 1] = imag;
+                        }
+
+                        // Squelch is closed
+                    } else {
+                        waveout = 0;
+                        if (channel->has_iq_outputs) {
+                            channel->iq_out[2 * (j - AGC_EXTRA)] = 0;
+                            channel->iq_out[2 * (j - AGC_EXTRA) + 1] = 0;
+                        }
+                    }
+                }
+                memmove(channel->wavein, channel->wavein + WAVE_BATCH, (dev->waveend - WAVE_BATCH) * sizeof(float));
+                if (channel->needs_raw_iq) {
+                    memmove(channel->iq_in, channel->iq_in + 2 * WAVE_BATCH, (dev->waveend - WAVE_BATCH) * sizeof(float) * 2);
+                }
+
+#ifdef WITH_BCM_VC
+                afc.finalize(dev, i, fft->out);
+#else
+                afc.finalize(dev, i, demod_params->fftout);
+#endif /* WITH_BCM_VC */
+
+                if (tui) {
+                    char symbol = fparms->squelch.signal_outside_filter() ? '~' : (char)channel->axcindicate;
+                    if (dev->mode == R_SCAN) {
+                        //GOTOXY(0, device_num * 17 + dev->row + 3);
+                        //printf("%4.0f/%3.0f%c %7.3f ", level_to_dBFS(fparms->squelch.signal_level()), level_to_dBFS(fparms->squelch.noise_level()), symbol,
+                        //       (dev->channels[0].freqlist[channel->freq_idx].frequency / 1000000.0));
+                    } else {
+                        //GOTOXY(i * 10, device_num * 17 + dev->row + 3);
+                        //printf("%4.0f/%3.0f%c ", level_to_dBFS(fparms->squelch.signal_level()), level_to_dBFS(fparms->squelch.noise_level()), symbol);
+                    }
+                    if (symbol=='*'){
+                        gpiod_line_set_value(gpio16line, 1);
+                        squelch_status = 1;
+                    }else{
+                        gpiod_line_set_value(gpio16line, 0);
+                        squelch_status = 0;
+                    }
+                    int temp_status = squelch_status + (audio_status << 1);
+                    if (status_data_old !=temp_status){
+                        pthread_mutex_lock(&lock_data); 
+                        status_data = temp_status;
+                        if (sem_post (&mutex_sem) == -1) {
+                            perror ("sem_post: mutex_sem"); exit (1);
+                        }
+                        if (sem_wait(&spooler_sem) == -1){
+                            perror ("sem_post: spooler_sem"); exit (1);
+                        }
+                        status_data_old = status_data;
+                        pthread_mutex_unlock(&lock_data); 
+                    }
+                    fflush(stdout);
+                }
+
+                if (channel->axcindicate != NO_SIGNAL) {
+                    channel->freqlist[channel->freq_idx].active_counter++;
+                }
+            }
+            if (dev->waveavail == 1) {
+                debug_print("devices[%d]: output channel overrun\n", device_num);
+                dev->output_overrun_count++;
+            } else {
+                dev->waveavail = 1;
+            }
+            dev->waveend -= WAVE_BATCH;
+#ifdef DEBUG
+            gettimeofday(&te, NULL);
+            debug_bulk_print("waveavail %lu.%lu %lu\n", te.tv_sec, (unsigned long)te.tv_usec, (te.tv_sec - ts.tv_sec) * 1000000UL + te.tv_usec - ts.tv_usec);
+            ts.tv_sec = te.tv_sec;
+            ts.tv_usec = te.tv_usec;
+#endif /* DEBUG */
+            demod_params->mp3_signal->send();
+            dev->row++;
+            if (dev->row == 12) {
+                dev->row = 0;
+            }
+        }
+
+        dev->input->bufs = (dev->input->bufs + bps * FFT_BATCH) % dev->input->buf_size;
+        device_num = next_device(demod_params, device_num);
+    }
+}
+
+void usage() {
+    cout << "Usage: rtl_airband [options] [-c <config_file_path>]\n\
+\t-h\t\t\tDisplay this help text\n\
+\t-f\t\t\tRun in foreground, display textual waterfalls\n\
+\t-F\t\t\tRun in foreground, do not display waterfalls (for running as a systemd service)\n";
+#ifdef NFM
+    cout << "\t-Q\t\t\tUse quadri correlator for FM demodulation (default is atan2)\n";
+#endif /* NFM */
+#ifdef DEBUG
+    cout << "\t-d <file>\t\tLog debugging information to <file> (default is " << DEBUG_PATH << ")\n";
+#endif /* DEBUG */
+    cout << "\t-e\t\t\tPrint messages to standard error (disables syslog logging)\n";
+    cout << "\t-c <config_file_path>\tUse non-default configuration file\n\t\t\t\t(default: " << CFGFILE << ")\n\
+\t-v\t\t\tDisplay version and exit\n";
+    exit(EXIT_SUCCESS);
+}
+
+static int count_devices_running() {
+    int ret = 0;
+    for (int i = 0; i < device_count; i++) {
+        if (devices[i].input->state == INPUT_RUNNING) {
+            ret++;
+        }
+    }
+    return ret;
+}
+
+int main(int argc, char* argv[]) {
+#ifdef WITH_PROFILING
+    ProfilerStart("rtl_airband.prof");
+#endif /* WITH_PROFILING */
+
+    // initialization
+
+    sem_init (&mutex_sem, 1, 0);
+    sem_init (&spooler_sem, 1, 0);
+    if (pthread_mutex_init(&lock_data, NULL) != 0) { 
+        printf("\n mutex init has failed\n"); 
+        return 1; 
+    } 
+    printf("SEM and locks OPEN\n");
+
+	char const *chipname = "gpiochip0";
+	unsigned int gpio16 = 16;
+    unsigned int gpio20 = 20;
+    unsigned int gpio26 = 26;
+
+	chip = gpiod_chip_open_by_name(chipname);
+    if (!chip) {
+        perror("gpiod_chip_open_by_name");
+        return 1;
+    }
+	gpio16line = gpiod_chip_get_line(chip, gpio16);
+    gpio20line = gpiod_chip_get_line(chip, gpio20);
+    gpio26line = gpiod_chip_get_line(chip, gpio26);
+	gpiod_line_request_output(gpio16line, "Consumer1", 0);
+	gpiod_line_set_value(gpio16line, 0);
+    gpiod_line_request_output(gpio20line, "Consumer2", 0);
+	gpiod_line_set_value(gpio20line, 0);
+    gpiod_line_request_output(gpio26line, "Consumer3", 0);
+	gpiod_line_set_value(gpio26line, 0);
+#pragma GCC diagnostic ignored "-Wwrite-strings"
+    char* cfgfile = CFGFILE;
+    char* pidfile = PIDFILE;
+#pragma GCC diagnostic warning "-Wwrite-strings"
+
+    int opt;
+    char optstring[16] = "efFhvc:";
+
+#ifdef NFM
+    strcat(optstring, "Q");
+#endif /* NFM */
+
+#ifdef DEBUG
+    strcat(optstring, "d:");
+#endif /* DEBUG */
+
+    int foreground = 0;  // daemonize
+    int do_syslog = 1;
+
+    while ((opt = getopt(argc, argv, optstring)) != -1) {
+        switch (opt) {
+#ifdef NFM
+            case 'Q':
+                fm_demod = FM_QUADRI_DEMOD;
+                break;
+#endif /* NFM */
+
+#ifdef DEBUG
+            case 'd':
+                debug_path = strdup(optarg);
+                break;
+#endif /* DEBUG */
+
+            case 'e':
+                do_syslog = 0;
+                break;
+            case 'f':
+                foreground = 1;
+                tui = 1;
+                break;
+            case 'F':
+                foreground = 1;
+                tui = 0;
+                break;
+            case 'c':
+                cfgfile = optarg;
+                break;
+            case 'v':
+                cout << "RTLSDR-Airband version " << RTL_AIRBAND_VERSION << "\n";
+                exit(EXIT_SUCCESS);
+            case 'h':
+            default:
+                usage();
+                break;
+        }
+    }
+#ifdef DEBUG
+    if (!debug_path)
+        debug_path = strdup(DEBUG_PATH);
+    init_debug(debug_path);
+#endif /* DEBUG */
+
+    // If executing other than as root, GPU memory gets alloc'd and the
+    // 'permission denied' message on /dev/mem kills rtl_airband without
+    // releasing GPU memory.
+#ifdef WITH_BCM_VC
+    //  should probably do this check in other circumstances also.
+    if (0 != getuid()) {
+        cerr << "FFT library requires that rtl_airband be executed as root\n";
+        exit(1);
+    }
+#endif /* WITH_BCM_VC */
+
+    // read config
+    try {
+        Config config;
+        config.readFile(cfgfile);
+        Setting& root = config.getRoot();
+        if (root.exists("pidfile"))
+            pidfile = strdup(root["pidfile"]);
+        if (root.exists("fft_size")) {
+            int fsize = (int)(root["fft_size"]);
+            fft_size_log = 0;
+            for (size_t i = MIN_FFT_SIZE_LOG; i <= MAX_FFT_SIZE_LOG; i++) {
+                if (fsize == 1 << i) {
+                    fft_size = (size_t)fsize;
+                    fft_size_log = i;
+                    break;
+                }
+            }
+            if (fft_size_log == 0) {
+                cerr << "Configuration error: invalid fft_size value (must be a power of two in range " << (1 << MIN_FFT_SIZE_LOG) << "-" << (1 << MAX_FFT_SIZE_LOG) << ")\n";
+                error();
+            }
+        }
+        if (root.exists("shout_metadata_delay"))
+            shout_metadata_delay = (int)(root["shout_metadata_delay"]);
+        if (shout_metadata_delay < 0 || shout_metadata_delay > 2 * TAG_QUEUE_LEN) {
+            cerr << "Configuration error: shout_metadata_delay is out of allowed range (0-" << 2 * TAG_QUEUE_LEN << ")\n";
+            error();
+        }
+        if (root.exists("localtime") && (bool)root["localtime"] == true)
+            use_localtime = true;
+        if (root.exists("multiple_demod_threads") && (bool)root["multiple_demod_threads"] == true) {
+#ifdef WITH_BCM_VC
+            cerr << "Using multiple_demod_threads not supported with BCM VideoCore for FFT\n";
+            exit(1);
+#endif /* WITH_BCM_VC */
+
+            multiple_demod_threads = true;
+        }
+        if (root.exists("multiple_output_threads") && (bool)root["multiple_output_threads"] == true) {
+            multiple_output_threads = true;
+        }
+        if (root.exists("log_scan_activity") && (bool)root["log_scan_activity"] == true)
+            log_scan_activity = true;
+        if (root.exists("stats_filepath"))
+            stats_filepath = strdup(root["stats_filepath"]);
+#ifdef NFM
+        if (root.exists("tau"))
+            alpha = ((int)root["tau"] == 0 ? 0.0f : exp(-1.0f / (WAVE_RATE * 1e-6 * (int)root["tau"])));
+#endif /* NFM */
+
+        Setting& devs = config.lookup("devices");
+        device_count = devs.getLength();
+        if (device_count < 1) {
+            cerr << "Configuration error: no devices defined\n";
+            error();
+        }
+
+        struct sigaction sigact, pipeact;
+
+        memset(&sigact, 0, sizeof(sigact));
+        memset(&pipeact, 0, sizeof(pipeact));
+        pipeact.sa_handler = SIG_IGN;
+        sigact.sa_handler = &sighandler;
+        sigaction(SIGPIPE, &pipeact, NULL);
+        sigaction(SIGHUP, &sigact, NULL);
+        sigaction(SIGINT, &sigact, NULL);
+        sigaction(SIGQUIT, &sigact, NULL);
+        sigaction(SIGTERM, &sigact, NULL);
+
+        devices = (device_t*)XCALLOC(device_count, sizeof(device_t));
+        shout_init();
+
+        if (do_syslog) {
+            openlog("rtl_airband", LOG_PID, LOG_DAEMON);
+            log_destination = SYSLOG;
+        } else if (foreground) {
+            log_destination = STDERR;
+        } else {
+            log_destination = NONE;
+        }
+
+        if (root.exists("mixers")) {
+            Setting& mx = config.lookup("mixers");
+            mixers = (mixer_t*)XCALLOC(mx.getLength(), sizeof(struct mixer_t));
+            if ((mixer_count = parse_mixers(mx)) > 0) {
+                mixers = (mixer_t*)XREALLOC(mixers, mixer_count * sizeof(struct mixer_t));
+            } else {
+                free(mixers);
+            }
+        } else {
+            mixer_count = 0;
+        }
+
+        uint32_t devs_enabled = parse_devices(devs);
+        if (devs_enabled < 1) {
+            cerr << "Configuration error: no devices defined\n";
+            error();
+        }
+        device_count = devs_enabled;
+        debug_print("mixer_count=%d\n", mixer_count);
+#ifdef DEBUG
+        for (int z = 0; z < mixer_count; z++) {
+            mixer_t* m = &mixers[z];
+            debug_print("mixer[%d]: name=%s, input_count=%d, output_count=%d\n", z, m->name, m->input_count, m->channel.output_count);
+        }
+#endif /* DEBUG */
+    } catch (const FileIOException& e) {
+        cerr << "Cannot read configuration file " << cfgfile << "\n";
+        error();
+    } catch (const ParseException& e) {
+        cerr << "Error while parsing configuration file " << cfgfile << " line " << e.getLine() << ": " << e.getError() << "\n";
+        error();
+    } catch (const SettingNotFoundException& e) {
+        cerr << "Configuration error: mandatory parameter missing: " << e.getPath() << "\n";
+        error();
+    } catch (const SettingTypeException& e) {
+        cerr << "Configuration error: invalid parameter type: " << e.getPath() << "\n";
+        error();
+    } catch (const ConfigException& e) {
+        cerr << "Unhandled config exception\n";
+        error();
+    }
+
+    log(LOG_INFO, "RTLSDR-Airband version %s starting\n", RTL_AIRBAND_VERSION);
+
+    if (!foreground) {
+        int pid1, pid2;
+        if ((pid1 = fork()) == -1) {
+            cerr << "Cannot fork child process: " << strerror(errno) << "\n";
+            error();
+        }
+        if (pid1) {
+            waitpid(-1, NULL, 0);
+            return (0);
+        } else {
+            if ((pid2 = fork()) == -1) {
+                cerr << "Cannot fork child process: " << strerror(errno) << "\n";
+                error();
+            }
+            if (pid2) {
+                return (0);
+            } else {
+                int nullfd, dupfd;
+                if ((nullfd = open("/dev/null", O_RDWR)) == -1) {
+                    log(LOG_CRIT, "Cannot open /dev/null: %s\n", strerror(errno));
+                    error();
+                }
+                for (dupfd = 0; dupfd <= 2; dupfd++) {
+                    if (dup2(nullfd, dupfd) == -1) {
+                        log(LOG_CRIT, "dup2(): %s\n", strerror(errno));
+                        error();
+                    }
+                }
+                if (nullfd > 2)
+                    close(nullfd);
+                FILE* f = fopen(pidfile, "w");
+                if (f == NULL) {
+                    log(LOG_WARNING, "Cannot write pidfile: %s\n", strerror(errno));
+                } else {
+                    fprintf(f, "%ld\n", (long)getpid());
+                    fclose(f);
+                }
+            }
+        }
+    }
+
+    for (int i = 0; i < mixer_count; i++) {
+        if (mixers[i].enabled == false) {
+            continue;  // no inputs connected = no need to initialize output
+        }
+        channel_t* channel = &mixers[i].channel;
+        if (channel->need_mp3) {
+            channel->lame = airlame_init(mixers[i].channel.mode, mixers[i].channel.highpass, mixers[i].channel.lowpass);
+            channel->lamebuf = (unsigned char*)malloc(sizeof(unsigned char) * LAMEBUF_SIZE);
+        }
+        for (int k = 0; k < channel->output_count; k++) {
+            output_t* output = channel->outputs + k;
+            if (output->type == O_ICECAST) {
+                shout_setup((icecast_data*)(output->data), channel->mode);
+            } else if (output->type == O_UDP_STREAM) {
+                udp_stream_data* sdata = (udp_stream_data*)(output->data);
+                if (!udp_stream_init(sdata, channel->mode, (size_t)WAVE_BATCH * sizeof(float))) {
+                    cerr << "Failed to initialize mixer " << i << " output " << k << " - aborting\n";
+                    error();
+                }
+#ifdef WITH_PULSEAUDIO
+            } else if (output->type == O_PULSE) {
+                pulse_init();
+                pulse_setup((pulse_data*)(output->data), channel->mode);
+#endif /* WITH_PULSEAUDIO */
+            }
+        }
+    }
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = dev->channels + j;
+
+            // If the channel has icecast or MP3 file output, we will attempt to
+            // initialize a separate LAME context for MP3 encoding.
+            if (channel->need_mp3) {
+                channel->lame = airlame_init(channel->mode, channel->highpass, channel->lowpass);
+                channel->lamebuf = (unsigned char*)malloc(sizeof(unsigned char) * LAMEBUF_SIZE);
+            }
+            for (int k = 0; k < channel->output_count; k++) {
+                output_t* output = channel->outputs + k;
+                if (output->type == O_ICECAST) {
+                    shout_setup((icecast_data*)(output->data), channel->mode);
+                } else if (output->type == O_UDP_STREAM) {
+                    udp_stream_data* sdata = (udp_stream_data*)(output->data);
+                    if (!udp_stream_init(sdata, channel->mode, (size_t)WAVE_BATCH * sizeof(float))) {
+                        cerr << "Failed to initialize device " << i << " channel " << j << " output " << k << " - aborting\n";
+                        error();
+                    }
+#ifdef WITH_PULSEAUDIO
+                } else if (output->type == O_PULSE) {
+                    pulse_init();
+                    pulse_setup((pulse_data*)(output->data), channel->mode);
+#endif /* WITH_PULSEAUDIO */
+                }
+            }
+        }
+        if (input_init(dev->input) != 0 || dev->input->state != INPUT_INITIALIZED) {
+            if (errno != 0) {
+                cerr << "Failed to initialize input device " << i << ": " << strerror(errno) << " - aborting\n";
+            } else {
+                cerr << "Failed to initialize input device " << i << " - aborting\n";
+            }
+            error();
+        }
+        if (input_start(dev->input) != 0) {
+            cerr << "Failed to start input on device " << i << ": " << strerror(errno) << " - aborting\n";
+            error();
+        }
+        if (dev->mode == R_SCAN) {
+            // FIXME: set errno
+            if (pthread_mutex_init(&dev->tag_queue_lock, NULL) != 0) {
+                cerr << "Failed to initialize mutex - aborting\n";
+                error();
+            }
+            // FIXME: not needed when freq_count == 1?
+            pthread_create(&dev->controller_thread, NULL, &controller_thread, dev);
+        }
+    }
+
+    int timeout = 50;  // 5 seconds
+    while ((devices_running = count_devices_running()) != device_count && timeout > 0) {
+        SLEEP(100);
+        timeout--;
+    }
+    if ((devices_running = count_devices_running()) != device_count) {
+        log(LOG_ERR, "%d device(s) failed to initialize - aborting\n", device_count - devices_running);
+        error();
+    }
+    if (tui) {
+        printf("\e[1;1H\e[2J");
+
+        GOTOXY(0, 0);
+        printf("                                                                               ");
+        for (int i = 0; i < device_count; i++) {
+            GOTOXY(0, i * 17 + 1);
+            for (int j = 0; j < devices[i].channel_count; j++) {
+                printf(" %7.3f  ", devices[i].channels[j].freqlist[devices[i].channels[j].freq_idx].frequency / 1000000.0);
+            }
+            if (i != device_count - 1) {
+                GOTOXY(0, i * 17 + 16);
+                printf("-------------------------------------------------------------------------------");
+            }
+        }
+    }
+    THREAD belysning_control_check;
+    pthread_create(&belysning_control_check, NULL, &belysning_control_thread, NULL);
+    THREAD output_check;
+    pthread_create(&output_check, NULL, &output_check_thread, NULL);
+
+    int demod_thread_count = multiple_demod_threads ? device_count : 1;
+    demod_params_t* demod_params = (demod_params_t*)XCALLOC(demod_thread_count, sizeof(demod_params_t));
+    THREAD* demod_threads = (THREAD*)XCALLOC(demod_thread_count, sizeof(THREAD));
+
+    int output_thread_count = 1;
+    if (multiple_output_threads) {
+        output_thread_count = demod_thread_count;
+        if (mixer_count > 0) {
+            output_thread_count++;
+        }
+    }
+    output_params_t* output_params = (output_params_t*)XCALLOC(output_thread_count, sizeof(output_params_t));
+    THREAD* output_threads = (THREAD*)XCALLOC(output_thread_count, sizeof(THREAD));
+
+    // Setup the output and demod threads
+    if (multiple_output_threads == false) {
+        init_output(&output_params[0], 0, device_count, 0, mixer_count);
+
+        if (multiple_demod_threads == false) {
+            init_demod(&demod_params[0], output_params[0].mp3_signal, 0, device_count);
+        } else {
+            for (int i = 0; i < demod_thread_count; i++) {
+                init_demod(&demod_params[i], output_params[0].mp3_signal, i, i + 1);
+            }
+        }
+    } else {
+        if (multiple_demod_threads == false) {
+            init_output(&output_params[0], 0, device_count, 0, 0);
+            init_demod(&demod_params[0], output_params[0].mp3_signal, 0, device_count);
+        } else {
+            for (int i = 0; i < device_count; i++) {
+                init_output(&output_params[i], i, i + 1, 0, 0);
+                init_demod(&demod_params[i], output_params[i].mp3_signal, i, i + 1);
+            }
+        }
+        if (mixer_count > 0) {
+            init_output(&output_params[output_thread_count - 1], 0, 0, 0, mixer_count);
+        }
+    }
+
+    // Startup the output threads
+    for (int i = 0; i < output_thread_count; i++) {
+        pthread_create(&output_threads[i], NULL, &output_thread, &output_params[i]);
+    }
+
+    // Startup the mixer thread (if there is one) using the signal for the last output thread
+    THREAD mixer;
+    if (mixer_count > 0) {
+        pthread_create(&mixer, NULL, &mixer_thread, output_params[output_thread_count - 1].mp3_signal);
+    }
+
+#ifdef WITH_PULSEAUDIO
+    pulse_start();
+#endif /* WITH_PULSEAUDIO */
+
+    sincosf_lut_init();
+
+    // Startup the demod threads
+    for (int i = 0; i < demod_thread_count; i++) {
+        pthread_create(&demod_threads[i], NULL, &demodulate, &demod_params[i]);
+    }
+    // Wait for demod threads to exit
+    for (int i = 0; i < demod_thread_count; i++) {
+        pthread_join(demod_threads[i], NULL);
+    }
+
+    log(LOG_INFO, "Cleaning up\n");
+    for (int i = 0; i < device_count; i++) {
+        if (devices[i].mode == R_SCAN)
+            pthread_join(devices[i].controller_thread, NULL);
+        if (input_stop(devices[i].input) != 0 || devices[i].input->state != INPUT_STOPPED) {
+            if (errno != 0) {
+                log(LOG_ERR, "Failed do stop device #%d: %s\n", i, strerror(errno));
+            } else {
+                log(LOG_ERR, "Failed do stop device #%d\n", i);
+            }
+        }
+    }
+    log(LOG_INFO, "Input threads closed\n");
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        disable_device_outputs(dev);
+    }
+
+    if (mixer_count > 0) {
+        log(LOG_INFO, "Closing mixer thread\n");
+        pthread_join(mixer, NULL);
+    }
+
+    log(LOG_INFO, "Closing output thread(s)\n");
+    for (int i = 0; i < output_thread_count; i++) {
+        output_params[i].mp3_signal->send();
+        pthread_join(output_threads[i], NULL);
+    }
+
+    for (int i = 0; i < device_count; i++) {
+        device_t* dev = devices + i;
+        for (int j = 0; j < dev->channel_count; j++) {
+            channel_t* channel = dev->channels + j;
+            if (channel->need_mp3 && channel->lame) {
+                lame_close(channel->lame);
+            }
+        }
+    }
+
+    close_debug();
+#ifdef WITH_PROFILING
+    ProfilerStop();
+#endif /* WITH_PROFILING */
+    return 0;
+}

+ 401 - 0
src/rtl_airband.h

@@ -0,0 +1,401 @@
+/*
+ * rtl_airband.h
+ * Global declarations
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _RTL_AIRBAND_H
+#define _RTL_AIRBAND_H 1
+#include <lame/lame.h>
+#include <netinet/in.h>  // sockaddr_in
+#include <pthread.h>
+#include <shout/shout.h>
+#include <stdint.h>  // uint32_t
+#include <sys/time.h>
+#include <complex>
+#include <cstdio>
+#include <libconfig.h++>
+#include <string>
+
+#include "config.h"
+
+#ifdef WITH_BCM_VC
+#include "hello_fft/gpu_fft.h"
+#else
+#include <fftw3.h>
+#endif /* WITH_BCM_VC */
+
+#ifdef WITH_PULSEAUDIO
+#include <pulse/context.h>
+#include <pulse/stream.h>
+#endif /* WITH_PULSEAUDIO */
+
+#include "filters.h"
+#include "input-common.h"  // input_t
+#include "logging.h"
+#include "squelch.h"
+
+#define ALIGNED32 __attribute__((aligned(32)))
+#define SLEEP(x) usleep(x * 1000)
+#define THREAD pthread_t
+#define GOTOXY(x, y) printf("%c[%d;%df", 0x1B, y, x)
+
+#ifndef SYSCONFDIR
+#define SYSCONFDIR "/usr/local/etc"
+#endif /* SYSCONFDIR */
+
+#define CFGFILE SYSCONFDIR "/rtl_airband.conf"
+#define PIDFILE "/run/rtl_airband.pid"
+
+#define MIN_BUF_SIZE 2560000
+#define DEFAULT_SAMPLE_RATE 2560000
+
+#ifdef NFM
+#define WAVE_RATE 16000
+#else
+#define WAVE_RATE 8000
+#endif /* NFM */
+
+#define WAVE_BATCH WAVE_RATE / 8
+#define AGC_EXTRA 100
+#define WAVE_LEN 2 * WAVE_BATCH + AGC_EXTRA
+#define MP3_RATE 8000
+#define MAX_SHOUT_QUEUELEN 32768
+#define TAG_QUEUE_LEN 16
+
+#define MIN_FFT_SIZE_LOG 8
+#define DEFAULT_FFT_SIZE_LOG 9
+#define MAX_FFT_SIZE_LOG 13
+
+#define LAMEBUF_SIZE 22000  // todo: calculate
+#define MIX_DIVISOR 2
+
+#ifdef WITH_BCM_VC
+struct sample_fft_arg {
+    size_t fft_size_by4;
+    GPU_FFT_COMPLEX* dest;
+};
+extern "C" void samplefft(sample_fft_arg* a, unsigned char* buffer, float* window, float* levels);
+
+#define FFT_BATCH 250
+#else
+#define FFT_BATCH 1
+#endif /* WITH_BCM_VC */
+
+//#define AFC_LOGGING
+
+enum status { NO_SIGNAL = ' ', SIGNAL = '*', AFC_UP = '<', AFC_DOWN = '>' };
+enum ch_states { CH_DIRTY, CH_WORKING, CH_READY };
+enum mix_modes { MM_MONO, MM_STEREO };
+enum output_type {
+    O_ICECAST,
+    O_FILE,
+    O_RAWFILE,
+    O_MIXER,
+    O_UDP_STREAM
+#ifdef WITH_PULSEAUDIO
+    ,
+    O_PULSE
+#endif /* WITH_PULSEAUDIO */
+};
+
+struct icecast_data {
+    const char* hostname;
+    int port;
+#ifdef LIBSHOUT_HAS_TLS
+    int tls_mode;
+#endif /* LIBSHOUT_HAS_TLS */
+    const char* username;
+    const char* password;
+    const char* mountpoint;
+    const char* name;
+    const char* genre;
+    const char* description;
+    bool send_scan_freq_tags;
+    shout_t* shout;
+};
+
+struct file_data {
+    std::string basedir;
+    std::string basename;
+    std::string suffix;
+    std::string file_path;
+    std::string file_path_tmp;
+    bool dated_subdirectories;
+    bool continuous;
+    bool append;
+    bool split_on_transmission;
+    bool include_freq;
+    timeval open_time;
+    timeval last_write_time;
+    FILE* f;
+    enum output_type type;
+};
+
+struct udp_stream_data {
+    float* stereo_buffer;
+    size_t stereo_buffer_len;
+
+    bool continuous;
+    const char* dest_address;
+    const char* dest_port;
+
+    int send_socket;
+    struct sockaddr dest_sockaddr;
+    socklen_t dest_sockaddr_len;
+};
+
+#ifdef WITH_PULSEAUDIO
+struct pulse_data {
+    const char* server;
+    const char* name;
+    const char* sink;
+    const char* stream_name;
+    pa_context* context;
+    pa_stream *left, *right;
+    pa_channel_map lmap, rmap;
+    mix_modes mode;
+    bool continuous;
+};
+#endif /* WITH_PULSEAUDIO */
+
+struct mixer_data {
+    struct mixer_t* mixer;
+    int input;
+};
+
+struct output_t {
+    enum output_type type;
+    bool enabled;
+    bool active;
+    void* data;
+};
+
+struct freq_tag {
+    int freq;
+    struct timeval tv;
+};
+
+enum modulations {
+    MOD_AM
+#ifdef NFM
+    ,
+    MOD_NFM
+#endif /* NFM */
+};
+
+class Signal {
+   public:
+    Signal(void) {
+        pthread_cond_init(&cond_, NULL);
+        pthread_mutex_init(&mutex_, NULL);
+    }
+    void send(void) {
+        pthread_mutex_lock(&mutex_);
+        pthread_cond_signal(&cond_);
+        pthread_mutex_unlock(&mutex_);
+    }
+    void wait(void) {
+        pthread_mutex_lock(&mutex_);
+        pthread_cond_wait(&cond_, &mutex_);
+        pthread_mutex_unlock(&mutex_);
+    }
+
+   private:
+    pthread_cond_t cond_;
+    pthread_mutex_t mutex_;
+};
+
+struct freq_t {
+    int frequency;     // scan frequency
+    char* label;       // frequency label
+    float agcavgfast;  // average power, for AGC
+    float ampfactor;   // multiplier to increase / decrease volume
+    Squelch squelch;
+    size_t active_counter;         // count of loops where channel has signal
+    NotchFilter notch_filter;      // notch filter - good to remove CTCSS tones
+    LowpassFilter lowpass_filter;  // lowpass filter, applied to I/Q after derotation, set at bandwidth/2 to remove out of band noise
+    enum modulations modulation;
+};
+struct channel_t {
+    float wavein[WAVE_LEN];      // FFT output waveform
+    float waveout[WAVE_LEN];     // waveform after squelch + AGC (left/center channel mixer output)
+    float waveout_r[WAVE_LEN];   // right channel mixer output
+    float iq_in[2 * WAVE_LEN];   // raw input samples for I/Q outputs and NFM demod
+    float iq_out[2 * WAVE_LEN];  // raw output samples for I/Q outputs (FIXME: allocate only if required)
+#ifdef NFM
+    float pr;            // previous sample - real part
+    float pj;            // previous sample - imaginary part
+    float prev_waveout;  // previous sample - waveout before notch / ampfactor
+    float alpha;
+#endif                         /* NFM */
+    uint32_t dm_dphi, dm_phi;  // derotation frequency and current phase value
+    enum mix_modes mode;       // mono or stereo
+    status axcindicate;
+    unsigned char afc;  // 0 - AFC disabled; 1 - minimal AFC; 2 - more aggressive AFC and so on to 255
+    struct freq_t* freqlist;
+    int freq_count;
+    int freq_idx;
+    int need_mp3;
+    int needs_raw_iq;
+    int has_iq_outputs;
+    enum ch_states state;  // mixer channel state flag
+    int output_count;
+    output_t* outputs;
+    int highpass;            // highpass filter cutoff
+    int lowpass;             // lowpass filter cutoff
+    lame_t lame;             // Context for LAME MP3 encoding if needed
+    unsigned char* lamebuf;  // Buffer used by each lame encode
+};
+
+enum rec_modes { R_MULTICHANNEL, R_SCAN };
+struct device_t {
+    input_t* input;
+#ifdef NFM
+    float alpha;
+#endif /* NFM */
+    int channel_count;
+    size_t *base_bins, *bins;
+    channel_t* channels;
+    // FIXME: size_t
+    int waveend;
+    int waveavail;
+    THREAD controller_thread;
+    struct freq_tag tag_queue[TAG_QUEUE_LEN];
+    int tq_head, tq_tail;
+    int last_frequency;
+    pthread_mutex_t tag_queue_lock;
+    int row;
+    int failed;
+    enum rec_modes mode;
+    size_t output_overrun_count;
+};
+
+struct mixinput_t {
+    float* wavein;
+    float ampfactor;
+    float ampl, ampr;
+    bool ready;
+    bool has_signal;
+    pthread_mutex_t mutex;
+    size_t input_overrun_count;
+};
+
+struct mixer_t {
+    const char* name;
+    bool enabled;
+    int interval;
+    size_t output_overrun_count;
+    int input_count;
+    mixinput_t* inputs;
+    bool* inputs_todo;
+    bool* input_mask;
+    channel_t channel;
+};
+
+struct demod_params_t {
+    Signal* mp3_signal;
+    int device_start;
+    int device_end;
+
+#ifndef WITH_BCM_VC
+    fftwf_plan fft;
+    fftwf_complex* fftin;
+    fftwf_complex* fftout;
+#endif /* WITH_BCM_VC */
+};
+
+struct output_params_t {
+    Signal* mp3_signal;
+    int device_start;
+    int device_end;
+    int mixer_start;
+    int mixer_end;
+};
+
+// version.cpp
+extern char const* RTL_AIRBAND_VERSION;
+
+// output.cpp
+lame_t airlame_init(mix_modes mixmode, int highpass, int lowpass);
+void shout_setup(icecast_data* icecast, mix_modes mixmode);
+void disable_device_outputs(device_t* dev);
+void disable_channel_outputs(channel_t* channel);
+void* output_check_thread(void* params);
+void* output_thread(void* params);
+
+// rtl_airband.cpp
+extern bool use_localtime;
+extern bool multiple_demod_threads;
+extern bool multiple_output_threads;
+extern char* stats_filepath;
+extern size_t fft_size, fft_size_log;
+extern int device_count, mixer_count;
+extern int shout_metadata_delay;
+extern volatile int do_exit, device_opened;
+extern float alpha;
+extern device_t* devices;
+extern mixer_t* mixers;
+
+// util.cpp
+int atomic_inc(volatile int* pv);
+int atomic_dec(volatile int* pv);
+int atomic_get(volatile int* pv);
+double atofs(char* s);
+double delta_sec(const timeval* start, const timeval* stop);
+void log(int priority, const char* format, ...);
+void tag_queue_put(device_t* dev, int freq, struct timeval tv);
+void tag_queue_get(device_t* dev, struct freq_tag* tag);
+void tag_queue_advance(device_t* dev);
+void sincosf_lut_init();
+void sincosf_lut(uint32_t phi, float* sine, float* cosine);
+void* xcalloc(size_t nmemb, size_t size, const char* file, const int line, const char* func);
+void* xrealloc(void* ptr, size_t size, const char* file, const int line, const char* func);
+#define XCALLOC(nmemb, size) xcalloc((nmemb), (size), __FILE__, __LINE__, __func__)
+#define XREALLOC(ptr, size) xrealloc((ptr), (size), __FILE__, __LINE__, __func__)
+float dBFS_to_level(const float& dBFS);
+float level_to_dBFS(const float& level);
+
+// mixer.cpp
+mixer_t* getmixerbyname(const char* name);
+int mixer_connect_input(mixer_t* mixer, float ampfactor, float balance);
+void mixer_disable_input(mixer_t* mixer, int input_idx);
+void mixer_put_samples(mixer_t* mixer, int input_idx, const float* samples, bool has_signal, unsigned int len);
+void* mixer_thread(void* params);
+const char* mixer_get_error();
+
+// config.cpp
+int parse_devices(libconfig::Setting& devs);
+int parse_mixers(libconfig::Setting& mx);
+
+// udp_stream.cpp
+bool udp_stream_init(udp_stream_data* sdata, mix_modes mode, size_t len);
+void udp_stream_write(udp_stream_data* sdata, const float* data, size_t len);
+void udp_stream_write(udp_stream_data* sdata, const float* data_left, const float* data_right, size_t len);
+void udp_stream_shutdown(udp_stream_data* sdata);
+
+#ifdef WITH_PULSEAUDIO
+#define PULSE_STREAM_LATENCY_LIMIT 10000000UL
+// pulse.cpp
+void pulse_init();
+int pulse_setup(pulse_data* pdata, mix_modes mixmode);
+void pulse_start();
+void pulse_shutdown(pulse_data* pdata);
+void pulse_write_stream(pulse_data* pdata, mix_modes mode, const float* data_left, const float* data_right, size_t len);
+#endif /* WITH_PULSEAUDIO */
+
+#endif /* _RTL_AIRBAND_H */

+ 83 - 0
src/rtl_airband_neon.s

@@ -0,0 +1,83 @@
+#
+# RTLSDR AM demodulator and streaming
+#
+# Copyright (c) 2014 Wong Man Hang <microtony@gmail.com>
+#
+# Updates for NEON coprocessor by Tomasz Lemiech <szpajder@gmail.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+.text
+.align  2
+.global samplefft
+.type samplefft, %function
+.fpu    neon
+
+samplefft:
+
+push {r4-r12, lr}
+vpush {d4-d15}
+
+#r0 is sample_fft_arg
+#[r0, #0] is fft_size_by4
+#[r0, #4] is dest
+ldr r4, [r0]
+ldr r0, [r0, #4]
+
+ldrb r5, [r1]
+ldrb r6, [r1, #1]
+ldrb r7, [r1, #2]
+ldrb r8, [r1, #3]
+ldrb r9, [r1, #4]
+ldrb r10, [r1, #5]
+ldrb r11, [r1, #6]
+ldrb r12, [r1, #7]
+
+.a:
+
+ldr r5, [r3, r5, LSL #2]
+ldr r6, [r3, r6, LSL #2]
+ldr r7, [r3, r7, LSL #2]
+ldr r8, [r3, r8, LSL #2]
+ldr r9, [r3, r9, LSL #2]
+ldr r10, [r3, r10, LSL #2]
+ldr r11, [r3, r11, LSL #2]
+ldr r12, [r3, r12, LSL #2]
+# load window to NEON registers
+vldmia r2!,{d8-d11}
+add r1, r1, #8
+# move level from ARM to NEON registers
+vmov d4, r5, r6
+vmov d5, r7, r8
+vmov d6, r9, r10
+vmov d7, r11, r12
+pld [r1, #16]
+vmul.f32 q6, q2, q4
+vmul.f32 q7, q3, q5
+pld [r2, #8]
+ldrb r5, [r1]
+ldrb r6, [r1, #1]
+ldrb r7, [r1, #2]
+ldrb r8, [r1, #3]
+ldrb r9, [r1, #4]
+ldrb r10, [r1, #5]
+ldrb r11, [r1, #6]
+ldrb r12, [r1, #7]
+vstmia r0!,{q6-q7}
+subs r4, r4, #1
+bne .a
+
+vpop {d4-d15}
+pop {r4-r12, pc}

+ 635 - 0
src/squelch.cpp

@@ -0,0 +1,635 @@
+/*
+ * squelch.cpp
+ *
+ * Copyright (C) 2022-2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "squelch.h"
+
+#ifdef DEBUG_SQUELCH
+#include <errno.h>   // errno
+#include <string.h>  // strerror()
+#endif               /* DEBUG_SQUELCH _*/
+
+#include <stdlib.h>   // calloc()
+#include <algorithm>  // min()
+#include <cassert>    // assert()
+#include <cmath>      // pow()
+
+#include "logging.h"  // debug_print()
+
+using namespace std;
+
+Squelch::Squelch(void) {
+    noise_floor_ = 5.0f;
+    set_squelch_snr_threshold(9.54f);  // depends on noise_floor_, sets using_manual_level_, normal_signal_ratio_, flappy_signal_ratio_, and moving_avg_cap_
+    manual_signal_level_ = -1.0;
+
+    pre_filter_ = {0.001f, 0.001f};
+    post_filter_ = {0.001f, 0.001f};
+
+    squelch_level_ = 0.0f;
+
+    using_post_filter_ = false;
+    pre_vs_post_factor_ = 0.9f;
+
+    open_delay_ = 197;
+    close_delay_ = 197;
+    low_signal_abort_ = 88;
+
+    next_state_ = CLOSED;
+    current_state_ = CLOSED;
+
+    delay_ = 0;
+    open_count_ = 0;
+    sample_count_ = -1;
+    flappy_count_ = 0;
+    low_signal_count_ = 0;
+
+    recent_sample_size_ = 1000;
+    flap_opens_threshold_ = 3;
+    recent_open_count_ = 0;
+    closed_sample_count_ = 0;
+
+    buffer_size_ = 102;  // NOTE: this is specific to the 2nd order lowpass Bessel filter
+    buffer_head_ = 0;
+    buffer_tail_ = 1;
+    buffer_ = (float*)calloc(buffer_size_, sizeof(float));
+
+#ifdef DEBUG_SQUELCH
+    debug_file_ = NULL;
+    raw_input_ = 0.0;
+    filtered_input_ = 0.0;
+#endif /* DEBUG_SQUELCH */
+
+    assert(open_delay_ > buffer_size_);
+
+    debug_print("Created Squelch, open_delay_: %d, close_delay_: %d, low_signal_abort: %d, using_manual_level_: %s\n", open_delay_, close_delay_, low_signal_abort_,
+                using_manual_level_ ? "true" : "false");
+}
+
+void Squelch::set_squelch_level_threshold(const float& level) {
+    if (level > 0) {
+        using_manual_level_ = true;
+        manual_signal_level_ = level;
+    } else {
+        using_manual_level_ = false;
+    }
+
+    // Need to update moving_avg_cap_ - depends on using_manual_level_ and manual_signal_level_
+    calculate_moving_avg_cap();
+
+    debug_print("Set level threshold, using_manual_level_: %s, manual_signal_level_: %f, moving_avg_cap_: %f\n", using_manual_level_ ? "true" : "false", manual_signal_level_, moving_avg_cap_);
+}
+
+void Squelch::set_squelch_snr_threshold(const float& db) {
+    using_manual_level_ = false;
+    normal_signal_ratio_ = pow(10.0, db / 20.0);
+    flappy_signal_ratio_ = normal_signal_ratio_ * 0.9f;
+
+    // Need to update moving_avg_cap_ - depends on using_manual_level_ and normal_signal_ratio_
+    calculate_moving_avg_cap();
+
+    debug_print("SNR threshold updated, using_manual_level_: %s, normal_signal_ratio_: %f, flappy_signal_ratio_: %f, moving_avg_cap_: %f\n", using_manual_level_ ? "true" : "false",
+                normal_signal_ratio_, flappy_signal_ratio_, moving_avg_cap_);
+}
+
+void Squelch::set_ctcss_freq(const float& ctcss_freq, const float& sample_rate) {
+    // create two CTCSS detectors with different window sizes.  0.4 sec is required to tell between all the "standard"
+    // tones but 0.05 is enough to tell between tones ~20 Hz appart.  Will use ctcss_fast_ until there are enough samples
+    // for ctcss_slow_
+    ctcss_fast_ = CTCSS(ctcss_freq, sample_rate, sample_rate * 0.05);
+    ctcss_slow_ = CTCSS(ctcss_freq, sample_rate, sample_rate * 0.4);
+}
+
+bool Squelch::is_open(void) const {
+    // if current state is OPEN or CLOSING then decide based on CTCSS (if enabled)
+    if (current_state_ == OPEN || current_state_ == CLOSING) {
+        // if CTCSS is enabled then use slow (more accurate) if it has enough samples, otherwise
+        // use fast (will return false if also not enough samples)
+        if (ctcss_slow_.is_enabled()) {
+            if (ctcss_slow_.enough_samples()) {
+                return ctcss_slow_.has_tone();
+            }
+            return ctcss_fast_.has_tone();
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
+bool Squelch::should_filter_sample(void) {
+    return ((has_pre_filter_signal() || current_state_ != CLOSED) && current_state_ != LOW_SIGNAL_ABORT);
+}
+
+bool Squelch::should_process_audio(void) {
+    return (current_state_ == OPEN || current_state_ == CLOSING);
+}
+
+bool Squelch::first_open_sample(void) const {
+    return (current_state_ != OPEN && next_state_ == OPEN);
+}
+
+bool Squelch::last_open_sample(void) const {
+    return (current_state_ == CLOSING && next_state_ == CLOSED) || (current_state_ != LOW_SIGNAL_ABORT && next_state_ == LOW_SIGNAL_ABORT);
+}
+
+bool Squelch::signal_outside_filter(void) {
+    return (using_post_filter_ && has_pre_filter_signal() && !has_post_filter_signal());
+}
+
+const float& Squelch::noise_level(void) const {
+    return noise_floor_;
+}
+
+const float& Squelch::signal_level(void) const {
+    return pre_filter_.full_;
+}
+
+const float& Squelch::squelch_level(void) {
+    if (using_manual_level_) {
+        return manual_signal_level_;
+    }
+
+    if (squelch_level_ == 0.0f) {
+        if (currently_flapping() && flappy_signal_ratio_ < normal_signal_ratio_) {
+            squelch_level_ = flappy_signal_ratio_ * noise_floor_;
+        } else {
+            squelch_level_ = normal_signal_ratio_ * noise_floor_;
+        }
+    }
+    return squelch_level_;
+}
+
+const size_t& Squelch::open_count(void) const {
+    return open_count_;
+}
+
+const size_t& Squelch::flappy_count(void) const {
+    return flappy_count_;
+}
+
+const size_t& Squelch::ctcss_count(void) const {
+    return ctcss_slow_.found_count();
+}
+
+const size_t& Squelch::no_ctcss_count(void) const {
+    return ctcss_slow_.not_found_count();
+}
+
+void Squelch::process_raw_sample(const float& sample) {
+    // Update current state based on previous state from last iteration
+    update_current_state();
+
+#ifdef DEBUG_SQUELCH
+    raw_input_ = sample;
+#endif /* DEBUG_SQUELCH */
+
+    sample_count_++;
+
+    // Auto noise floor
+    //  - Doing this every 16 samples instead of every sample allows a gradual signal increase
+    //    to cross the squelch threshold (that is a function of the noise floor) sooner.
+    //  - Updating even when squelch is open and / or signal is outside filter means the noise
+    //    floor (and squelch threshold) will slowly increasing during a long signal.  This can lead
+    //    to flapping, but this keeps a sudden and sustained increase of noise from locking squelch
+    //    OPEN.
+    if (sample_count_ % 16 == 0) {
+        calculate_noise_floor();
+    }
+
+    update_moving_avg(pre_filter_, sample);
+
+    // Apply the comparison factor before adding to the buffer, will later be used as the threshold
+    // for the post_filter_
+    buffer_[buffer_head_] = pre_filter_.capped_ * pre_vs_post_factor_;
+
+    // Check signal against thresholds
+    if (current_state_ == OPEN && !has_signal()) {
+        debug_print("Closing at %zu: no signal after timeout (%f, %f, %f)\n", sample_count_, pre_filter_.capped_, post_filter_.capped_, squelch_level());
+        set_state(CLOSING);
+    }
+
+    if (current_state_ == CLOSED && has_signal()) {
+        debug_print("Opening at %zu: signal (%f, %f, %f)\n", sample_count_, pre_filter_.capped_, post_filter_.capped_, squelch_level());
+        set_state(OPENING);
+    }
+
+    // Override squelch and close if there are repeated samples under the squelch level
+    // NOTE: this can cause squelch to close, but it may immediately be re-opened if the signal level still hasn't fallen after the delays
+    if (current_state_ != CLOSED && current_state_ != LOW_SIGNAL_ABORT) {
+        if (sample >= squelch_level()) {
+            low_signal_count_ = 0;
+        } else {
+            low_signal_count_++;
+            if (low_signal_count_ >= low_signal_abort_) {
+                debug_print("Low signal abort at %zu: low signal count %d\n", sample_count_, low_signal_count_);
+                set_state(LOW_SIGNAL_ABORT);
+            }
+        }
+    }
+}
+
+void Squelch::process_filtered_sample(const float& sample) {
+#ifdef DEBUG_SQUELCH
+    filtered_input_ = sample;
+#endif /* DEBUG_SQUELCH */
+
+    if (!should_filter_sample()) {
+        return;
+    }
+
+    if (current_state_ == OPENING) {
+        // While OPENING, need to wait until the pre-filter value gets through the buffer
+        if (delay_ < buffer_size_) {
+            return;
+        }
+        // Buffer has been filled, initialize post-filter with the pre-filter value
+        if (delay_ == buffer_size_) {
+            post_filter_ = {buffer_[buffer_tail_], buffer_[buffer_tail_]};
+        }
+    }
+
+    using_post_filter_ = true;
+    update_moving_avg(post_filter_, sample);
+
+    // Always comparing the post-filter average to the buffered pre-filtered value
+    if (post_filter_.capped_ < buffer_[buffer_tail_]) {
+        debug_print("Closing at %zu: signal level post filter (%f < %f)\n", sample_count_, post_filter_.capped_, squelch_level());
+        set_state(CLOSED);
+    }
+}
+
+void Squelch::process_audio_sample(const float& sample) {
+#ifdef DEBUG_SQUELCH
+    audio_input_ = sample;
+#endif /* DEBUG_SQUELCH */
+
+    if (!ctcss_slow_.is_enabled()) {
+        return;
+    }
+
+    // ctcss_ is reset on transition to CLOSED and stays "unused" while CLOSED
+    if (current_state_ != CLOSED) {
+        // always send the sample to the slow (more accurate) detector, also send to the fast if there havent been enough yet
+        ctcss_slow_.process_audio_sample(sample);
+        if (!ctcss_slow_.enough_samples()) {
+            ctcss_fast_.process_audio_sample(sample);
+        }
+    }
+}
+
+void Squelch::set_state(State update) {
+    // Valid transitions (current_state_ -> next_state_) are:
+
+    //  - CLOSED -> CLOSED
+    //  - CLOSED -> OPENING
+    //    ---------------------------
+    //  - OPENING -> CLOSED
+    //  - OPENING -> OPENING
+    //  - OPENING -> CLOSING
+    //  - OPENING -> OPEN
+    //    ---------------------------
+    //  - CLOSING -> CLOSED
+    //  - CLOSING -> OPENING
+    //  - CLOSING -> CLOSING
+    //  - CLOSING -> LOW_SIGNAL_ABORT
+    //  - CLOSING -> OPEN
+    //    ---------------------------
+    //  - LOW_SIGNAL_ABORT -> CLOSED
+    //  - LOW_SIGNAL_ABORT -> LOW_SIGNAL_ABORT
+    //    ---------------------------
+    //  - OPEN -> CLOSING
+    //  - OPEN -> LOW_SIGNAL_ABORT
+    //  - OPEN -> OPEN
+
+    // Invalid transistions (current_state_ -> next_state_) are:
+
+    //  CLOSED -> CLOSING (if already CLOSED cant go backwards)
+    if (current_state_ == CLOSED && update == CLOSING) {
+        update = CLOSED;
+    }
+
+    //  CLOSED -> LOW_SIGNAL_ABORT (if already CLOSED cant go backwards)
+    else if (current_state_ == CLOSED && update == LOW_SIGNAL_ABORT) {
+        update = CLOSED;
+    }
+
+    //  CLOSED -> OPEN (must go through OPENING to get to OPEN)
+    else if (current_state_ == CLOSED && update == OPEN) {
+        update = OPENING;
+    }
+
+    //  OPENING -> LOW_SIGNAL_ABORT (just go to CLOSED instead)
+    else if (current_state_ == OPENING && update == LOW_SIGNAL_ABORT) {
+        update = CLOSED;
+    }
+
+    //  LOW_SIGNAL_ABORT -> OPENING (LOW_SIGNAL_ABORT can only go to CLOSED)
+    //  LOW_SIGNAL_ABORT -> OPEN (LOW_SIGNAL_ABORT can only go to CLOSED)
+    //  LOW_SIGNAL_ABORT -> CLOSING (LOW_SIGNAL_ABORT can only go to CLOSED)
+    else if (current_state_ == LOW_SIGNAL_ABORT && update != LOW_SIGNAL_ABORT && update != CLOSED) {
+        update = CLOSED;
+    }
+
+    //  OPEN -> CLOSED (must go through CLOSING to get to CLOSED)
+    else if (current_state_ == OPEN && update == CLOSED) {
+        update = CLOSING;
+    }
+
+    //  OPEN -> OPENING (if already OPEN cant go backwards)
+    else if (current_state_ == OPEN && update == OPENING) {
+        update = OPEN;
+    }
+
+    next_state_ = update;
+}
+
+void Squelch::update_current_state(void) {
+    if (next_state_ == OPENING) {
+        if (current_state_ != OPENING) {
+            debug_print("%zu: transitioning to OPENING\n", sample_count_);
+            delay_ = 0;
+            low_signal_count_ = 0;
+            using_post_filter_ = false;
+            current_state_ = next_state_;
+        } else {
+            // in OPENING delay
+            delay_++;
+            if (delay_ >= open_delay_) {
+                // After getting through OPENING delay, count this as an "open" for flap
+                // detection even if signal has gone.  NOTE - if process_filtered_sample() would
+                // have already sent state to CLOSED before the delay if post_filter_.capped_ was
+                // too low, so that wont count towards flapping
+                if (closed_sample_count_ < recent_sample_size_) {
+                    recent_open_count_++;
+                    if (currently_flapping()) {
+                        flappy_count_++;
+                    }
+
+                    // Force squelch_level_ recalculation at next call to squelch_level()
+                    squelch_level_ = 0.0f;
+                }
+
+                // Check signal level after delay to either go to OPEN or CLOSED
+                if (has_signal()) {
+                    next_state_ = OPEN;
+                } else {
+                    debug_print("%zu: no signal after OPENING delay, going to CLOSED\n", sample_count_);
+                    next_state_ = CLOSED;
+                }
+            }
+        }
+    } else if (next_state_ == CLOSING) {
+        if (current_state_ != CLOSING) {
+            debug_print("%zu: transitioning to CLOSING\n", sample_count_);
+            delay_ = 0;
+            current_state_ = next_state_;
+        } else {
+            // in CLOSING delay
+            delay_++;
+            if (delay_ >= close_delay_) {
+                if (!has_signal()) {
+                    next_state_ = CLOSED;
+                } else {
+                    debug_print("%zu: signal after CLOSING delay, reverting to OPEN\n", sample_count_);
+                    current_state_ = OPEN;  // set current_state_ to avoid incrementing open_count_
+                    next_state_ = OPEN;
+                }
+            }
+        }
+    } else if (next_state_ == LOW_SIGNAL_ABORT) {
+        if (current_state_ != LOW_SIGNAL_ABORT) {
+            debug_print("%zu: transitioning to LOW_SIGNAL_ABORT\n", sample_count_);
+            // If coming from CLOSING then keep the delay counter that has already started
+            if (current_state_ != CLOSING) {
+                delay_ = 0;
+            }
+            current_state_ = next_state_;
+        } else {
+            // in LOW_SIGNAL_ABORT delay
+            delay_++;
+            if (delay_ >= close_delay_) {
+                next_state_ = CLOSED;
+            }
+        }
+    } else if (next_state_ == OPEN && current_state_ != OPEN) {
+        debug_print("%zu: transitioning to OPEN\n", sample_count_);
+        open_count_++;
+        current_state_ = next_state_;
+    } else if (next_state_ == CLOSED && current_state_ != CLOSED) {
+        debug_print("%zu: transitioning to CLOSED\n", sample_count_);
+        using_post_filter_ = false;
+        closed_sample_count_ = 0;
+        current_state_ = next_state_;
+        ctcss_fast_.reset();
+        ctcss_slow_.reset();
+    } else if (next_state_ == CLOSED && current_state_ == CLOSED) {
+        // Count this as a closed sample towards flap detection (can stop counting at recent_sample_size_)
+        if (closed_sample_count_ < recent_sample_size_) {
+            closed_sample_count_++;
+        } else if (closed_sample_count_ == recent_sample_size_) {
+            recent_open_count_ = 0;
+            squelch_level_ = 0.0f;  // Force squelch_level_ recalculation
+        }
+    } else {
+        current_state_ = next_state_;
+    }
+
+    buffer_tail_ = (buffer_tail_ + 1) % buffer_size_;
+    buffer_head_ = (buffer_head_ + 1) % buffer_size_;
+
+#ifdef DEBUG_SQUELCH
+    debug_state();
+#endif /* DEBUG_SQUELCH */
+}
+
+bool Squelch::has_pre_filter_signal(void) {
+    return pre_filter_.capped_ >= squelch_level();
+}
+
+bool Squelch::has_post_filter_signal(void) {
+    return using_post_filter_ && post_filter_.capped_ >= buffer_[buffer_tail_];
+}
+
+bool Squelch::has_signal(void) {
+    if (using_post_filter_) {
+        return has_pre_filter_signal() && has_post_filter_signal();
+    }
+    return has_pre_filter_signal();
+}
+
+void Squelch::calculate_noise_floor(void) {
+    static const float decay_factor = 0.97f;
+    static const float new_factor = 1.0 - decay_factor;
+
+    noise_floor_ = noise_floor_ * decay_factor + std::min(pre_filter_.capped_, noise_floor_) * new_factor + 1e-6f;
+
+    debug_print("%zu: noise floor is now %f\n", sample_count_, noise_floor_);
+
+    // Need to update moving_avg_cap_ - depends on noise_floor_
+    calculate_moving_avg_cap();
+
+    // Force squelch_level_ recalculation at next call to squelch_level() - depends on noise_floor_
+    squelch_level_ = 0.0f;
+}
+
+void Squelch::calculate_moving_avg_cap(void) {
+    // set max value for MovingAverage's capped_ to 1.5 x the normal / manual squelch level.
+    if (using_manual_level_) {
+        moving_avg_cap_ = 1.5f * manual_signal_level_;
+    } else {
+        moving_avg_cap_ = 1.5f * normal_signal_ratio_ * noise_floor_;
+    }
+}
+
+void Squelch::update_moving_avg(MovingAverage& avg, const float& sample) {
+    static const float decay_factor = 0.99f;
+    static const float new_factor = 1.0 - decay_factor;
+
+    avg.full_ = avg.full_ * decay_factor + sample * new_factor;
+
+    // Cap average level, this lets the average drop after the signal goes away more quickly
+    // (if current value and update are both at/above the max then can avoid the float multiplications)
+    if (avg.capped_ >= moving_avg_cap_ && sample >= moving_avg_cap_) {
+        avg.capped_ = moving_avg_cap_;
+    } else {
+        avg.capped_ = min(moving_avg_cap_, avg.capped_ * decay_factor + sample * new_factor);
+    }
+}
+
+bool Squelch::currently_flapping(void) const {
+    return recent_open_count_ >= flap_opens_threshold_;
+}
+
+#ifdef DEBUG_SQUELCH
+/*
+ Debug file methods
+ ==================
+
+ Values written to file are:
+         - (int16_t) process_raw_sample input
+         - (int16_t) process_filtered_sample input
+         - (int16_t) process_audio_sample input
+         - (int16_t) noise_floor_
+         - (int16_t) pre_filter_.capped_
+         - (int16_t) post_filter_.capped_
+         - (int) current_state_
+         - (int) delay_
+         - (int) low_signalcount_
+         - (int) ctcss_fast_.has_tone()
+         - (int) ctcss_slow_.has_tone()
+
+  The output file can be read / plotted in python as follows:
+
+        import matplotlib.pyplot as plt
+        import numpy as np
+
+        def plot_squelch_debug(filepath):
+
+                dt = np.dtype([('raw_input', np.single),
+                                           ('filtered_input', np.single),
+                                           ('audio_input', np.single),
+                                           ('noise_floor', np.single),
+                                           ('pre_filter_capped', np.single),
+                                           ('post_filter_capped', np.single),
+                                           ('current_state', np.intc),
+                                           ('delay', np.intc),
+                                           ('low_signalcount', np.intc),
+                                           ('ctcss_fast_has_tone', np.intc),
+                                           ('ctcss_slow_has_tone', np.intc)
+                                          ])
+
+                dat = np.fromfile(filepath, dtype=dt)
+
+                plt.figure()
+                plt.plot(dat['raw_input'], 'b')
+                plt.plot(dat['pre_filter_capped'], 'g')
+                plt.plot(dat['noise_floor'], 'r')
+                plt.show(block=False)
+
+                plt.figure()
+                plt.plot(dat['post_filter_capped'], 'k')
+                plt.show(block=False)
+
+                plt.figure()
+                axis = plt.subplot2grid((3, 1), (0, 0))
+                axis.plot(dat['current_state'], 'c')
+                axis = plt.subplot2grid((3, 1), (1, 0))
+                axis.plot(dat['delay'], 'm')
+                axis = plt.subplot2grid((3, 1), (2, 0))
+                axis.plot(dat['low_signalcount'], 'y')
+                plt.show(block=False)
+
+                return
+
+  */
+
+Squelch::~Squelch(void) {
+    if (debug_file_) {
+        fclose(debug_file_);
+    }
+}
+
+void Squelch::set_debug_file(const char* filepath) {
+    debug_file_ = fopen(filepath, "wb");
+}
+
+void Squelch::debug_value(const float& value) {
+    if (!debug_file_) {
+        return;
+    }
+
+    if (fwrite(&value, sizeof(value), 1, debug_file_) != 1) {
+        debug_print("Error writing to squelch debug file: %s\n", strerror(errno));
+    }
+}
+
+void Squelch::debug_value(const int& value) {
+    if (!debug_file_) {
+        return;
+    }
+
+    if (fwrite(&value, sizeof(value), 1, debug_file_) != 1) {
+        debug_print("Error writing to squelch debug file: %s\n", strerror(errno));
+    }
+}
+
+void Squelch::debug_state(void) {
+    if (!debug_file_) {
+        return;
+    }
+    debug_value(raw_input_);
+    debug_value(filtered_input_);
+    debug_value(audio_input_);
+
+    raw_input_ = 0.0;
+    filtered_input_ = 0.0;
+    audio_input_ = 0.0;
+
+    debug_value(noise_floor_);
+    debug_value(pre_filter_.capped_);
+    debug_value(post_filter_.capped_);
+    debug_value((int)current_state_);
+    debug_value(delay_);
+    debug_value(low_signal_count_);
+    debug_value((int)ctcss_fast_.has_tone());
+    debug_value((int)ctcss_slow_.has_tone());
+}
+
+#endif /* DEBUG_SQUELCH */

+ 181 - 0
src/squelch.h

@@ -0,0 +1,181 @@
+/*
+ * squelch.h
+ *
+ * Copyright (C) 2022-2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SQUELCH_H
+#define _SQUELCH_H
+
+#include <cstddef>  // size_t
+
+#ifdef DEBUG_SQUELCH
+#include <stdio.h>  // needed for debug file output
+#endif              /* DEBUG_SQUELCH */
+
+#include "ctcss.h"
+
+/*
+ Theory of operation:
+
+ Squelch has 5 states, OPEN (has audio), CLOSED (no audio), OPENING (transitioning from CLOSED to OPEN),
+ CLOSING (transitioning from OPEN to CLOSED), and LOW_SIGNAL_ABORT (same as CLOSING but because of a constant
+ signal drop).
+
+ Squelch is considered "open" when the state is OPEN or CLOSING and squelch is considered "closed" when the
+ state is OPENING, LOW_SIGNAL_ABORT, or CLOSED.
+
+ Noise floor is computed using a low pass filter and updated with the current sample or prior value, whatever
+ is lower.  Noise floor is updated every 16 stamples, except when squelch is open.
+
+ Low pass filters are also used to track the current signal levels.  One level is for the sample before
+ filtering, the second for post signal filtering (if any).  The pre-filter signal level is updated for every
+ sample.  The post-filter level is optional.  When used, the post-filter signal level is compared to a
+ delayed pre-filter value.  The post-filter is set to a fraction of the pre-filtered value each time state
+ transitions to OPENING, and is not updated while state is CLOSED.
+
+ Squelch level can be set manually or is computed as a function of the noise floor.
+
+ When the signal level exceeds the squelch level, the state transitions to OPENING and a delay counter starts,
+ then once the counter is over the state moves to OPEN if there is signal, otherwise back to CLOSED. The same
+ (but opposite) happens when the signal level drops below the squelch level.
+
+ While the squelch is OPEN, a count of continuous samples that are below the squelch level is maintained.  If
+ this count exceeds a threshold then the state moves to LOW_SIGNAL_ABORT.  This allows the squelch to close
+ after a sharp drop off in signal before the signal level has caught up.
+
+ A count of "recent opens" is maintained as a way to detect squelch flapping (ie rapidly opening and closing).
+ When flapping is detected the squelch level is decreased in an attempt to keep squelch open longer.
+
+ CTCSS tone detection can be enabled.  If used, two tone detectors are created at different window lengths.
+ The “fast” detector has less resolution but needs fewer samples while the “slow” detector is more accurate.
+ When CTCSS is enabled, squelch remains CLOSED for an additional 0.05 sec until a tone is detected by the “fast”
+ detector.
+ */
+
+class Squelch {
+   public:
+    Squelch();
+
+    void set_squelch_level_threshold(const float& level);
+    void set_squelch_snr_threshold(const float& db);
+    void set_ctcss_freq(const float& ctcss_freq, const float& sample_rate);
+
+    void process_raw_sample(const float& sample);
+    void process_filtered_sample(const float& sample);
+    void process_audio_sample(const float& sample);
+
+    bool is_open(void) const;
+    bool should_filter_sample(void);
+    bool should_process_audio(void);
+
+    bool first_open_sample(void) const;
+    bool last_open_sample(void) const;
+    bool signal_outside_filter(void);
+
+    const float& noise_level(void) const;
+    const float& signal_level(void) const;
+    const float& squelch_level(void);
+
+    const size_t& open_count(void) const;
+    const size_t& flappy_count(void) const;
+    const size_t& ctcss_count(void) const;
+    const size_t& no_ctcss_count(void) const;
+
+#ifdef DEBUG_SQUELCH
+    ~Squelch(void);
+    void set_debug_file(const char* filepath);
+#endif /* DEBUG_SQUELCH */
+
+   private:
+    enum State {
+        CLOSED,            // Audio is suppressed
+        OPENING,           // Transitioning closed -> open
+        CLOSING,           // Transitioning open -> closed
+        LOW_SIGNAL_ABORT,  // Like CLOSING but is_open() is false
+        OPEN               // Audio not suppressed
+    };
+
+    struct MovingAverage {
+        float full_;
+        float capped_;
+    };
+
+    float noise_floor_;          // noise level
+    bool using_manual_level_;    // if using a manually set signal level threshold
+    float manual_signal_level_;  // manually configured squelch level, < 0 for disabled
+    float normal_signal_ratio_;  // signal-to-noise ratio for normal squelch - ratio, not in dB
+    float flappy_signal_ratio_;  // signal-to-noise ratio for flappy squelch - ratio, not in dB
+
+    float moving_avg_cap_;       // the max value for capped moving average
+    MovingAverage pre_filter_;   // average signal level for reference sample
+    MovingAverage post_filter_;  // average signal level for post-filter sample
+
+    float squelch_level_;  // cached calculation of the squelch_level() value
+
+    bool using_post_filter_;    // if the caller is providing filtered samples
+    float pre_vs_post_factor_;  // multiplier when doing pre vs post filter compaison
+
+    int open_delay_;        // how long to wait after signal level crosses squelch to open
+    int close_delay_;       // how long to wait after signal level crosses squelch to close
+    int low_signal_abort_;  // number of repeated samples below squelch to cause a close
+
+    State next_state_;
+    State current_state_;
+
+    int delay_;             // samples to wait before making next squelch decision
+    size_t open_count_;     // number of times squelch is opened
+    size_t sample_count_;   // number of samples processed (for logging)
+    size_t flappy_count_;   // number of times squelch was detected as flapping OPEN/CLOSED
+    int low_signal_count_;  // number of repeated samples below squelch
+
+    // Flap detection parameters
+    size_t recent_sample_size_;    // number of samples defined as "recent"
+    size_t flap_opens_threshold_;  // number of opens to count as flapping
+    size_t recent_open_count_;     // number of times squelch recently opened
+    size_t closed_sample_count_;   // number of continuous samples where squelch has been CLOSED
+
+    // Buffered pre-filtered values
+    int buffer_size_;  // size of buffer
+    int buffer_head_;  // index to add new values
+    int buffer_tail_;  // index to read buffered values
+    float* buffer_;    // buffer
+
+    CTCSS ctcss_fast_;  // ctcss tone detection
+    CTCSS ctcss_slow_;  // ctcss tone detection
+
+    void set_state(State update);
+    void update_current_state(void);
+    bool has_pre_filter_signal(void);
+    bool has_post_filter_signal(void);
+    bool has_signal(void);
+    void calculate_noise_floor(void);
+    void calculate_moving_avg_cap(void);
+    void update_moving_avg(MovingAverage& avg, const float& sample);
+    bool currently_flapping(void) const;
+
+#ifdef DEBUG_SQUELCH
+    FILE* debug_file_;
+    float raw_input_;
+    float filtered_input_;
+    float audio_input_;
+    void debug_value(const float& value);
+    void debug_value(const int& value);
+    void debug_state(void);
+#endif /* DEBUG_SQUELCH */
+};
+
+#endif /* _SQUELCH_H */

+ 138 - 0
src/test_base_class.cpp

@@ -0,0 +1,138 @@
+/*
+ * test_base_class.cpp
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <dirent.h>
+
+#include "logging.h"
+
+#include "test_base_class.h"
+
+using namespace std;
+
+void delete_directory(const string& root) {
+    DIR* dp = NULL;
+
+    dp = opendir(root.c_str());
+    if (dp == NULL) {
+        cerr << "Error opening directory " << root << endl;
+        return;
+    }
+
+    string current_dir = ".";
+    string parent_dir = "..";
+
+    struct dirent* entry = NULL;
+    while ((entry = readdir(dp))) {
+        if (current_dir.compare(entry->d_name) == 0 || parent_dir.compare(entry->d_name) == 0) {
+            continue;
+        }
+
+        struct stat info;
+        string filepath = root + "/" + string(entry->d_name);
+
+        if (stat(filepath.c_str(), &info) != 0) {
+            cerr << "Error getting info on " << filepath.c_str() << ": " << strerror(errno) << endl;
+            continue;
+        }
+
+        if (S_ISDIR(info.st_mode)) {
+            delete_directory(filepath);
+        } else {
+            unlink(filepath.c_str());
+        }
+    }
+
+    closedir(dp);
+    rmdir(root.c_str());
+}
+
+string make_temp_dir(void) {
+    char temp_path_template[] = "/tmp/temp_unittest_dir_XXXXXX";
+    if (mkdtemp(temp_path_template) == NULL) {
+        cerr << "Error making temp dir for test files: " << strerror(errno) << endl;
+        return "";
+    }
+    return string(temp_path_template);
+}
+
+void TestBaseClass::SetUp(void) {
+    ::testing::Test::SetUp();
+
+    // setup debug log file for each test
+    temp_dir = make_temp_dir();
+    ASSERT_FALSE(temp_dir.empty());
+    string debug_filepath = temp_dir + "/debug_file.log";
+    init_debug(debug_filepath.c_str());
+
+    // point logging to stderr
+    log_destination = STDERR;
+}
+
+void TestBaseClass::TearDown(void) {
+    ::testing::Test::TearDown();
+    close_debug();
+    delete_directory(temp_dir);
+}
+
+TEST(TestHelpers, make_temp_dir) {
+    // make a temp dir
+    string temp_dir = make_temp_dir();
+
+    // path should not be empty string
+    ASSERT_FALSE(temp_dir.empty());
+
+    // a directory should exist at the path
+    struct stat info;
+    ASSERT_EQ(stat(temp_dir.c_str(), &info), 0);
+    EXPECT_TRUE(S_ISDIR(info.st_mode));
+
+    delete_directory(temp_dir);
+}
+
+TEST(TestHelpers, delete_directory) {
+    // make a temp dir
+    string temp_dir = make_temp_dir();
+    ASSERT_FALSE(temp_dir.empty());
+
+    // build a bunch of nested sub-dirs and files
+    string path = temp_dir;
+    for (int i = 0; i < 5; ++i) {
+        path = path + "/sub_dir";
+        mkdir(path.c_str(), 0777);
+
+        string filename = path + "/some_file";
+        fclose(fopen(filename.c_str(), "w"));
+    }
+
+    // last sub-dir should exist and be a directory
+    struct stat info;
+    ASSERT_EQ(stat(path.c_str(), &info), 0);
+    EXPECT_TRUE(S_ISDIR(info.st_mode));
+
+    // last sub-dir should have a file in it
+    string filename = path + "/some_file";
+    ASSERT_EQ(stat(filename.c_str(), &info), 0);
+    EXPECT_TRUE(S_ISREG(info.st_mode));
+
+    // delete the root temp dir
+    delete_directory(temp_dir);
+
+    // root temp dir should no longer exist
+    ASSERT_NE(stat(temp_dir.c_str(), &info), 0);
+}

+ 35 - 0
src/test_base_class.h

@@ -0,0 +1,35 @@
+/*
+ * test_base_class.h
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _TEST_BASE_CLASS_H
+#define _TEST_BASE_CLASS_H
+
+#include <gtest/gtest.h>
+
+#include <string>
+
+class TestBaseClass : public ::testing::Test {
+   protected:
+    void SetUp(void);
+    void TearDown(void);
+
+    std::string temp_dir;
+};
+
+#endif /* _TEST_BASE_CLASS_H */

+ 155 - 0
src/test_ctcss.cpp

@@ -0,0 +1,155 @@
+/*
+ * test_ctcss.cpp
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "generate_signal.h"
+#include "test_base_class.h"
+
+#include "ctcss.h"
+
+using namespace std;
+
+class CTCSSTest : public TestBaseClass {
+   protected:
+    int sample_rate;
+    int fast_window_size;
+    int slow_window_size;
+
+    void SetUp(void) {
+        TestBaseClass::SetUp();
+        sample_rate = 8000;
+        fast_window_size = sample_rate * 0.05;
+        slow_window_size = sample_rate * 0.4;
+    }
+
+    void write_file(const vector<float>& samples, const string& filepath) {
+        cerr << "writing file out to " << filepath << endl;
+
+        FILE* fp = fopen(filepath.c_str(), "wb");
+
+        for (auto sample : samples) {
+            fwrite(&sample, sizeof(float), 1, fp);
+        }
+        fclose(fp);
+    }
+
+    void load_from_file(CTCSS& ctcss, const string& filepath) {
+        FILE* fp = fopen(filepath.c_str(), "rb");
+
+        while (!ctcss.enough_samples()) {
+            float sample;
+            if (fread(&sample, sizeof(float), 1, fp) != 1) {
+                break;
+            }
+            ctcss.process_audio_sample(sample);
+        }
+        fclose(fp);
+
+        ASSERT_TRUE(ctcss.enough_samples());
+    }
+
+    void test_all_tones(GenerateSignal& signal, const float& tone = 0) {
+        for (auto standard_tone : CTCSS::standard_tones) {
+            // skipping tones within +/- 5Hz
+            if (abs(standard_tone - tone) < 5) {
+                continue;
+            }
+
+            CTCSS ctcss(standard_tone, sample_rate, slow_window_size);
+            vector<float> samples;
+            run_signal(ctcss, signal, samples);
+
+            EXPECT_FALSE(ctcss.has_tone()) << "Tone of " << standard_tone << " found, expected " << tone;
+
+            // on failure write out a file for debugging
+            if (ctcss.has_tone()) {
+                // double the samples to write to the file for later testing
+                size_t initial_count = samples.size();
+                while (samples.size() < initial_count * 2) {
+                    samples.push_back(signal.get_sample());
+                }
+
+                string filepath = "/tmp/found_" + to_string(standard_tone) + "_expected_" + to_string(tone);
+                write_file(samples, filepath);
+            }
+        }
+        if (tone != 0) {
+            CTCSS ctcss(tone, sample_rate, slow_window_size);
+            vector<float> samples;
+            run_signal(ctcss, signal, samples);
+
+            EXPECT_TRUE(ctcss.has_tone()) << "Expected tone of " << tone << " not found";
+
+            // on failure write out a file for debugging
+            if (!ctcss.has_tone()) {
+                // double the samples to write to the file for later testing
+                size_t initial_count = samples.size();
+                while (samples.size() < initial_count * 2) {
+                    samples.push_back(signal.get_sample());
+                }
+
+                string filepath = "/tmp/didnt_find_" + to_string(tone);
+                write_file(samples, filepath);
+            }
+        }
+    }
+
+    void run_signal(CTCSS& ctcss, GenerateSignal& signal, vector<float>& samples) {
+        EXPECT_TRUE(ctcss.is_enabled()) << "CTCSS not enabled";
+        while (!ctcss.enough_samples()) {
+            float sample = signal.get_sample();
+            samples.push_back(sample);
+            ctcss.process_audio_sample(sample);
+        }
+    }
+};
+
+TEST_F(CTCSSTest, creation) {
+    CTCSS ctcss;
+    EXPECT_FALSE(ctcss.is_enabled());
+}
+
+TEST_F(CTCSSTest, no_signal) {
+    GenerateSignal signal(sample_rate);
+    test_all_tones(signal);
+}
+
+TEST_F(CTCSSTest, has_tone) {
+    float tone = CTCSS::standard_tones[0];
+    GenerateSignal signal(sample_rate);
+    signal.add_tone(tone, Tone::NORMAL);
+    signal.add_noise(Noise::NORMAL);
+    test_all_tones(signal, tone);
+}
+
+TEST_F(CTCSSTest, has_non_standard_tone) {
+    float tone = (CTCSS::standard_tones[0] + CTCSS::standard_tones[0]) / 2;
+    GenerateSignal signal(sample_rate);
+    signal.add_tone(tone, Tone::NORMAL);
+    signal.add_noise(Noise::NORMAL);
+    test_all_tones(signal, tone);
+}
+
+TEST_F(CTCSSTest, has_each_standard_tone) {
+    for (auto tone : CTCSS::standard_tones) {
+        GenerateSignal signal(sample_rate);
+        signal.add_tone(tone, Tone::NORMAL);
+        signal.add_noise(Noise::NORMAL);
+        test_all_tones(signal, tone);
+    }
+}

+ 41 - 0
src/test_filters.cpp

@@ -0,0 +1,41 @@
+/*
+ * test_filters.cpp
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "test_base_class.h"
+
+#include "filters.h"
+
+using namespace std;
+
+class FiltersTest : public TestBaseClass {
+   protected:
+    void SetUp(void) { TestBaseClass::SetUp(); }
+
+    void TearDown(void) { TestBaseClass::TearDown(); }
+};
+
+TEST_F(FiltersTest, default_notch) {
+    NotchFilter notch;
+    EXPECT_FALSE(notch.enabled());
+}
+
+TEST_F(FiltersTest, default_lowpass) {
+    LowpassFilter lowpass;
+    EXPECT_FALSE(lowpass.enabled());
+}

+ 280 - 0
src/test_generate_signal.cpp

@@ -0,0 +1,280 @@
+/*
+ * test_generate_signal.cpp
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <algorithm>
+
+#include "test_base_class.h"
+
+#include "generate_signal.h"
+
+using namespace std;
+
+class ToneTest : public TestBaseClass {};
+
+TEST_F(ToneTest, simple_object) {
+    // simple case the sample rate is a multiple of the frequency so specific points can be measured
+    float tone_freq = 100;  // tone at 100 Hz
+    // set sample rate to 1000 times the tone so there will be 250 samples per quarter
+    float sample_rate = 1000 * tone_freq;
+    float amplitude = Tone::STRONG;
+
+    Tone tone(sample_rate, tone_freq, amplitude);
+
+    float last_sample = 0.0;
+    float this_sample = 0.0;
+
+    // loop through some number of cycles
+    for (int j = 0; j < 10; ++j) {
+        // first 249 samples will be positive and increasing
+        for (int i = 0; i < 249; ++i) {
+            this_sample = tone.get_sample();
+            ASSERT_GT(this_sample, 0.0);
+            ASSERT_GT(this_sample, last_sample);
+            last_sample = this_sample;
+        }
+
+        // sample 250 will be the amp
+        this_sample = tone.get_sample();
+        ASSERT_EQ(this_sample, amplitude);
+        ASSERT_GT(this_sample, last_sample);
+        last_sample = this_sample;
+
+        // next 249 samples will be positive and decreasing
+        for (int i = 0; i < 249; ++i) {
+            this_sample = tone.get_sample();
+            ASSERT_GT(this_sample, 0.0);
+            ASSERT_LT(this_sample, last_sample);
+            last_sample = this_sample;
+        }
+
+        // sample 500 will be zero-ish
+        this_sample = tone.get_sample();
+        ASSERT_LT(this_sample, 0.000001);
+        ASSERT_LT(this_sample, last_sample);
+        last_sample = this_sample;
+
+        // next 249 samples will be negative and decreasing
+        for (int i = 0; i < 249; ++i) {
+            this_sample = tone.get_sample();
+            ASSERT_LT(this_sample, 0.0);
+            ASSERT_LT(this_sample, last_sample);
+            last_sample = this_sample;
+        }
+
+        // sample 750 will be negative amp
+        this_sample = tone.get_sample();
+        ASSERT_EQ(this_sample, -1.0 * amplitude);
+        ASSERT_LT(this_sample, last_sample);
+        last_sample = this_sample;
+
+        // next 249 samples will be negative and increasing
+        for (int i = 0; i < 249; ++i) {
+            this_sample = tone.get_sample();
+            ASSERT_LT(this_sample, 0.0);
+            ASSERT_GT(this_sample, last_sample);
+            last_sample = this_sample;
+        }
+
+        // sample 1000 will be zero-ish
+        this_sample = tone.get_sample();
+        ASSERT_LT(this_sample, 0.000001);
+        ASSERT_GT(this_sample, last_sample);
+        last_sample = this_sample;
+    }
+}
+
+TEST_F(ToneTest, strengths) {
+    float tone_freq = 100;
+    float sample_rate = 8000;
+
+    Tone tone_weak(sample_rate, tone_freq, Tone::WEAK);
+    Tone tone_normal(sample_rate, tone_freq, Tone::NORMAL);
+    Tone tone_strong(sample_rate, tone_freq, Tone::STRONG);
+
+    for (int i = 0; i < 100 * sample_rate; ++i) {
+        float weak_sample = tone_weak.get_sample();
+        float normal_sample = tone_normal.get_sample();
+        float strong_sample = tone_strong.get_sample();
+
+        if (weak_sample > 0.0) {
+            ASSERT_LT(weak_sample, normal_sample);
+            ASSERT_LT(normal_sample, strong_sample);
+        } else if (weak_sample == 0.0) {
+            ASSERT_EQ(weak_sample, 0.0);
+            ASSERT_EQ(normal_sample, 0.0);
+            ASSERT_EQ(strong_sample, 0.0);
+        } else {
+            ASSERT_GT(weak_sample, normal_sample);
+            ASSERT_GT(normal_sample, strong_sample);
+        }
+    }
+}
+
+class NoiseTest : public TestBaseClass {};
+
+TEST_F(NoiseTest, simple_object) {
+    Noise noise(Noise::STRONG);
+
+    int sample_count = 10000;
+    float sample_max = 0.0;
+    float sample_min = 0.0;
+    float sample_sum = 0.0;
+    for (int i = 0; i < sample_count; ++i) {
+        float sample = noise.get_sample();
+        sample_max = max(sample, sample_max);
+        sample_min = min(sample, sample_min);
+        sample_sum += sample;
+    }
+    float sample_avg = sample_sum / sample_count;
+
+    // average is near zero
+    EXPECT_LE(abs(sample_avg), 0.01);
+    // max and min are off of zero
+    EXPECT_LE(sample_min, Noise::STRONG * -0.3);
+    EXPECT_GT(sample_max, Noise::STRONG * 0.3);
+}
+
+TEST_F(NoiseTest, strengths) {
+    Noise noise_weak(Noise::WEAK);
+    Noise noise_normal(Noise::NORMAL);
+    Noise noise_strong(Noise::STRONG);
+
+    float weak_max = 0.0;
+    float normal_max = 0.0;
+    float strong_max = 0.0;
+    for (int i = 0; i < 10000; ++i) {
+        weak_max = max(weak_max, abs(noise_weak.get_sample()));
+        normal_max = max(normal_max, abs(noise_normal.get_sample()));
+        strong_max = max(strong_max, abs(noise_strong.get_sample()));
+    }
+
+    EXPECT_NE(weak_max, 0.0);
+    EXPECT_GT(normal_max, weak_max);
+    EXPECT_GT(strong_max, normal_max);
+}
+
+class GenerateSignalTest : public TestBaseClass {
+   protected:
+    int sample_rate;
+    void SetUp(void) {
+        TestBaseClass::SetUp();
+        sample_rate = 8000;
+    }
+};
+
+TEST_F(GenerateSignalTest, default_object) {
+    GenerateSignal signal(8000);
+    EXPECT_EQ(signal.get_sample(), 0.0);
+}
+
+TEST_F(GenerateSignalTest, generate_file) {
+    float file_seconds = 10.5;
+    GenerateSignal signal(sample_rate);
+
+    string test_filepath = temp_dir + "/10_sec_file.dat";
+    signal.write_file(test_filepath, file_seconds);
+
+    // make sure the file exists and is the right size
+    struct stat info;
+    ASSERT_EQ(stat(test_filepath.c_str(), &info), 0);
+    EXPECT_TRUE(S_ISREG(info.st_mode));
+    EXPECT_EQ(info.st_size, sample_rate * file_seconds * sizeof(float));
+}
+
+TEST_F(GenerateSignalTest, get_sample_no_signals) {
+    GenerateSignal signal(sample_rate);
+    for (int i = 0; i < 60 * sample_rate; ++i) {
+        ASSERT_EQ(signal.get_sample(), 0.0);
+    }
+}
+
+TEST_F(GenerateSignalTest, get_sample_single_tone_only) {
+    float tone_freq = 123.34;
+    float tone_ampl = 0.32;
+
+    GenerateSignal signal(sample_rate);
+    signal.add_tone(tone_freq, tone_ampl);
+    Tone tone(sample_rate, tone_freq, tone_ampl);
+    for (int i = 0; i < 60 * sample_rate; ++i) {
+        ASSERT_FLOAT_EQ(signal.get_sample(), tone.get_sample());
+    }
+}
+
+TEST_F(GenerateSignalTest, get_sample_two_tones) {
+    float tone1_freq = 123.34;
+    float tone2_freq = 231.43;
+    float tone1_ampl = Tone::NORMAL;
+    float tone2_ampl = Tone::STRONG;
+
+    GenerateSignal signal(sample_rate);
+    signal.add_tone(tone1_freq, tone1_ampl);
+    signal.add_tone(tone2_freq, tone2_ampl);
+    Tone tone1(sample_rate, tone1_freq, tone1_ampl);
+    Tone tone2(sample_rate, tone2_freq, tone2_ampl);
+    for (int i = 0; i < 60 * sample_rate; ++i) {
+        ASSERT_NEAR(signal.get_sample(), tone1.get_sample() + tone2.get_sample(), 0.000001);
+    }
+}
+
+TEST_F(GenerateSignalTest, get_sample_only_noise) {
+    GenerateSignal signal(sample_rate);
+    signal.add_noise(Noise::NORMAL);
+
+    float max_value = 0;
+    float min_value = 0;
+    for (int i = 0; i < 600 * sample_rate; ++i) {
+        float sample = signal.get_sample();
+        min_value = min(sample, min_value);
+        max_value = max(sample, max_value);
+    }
+
+    EXPECT_GT(max_value, 0);
+    EXPECT_LT(max_value, Noise::NORMAL);
+
+    EXPECT_LT(min_value, 0);
+    EXPECT_GT(min_value, -1.0 * Noise::NORMAL);
+}
+
+TEST_F(GenerateSignalTest, get_sample_two_tones_and_noise) {
+    float tone1_freq = 123.34;
+    float tone2_freq = 231.43;
+    float tone1_ampl = Tone::NORMAL;
+    float tone2_ampl = Tone::WEAK;
+
+    GenerateSignal signal(sample_rate);
+    signal.add_tone(tone1_freq, tone1_ampl);
+    signal.add_tone(tone2_freq, tone2_ampl);
+    signal.add_noise(Noise::NORMAL);
+
+    Tone tone1(sample_rate, tone1_freq, tone1_ampl);
+    Tone tone2(sample_rate, tone2_freq, tone2_ampl);
+    float max_value = 0;
+    float min_value = 0;
+    for (int i = 0; i < 60 * sample_rate; ++i) {
+        float sample_noise = signal.get_sample() - tone1.get_sample() - tone2.get_sample();
+        min_value = min(sample_noise, min_value);
+        max_value = max(sample_noise, max_value);
+    }
+
+    EXPECT_GT(max_value, 0);
+    EXPECT_LT(max_value, Noise::NORMAL);
+
+    EXPECT_LT(min_value, 0);
+    EXPECT_GT(min_value, -1.0 * Noise::NORMAL);
+}

+ 167 - 0
src/test_helper_functions.cpp

@@ -0,0 +1,167 @@
+/*
+ * test_output.cpp
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "test_base_class.h"
+
+#include "helper_functions.h"
+
+using namespace std;
+
+class HelperFunctionsTest : public TestBaseClass {
+   protected:
+    void SetUp(void) { TestBaseClass::SetUp(); }
+
+    void create_file(const string& filepath) {
+        fclose(fopen(filepath.c_str(), "wb"));
+        EXPECT_TRUE(file_exists(filepath));
+    }
+};
+
+TEST_F(HelperFunctionsTest, dir_exists_true) {
+    EXPECT_TRUE(dir_exists(temp_dir));
+}
+
+TEST_F(HelperFunctionsTest, dir_exists_false) {
+    EXPECT_FALSE(dir_exists("/not/a/real/dir"));
+}
+
+TEST_F(HelperFunctionsTest, dir_exists_not_dir) {
+    string file_in_dir = temp_dir + "/some_file";
+    create_file(file_in_dir);
+    EXPECT_FALSE(dir_exists(file_in_dir));
+}
+
+TEST_F(HelperFunctionsTest, file_exists_true) {
+    string file_in_dir = temp_dir + "/some_file";
+    create_file(file_in_dir);
+    EXPECT_TRUE(file_exists(file_in_dir));
+}
+
+TEST_F(HelperFunctionsTest, file_exists_false) {
+    EXPECT_FALSE(file_exists(temp_dir + "/nothing"));
+}
+
+TEST_F(HelperFunctionsTest, file_exists_not_file) {
+    EXPECT_FALSE(file_exists(temp_dir));
+    EXPECT_TRUE(dir_exists(temp_dir));
+}
+
+TEST_F(HelperFunctionsTest, make_dir_normal) {
+    const string dir_path = temp_dir + "/a";
+    EXPECT_FALSE(dir_exists(dir_path));
+    EXPECT_TRUE(make_dir(dir_path));
+    EXPECT_TRUE(dir_exists(dir_path));
+}
+
+TEST_F(HelperFunctionsTest, make_dir_exists) {
+    EXPECT_TRUE(dir_exists(temp_dir));
+    EXPECT_TRUE(make_dir(temp_dir));
+    EXPECT_TRUE(dir_exists(temp_dir));
+}
+
+TEST_F(HelperFunctionsTest, make_dir_empty) {
+    EXPECT_FALSE(make_dir(""));
+}
+
+TEST_F(HelperFunctionsTest, make_dir_fail) {
+    EXPECT_FALSE(make_dir("/this/path/does/not/exist"));
+}
+
+TEST_F(HelperFunctionsTest, make_dir_file_in_the_way) {
+    const string file_path = temp_dir + "/some_file";
+    create_file(file_path);
+    EXPECT_FALSE(make_dir(file_path));
+}
+
+TEST_F(HelperFunctionsTest, make_subdirs_exists) {
+    EXPECT_TRUE(dir_exists(temp_dir));
+    EXPECT_TRUE(make_subdirs(temp_dir, ""));
+    EXPECT_TRUE(dir_exists(temp_dir));
+}
+
+TEST_F(HelperFunctionsTest, make_subdirs_one_subdir) {
+    const string path = "bob";
+    EXPECT_FALSE(dir_exists(temp_dir + "/" + path));
+    EXPECT_TRUE(make_subdirs(temp_dir, path));
+    EXPECT_TRUE(dir_exists(temp_dir + "/" + path));
+}
+
+TEST_F(HelperFunctionsTest, make_subdirs_multiple_subdir) {
+    const string path = "bob/joe/sam";
+    EXPECT_FALSE(dir_exists(temp_dir + "/" + path));
+    EXPECT_TRUE(make_subdirs(temp_dir, path));
+    EXPECT_TRUE(dir_exists(temp_dir + "/" + path));
+}
+
+TEST_F(HelperFunctionsTest, make_subdirs_file_in_the_way) {
+    const string file_in_dir = temp_dir + "/some_file";
+    create_file(file_in_dir);
+    EXPECT_TRUE(file_exists(file_in_dir));
+    EXPECT_FALSE(make_subdirs(temp_dir, "some_file/some_dir"));
+    EXPECT_FALSE(dir_exists(file_in_dir));
+    EXPECT_TRUE(file_exists(file_in_dir));
+}
+
+TEST_F(HelperFunctionsTest, make_subdirs_create_base) {
+    EXPECT_FALSE(dir_exists(temp_dir + "/base_dir/a"));
+    EXPECT_TRUE(make_subdirs(temp_dir + "/base_dir", "a"));
+    EXPECT_TRUE(dir_exists(temp_dir + "/base_dir/a"));
+}
+
+TEST_F(HelperFunctionsTest, make_subdirs_extra_slashes) {
+    EXPECT_FALSE(dir_exists(temp_dir + "/a/b/c/d"));
+    EXPECT_TRUE(make_subdirs(temp_dir, "///a/b////c///d"));
+    EXPECT_TRUE(dir_exists(temp_dir + "/a/b/c/d"));
+}
+
+TEST_F(HelperFunctionsTest, make_dated_subdirs_normal) {
+    struct tm time_struct;
+
+    strptime("2010-3-7", "%Y-%m-%d", &time_struct);
+
+    const string dir_path = temp_dir + "/2010/03/07";
+
+    EXPECT_FALSE(dir_exists(dir_path));
+    EXPECT_EQ(make_dated_subdirs(temp_dir, &time_struct), dir_path);
+    EXPECT_TRUE(dir_exists(dir_path));
+}
+
+TEST_F(HelperFunctionsTest, make_dated_subdirs_fail) {
+    struct tm time_struct;
+
+    strptime("2010-3-7", "%Y-%m-%d", &time_struct);
+
+    EXPECT_EQ(make_dated_subdirs("/invalid/base/dir", &time_struct), "");
+}
+
+TEST_F(HelperFunctionsTest, make_dated_subdirs_some_exist) {
+    struct tm time_struct;
+
+    const string dir_through_month = temp_dir + "/2010/03/";
+
+    strptime("2010-3-7", "%Y-%m-%d", &time_struct);
+    EXPECT_EQ(make_dated_subdirs(temp_dir, &time_struct), dir_through_month + "07");
+
+    EXPECT_TRUE(dir_exists(dir_through_month));
+    EXPECT_FALSE(dir_exists(dir_through_month + "08"));
+
+    strptime("2010-3-8", "%Y-%m-%d", &time_struct);
+    EXPECT_EQ(make_dated_subdirs(temp_dir, &time_struct), dir_through_month + "08");
+    EXPECT_TRUE(dir_exists(dir_through_month + "08"));
+}

+ 281 - 0
src/test_squelch.cpp

@@ -0,0 +1,281 @@
+/*
+ * test_squelch.cpp
+ *
+ * Copyright (C) 2023 charlie-foxtrot
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "generate_signal.h"
+#include "test_base_class.h"
+
+#include "squelch.h"
+
+using namespace std;
+
+class SquelchTest : public TestBaseClass {
+   protected:
+    void SetUp(void) {
+        TestBaseClass::SetUp();
+
+        raw_no_signal_sample = 0.05;
+        raw_signal_sample = 0.75;
+    }
+
+    void TearDown(void) { TestBaseClass::TearDown(); }
+
+    // send through "no signal" samples to get noise floor down
+    void send_samples_for_noise_floor(Squelch& squelch) {
+        while (squelch.noise_level() > 1.01 * raw_no_signal_sample) {
+            squelch.process_raw_sample(raw_no_signal_sample);
+        }
+        ASSERT_LE(squelch.noise_level(), 1.01 * raw_no_signal_sample);
+        ASSERT_GT(raw_signal_sample, squelch.squelch_level());
+    }
+
+    float raw_no_signal_sample;
+    float raw_signal_sample;
+};
+
+TEST_F(SquelchTest, default_object) {
+    Squelch squelch;
+    EXPECT_EQ(squelch.open_count(), 0);
+}
+
+TEST_F(SquelchTest, noise_floor) {
+    Squelch squelch;
+
+    // noise floor starts high
+    EXPECT_GT(squelch.noise_level(), 10.0 * raw_no_signal_sample);
+
+    // noise floor drifts down towards (but never at) the incoming raw sample level
+    float last_noise_level, this_noise_level;
+    this_noise_level = squelch.noise_level();
+    do {
+        last_noise_level = this_noise_level;
+
+        // not all samples update noise floor
+        for (int j = 0; j < 25; ++j) {
+            squelch.process_raw_sample(raw_no_signal_sample);
+        }
+
+        this_noise_level = squelch.noise_level();
+        ASSERT_LE(this_noise_level, last_noise_level);
+    } while (this_noise_level != last_noise_level);
+
+    // noise floor ends up close to the incoming level
+    EXPECT_LT(squelch.noise_level(), 1.01 * raw_no_signal_sample);
+}
+
+TEST_F(SquelchTest, normal_operation) {
+    Squelch squelch;
+
+    // send through "no signal" samples to get noise floor down
+    send_samples_for_noise_floor(squelch);
+    ASSERT_LE(squelch.noise_level(), 1.01 * raw_no_signal_sample);
+    ASSERT_GT(raw_signal_sample, squelch.squelch_level());
+
+    // send through "signal" samples and squelch should open shortly
+    for (int i = 0; i < 500 && !squelch.is_open(); ++i) {
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_TRUE(squelch.is_open());
+    ASSERT_TRUE(squelch.should_process_audio());
+
+    // send through a bunch more "signal" values and squelch stays open
+    for (int i = 0; i < 1000; ++i) {
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_TRUE(squelch.is_open());
+    ASSERT_TRUE(squelch.should_process_audio());
+
+    // send through "no signal" samples and squelch should close quickly
+    for (int i = 0; i < 100 && squelch.is_open(); ++i) {
+        squelch.process_raw_sample(raw_no_signal_sample);
+    }
+    ASSERT_FALSE(squelch.is_open());
+    ASSERT_FALSE(squelch.should_process_audio());
+}
+
+TEST_F(SquelchTest, dead_spot) {
+    Squelch squelch;
+
+    send_samples_for_noise_floor(squelch);
+
+    // send through "signal" samples and squelch should open shortly
+    for (int i = 0; i < 500 && !squelch.is_open(); ++i) {
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_TRUE(squelch.is_open());
+    ASSERT_TRUE(squelch.should_process_audio());
+
+    // send through a bunch more "signal" values and squelch stays open
+    for (int i = 0; i < 1000; ++i) {
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_TRUE(squelch.is_open());
+    ASSERT_TRUE(squelch.should_process_audio());
+
+    // send through a dead spot of "no signal" and squelch should stay open
+    for (int i = 0; i < 50; ++i) {
+        squelch.process_raw_sample(raw_no_signal_sample);
+        ASSERT_TRUE(squelch.is_open());
+        ASSERT_TRUE(squelch.should_process_audio());
+    }
+
+    // send go back to "signal" samples and squelch stays open
+    for (int i = 0; i < 1000; ++i) {
+        squelch.process_raw_sample(raw_signal_sample);
+        ASSERT_TRUE(squelch.is_open());
+        ASSERT_TRUE(squelch.should_process_audio());
+    }
+}
+
+TEST_F(SquelchTest, should_process_audio) {
+    Squelch squelch;
+
+    send_samples_for_noise_floor(squelch);
+
+    // should_process_audio is true as soon as squelch opens
+    for (int i = 0; i < 500 && !squelch.is_open(); ++i) {
+        ASSERT_FALSE(squelch.should_process_audio());
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_TRUE(squelch.is_open());
+    ASSERT_TRUE(squelch.should_process_audio());
+
+    // and stays true until fully closed
+    for (int i = 0; i < 100 && squelch.is_open(); ++i) {
+        ASSERT_TRUE(squelch.should_process_audio());
+        squelch.process_raw_sample(raw_no_signal_sample);
+    }
+    ASSERT_FALSE(squelch.is_open());
+    ASSERT_FALSE(squelch.should_process_audio());
+}
+
+TEST_F(SquelchTest, good_ctcss) {
+    float tone = CTCSS::standard_tones[5];
+    float sample_rate = 8000;
+
+    Squelch squelch;
+    squelch.set_ctcss_freq(tone, sample_rate);
+    send_samples_for_noise_floor(squelch);
+
+    GenerateSignal signal_with_tone(sample_rate);
+    signal_with_tone.add_tone(tone, Tone::NORMAL);
+
+    // send through "signal" samples until its time to process audio
+    for (int i = 0; i < 500 && !squelch.should_process_audio(); ++i) {
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_FALSE(squelch.is_open());
+    ASSERT_TRUE(squelch.should_process_audio());
+
+    // process audio samples and "signal" samples until squelch is open
+    for (int i = 0; i < 500 && !squelch.is_open(); ++i) {
+        squelch.process_audio_sample(signal_with_tone.get_sample());
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_TRUE(squelch.is_open());
+    ASSERT_TRUE(squelch.should_process_audio());
+
+    // run through a lot more to ensure squelch stays open
+    for (int i = 0; i < 100000; ++i) {
+        squelch.process_audio_sample(signal_with_tone.get_sample());
+        squelch.process_raw_sample(raw_signal_sample);
+        ASSERT_TRUE(squelch.is_open());
+        ASSERT_TRUE(squelch.should_process_audio());
+    }
+
+    EXPECT_GT(squelch.ctcss_count(), 0);
+    EXPECT_EQ(squelch.no_ctcss_count(), 0);
+}
+
+TEST_F(SquelchTest, wrong_ctcss) {
+    float actual_tone = CTCSS::standard_tones[0];
+    float expected_tone = CTCSS::standard_tones[7];
+    float sample_rate = 8000;
+
+    Squelch squelch;
+    squelch.set_ctcss_freq(expected_tone, sample_rate);
+    send_samples_for_noise_floor(squelch);
+
+    GenerateSignal signal_with_tone(sample_rate);
+    signal_with_tone.add_tone(actual_tone, Tone::NORMAL);
+
+    // send through "signal" samples until its time to process audio
+    for (int i = 0; i < 500 && !squelch.should_process_audio(); ++i) {
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_TRUE(squelch.should_process_audio());
+    ASSERT_FALSE(squelch.is_open());
+
+    // process lots of audio samples and "signal" samples and squelch never opens
+    for (int i = 0; i < 100000; ++i) {
+        squelch.process_audio_sample(signal_with_tone.get_sample());
+        squelch.process_raw_sample(raw_signal_sample);
+        ASSERT_TRUE(squelch.should_process_audio());
+        ASSERT_FALSE(squelch.is_open());
+    }
+
+    EXPECT_EQ(squelch.ctcss_count(), 0);
+    EXPECT_GT(squelch.no_ctcss_count(), 0);
+}
+
+TEST_F(SquelchTest, close_ctcss) {
+    float actual_tone = CTCSS::standard_tones[5];
+    float expected_tone = CTCSS::standard_tones[7];
+    float sample_rate = 8000;
+
+    Squelch squelch;
+    squelch.set_ctcss_freq(expected_tone, sample_rate);
+    send_samples_for_noise_floor(squelch);
+
+    GenerateSignal signal_with_tone(sample_rate);
+    signal_with_tone.add_tone(actual_tone, Tone::NORMAL);
+
+    // send through "signal" samples until its time to process audio
+    for (int i = 0; i < 500 && !squelch.should_process_audio(); ++i) {
+        squelch.process_raw_sample(raw_signal_sample);
+    }
+    ASSERT_TRUE(squelch.should_process_audio());
+    ASSERT_FALSE(squelch.is_open());
+
+    // process of audio samples and "signal" samples until squelch opens
+    for (int i = 0; i < 500 && !squelch.is_open(); ++i) {
+        squelch.process_audio_sample(signal_with_tone.get_sample());
+        squelch.process_raw_sample(raw_signal_sample);
+        ASSERT_TRUE(squelch.should_process_audio());
+    }
+    ASSERT_TRUE(squelch.is_open());
+
+    // keep processing samples until squelch closes again
+    for (int i = 0; i < 3000 && squelch.is_open(); ++i) {
+        squelch.process_audio_sample(signal_with_tone.get_sample());
+        squelch.process_raw_sample(raw_signal_sample);
+        ASSERT_TRUE(squelch.should_process_audio());
+    }
+    ASSERT_FALSE(squelch.is_open());
+
+    // process lots of audio samples and "signal" samples and squelch stays closed
+    for (int i = 0; i < 100000; ++i) {
+        squelch.process_audio_sample(signal_with_tone.get_sample());
+        squelch.process_raw_sample(raw_signal_sample);
+        ASSERT_TRUE(squelch.should_process_audio());
+        ASSERT_FALSE(squelch.is_open());
+    }
+
+    EXPECT_EQ(squelch.ctcss_count(), 0);
+    EXPECT_GT(squelch.no_ctcss_count(), 0);
+}

+ 90 - 0
src/udp_stream.cpp

@@ -0,0 +1,90 @@
+#include <string.h>  // strerror()
+#include <syslog.h>  // LOG_INFO / LOG_ERR
+#include <unistd.h>  // close()
+#include <cassert>   // assert()
+
+#include <arpa/inet.h>  // inet_aton()
+#include <netdb.h>      // getaddrinfo()
+
+#include "rtl_airband.h"
+
+bool udp_stream_init(udp_stream_data* sdata, mix_modes mode, size_t len) {
+    // pre-allocate the stereo buffer
+    if (mode == MM_STEREO) {
+        sdata->stereo_buffer_len = len * 2;
+        sdata->stereo_buffer = (float*)XCALLOC(sdata->stereo_buffer_len, sizeof(float));
+    } else {
+        sdata->stereo_buffer_len = 0;
+        sdata->stereo_buffer = NULL;
+    }
+
+    sdata->send_socket = -1;
+    sdata->dest_sockaddr_len = 0;
+
+    // lookup address / port
+    struct addrinfo hints, *result, *rptr;
+    memset(&hints, 0, sizeof(struct addrinfo));
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_DGRAM;
+    hints.ai_flags = 0;
+    hints.ai_protocol = 0;
+    int error = getaddrinfo(sdata->dest_address, sdata->dest_port, &hints, &result);
+    if (error) {
+        log(LOG_ERR, "udp_stream: could not resolve %s:%s - %s\n", sdata->dest_address, sdata->dest_port, gai_strerror(error));
+        return false;
+    }
+
+    // check each result and try to create a connection
+    for (rptr = result; rptr != NULL; rptr = rptr->ai_next) {
+        sdata->send_socket = socket(rptr->ai_family, rptr->ai_socktype, rptr->ai_protocol);
+        if (sdata->send_socket == -1) {
+            log(LOG_ERR, "udp_stream: socket failed: %s\n", strerror(errno));
+            continue;
+        }
+
+        if (connect(sdata->send_socket, rptr->ai_addr, rptr->ai_addrlen) == -1) {
+            log(LOG_INFO, "udp_stream: connect to %s:%s failed: %s\n", sdata->dest_address, sdata->dest_port, strerror(errno));
+            close(sdata->send_socket);
+            sdata->send_socket = -1;
+            continue;
+        }
+
+        sdata->dest_sockaddr = *rptr->ai_addr;
+        sdata->dest_sockaddr_len = rptr->ai_addrlen;
+        break;
+    }
+    freeaddrinfo(result);
+
+    // error if no valid socket
+    if (sdata->send_socket == -1) {
+        log(LOG_ERR, "udp_stream: could not set up UDP socket to %s:%s - all addresses failed\n", sdata->dest_address, sdata->dest_port);
+        return false;
+    }
+
+    log(LOG_INFO, "udp_stream: sending %s 32-bit float at %d Hz to %s:%s\n", mode == MM_MONO ? "Mono" : "Stereo", WAVE_RATE, sdata->dest_address, sdata->dest_port);
+    return true;
+}
+
+void udp_stream_write(udp_stream_data* sdata, const float* data, size_t len) {
+    if (sdata->send_socket != -1) {
+        // Send without blocking or checking for success
+        sendto(sdata->send_socket, data, len, MSG_DONTWAIT | MSG_NOSIGNAL, &sdata->dest_sockaddr, sdata->dest_sockaddr_len);
+    }
+}
+
+void udp_stream_write(udp_stream_data* sdata, const float* data_left, const float* data_right, size_t len) {
+    if (sdata->send_socket != -1) {
+        assert(len * 2 <= sdata->stereo_buffer_len);
+        for (size_t i = 0; i < len; ++i) {
+            sdata->stereo_buffer[2 * i] = data_left[i];
+            sdata->stereo_buffer[2 * i + 1] = data_right[i];
+        }
+        udp_stream_write(sdata, sdata->stereo_buffer, len * 2);
+    }
+}
+
+void udp_stream_shutdown(udp_stream_data* sdata) {
+    if (sdata->send_socket != -1) {
+        close(sdata->send_socket);
+    }
+}

+ 180 - 0
src/util.cpp

@@ -0,0 +1,180 @@
+/*
+ * util.cpp
+ * Miscellaneous routines
+ *
+ * Copyright (c) 2015-2021 Tomasz Lemiech <szpajder@gmail.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <lame/lame.h>
+#include <shout/shout.h>
+#include <stdint.h>  // uint32_t
+#include <unistd.h>
+#include <cerrno>
+#include <cmath>
+#include <cstdarg>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include "config.h"
+#include "logging.h"
+#include "rtl_airband.h"
+
+int atomic_inc(volatile int* pv) {
+    return __sync_fetch_and_add(pv, 1);
+}
+
+int atomic_dec(volatile int* pv) {
+    return __sync_fetch_and_sub(pv, 1);
+}
+
+int atomic_get(volatile int* pv) {
+    return __sync_fetch_and_add(pv, 0);
+}
+
+void tag_queue_put(device_t* dev, int freq, struct timeval tv) {
+    pthread_mutex_lock(&dev->tag_queue_lock);
+    dev->tq_head++;
+    dev->tq_head %= TAG_QUEUE_LEN;
+    if (dev->tq_head == dev->tq_tail) {
+        log(LOG_WARNING, "tag_queue_put: queue overrun\n");
+        dev->tq_tail++;
+    }
+    dev->tag_queue[dev->tq_head].freq = freq;
+    memcpy(&dev->tag_queue[dev->tq_head].tv, &tv, sizeof(struct timeval));
+    pthread_mutex_unlock(&dev->tag_queue_lock);
+}
+
+void tag_queue_get(device_t* dev, struct freq_tag* tag) {
+    int i;
+
+    if (!tag)
+        return;
+    pthread_mutex_lock(&dev->tag_queue_lock);
+    if (dev->tq_head == dev->tq_tail) { /* empty queue */
+        tag->freq = -1;
+    } else {
+        // read queue entry at pos tq_tail+1 without dequeueing it
+        i = dev->tq_tail + 1;
+        i %= TAG_QUEUE_LEN;
+        tag->freq = dev->tag_queue[i].freq;
+        memcpy(&tag->tv, &dev->tag_queue[i].tv, sizeof(struct timeval));
+    }
+    pthread_mutex_unlock(&dev->tag_queue_lock);
+}
+
+void tag_queue_advance(device_t* dev) {
+    pthread_mutex_lock(&dev->tag_queue_lock);
+    dev->tq_tail++;
+    dev->tq_tail %= TAG_QUEUE_LEN;
+    pthread_mutex_unlock(&dev->tag_queue_lock);
+}
+
+void* xcalloc(size_t nmemb, size_t size, const char* file, const int line, const char* func) {
+    void* ptr = calloc(nmemb, size);
+    if (ptr == NULL) {
+        log(LOG_ERR, "%s:%d: %s(): calloc(%zu, %zu) failed: %s\n", file, line, func, nmemb, size, strerror(errno));
+        error();
+    }
+    return ptr;
+}
+
+void* xrealloc(void* ptr, size_t size, const char* file, const int line, const char* func) {
+    ptr = realloc(ptr, size);
+    if (ptr == NULL) {
+        log(LOG_ERR, "%s:%d: %s(): realloc(%zu) failed: %s\n", file, line, func, size, strerror(errno));
+        error();
+    }
+    return ptr;
+}
+
+static float sin_lut[257], cos_lut[257];
+
+void sincosf_lut_init() {
+    for (uint32_t i = 0; i < 256; i++)
+        SINCOSF(2.0F * M_PI * (float)i / 256.0f, sin_lut + i, cos_lut + i);
+    sin_lut[256] = sin_lut[0];
+    cos_lut[256] = cos_lut[0];
+}
+
+// phi range must be (0..1), rescaled to 0x0-0xFFFFFF
+void sincosf_lut(uint32_t phi, float* sine, float* cosine) {
+    float v1, v2, fract;
+    uint32_t idx;
+    // get LUT index
+    idx = phi >> 16;
+    // cast fixed point fraction to float
+    fract = (float)(phi & 0xffff) / 65536.0f;
+    // get two adjacent values from LUT and interpolate
+    v1 = sin_lut[idx];
+    v2 = sin_lut[idx + 1];
+    *sine = v1 + (v2 - v1) * fract;
+    v1 = cos_lut[idx];
+    v2 = cos_lut[idx + 1];
+    *cosine = v1 + (v2 - v1) * fract;
+}
+
+/* librtlsdr-keenerd, (c) Kyle Keen */
+double atofs(char* s) {
+    char last;
+    int len;
+    double suff = 1.0;
+    len = strlen(s);
+    last = s[len - 1];
+    s[len - 1] = '\0';
+    switch (last) {
+        case 'g':
+        case 'G':
+            suff *= 1e3;
+            [[fallthrough]];
+        case 'm':
+        case 'M':
+            suff *= 1e3;
+            [[fallthrough]];
+        case 'k':
+        case 'K':
+            suff *= 1e3;
+            suff *= atof(s);
+            s[len - 1] = last;
+            return suff;
+    }
+    s[len - 1] = last;
+    return atof(s);
+}
+
+double delta_sec(const timeval* start, const timeval* stop) {
+    timeval delta;
+    timersub(stop, start, &delta);
+    return delta.tv_sec + delta.tv_usec / 1000000.0;
+}
+
+// level to/from dBFS conversion assumes level is nomalized to 1 and is based on:
+//    https://kluedo.ub.uni-kl.de/frontdoor/deliver/index/docId/4293/file/exact_fft_measurements.pdf
+//
+// expanded form:
+//    20.0f * log10f(level / fft_size) + 7.54f + 10.0f * log10f(fft_size/2) - 2.38f
+
+const float& dBFS_offset(void) {
+    static const float offset = 7.54f + 10.0f * log10f(fft_size / 2) - 2.38f;
+    return offset;
+}
+
+float dBFS_to_level(const float& dBFS) {
+    return pow(10.0, (dBFS - dBFS_offset()) / 20.0f) * fft_size;
+}
+
+float level_to_dBFS(const float& level) {
+    return std::min(0.0f, 20.0f * log10f(level / fft_size) + dBFS_offset());
+}