arm_cfft_radix4_q15.c 53 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_cfft_radix4_q15.c
  4. * Description: This file has function definition of Radix-4 FFT & IFFT function and
  5. * In-place bit reversal using bit reversal table
  6. *
  7. * $Date: 27. January 2017
  8. * $Revision: V.1.5.1
  9. *
  10. * Target Processor: Cortex-M cores
  11. * -------------------------------------------------------------------- */
  12. /*
  13. * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  14. *
  15. * SPDX-License-Identifier: Apache-2.0
  16. *
  17. * Licensed under the Apache License, Version 2.0 (the License); you may
  18. * not use this file except in compliance with the License.
  19. * You may obtain a copy of the License at
  20. *
  21. * www.apache.org/licenses/LICENSE-2.0
  22. *
  23. * Unless required by applicable law or agreed to in writing, software
  24. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  25. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  26. * See the License for the specific language governing permissions and
  27. * limitations under the License.
  28. */
  29. #include "arm_math.h"
  30. void arm_radix4_butterfly_q15(
  31. q15_t * pSrc16,
  32. uint32_t fftLen,
  33. q15_t * pCoef16,
  34. uint32_t twidCoefModifier);
  35. void arm_radix4_butterfly_inverse_q15(
  36. q15_t * pSrc16,
  37. uint32_t fftLen,
  38. q15_t * pCoef16,
  39. uint32_t twidCoefModifier);
  40. void arm_bitreversal_q15(
  41. q15_t * pSrc,
  42. uint32_t fftLen,
  43. uint16_t bitRevFactor,
  44. uint16_t * pBitRevTab);
  45. /**
  46. * @ingroup groupTransforms
  47. */
  48. /**
  49. * @addtogroup ComplexFFT
  50. * @{
  51. */
  52. /**
  53. * @details
  54. * @brief Processing function for the Q15 CFFT/CIFFT.
  55. * @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
  56. * @param[in] *S points to an instance of the Q15 CFFT/CIFFT structure.
  57. * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
  58. * @return none.
  59. *
  60. * \par Input and output formats:
  61. * \par
  62. * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
  63. * Hence the output format is different for different FFT sizes.
  64. * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
  65. * \par
  66. * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
  67. * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
  68. */
  69. void arm_cfft_radix4_q15(
  70. const arm_cfft_radix4_instance_q15 * S,
  71. q15_t * pSrc)
  72. {
  73. if (S->ifftFlag == 1U)
  74. {
  75. /* Complex IFFT radix-4 */
  76. arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  77. }
  78. else
  79. {
  80. /* Complex FFT radix-4 */
  81. arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  82. }
  83. if (S->bitReverseFlag == 1U)
  84. {
  85. /* Bit Reversal */
  86. arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
  87. }
  88. }
  89. /**
  90. * @} end of ComplexFFT group
  91. */
  92. /*
  93. * Radix-4 FFT algorithm used is :
  94. *
  95. * Input real and imaginary data:
  96. * x(n) = xa + j * ya
  97. * x(n+N/4 ) = xb + j * yb
  98. * x(n+N/2 ) = xc + j * yc
  99. * x(n+3N 4) = xd + j * yd
  100. *
  101. *
  102. * Output real and imaginary data:
  103. * x(4r) = xa'+ j * ya'
  104. * x(4r+1) = xb'+ j * yb'
  105. * x(4r+2) = xc'+ j * yc'
  106. * x(4r+3) = xd'+ j * yd'
  107. *
  108. *
  109. * Twiddle factors for radix-4 FFT:
  110. * Wn = co1 + j * (- si1)
  111. * W2n = co2 + j * (- si2)
  112. * W3n = co3 + j * (- si3)
  113. * The real and imaginary output values for the radix-4 butterfly are
  114. * xa' = xa + xb + xc + xd
  115. * ya' = ya + yb + yc + yd
  116. * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
  117. * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
  118. * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
  119. * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
  120. * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
  121. * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
  122. *
  123. */
  124. /**
  125. * @brief Core function for the Q15 CFFT butterfly process.
  126. * @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
  127. * @param[in] fftLen length of the FFT.
  128. * @param[in] *pCoef16 points to twiddle coefficient buffer.
  129. * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
  130. * @return none.
  131. */
  132. void arm_radix4_butterfly_q15(
  133. q15_t * pSrc16,
  134. uint32_t fftLen,
  135. q15_t * pCoef16,
  136. uint32_t twidCoefModifier)
  137. {
  138. #if defined (ARM_MATH_DSP)
  139. /* Run the below code for Cortex-M4 and Cortex-M3 */
  140. q31_t R, S, T, U;
  141. q31_t C1, C2, C3, out1, out2;
  142. uint32_t n1, n2, ic, i0, j, k;
  143. q15_t *ptr1;
  144. q15_t *pSi0;
  145. q15_t *pSi1;
  146. q15_t *pSi2;
  147. q15_t *pSi3;
  148. q31_t xaya, xbyb, xcyc, xdyd;
  149. /* Total process is divided into three stages */
  150. /* process first stage, middle stages, & last stage */
  151. /* Initializations for the first stage */
  152. n2 = fftLen;
  153. n1 = n2;
  154. /* n2 = fftLen/4 */
  155. n2 >>= 2U;
  156. /* Index for twiddle coefficient */
  157. ic = 0U;
  158. /* Index for input read and output write */
  159. j = n2;
  160. pSi0 = pSrc16;
  161. pSi1 = pSi0 + 2 * n2;
  162. pSi2 = pSi1 + 2 * n2;
  163. pSi3 = pSi2 + 2 * n2;
  164. /* Input is in 1.15(q15) format */
  165. /* start of first stage process */
  166. do
  167. {
  168. /* Butterfly implementation */
  169. /* Reading i0, i0+fftLen/2 inputs */
  170. /* Read ya (real), xa(imag) input */
  171. T = _SIMD32_OFFSET(pSi0);
  172. T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
  173. T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
  174. //in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
  175. //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
  176. /* Read yc (real), xc(imag) input */
  177. S = _SIMD32_OFFSET(pSi2);
  178. S = __SHADD16(S, 0);
  179. S = __SHADD16(S, 0);
  180. /* R = packed((ya + yc), (xa + xc) ) */
  181. R = __QADD16(T, S);
  182. /* S = packed((ya - yc), (xa - xc) ) */
  183. S = __QSUB16(T, S);
  184. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  185. /* Read yb (real), xb(imag) input */
  186. T = _SIMD32_OFFSET(pSi1);
  187. T = __SHADD16(T, 0);
  188. T = __SHADD16(T, 0);
  189. /* Read yd (real), xd(imag) input */
  190. U = _SIMD32_OFFSET(pSi3);
  191. U = __SHADD16(U, 0);
  192. U = __SHADD16(U, 0);
  193. /* T = packed((yb + yd), (xb + xd) ) */
  194. T = __QADD16(T, U);
  195. /* writing the butterfly processed i0 sample */
  196. /* xa' = xa + xb + xc + xd */
  197. /* ya' = ya + yb + yc + yd */
  198. _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
  199. pSi0 += 2;
  200. /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
  201. R = __QSUB16(R, T);
  202. /* co2 & si2 are read from SIMD Coefficient pointer */
  203. C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
  204. #ifndef ARM_MATH_BIG_ENDIAN
  205. /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  206. out1 = __SMUAD(C2, R) >> 16U;
  207. /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  208. out2 = __SMUSDX(C2, R);
  209. #else
  210. /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  211. out1 = __SMUSDX(R, C2) >> 16U;
  212. /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  213. out2 = __SMUAD(C2, R);
  214. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  215. /* Reading i0+fftLen/4 */
  216. /* T = packed(yb, xb) */
  217. T = _SIMD32_OFFSET(pSi1);
  218. T = __SHADD16(T, 0);
  219. T = __SHADD16(T, 0);
  220. /* writing the butterfly processed i0 + fftLen/4 sample */
  221. /* writing output(xc', yc') in little endian format */
  222. _SIMD32_OFFSET(pSi1) =
  223. (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  224. pSi1 += 2;
  225. /* Butterfly calculations */
  226. /* U = packed(yd, xd) */
  227. U = _SIMD32_OFFSET(pSi3);
  228. U = __SHADD16(U, 0);
  229. U = __SHADD16(U, 0);
  230. /* T = packed(yb-yd, xb-xd) */
  231. T = __QSUB16(T, U);
  232. #ifndef ARM_MATH_BIG_ENDIAN
  233. /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  234. R = __QASX(S, T);
  235. /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
  236. S = __QSAX(S, T);
  237. #else
  238. /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  239. R = __QSAX(S, T);
  240. /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
  241. S = __QASX(S, T);
  242. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  243. /* co1 & si1 are read from SIMD Coefficient pointer */
  244. C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
  245. /* Butterfly process for the i0+fftLen/2 sample */
  246. #ifndef ARM_MATH_BIG_ENDIAN
  247. /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  248. out1 = __SMUAD(C1, S) >> 16U;
  249. /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  250. out2 = __SMUSDX(C1, S);
  251. #else
  252. /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  253. out1 = __SMUSDX(S, C1) >> 16U;
  254. /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  255. out2 = __SMUAD(C1, S);
  256. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  257. /* writing output(xb', yb') in little endian format */
  258. _SIMD32_OFFSET(pSi2) =
  259. ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
  260. pSi2 += 2;
  261. /* co3 & si3 are read from SIMD Coefficient pointer */
  262. C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
  263. /* Butterfly process for the i0+3fftLen/4 sample */
  264. #ifndef ARM_MATH_BIG_ENDIAN
  265. /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  266. out1 = __SMUAD(C3, R) >> 16U;
  267. /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  268. out2 = __SMUSDX(C3, R);
  269. #else
  270. /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  271. out1 = __SMUSDX(R, C3) >> 16U;
  272. /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  273. out2 = __SMUAD(C3, R);
  274. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  275. /* writing output(xd', yd') in little endian format */
  276. _SIMD32_OFFSET(pSi3) =
  277. ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  278. pSi3 += 2;
  279. /* Twiddle coefficients index modifier */
  280. ic = ic + twidCoefModifier;
  281. } while (--j);
  282. /* data is in 4.11(q11) format */
  283. /* end of first stage process */
  284. /* start of middle stage process */
  285. /* Twiddle coefficients index modifier */
  286. twidCoefModifier <<= 2U;
  287. /* Calculation of Middle stage */
  288. for (k = fftLen / 4U; k > 4U; k >>= 2U)
  289. {
  290. /* Initializations for the middle stage */
  291. n1 = n2;
  292. n2 >>= 2U;
  293. ic = 0U;
  294. for (j = 0U; j <= (n2 - 1U); j++)
  295. {
  296. /* index calculation for the coefficients */
  297. C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
  298. C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
  299. C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
  300. /* Twiddle coefficients index modifier */
  301. ic = ic + twidCoefModifier;
  302. pSi0 = pSrc16 + 2 * j;
  303. pSi1 = pSi0 + 2 * n2;
  304. pSi2 = pSi1 + 2 * n2;
  305. pSi3 = pSi2 + 2 * n2;
  306. /* Butterfly implementation */
  307. for (i0 = j; i0 < fftLen; i0 += n1)
  308. {
  309. /* Reading i0, i0+fftLen/2 inputs */
  310. /* Read ya (real), xa(imag) input */
  311. T = _SIMD32_OFFSET(pSi0);
  312. /* Read yc (real), xc(imag) input */
  313. S = _SIMD32_OFFSET(pSi2);
  314. /* R = packed( (ya + yc), (xa + xc)) */
  315. R = __QADD16(T, S);
  316. /* S = packed((ya - yc), (xa - xc)) */
  317. S = __QSUB16(T, S);
  318. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  319. /* Read yb (real), xb(imag) input */
  320. T = _SIMD32_OFFSET(pSi1);
  321. /* Read yd (real), xd(imag) input */
  322. U = _SIMD32_OFFSET(pSi3);
  323. /* T = packed( (yb + yd), (xb + xd)) */
  324. T = __QADD16(T, U);
  325. /* writing the butterfly processed i0 sample */
  326. /* xa' = xa + xb + xc + xd */
  327. /* ya' = ya + yb + yc + yd */
  328. out1 = __SHADD16(R, T);
  329. out1 = __SHADD16(out1, 0);
  330. _SIMD32_OFFSET(pSi0) = out1;
  331. pSi0 += 2 * n1;
  332. /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
  333. R = __SHSUB16(R, T);
  334. #ifndef ARM_MATH_BIG_ENDIAN
  335. /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  336. out1 = __SMUAD(C2, R) >> 16U;
  337. /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  338. out2 = __SMUSDX(C2, R);
  339. #else
  340. /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  341. out1 = __SMUSDX(R, C2) >> 16U;
  342. /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  343. out2 = __SMUAD(C2, R);
  344. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  345. /* Reading i0+3fftLen/4 */
  346. /* Read yb (real), xb(imag) input */
  347. T = _SIMD32_OFFSET(pSi1);
  348. /* writing the butterfly processed i0 + fftLen/4 sample */
  349. /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  350. /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  351. _SIMD32_OFFSET(pSi1) =
  352. ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  353. pSi1 += 2 * n1;
  354. /* Butterfly calculations */
  355. /* Read yd (real), xd(imag) input */
  356. U = _SIMD32_OFFSET(pSi3);
  357. /* T = packed(yb-yd, xb-xd) */
  358. T = __QSUB16(T, U);
  359. #ifndef ARM_MATH_BIG_ENDIAN
  360. /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  361. R = __SHASX(S, T);
  362. /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
  363. S = __SHSAX(S, T);
  364. /* Butterfly process for the i0+fftLen/2 sample */
  365. out1 = __SMUAD(C1, S) >> 16U;
  366. out2 = __SMUSDX(C1, S);
  367. #else
  368. /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  369. R = __SHSAX(S, T);
  370. /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
  371. S = __SHASX(S, T);
  372. /* Butterfly process for the i0+fftLen/2 sample */
  373. out1 = __SMUSDX(S, C1) >> 16U;
  374. out2 = __SMUAD(C1, S);
  375. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  376. /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  377. /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  378. _SIMD32_OFFSET(pSi2) =
  379. ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  380. pSi2 += 2 * n1;
  381. /* Butterfly process for the i0+3fftLen/4 sample */
  382. #ifndef ARM_MATH_BIG_ENDIAN
  383. out1 = __SMUAD(C3, R) >> 16U;
  384. out2 = __SMUSDX(C3, R);
  385. #else
  386. out1 = __SMUSDX(R, C3) >> 16U;
  387. out2 = __SMUAD(C3, R);
  388. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  389. /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  390. /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  391. _SIMD32_OFFSET(pSi3) =
  392. ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  393. pSi3 += 2 * n1;
  394. }
  395. }
  396. /* Twiddle coefficients index modifier */
  397. twidCoefModifier <<= 2U;
  398. }
  399. /* end of middle stage process */
  400. /* data is in 10.6(q6) format for the 1024 point */
  401. /* data is in 8.8(q8) format for the 256 point */
  402. /* data is in 6.10(q10) format for the 64 point */
  403. /* data is in 4.12(q12) format for the 16 point */
  404. /* Initializations for the last stage */
  405. j = fftLen >> 2;
  406. ptr1 = &pSrc16[0];
  407. /* start of last stage process */
  408. /* Butterfly implementation */
  409. do
  410. {
  411. /* Read xa (real), ya(imag) input */
  412. xaya = *__SIMD32(ptr1)++;
  413. /* Read xb (real), yb(imag) input */
  414. xbyb = *__SIMD32(ptr1)++;
  415. /* Read xc (real), yc(imag) input */
  416. xcyc = *__SIMD32(ptr1)++;
  417. /* Read xd (real), yd(imag) input */
  418. xdyd = *__SIMD32(ptr1)++;
  419. /* R = packed((ya + yc), (xa + xc)) */
  420. R = __QADD16(xaya, xcyc);
  421. /* T = packed((yb + yd), (xb + xd)) */
  422. T = __QADD16(xbyb, xdyd);
  423. /* pointer updation for writing */
  424. ptr1 = ptr1 - 8U;
  425. /* xa' = xa + xb + xc + xd */
  426. /* ya' = ya + yb + yc + yd */
  427. *__SIMD32(ptr1)++ = __SHADD16(R, T);
  428. /* T = packed((yb + yd), (xb + xd)) */
  429. T = __QADD16(xbyb, xdyd);
  430. /* xc' = (xa-xb+xc-xd) */
  431. /* yc' = (ya-yb+yc-yd) */
  432. *__SIMD32(ptr1)++ = __SHSUB16(R, T);
  433. /* S = packed((ya - yc), (xa - xc)) */
  434. S = __QSUB16(xaya, xcyc);
  435. /* Read yd (real), xd(imag) input */
  436. /* T = packed( (yb - yd), (xb - xd)) */
  437. U = __QSUB16(xbyb, xdyd);
  438. #ifndef ARM_MATH_BIG_ENDIAN
  439. /* xb' = (xa+yb-xc-yd) */
  440. /* yb' = (ya-xb-yc+xd) */
  441. *__SIMD32(ptr1)++ = __SHSAX(S, U);
  442. /* xd' = (xa-yb-xc+yd) */
  443. /* yd' = (ya+xb-yc-xd) */
  444. *__SIMD32(ptr1)++ = __SHASX(S, U);
  445. #else
  446. /* xb' = (xa+yb-xc-yd) */
  447. /* yb' = (ya-xb-yc+xd) */
  448. *__SIMD32(ptr1)++ = __SHASX(S, U);
  449. /* xd' = (xa-yb-xc+yd) */
  450. /* yd' = (ya+xb-yc-xd) */
  451. *__SIMD32(ptr1)++ = __SHSAX(S, U);
  452. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  453. } while (--j);
  454. /* end of last stage process */
  455. /* output is in 11.5(q5) format for the 1024 point */
  456. /* output is in 9.7(q7) format for the 256 point */
  457. /* output is in 7.9(q9) format for the 64 point */
  458. /* output is in 5.11(q11) format for the 16 point */
  459. #else
  460. /* Run the below code for Cortex-M0 */
  461. q15_t R0, R1, S0, S1, T0, T1, U0, U1;
  462. q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
  463. uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
  464. /* Total process is divided into three stages */
  465. /* process first stage, middle stages, & last stage */
  466. /* Initializations for the first stage */
  467. n2 = fftLen;
  468. n1 = n2;
  469. /* n2 = fftLen/4 */
  470. n2 >>= 2U;
  471. /* Index for twiddle coefficient */
  472. ic = 0U;
  473. /* Index for input read and output write */
  474. i0 = 0U;
  475. j = n2;
  476. /* Input is in 1.15(q15) format */
  477. /* start of first stage process */
  478. do
  479. {
  480. /* Butterfly implementation */
  481. /* index calculation for the input as, */
  482. /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  483. i1 = i0 + n2;
  484. i2 = i1 + n2;
  485. i3 = i2 + n2;
  486. /* Reading i0, i0+fftLen/2 inputs */
  487. /* input is down scale by 4 to avoid overflow */
  488. /* Read ya (real), xa(imag) input */
  489. T0 = pSrc16[i0 * 2U] >> 2U;
  490. T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
  491. /* input is down scale by 4 to avoid overflow */
  492. /* Read yc (real), xc(imag) input */
  493. S0 = pSrc16[i2 * 2U] >> 2U;
  494. S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
  495. /* R0 = (ya + yc) */
  496. R0 = __SSAT(T0 + S0, 16U);
  497. /* R1 = (xa + xc) */
  498. R1 = __SSAT(T1 + S1, 16U);
  499. /* S0 = (ya - yc) */
  500. S0 = __SSAT(T0 - S0, 16);
  501. /* S1 = (xa - xc) */
  502. S1 = __SSAT(T1 - S1, 16);
  503. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  504. /* input is down scale by 4 to avoid overflow */
  505. /* Read yb (real), xb(imag) input */
  506. T0 = pSrc16[i1 * 2U] >> 2U;
  507. T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
  508. /* input is down scale by 4 to avoid overflow */
  509. /* Read yd (real), xd(imag) input */
  510. U0 = pSrc16[i3 * 2U] >> 2U;
  511. U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
  512. /* T0 = (yb + yd) */
  513. T0 = __SSAT(T0 + U0, 16U);
  514. /* T1 = (xb + xd) */
  515. T1 = __SSAT(T1 + U1, 16U);
  516. /* writing the butterfly processed i0 sample */
  517. /* ya' = ya + yb + yc + yd */
  518. /* xa' = xa + xb + xc + xd */
  519. pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
  520. pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
  521. /* R0 = (ya + yc) - (yb + yd) */
  522. /* R1 = (xa + xc) - (xb + xd) */
  523. R0 = __SSAT(R0 - T0, 16U);
  524. R1 = __SSAT(R1 - T1, 16U);
  525. /* co2 & si2 are read from Coefficient pointer */
  526. Co2 = pCoef16[2U * ic * 2U];
  527. Si2 = pCoef16[(2U * ic * 2U) + 1];
  528. /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  529. out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
  530. /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  531. out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
  532. /* Reading i0+fftLen/4 */
  533. /* input is down scale by 4 to avoid overflow */
  534. /* T0 = yb, T1 = xb */
  535. T0 = pSrc16[i1 * 2U] >> 2;
  536. T1 = pSrc16[(i1 * 2U) + 1] >> 2;
  537. /* writing the butterfly processed i0 + fftLen/4 sample */
  538. /* writing output(xc', yc') in little endian format */
  539. pSrc16[i1 * 2U] = out1;
  540. pSrc16[(i1 * 2U) + 1] = out2;
  541. /* Butterfly calculations */
  542. /* input is down scale by 4 to avoid overflow */
  543. /* U0 = yd, U1 = xd */
  544. U0 = pSrc16[i3 * 2U] >> 2;
  545. U1 = pSrc16[(i3 * 2U) + 1] >> 2;
  546. /* T0 = yb-yd */
  547. T0 = __SSAT(T0 - U0, 16);
  548. /* T1 = xb-xd */
  549. T1 = __SSAT(T1 - U1, 16);
  550. /* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */
  551. R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
  552. R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
  553. /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
  554. S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
  555. S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
  556. /* co1 & si1 are read from Coefficient pointer */
  557. Co1 = pCoef16[ic * 2U];
  558. Si1 = pCoef16[(ic * 2U) + 1];
  559. /* Butterfly process for the i0+fftLen/2 sample */
  560. /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  561. out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
  562. /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  563. out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
  564. /* writing output(xb', yb') in little endian format */
  565. pSrc16[i2 * 2U] = out1;
  566. pSrc16[(i2 * 2U) + 1] = out2;
  567. /* Co3 & si3 are read from Coefficient pointer */
  568. Co3 = pCoef16[3U * (ic * 2U)];
  569. Si3 = pCoef16[(3U * (ic * 2U)) + 1];
  570. /* Butterfly process for the i0+3fftLen/4 sample */
  571. /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
  572. out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
  573. /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
  574. out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
  575. /* writing output(xd', yd') in little endian format */
  576. pSrc16[i3 * 2U] = out1;
  577. pSrc16[(i3 * 2U) + 1] = out2;
  578. /* Twiddle coefficients index modifier */
  579. ic = ic + twidCoefModifier;
  580. /* Updating input index */
  581. i0 = i0 + 1U;
  582. } while (--j);
  583. /* data is in 4.11(q11) format */
  584. /* end of first stage process */
  585. /* start of middle stage process */
  586. /* Twiddle coefficients index modifier */
  587. twidCoefModifier <<= 2U;
  588. /* Calculation of Middle stage */
  589. for (k = fftLen / 4U; k > 4U; k >>= 2U)
  590. {
  591. /* Initializations for the middle stage */
  592. n1 = n2;
  593. n2 >>= 2U;
  594. ic = 0U;
  595. for (j = 0U; j <= (n2 - 1U); j++)
  596. {
  597. /* index calculation for the coefficients */
  598. Co1 = pCoef16[ic * 2U];
  599. Si1 = pCoef16[(ic * 2U) + 1U];
  600. Co2 = pCoef16[2U * (ic * 2U)];
  601. Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
  602. Co3 = pCoef16[3U * (ic * 2U)];
  603. Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
  604. /* Twiddle coefficients index modifier */
  605. ic = ic + twidCoefModifier;
  606. /* Butterfly implementation */
  607. for (i0 = j; i0 < fftLen; i0 += n1)
  608. {
  609. /* index calculation for the input as, */
  610. /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  611. i1 = i0 + n2;
  612. i2 = i1 + n2;
  613. i3 = i2 + n2;
  614. /* Reading i0, i0+fftLen/2 inputs */
  615. /* Read ya (real), xa(imag) input */
  616. T0 = pSrc16[i0 * 2U];
  617. T1 = pSrc16[(i0 * 2U) + 1U];
  618. /* Read yc (real), xc(imag) input */
  619. S0 = pSrc16[i2 * 2U];
  620. S1 = pSrc16[(i2 * 2U) + 1U];
  621. /* R0 = (ya + yc), R1 = (xa + xc) */
  622. R0 = __SSAT(T0 + S0, 16);
  623. R1 = __SSAT(T1 + S1, 16);
  624. /* S0 = (ya - yc), S1 =(xa - xc) */
  625. S0 = __SSAT(T0 - S0, 16);
  626. S1 = __SSAT(T1 - S1, 16);
  627. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  628. /* Read yb (real), xb(imag) input */
  629. T0 = pSrc16[i1 * 2U];
  630. T1 = pSrc16[(i1 * 2U) + 1U];
  631. /* Read yd (real), xd(imag) input */
  632. U0 = pSrc16[i3 * 2U];
  633. U1 = pSrc16[(i3 * 2U) + 1U];
  634. /* T0 = (yb + yd), T1 = (xb + xd) */
  635. T0 = __SSAT(T0 + U0, 16);
  636. T1 = __SSAT(T1 + U1, 16);
  637. /* writing the butterfly processed i0 sample */
  638. /* xa' = xa + xb + xc + xd */
  639. /* ya' = ya + yb + yc + yd */
  640. out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
  641. out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
  642. pSrc16[i0 * 2U] = out1;
  643. pSrc16[(2U * i0) + 1U] = out2;
  644. /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  645. R0 = (R0 >> 1U) - (T0 >> 1U);
  646. R1 = (R1 >> 1U) - (T1 >> 1U);
  647. /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  648. out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
  649. /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  650. out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
  651. /* Reading i0+3fftLen/4 */
  652. /* Read yb (real), xb(imag) input */
  653. T0 = pSrc16[i1 * 2U];
  654. T1 = pSrc16[(i1 * 2U) + 1U];
  655. /* writing the butterfly processed i0 + fftLen/4 sample */
  656. /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  657. /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  658. pSrc16[i1 * 2U] = out1;
  659. pSrc16[(i1 * 2U) + 1U] = out2;
  660. /* Butterfly calculations */
  661. /* Read yd (real), xd(imag) input */
  662. U0 = pSrc16[i3 * 2U];
  663. U1 = pSrc16[(i3 * 2U) + 1U];
  664. /* T0 = yb-yd, T1 = xb-xd */
  665. T0 = __SSAT(T0 - U0, 16);
  666. T1 = __SSAT(T1 - U1, 16);
  667. /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
  668. R0 = (S0 >> 1U) - (T1 >> 1U);
  669. R1 = (S1 >> 1U) + (T0 >> 1U);
  670. /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
  671. S0 = (S0 >> 1U) + (T1 >> 1U);
  672. S1 = (S1 >> 1U) - (T0 >> 1U);
  673. /* Butterfly process for the i0+fftLen/2 sample */
  674. out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
  675. out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
  676. /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  677. /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  678. pSrc16[i2 * 2U] = out1;
  679. pSrc16[(i2 * 2U) + 1U] = out2;
  680. /* Butterfly process for the i0+3fftLen/4 sample */
  681. out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
  682. out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
  683. /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
  684. /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
  685. pSrc16[i3 * 2U] = out1;
  686. pSrc16[(i3 * 2U) + 1U] = out2;
  687. }
  688. }
  689. /* Twiddle coefficients index modifier */
  690. twidCoefModifier <<= 2U;
  691. }
  692. /* end of middle stage process */
  693. /* data is in 10.6(q6) format for the 1024 point */
  694. /* data is in 8.8(q8) format for the 256 point */
  695. /* data is in 6.10(q10) format for the 64 point */
  696. /* data is in 4.12(q12) format for the 16 point */
  697. /* Initializations for the last stage */
  698. n1 = n2;
  699. n2 >>= 2U;
  700. /* start of last stage process */
  701. /* Butterfly implementation */
  702. for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
  703. {
  704. /* index calculation for the input as, */
  705. /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  706. i1 = i0 + n2;
  707. i2 = i1 + n2;
  708. i3 = i2 + n2;
  709. /* Reading i0, i0+fftLen/2 inputs */
  710. /* Read ya (real), xa(imag) input */
  711. T0 = pSrc16[i0 * 2U];
  712. T1 = pSrc16[(i0 * 2U) + 1U];
  713. /* Read yc (real), xc(imag) input */
  714. S0 = pSrc16[i2 * 2U];
  715. S1 = pSrc16[(i2 * 2U) + 1U];
  716. /* R0 = (ya + yc), R1 = (xa + xc) */
  717. R0 = __SSAT(T0 + S0, 16U);
  718. R1 = __SSAT(T1 + S1, 16U);
  719. /* S0 = (ya - yc), S1 = (xa - xc) */
  720. S0 = __SSAT(T0 - S0, 16U);
  721. S1 = __SSAT(T1 - S1, 16U);
  722. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  723. /* Read yb (real), xb(imag) input */
  724. T0 = pSrc16[i1 * 2U];
  725. T1 = pSrc16[(i1 * 2U) + 1U];
  726. /* Read yd (real), xd(imag) input */
  727. U0 = pSrc16[i3 * 2U];
  728. U1 = pSrc16[(i3 * 2U) + 1U];
  729. /* T0 = (yb + yd), T1 = (xb + xd)) */
  730. T0 = __SSAT(T0 + U0, 16U);
  731. T1 = __SSAT(T1 + U1, 16U);
  732. /* writing the butterfly processed i0 sample */
  733. /* xa' = xa + xb + xc + xd */
  734. /* ya' = ya + yb + yc + yd */
  735. pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
  736. pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
  737. /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  738. R0 = (R0 >> 1U) - (T0 >> 1U);
  739. R1 = (R1 >> 1U) - (T1 >> 1U);
  740. /* Read yb (real), xb(imag) input */
  741. T0 = pSrc16[i1 * 2U];
  742. T1 = pSrc16[(i1 * 2U) + 1U];
  743. /* writing the butterfly processed i0 + fftLen/4 sample */
  744. /* xc' = (xa-xb+xc-xd) */
  745. /* yc' = (ya-yb+yc-yd) */
  746. pSrc16[i1 * 2U] = R0;
  747. pSrc16[(i1 * 2U) + 1U] = R1;
  748. /* Read yd (real), xd(imag) input */
  749. U0 = pSrc16[i3 * 2U];
  750. U1 = pSrc16[(i3 * 2U) + 1U];
  751. /* T0 = (yb - yd), T1 = (xb - xd) */
  752. T0 = __SSAT(T0 - U0, 16U);
  753. T1 = __SSAT(T1 - U1, 16U);
  754. /* writing the butterfly processed i0 + fftLen/2 sample */
  755. /* xb' = (xa+yb-xc-yd) */
  756. /* yb' = (ya-xb-yc+xd) */
  757. pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
  758. pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
  759. /* writing the butterfly processed i0 + 3fftLen/4 sample */
  760. /* xd' = (xa-yb-xc+yd) */
  761. /* yd' = (ya+xb-yc-xd) */
  762. pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
  763. pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
  764. }
  765. /* end of last stage process */
  766. /* output is in 11.5(q5) format for the 1024 point */
  767. /* output is in 9.7(q7) format for the 256 point */
  768. /* output is in 7.9(q9) format for the 64 point */
  769. /* output is in 5.11(q11) format for the 16 point */
  770. #endif /* #if defined (ARM_MATH_DSP) */
  771. }
  772. /**
  773. * @brief Core function for the Q15 CIFFT butterfly process.
  774. * @param[in, out] *pSrc16 points to the in-place buffer of Q15 data type.
  775. * @param[in] fftLen length of the FFT.
  776. * @param[in] *pCoef16 points to twiddle coefficient buffer.
  777. * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
  778. * @return none.
  779. */
  780. /*
  781. * Radix-4 IFFT algorithm used is :
  782. *
  783. * CIFFT uses same twiddle coefficients as CFFT function
  784. * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
  785. *
  786. *
  787. * IFFT is implemented with following changes in equations from FFT
  788. *
  789. * Input real and imaginary data:
  790. * x(n) = xa + j * ya
  791. * x(n+N/4 ) = xb + j * yb
  792. * x(n+N/2 ) = xc + j * yc
  793. * x(n+3N 4) = xd + j * yd
  794. *
  795. *
  796. * Output real and imaginary data:
  797. * x(4r) = xa'+ j * ya'
  798. * x(4r+1) = xb'+ j * yb'
  799. * x(4r+2) = xc'+ j * yc'
  800. * x(4r+3) = xd'+ j * yd'
  801. *
  802. *
  803. * Twiddle factors for radix-4 IFFT:
  804. * Wn = co1 + j * (si1)
  805. * W2n = co2 + j * (si2)
  806. * W3n = co3 + j * (si3)
  807. * The real and imaginary output values for the radix-4 butterfly are
  808. * xa' = xa + xb + xc + xd
  809. * ya' = ya + yb + yc + yd
  810. * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
  811. * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
  812. * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
  813. * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
  814. * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
  815. * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
  816. *
  817. */
  818. void arm_radix4_butterfly_inverse_q15(
  819. q15_t * pSrc16,
  820. uint32_t fftLen,
  821. q15_t * pCoef16,
  822. uint32_t twidCoefModifier)
  823. {
  824. #if defined (ARM_MATH_DSP)
  825. /* Run the below code for Cortex-M4 and Cortex-M3 */
  826. q31_t R, S, T, U;
  827. q31_t C1, C2, C3, out1, out2;
  828. uint32_t n1, n2, ic, i0, j, k;
  829. q15_t *ptr1;
  830. q15_t *pSi0;
  831. q15_t *pSi1;
  832. q15_t *pSi2;
  833. q15_t *pSi3;
  834. q31_t xaya, xbyb, xcyc, xdyd;
  835. /* Total process is divided into three stages */
  836. /* process first stage, middle stages, & last stage */
  837. /* Initializations for the first stage */
  838. n2 = fftLen;
  839. n1 = n2;
  840. /* n2 = fftLen/4 */
  841. n2 >>= 2U;
  842. /* Index for twiddle coefficient */
  843. ic = 0U;
  844. /* Index for input read and output write */
  845. j = n2;
  846. pSi0 = pSrc16;
  847. pSi1 = pSi0 + 2 * n2;
  848. pSi2 = pSi1 + 2 * n2;
  849. pSi3 = pSi2 + 2 * n2;
  850. /* Input is in 1.15(q15) format */
  851. /* start of first stage process */
  852. do
  853. {
  854. /* Butterfly implementation */
  855. /* Reading i0, i0+fftLen/2 inputs */
  856. /* Read ya (real), xa(imag) input */
  857. T = _SIMD32_OFFSET(pSi0);
  858. T = __SHADD16(T, 0);
  859. T = __SHADD16(T, 0);
  860. /* Read yc (real), xc(imag) input */
  861. S = _SIMD32_OFFSET(pSi2);
  862. S = __SHADD16(S, 0);
  863. S = __SHADD16(S, 0);
  864. /* R = packed((ya + yc), (xa + xc) ) */
  865. R = __QADD16(T, S);
  866. /* S = packed((ya - yc), (xa - xc) ) */
  867. S = __QSUB16(T, S);
  868. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  869. /* Read yb (real), xb(imag) input */
  870. T = _SIMD32_OFFSET(pSi1);
  871. T = __SHADD16(T, 0);
  872. T = __SHADD16(T, 0);
  873. /* Read yd (real), xd(imag) input */
  874. U = _SIMD32_OFFSET(pSi3);
  875. U = __SHADD16(U, 0);
  876. U = __SHADD16(U, 0);
  877. /* T = packed((yb + yd), (xb + xd) ) */
  878. T = __QADD16(T, U);
  879. /* writing the butterfly processed i0 sample */
  880. /* xa' = xa + xb + xc + xd */
  881. /* ya' = ya + yb + yc + yd */
  882. _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
  883. pSi0 += 2;
  884. /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
  885. R = __QSUB16(R, T);
  886. /* co2 & si2 are read from SIMD Coefficient pointer */
  887. C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
  888. #ifndef ARM_MATH_BIG_ENDIAN
  889. /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  890. out1 = __SMUSD(C2, R) >> 16U;
  891. /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  892. out2 = __SMUADX(C2, R);
  893. #else
  894. /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  895. out1 = __SMUADX(C2, R) >> 16U;
  896. /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  897. out2 = __SMUSD(__QSUB16(0, C2), R);
  898. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  899. /* Reading i0+fftLen/4 */
  900. /* T = packed(yb, xb) */
  901. T = _SIMD32_OFFSET(pSi1);
  902. T = __SHADD16(T, 0);
  903. T = __SHADD16(T, 0);
  904. /* writing the butterfly processed i0 + fftLen/4 sample */
  905. /* writing output(xc', yc') in little endian format */
  906. _SIMD32_OFFSET(pSi1) =
  907. (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  908. pSi1 += 2;
  909. /* Butterfly calculations */
  910. /* U = packed(yd, xd) */
  911. U = _SIMD32_OFFSET(pSi3);
  912. U = __SHADD16(U, 0);
  913. U = __SHADD16(U, 0);
  914. /* T = packed(yb-yd, xb-xd) */
  915. T = __QSUB16(T, U);
  916. #ifndef ARM_MATH_BIG_ENDIAN
  917. /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  918. R = __QSAX(S, T);
  919. /* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
  920. S = __QASX(S, T);
  921. #else
  922. /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  923. R = __QASX(S, T);
  924. /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
  925. S = __QSAX(S, T);
  926. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  927. /* co1 & si1 are read from SIMD Coefficient pointer */
  928. C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
  929. /* Butterfly process for the i0+fftLen/2 sample */
  930. #ifndef ARM_MATH_BIG_ENDIAN
  931. /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  932. out1 = __SMUSD(C1, S) >> 16U;
  933. /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  934. out2 = __SMUADX(C1, S);
  935. #else
  936. /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  937. out1 = __SMUADX(C1, S) >> 16U;
  938. /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  939. out2 = __SMUSD(__QSUB16(0, C1), S);
  940. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  941. /* writing output(xb', yb') in little endian format */
  942. _SIMD32_OFFSET(pSi2) =
  943. ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
  944. pSi2 += 2;
  945. /* co3 & si3 are read from SIMD Coefficient pointer */
  946. C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
  947. /* Butterfly process for the i0+3fftLen/4 sample */
  948. #ifndef ARM_MATH_BIG_ENDIAN
  949. /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  950. out1 = __SMUSD(C3, R) >> 16U;
  951. /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  952. out2 = __SMUADX(C3, R);
  953. #else
  954. /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  955. out1 = __SMUADX(C3, R) >> 16U;
  956. /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  957. out2 = __SMUSD(__QSUB16(0, C3), R);
  958. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  959. /* writing output(xd', yd') in little endian format */
  960. _SIMD32_OFFSET(pSi3) =
  961. ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  962. pSi3 += 2;
  963. /* Twiddle coefficients index modifier */
  964. ic = ic + twidCoefModifier;
  965. } while (--j);
  966. /* data is in 4.11(q11) format */
  967. /* end of first stage process */
  968. /* start of middle stage process */
  969. /* Twiddle coefficients index modifier */
  970. twidCoefModifier <<= 2U;
  971. /* Calculation of Middle stage */
  972. for (k = fftLen / 4U; k > 4U; k >>= 2U)
  973. {
  974. /* Initializations for the middle stage */
  975. n1 = n2;
  976. n2 >>= 2U;
  977. ic = 0U;
  978. for (j = 0U; j <= (n2 - 1U); j++)
  979. {
  980. /* index calculation for the coefficients */
  981. C1 = _SIMD32_OFFSET(pCoef16 + (2U * ic));
  982. C2 = _SIMD32_OFFSET(pCoef16 + (4U * ic));
  983. C3 = _SIMD32_OFFSET(pCoef16 + (6U * ic));
  984. /* Twiddle coefficients index modifier */
  985. ic = ic + twidCoefModifier;
  986. pSi0 = pSrc16 + 2 * j;
  987. pSi1 = pSi0 + 2 * n2;
  988. pSi2 = pSi1 + 2 * n2;
  989. pSi3 = pSi2 + 2 * n2;
  990. /* Butterfly implementation */
  991. for (i0 = j; i0 < fftLen; i0 += n1)
  992. {
  993. /* Reading i0, i0+fftLen/2 inputs */
  994. /* Read ya (real), xa(imag) input */
  995. T = _SIMD32_OFFSET(pSi0);
  996. /* Read yc (real), xc(imag) input */
  997. S = _SIMD32_OFFSET(pSi2);
  998. /* R = packed( (ya + yc), (xa + xc)) */
  999. R = __QADD16(T, S);
  1000. /* S = packed((ya - yc), (xa - xc)) */
  1001. S = __QSUB16(T, S);
  1002. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1003. /* Read yb (real), xb(imag) input */
  1004. T = _SIMD32_OFFSET(pSi1);
  1005. /* Read yd (real), xd(imag) input */
  1006. U = _SIMD32_OFFSET(pSi3);
  1007. /* T = packed( (yb + yd), (xb + xd)) */
  1008. T = __QADD16(T, U);
  1009. /* writing the butterfly processed i0 sample */
  1010. /* xa' = xa + xb + xc + xd */
  1011. /* ya' = ya + yb + yc + yd */
  1012. out1 = __SHADD16(R, T);
  1013. out1 = __SHADD16(out1, 0);
  1014. _SIMD32_OFFSET(pSi0) = out1;
  1015. pSi0 += 2 * n1;
  1016. /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
  1017. R = __SHSUB16(R, T);
  1018. #ifndef ARM_MATH_BIG_ENDIAN
  1019. /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  1020. out1 = __SMUSD(C2, R) >> 16U;
  1021. /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1022. out2 = __SMUADX(C2, R);
  1023. #else
  1024. /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1025. out1 = __SMUADX(R, C2) >> 16U;
  1026. /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
  1027. out2 = __SMUSD(__QSUB16(0, C2), R);
  1028. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  1029. /* Reading i0+3fftLen/4 */
  1030. /* Read yb (real), xb(imag) input */
  1031. T = _SIMD32_OFFSET(pSi1);
  1032. /* writing the butterfly processed i0 + fftLen/4 sample */
  1033. /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
  1034. /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
  1035. _SIMD32_OFFSET(pSi1) =
  1036. ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1037. pSi1 += 2 * n1;
  1038. /* Butterfly calculations */
  1039. /* Read yd (real), xd(imag) input */
  1040. U = _SIMD32_OFFSET(pSi3);
  1041. /* T = packed(yb-yd, xb-xd) */
  1042. T = __QSUB16(T, U);
  1043. #ifndef ARM_MATH_BIG_ENDIAN
  1044. /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1045. R = __SHSAX(S, T);
  1046. /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
  1047. S = __SHASX(S, T);
  1048. /* Butterfly process for the i0+fftLen/2 sample */
  1049. out1 = __SMUSD(C1, S) >> 16U;
  1050. out2 = __SMUADX(C1, S);
  1051. #else
  1052. /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
  1053. R = __SHASX(S, T);
  1054. /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
  1055. S = __SHSAX(S, T);
  1056. /* Butterfly process for the i0+fftLen/2 sample */
  1057. out1 = __SMUADX(S, C1) >> 16U;
  1058. out2 = __SMUSD(__QSUB16(0, C1), S);
  1059. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  1060. /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
  1061. /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
  1062. _SIMD32_OFFSET(pSi2) =
  1063. ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1064. pSi2 += 2 * n1;
  1065. /* Butterfly process for the i0+3fftLen/4 sample */
  1066. #ifndef ARM_MATH_BIG_ENDIAN
  1067. out1 = __SMUSD(C3, R) >> 16U;
  1068. out2 = __SMUADX(C3, R);
  1069. #else
  1070. out1 = __SMUADX(C3, R) >> 16U;
  1071. out2 = __SMUSD(__QSUB16(0, C3), R);
  1072. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  1073. /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
  1074. /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
  1075. _SIMD32_OFFSET(pSi3) =
  1076. ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
  1077. pSi3 += 2 * n1;
  1078. }
  1079. }
  1080. /* Twiddle coefficients index modifier */
  1081. twidCoefModifier <<= 2U;
  1082. }
  1083. /* end of middle stage process */
  1084. /* data is in 10.6(q6) format for the 1024 point */
  1085. /* data is in 8.8(q8) format for the 256 point */
  1086. /* data is in 6.10(q10) format for the 64 point */
  1087. /* data is in 4.12(q12) format for the 16 point */
  1088. /* Initializations for the last stage */
  1089. j = fftLen >> 2;
  1090. ptr1 = &pSrc16[0];
  1091. /* start of last stage process */
  1092. /* Butterfly implementation */
  1093. do
  1094. {
  1095. /* Read xa (real), ya(imag) input */
  1096. xaya = *__SIMD32(ptr1)++;
  1097. /* Read xb (real), yb(imag) input */
  1098. xbyb = *__SIMD32(ptr1)++;
  1099. /* Read xc (real), yc(imag) input */
  1100. xcyc = *__SIMD32(ptr1)++;
  1101. /* Read xd (real), yd(imag) input */
  1102. xdyd = *__SIMD32(ptr1)++;
  1103. /* R = packed((ya + yc), (xa + xc)) */
  1104. R = __QADD16(xaya, xcyc);
  1105. /* T = packed((yb + yd), (xb + xd)) */
  1106. T = __QADD16(xbyb, xdyd);
  1107. /* pointer updation for writing */
  1108. ptr1 = ptr1 - 8U;
  1109. /* xa' = xa + xb + xc + xd */
  1110. /* ya' = ya + yb + yc + yd */
  1111. *__SIMD32(ptr1)++ = __SHADD16(R, T);
  1112. /* T = packed((yb + yd), (xb + xd)) */
  1113. T = __QADD16(xbyb, xdyd);
  1114. /* xc' = (xa-xb+xc-xd) */
  1115. /* yc' = (ya-yb+yc-yd) */
  1116. *__SIMD32(ptr1)++ = __SHSUB16(R, T);
  1117. /* S = packed((ya - yc), (xa - xc)) */
  1118. S = __QSUB16(xaya, xcyc);
  1119. /* Read yd (real), xd(imag) input */
  1120. /* T = packed( (yb - yd), (xb - xd)) */
  1121. U = __QSUB16(xbyb, xdyd);
  1122. #ifndef ARM_MATH_BIG_ENDIAN
  1123. /* xb' = (xa+yb-xc-yd) */
  1124. /* yb' = (ya-xb-yc+xd) */
  1125. *__SIMD32(ptr1)++ = __SHASX(S, U);
  1126. /* xd' = (xa-yb-xc+yd) */
  1127. /* yd' = (ya+xb-yc-xd) */
  1128. *__SIMD32(ptr1)++ = __SHSAX(S, U);
  1129. #else
  1130. /* xb' = (xa+yb-xc-yd) */
  1131. /* yb' = (ya-xb-yc+xd) */
  1132. *__SIMD32(ptr1)++ = __SHSAX(S, U);
  1133. /* xd' = (xa-yb-xc+yd) */
  1134. /* yd' = (ya+xb-yc-xd) */
  1135. *__SIMD32(ptr1)++ = __SHASX(S, U);
  1136. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  1137. } while (--j);
  1138. /* end of last stage process */
  1139. /* output is in 11.5(q5) format for the 1024 point */
  1140. /* output is in 9.7(q7) format for the 256 point */
  1141. /* output is in 7.9(q9) format for the 64 point */
  1142. /* output is in 5.11(q11) format for the 16 point */
  1143. #else
  1144. /* Run the below code for Cortex-M0 */
  1145. q15_t R0, R1, S0, S1, T0, T1, U0, U1;
  1146. q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
  1147. uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
  1148. /* Total process is divided into three stages */
  1149. /* process first stage, middle stages, & last stage */
  1150. /* Initializations for the first stage */
  1151. n2 = fftLen;
  1152. n1 = n2;
  1153. /* n2 = fftLen/4 */
  1154. n2 >>= 2U;
  1155. /* Index for twiddle coefficient */
  1156. ic = 0U;
  1157. /* Index for input read and output write */
  1158. i0 = 0U;
  1159. j = n2;
  1160. /* Input is in 1.15(q15) format */
  1161. /* Start of first stage process */
  1162. do
  1163. {
  1164. /* Butterfly implementation */
  1165. /* index calculation for the input as, */
  1166. /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1167. i1 = i0 + n2;
  1168. i2 = i1 + n2;
  1169. i3 = i2 + n2;
  1170. /* Reading i0, i0+fftLen/2 inputs */
  1171. /* input is down scale by 4 to avoid overflow */
  1172. /* Read ya (real), xa(imag) input */
  1173. T0 = pSrc16[i0 * 2U] >> 2U;
  1174. T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
  1175. /* input is down scale by 4 to avoid overflow */
  1176. /* Read yc (real), xc(imag) input */
  1177. S0 = pSrc16[i2 * 2U] >> 2U;
  1178. S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
  1179. /* R0 = (ya + yc), R1 = (xa + xc) */
  1180. R0 = __SSAT(T0 + S0, 16U);
  1181. R1 = __SSAT(T1 + S1, 16U);
  1182. /* S0 = (ya - yc), S1 = (xa - xc) */
  1183. S0 = __SSAT(T0 - S0, 16U);
  1184. S1 = __SSAT(T1 - S1, 16U);
  1185. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1186. /* input is down scale by 4 to avoid overflow */
  1187. /* Read yb (real), xb(imag) input */
  1188. T0 = pSrc16[i1 * 2U] >> 2U;
  1189. T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
  1190. /* Read yd (real), xd(imag) input */
  1191. /* input is down scale by 4 to avoid overflow */
  1192. U0 = pSrc16[i3 * 2U] >> 2U;
  1193. U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
  1194. /* T0 = (yb + yd), T1 = (xb + xd) */
  1195. T0 = __SSAT(T0 + U0, 16U);
  1196. T1 = __SSAT(T1 + U1, 16U);
  1197. /* writing the butterfly processed i0 sample */
  1198. /* xa' = xa + xb + xc + xd */
  1199. /* ya' = ya + yb + yc + yd */
  1200. pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
  1201. pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
  1202. /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
  1203. R0 = __SSAT(R0 - T0, 16U);
  1204. R1 = __SSAT(R1 - T1, 16U);
  1205. /* co2 & si2 are read from Coefficient pointer */
  1206. Co2 = pCoef16[2U * ic * 2U];
  1207. Si2 = pCoef16[(2U * ic * 2U) + 1U];
  1208. /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
  1209. out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
  1210. /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1211. out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
  1212. /* Reading i0+fftLen/4 */
  1213. /* input is down scale by 4 to avoid overflow */
  1214. /* T0 = yb, T1 = xb */
  1215. T0 = pSrc16[i1 * 2U] >> 2U;
  1216. T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
  1217. /* writing the butterfly processed i0 + fftLen/4 sample */
  1218. /* writing output(xc', yc') in little endian format */
  1219. pSrc16[i1 * 2U] = out1;
  1220. pSrc16[(i1 * 2U) + 1U] = out2;
  1221. /* Butterfly calculations */
  1222. /* input is down scale by 4 to avoid overflow */
  1223. /* U0 = yd, U1 = xd) */
  1224. U0 = pSrc16[i3 * 2U] >> 2U;
  1225. U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
  1226. /* T0 = yb-yd, T1 = xb-xd) */
  1227. T0 = __SSAT(T0 - U0, 16U);
  1228. T1 = __SSAT(T1 - U1, 16U);
  1229. /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
  1230. R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
  1231. R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
  1232. /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
  1233. S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
  1234. S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
  1235. /* co1 & si1 are read from Coefficient pointer */
  1236. Co1 = pCoef16[ic * 2U];
  1237. Si1 = pCoef16[(ic * 2U) + 1U];
  1238. /* Butterfly process for the i0+fftLen/2 sample */
  1239. /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
  1240. out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
  1241. /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
  1242. out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
  1243. /* writing output(xb', yb') in little endian format */
  1244. pSrc16[i2 * 2U] = out1;
  1245. pSrc16[(i2 * 2U) + 1U] = out2;
  1246. /* Co3 & si3 are read from Coefficient pointer */
  1247. Co3 = pCoef16[3U * ic * 2U];
  1248. Si3 = pCoef16[(3U * ic * 2U) + 1U];
  1249. /* Butterfly process for the i0+3fftLen/4 sample */
  1250. /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
  1251. out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
  1252. /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
  1253. out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
  1254. /* writing output(xd', yd') in little endian format */
  1255. pSrc16[i3 * 2U] = out1;
  1256. pSrc16[(i3 * 2U) + 1U] = out2;
  1257. /* Twiddle coefficients index modifier */
  1258. ic = ic + twidCoefModifier;
  1259. /* Updating input index */
  1260. i0 = i0 + 1U;
  1261. } while (--j);
  1262. /* End of first stage process */
  1263. /* data is in 4.11(q11) format */
  1264. /* Start of Middle stage process */
  1265. /* Twiddle coefficients index modifier */
  1266. twidCoefModifier <<= 2U;
  1267. /* Calculation of Middle stage */
  1268. for (k = fftLen / 4U; k > 4U; k >>= 2U)
  1269. {
  1270. /* Initializations for the middle stage */
  1271. n1 = n2;
  1272. n2 >>= 2U;
  1273. ic = 0U;
  1274. for (j = 0U; j <= (n2 - 1U); j++)
  1275. {
  1276. /* index calculation for the coefficients */
  1277. Co1 = pCoef16[ic * 2U];
  1278. Si1 = pCoef16[(ic * 2U) + 1U];
  1279. Co2 = pCoef16[2U * ic * 2U];
  1280. Si2 = pCoef16[2U * ic * 2U + 1U];
  1281. Co3 = pCoef16[3U * ic * 2U];
  1282. Si3 = pCoef16[(3U * ic * 2U) + 1U];
  1283. /* Twiddle coefficients index modifier */
  1284. ic = ic + twidCoefModifier;
  1285. /* Butterfly implementation */
  1286. for (i0 = j; i0 < fftLen; i0 += n1)
  1287. {
  1288. /* index calculation for the input as, */
  1289. /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1290. i1 = i0 + n2;
  1291. i2 = i1 + n2;
  1292. i3 = i2 + n2;
  1293. /* Reading i0, i0+fftLen/2 inputs */
  1294. /* Read ya (real), xa(imag) input */
  1295. T0 = pSrc16[i0 * 2U];
  1296. T1 = pSrc16[(i0 * 2U) + 1U];
  1297. /* Read yc (real), xc(imag) input */
  1298. S0 = pSrc16[i2 * 2U];
  1299. S1 = pSrc16[(i2 * 2U) + 1U];
  1300. /* R0 = (ya + yc), R1 = (xa + xc) */
  1301. R0 = __SSAT(T0 + S0, 16U);
  1302. R1 = __SSAT(T1 + S1, 16U);
  1303. /* S0 = (ya - yc), S1 = (xa - xc) */
  1304. S0 = __SSAT(T0 - S0, 16U);
  1305. S1 = __SSAT(T1 - S1, 16U);
  1306. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1307. /* Read yb (real), xb(imag) input */
  1308. T0 = pSrc16[i1 * 2U];
  1309. T1 = pSrc16[(i1 * 2U) + 1U];
  1310. /* Read yd (real), xd(imag) input */
  1311. U0 = pSrc16[i3 * 2U];
  1312. U1 = pSrc16[(i3 * 2U) + 1U];
  1313. /* T0 = (yb + yd), T1 = (xb + xd) */
  1314. T0 = __SSAT(T0 + U0, 16U);
  1315. T1 = __SSAT(T1 + U1, 16U);
  1316. /* writing the butterfly processed i0 sample */
  1317. /* xa' = xa + xb + xc + xd */
  1318. /* ya' = ya + yb + yc + yd */
  1319. pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
  1320. pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
  1321. /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  1322. R0 = (R0 >> 1U) - (T0 >> 1U);
  1323. R1 = (R1 >> 1U) - (T1 >> 1U);
  1324. /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
  1325. out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
  1326. /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1327. out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
  1328. /* Reading i0+3fftLen/4 */
  1329. /* Read yb (real), xb(imag) input */
  1330. T0 = pSrc16[i1 * 2U];
  1331. T1 = pSrc16[(i1 * 2U) + 1U];
  1332. /* writing the butterfly processed i0 + fftLen/4 sample */
  1333. /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
  1334. /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
  1335. pSrc16[i1 * 2U] = out1;
  1336. pSrc16[(i1 * 2U) + 1U] = out2;
  1337. /* Butterfly calculations */
  1338. /* Read yd (real), xd(imag) input */
  1339. U0 = pSrc16[i3 * 2U];
  1340. U1 = pSrc16[(i3 * 2U) + 1U];
  1341. /* T0 = yb-yd, T1 = xb-xd) */
  1342. T0 = __SSAT(T0 - U0, 16U);
  1343. T1 = __SSAT(T1 - U1, 16U);
  1344. /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
  1345. R0 = (S0 >> 1U) + (T1 >> 1U);
  1346. R1 = (S1 >> 1U) - (T0 >> 1U);
  1347. /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
  1348. S0 = (S0 >> 1U) - (T1 >> 1U);
  1349. S1 = (S1 >> 1U) + (T0 >> 1U);
  1350. /* Butterfly process for the i0+fftLen/2 sample */
  1351. out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
  1352. out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
  1353. /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
  1354. /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
  1355. pSrc16[i2 * 2U] = out1;
  1356. pSrc16[(i2 * 2U) + 1U] = out2;
  1357. /* Butterfly process for the i0+3fftLen/4 sample */
  1358. out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
  1359. out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
  1360. /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
  1361. /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
  1362. pSrc16[i3 * 2U] = out1;
  1363. pSrc16[(i3 * 2U) + 1U] = out2;
  1364. }
  1365. }
  1366. /* Twiddle coefficients index modifier */
  1367. twidCoefModifier <<= 2U;
  1368. }
  1369. /* End of Middle stages process */
  1370. /* data is in 10.6(q6) format for the 1024 point */
  1371. /* data is in 8.8(q8) format for the 256 point */
  1372. /* data is in 6.10(q10) format for the 64 point */
  1373. /* data is in 4.12(q12) format for the 16 point */
  1374. /* start of last stage process */
  1375. /* Initializations for the last stage */
  1376. n1 = n2;
  1377. n2 >>= 2U;
  1378. /* Butterfly implementation */
  1379. for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
  1380. {
  1381. /* index calculation for the input as, */
  1382. /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
  1383. i1 = i0 + n2;
  1384. i2 = i1 + n2;
  1385. i3 = i2 + n2;
  1386. /* Reading i0, i0+fftLen/2 inputs */
  1387. /* Read ya (real), xa(imag) input */
  1388. T0 = pSrc16[i0 * 2U];
  1389. T1 = pSrc16[(i0 * 2U) + 1U];
  1390. /* Read yc (real), xc(imag) input */
  1391. S0 = pSrc16[i2 * 2U];
  1392. S1 = pSrc16[(i2 * 2U) + 1U];
  1393. /* R0 = (ya + yc), R1 = (xa + xc) */
  1394. R0 = __SSAT(T0 + S0, 16U);
  1395. R1 = __SSAT(T1 + S1, 16U);
  1396. /* S0 = (ya - yc), S1 = (xa - xc) */
  1397. S0 = __SSAT(T0 - S0, 16U);
  1398. S1 = __SSAT(T1 - S1, 16U);
  1399. /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
  1400. /* Read yb (real), xb(imag) input */
  1401. T0 = pSrc16[i1 * 2U];
  1402. T1 = pSrc16[(i1 * 2U) + 1U];
  1403. /* Read yd (real), xd(imag) input */
  1404. U0 = pSrc16[i3 * 2U];
  1405. U1 = pSrc16[(i3 * 2U) + 1U];
  1406. /* T0 = (yb + yd), T1 = (xb + xd) */
  1407. T0 = __SSAT(T0 + U0, 16U);
  1408. T1 = __SSAT(T1 + U1, 16U);
  1409. /* writing the butterfly processed i0 sample */
  1410. /* xa' = xa + xb + xc + xd */
  1411. /* ya' = ya + yb + yc + yd */
  1412. pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
  1413. pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
  1414. /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
  1415. R0 = (R0 >> 1U) - (T0 >> 1U);
  1416. R1 = (R1 >> 1U) - (T1 >> 1U);
  1417. /* Read yb (real), xb(imag) input */
  1418. T0 = pSrc16[i1 * 2U];
  1419. T1 = pSrc16[(i1 * 2U) + 1U];
  1420. /* writing the butterfly processed i0 + fftLen/4 sample */
  1421. /* xc' = (xa-xb+xc-xd) */
  1422. /* yc' = (ya-yb+yc-yd) */
  1423. pSrc16[i1 * 2U] = R0;
  1424. pSrc16[(i1 * 2U) + 1U] = R1;
  1425. /* Read yd (real), xd(imag) input */
  1426. U0 = pSrc16[i3 * 2U];
  1427. U1 = pSrc16[(i3 * 2U) + 1U];
  1428. /* T0 = (yb - yd), T1 = (xb - xd) */
  1429. T0 = __SSAT(T0 - U0, 16U);
  1430. T1 = __SSAT(T1 - U1, 16U);
  1431. /* writing the butterfly processed i0 + fftLen/2 sample */
  1432. /* xb' = (xa-yb-xc+yd) */
  1433. /* yb' = (ya+xb-yc-xd) */
  1434. pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
  1435. pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
  1436. /* writing the butterfly processed i0 + 3fftLen/4 sample */
  1437. /* xd' = (xa+yb-xc-yd) */
  1438. /* yd' = (ya-xb-yc+xd) */
  1439. pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
  1440. pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
  1441. }
  1442. /* end of last stage process */
  1443. /* output is in 11.5(q5) format for the 1024 point */
  1444. /* output is in 9.7(q7) format for the 256 point */
  1445. /* output is in 7.9(q9) format for the 64 point */
  1446. /* output is in 5.11(q11) format for the 16 point */
  1447. #endif /* #if defined (ARM_MATH_DSP) */
  1448. }