arm_mat_cmplx_mult_q15.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_cmplx_mat_mult_q15.c
  4. * Description: Q15 complex matrix multiplication
  5. *
  6. * $Date: 27. January 2017
  7. * $Revision: V.1.5.1
  8. *
  9. * Target Processor: Cortex-M cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "arm_math.h"
  29. /**
  30. * @ingroup groupMatrix
  31. */
  32. /**
  33. * @addtogroup CmplxMatrixMult
  34. * @{
  35. */
  36. /**
  37. * @brief Q15 Complex matrix multiplication
  38. * @param[in] *pSrcA points to the first input complex matrix structure
  39. * @param[in] *pSrcB points to the second input complex matrix structure
  40. * @param[out] *pDst points to output complex matrix structure
  41. * @param[in] *pScratch points to the array for storing intermediate results
  42. * @return The function returns either
  43. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  44. *
  45. * \par Conditions for optimum performance
  46. * Input, output and state buffers should be aligned by 32-bit
  47. *
  48. * \par Restrictions
  49. * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  50. * In this case input, output, scratch buffers should be aligned by 32-bit
  51. *
  52. * @details
  53. * <b>Scaling and Overflow Behavior:</b>
  54. *
  55. * \par
  56. * The function is implemented using a 64-bit internal accumulator. The inputs to the
  57. * multiplications are in 1.15 format and multiplications yield a 2.30 result.
  58. * The 2.30 intermediate
  59. * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
  60. * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
  61. * truncated to 34.15 format by discarding the low 15 bits and then saturated to
  62. * 1.15 format.
  63. *
  64. * \par
  65. * Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
  66. *
  67. */
  68. arm_status arm_mat_cmplx_mult_q15(
  69. const arm_matrix_instance_q15 * pSrcA,
  70. const arm_matrix_instance_q15 * pSrcB,
  71. arm_matrix_instance_q15 * pDst,
  72. q15_t * pScratch)
  73. {
  74. /* accumulator */
  75. q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
  76. q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
  77. q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
  78. q15_t *px; /* Temporary output data matrix pointer */
  79. uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
  80. uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
  81. uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
  82. uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
  83. uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
  84. arm_status status; /* status of matrix multiplication */
  85. q63_t sumReal, sumImag;
  86. #ifdef UNALIGNED_SUPPORT_DISABLE
  87. q15_t in; /* Temporary variable to hold the input value */
  88. q15_t a, b, c, d;
  89. #else
  90. q31_t in; /* Temporary variable to hold the input value */
  91. q31_t prod1, prod2;
  92. q31_t pSourceA, pSourceB;
  93. #endif
  94. #ifdef ARM_MATH_MATRIX_CHECK
  95. /* Check for matrix mismatch condition */
  96. if ((pSrcA->numCols != pSrcB->numRows) ||
  97. (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  98. {
  99. /* Set status as ARM_MATH_SIZE_MISMATCH */
  100. status = ARM_MATH_SIZE_MISMATCH;
  101. }
  102. else
  103. #endif
  104. {
  105. /* Matrix transpose */
  106. do
  107. {
  108. /* Apply loop unrolling and exchange the columns with row elements */
  109. col = numColsB >> 2;
  110. /* The pointer px is set to starting address of the column being processed */
  111. px = pSrcBT + i;
  112. /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
  113. ** a second loop below computes the remaining 1 to 3 samples. */
  114. while (col > 0U)
  115. {
  116. #ifdef UNALIGNED_SUPPORT_DISABLE
  117. /* Read two elements from the row */
  118. in = *pInB++;
  119. *px = in;
  120. in = *pInB++;
  121. px[1] = in;
  122. /* Update the pointer px to point to the next row of the transposed matrix */
  123. px += numRowsB * 2;
  124. /* Read two elements from the row */
  125. in = *pInB++;
  126. *px = in;
  127. in = *pInB++;
  128. px[1] = in;
  129. /* Update the pointer px to point to the next row of the transposed matrix */
  130. px += numRowsB * 2;
  131. /* Read two elements from the row */
  132. in = *pInB++;
  133. *px = in;
  134. in = *pInB++;
  135. px[1] = in;
  136. /* Update the pointer px to point to the next row of the transposed matrix */
  137. px += numRowsB * 2;
  138. /* Read two elements from the row */
  139. in = *pInB++;
  140. *px = in;
  141. in = *pInB++;
  142. px[1] = in;
  143. /* Update the pointer px to point to the next row of the transposed matrix */
  144. px += numRowsB * 2;
  145. /* Decrement the column loop counter */
  146. col--;
  147. }
  148. /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
  149. ** No loop unrolling is used. */
  150. col = numColsB % 0x4U;
  151. while (col > 0U)
  152. {
  153. /* Read two elements from the row */
  154. in = *pInB++;
  155. *px = in;
  156. in = *pInB++;
  157. px[1] = in;
  158. #else
  159. /* Read two elements from the row */
  160. in = *__SIMD32(pInB)++;
  161. *__SIMD32(px) = in;
  162. /* Update the pointer px to point to the next row of the transposed matrix */
  163. px += numRowsB * 2;
  164. /* Read two elements from the row */
  165. in = *__SIMD32(pInB)++;
  166. *__SIMD32(px) = in;
  167. /* Update the pointer px to point to the next row of the transposed matrix */
  168. px += numRowsB * 2;
  169. /* Read two elements from the row */
  170. in = *__SIMD32(pInB)++;
  171. *__SIMD32(px) = in;
  172. /* Update the pointer px to point to the next row of the transposed matrix */
  173. px += numRowsB * 2;
  174. /* Read two elements from the row */
  175. in = *__SIMD32(pInB)++;
  176. *__SIMD32(px) = in;
  177. /* Update the pointer px to point to the next row of the transposed matrix */
  178. px += numRowsB * 2;
  179. /* Decrement the column loop counter */
  180. col--;
  181. }
  182. /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
  183. ** No loop unrolling is used. */
  184. col = numColsB % 0x4U;
  185. while (col > 0U)
  186. {
  187. /* Read two elements from the row */
  188. in = *__SIMD32(pInB)++;
  189. *__SIMD32(px) = in;
  190. #endif
  191. /* Update the pointer px to point to the next row of the transposed matrix */
  192. px += numRowsB * 2;
  193. /* Decrement the column loop counter */
  194. col--;
  195. }
  196. i = i + 2U;
  197. /* Decrement the row loop counter */
  198. row--;
  199. } while (row > 0U);
  200. /* Reset the variables for the usage in the following multiplication process */
  201. row = numRowsA;
  202. i = 0U;
  203. px = pDst->pData;
  204. /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  205. /* row loop */
  206. do
  207. {
  208. /* For every row wise process, the column loop counter is to be initiated */
  209. col = numColsB;
  210. /* For every row wise process, the pIn2 pointer is set
  211. ** to the starting address of the transposed pSrcB data */
  212. pInB = pSrcBT;
  213. /* column loop */
  214. do
  215. {
  216. /* Set the variable sum, that acts as accumulator, to zero */
  217. sumReal = 0;
  218. sumImag = 0;
  219. /* Apply loop unrolling and compute 2 MACs simultaneously. */
  220. colCnt = numColsA >> 1;
  221. /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
  222. pInA = pSrcA->pData + i * 2;
  223. /* matrix multiplication */
  224. while (colCnt > 0U)
  225. {
  226. /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  227. #ifdef UNALIGNED_SUPPORT_DISABLE
  228. /* read real and imag values from pSrcA buffer */
  229. a = *pInA;
  230. b = *(pInA + 1U);
  231. /* read real and imag values from pSrcB buffer */
  232. c = *pInB;
  233. d = *(pInB + 1U);
  234. /* Multiply and Accumlates */
  235. sumReal += (q31_t) a *c;
  236. sumImag += (q31_t) a *d;
  237. sumReal -= (q31_t) b *d;
  238. sumImag += (q31_t) b *c;
  239. /* read next real and imag values from pSrcA buffer */
  240. a = *(pInA + 2U);
  241. b = *(pInA + 3U);
  242. /* read next real and imag values from pSrcB buffer */
  243. c = *(pInB + 2U);
  244. d = *(pInB + 3U);
  245. /* update pointer */
  246. pInA += 4U;
  247. /* Multiply and Accumlates */
  248. sumReal += (q31_t) a *c;
  249. sumImag += (q31_t) a *d;
  250. sumReal -= (q31_t) b *d;
  251. sumImag += (q31_t) b *c;
  252. /* update pointer */
  253. pInB += 4U;
  254. #else
  255. /* read real and imag values from pSrcA and pSrcB buffer */
  256. pSourceA = *__SIMD32(pInA)++;
  257. pSourceB = *__SIMD32(pInB)++;
  258. /* Multiply and Accumlates */
  259. #ifdef ARM_MATH_BIG_ENDIAN
  260. prod1 = -__SMUSD(pSourceA, pSourceB);
  261. #else
  262. prod1 = __SMUSD(pSourceA, pSourceB);
  263. #endif
  264. prod2 = __SMUADX(pSourceA, pSourceB);
  265. sumReal += (q63_t) prod1;
  266. sumImag += (q63_t) prod2;
  267. /* read real and imag values from pSrcA and pSrcB buffer */
  268. pSourceA = *__SIMD32(pInA)++;
  269. pSourceB = *__SIMD32(pInB)++;
  270. /* Multiply and Accumlates */
  271. #ifdef ARM_MATH_BIG_ENDIAN
  272. prod1 = -__SMUSD(pSourceA, pSourceB);
  273. #else
  274. prod1 = __SMUSD(pSourceA, pSourceB);
  275. #endif
  276. prod2 = __SMUADX(pSourceA, pSourceB);
  277. sumReal += (q63_t) prod1;
  278. sumImag += (q63_t) prod2;
  279. #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
  280. /* Decrement the loop counter */
  281. colCnt--;
  282. }
  283. /* process odd column samples */
  284. if ((numColsA & 0x1U) > 0U)
  285. {
  286. /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  287. #ifdef UNALIGNED_SUPPORT_DISABLE
  288. /* read real and imag values from pSrcA and pSrcB buffer */
  289. a = *pInA++;
  290. b = *pInA++;
  291. c = *pInB++;
  292. d = *pInB++;
  293. /* Multiply and Accumlates */
  294. sumReal += (q31_t) a *c;
  295. sumImag += (q31_t) a *d;
  296. sumReal -= (q31_t) b *d;
  297. sumImag += (q31_t) b *c;
  298. #else
  299. /* read real and imag values from pSrcA and pSrcB buffer */
  300. pSourceA = *__SIMD32(pInA)++;
  301. pSourceB = *__SIMD32(pInB)++;
  302. /* Multiply and Accumlates */
  303. #ifdef ARM_MATH_BIG_ENDIAN
  304. prod1 = -__SMUSD(pSourceA, pSourceB);
  305. #else
  306. prod1 = __SMUSD(pSourceA, pSourceB);
  307. #endif
  308. prod2 = __SMUADX(pSourceA, pSourceB);
  309. sumReal += (q63_t) prod1;
  310. sumImag += (q63_t) prod2;
  311. #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
  312. }
  313. /* Saturate and store the result in the destination buffer */
  314. *px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
  315. *px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
  316. /* Decrement the column loop counter */
  317. col--;
  318. } while (col > 0U);
  319. i = i + numColsA;
  320. /* Decrement the row loop counter */
  321. row--;
  322. } while (row > 0U);
  323. /* set status as ARM_MATH_SUCCESS */
  324. status = ARM_MATH_SUCCESS;
  325. }
  326. /* Return to application */
  327. return (status);
  328. }
  329. /**
  330. * @} end of MatrixMult group
  331. */