arm_nnfunctions.h 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010
  1. /*
  2. * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_nnfunctions.h
  21. * Description: Public header file for CMSIS NN Library
  22. *
  23. * $Date: 13. July 2018
  24. * $Revision: V.1.0.0
  25. *
  26. * Target Processor: Cortex-M cores
  27. * -------------------------------------------------------------------- */
  28. /**
  29. \mainpage CMSIS NN Software Library
  30. *
  31. * Introduction
  32. * ------------
  33. *
  34. * This user manual describes the CMSIS NN software library,
  35. * a collection of efficient neural network kernels developed to maximize the
  36. * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
  37. *
  38. * The library is divided into a number of functions each covering a specific category:
  39. * - Neural Network Convolution Functions
  40. * - Neural Network Activation Functions
  41. * - Fully-connected Layer Functions
  42. * - Neural Network Pooling Functions
  43. * - Softmax Functions
  44. * - Neural Network Support Functions
  45. *
  46. * The library has separate functions for operating on different weight and activation data
  47. * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
  48. * kernels are included in the function description. The implementation details are also
  49. * described in this paper [1].
  50. *
  51. * Block Diagram
  52. * --------
  53. * \image html CMSIS-NN-OVERVIEW.PNG
  54. *
  55. * Examples
  56. * --------
  57. *
  58. * The library ships with a number of examples which demonstrate how to use the library functions.
  59. *
  60. * Pre-processor Macros
  61. * ------------
  62. *
  63. * Each library project have differant pre-processor macros.
  64. *
  65. * - ARM_MATH_DSP:
  66. *
  67. * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
  68. *
  69. * - ARM_MATH_BIG_ENDIAN:
  70. *
  71. * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
  72. *
  73. * - ARM_NN_TRUNCATE:
  74. *
  75. * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
  76. *
  77. * Copyright Notice
  78. * ------------
  79. *
  80. * Copyright (C) 2010-2018 Arm Limited. All rights reserved.
  81. *
  82. * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
  83. */
  84. /**
  85. * @defgroup groupNN Neural Network Functions
  86. * These functions perform basic operations for neural network layers.
  87. */
  88. #ifndef _ARM_NNFUNCTIONS_H
  89. #define _ARM_NNFUNCTIONS_H
  90. #include "arm_nnsupportfunctions.h"
  91. #include "arm_nn_tables.h"
  92. #define USE_INTRINSIC
  93. //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
  94. #ifdef __cplusplus
  95. extern "C"
  96. {
  97. #endif
  98. /**
  99. * @defgroup NNConv Neural Network Convolution Functions
  100. *
  101. * Perform convolution layer
  102. *
  103. * The convolution is implemented in 2 steps: im2col and GEMM
  104. *
  105. * im2col is a process of converting each patch of image data into
  106. * a column. After im2col, the convolution is computed as matrix-matrix
  107. * multiplication.
  108. *
  109. * To reduce the memory footprint, the im2col is performed partially.
  110. * Each iteration, only a few column (i.e., patches) are generated and
  111. * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
  112. *
  113. */
  114. /**
  115. * @brief Basic Q7 convolution function
  116. * @param[in] Im_in pointer to input tensor
  117. * @param[in] dim_im_in input tensor dimention
  118. * @param[in] ch_im_in number of input tensor channels
  119. * @param[in] wt pointer to kernel weights
  120. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  121. * @param[in] dim_kernel filter kernel size
  122. * @param[in] padding padding sizes
  123. * @param[in] stride convolution stride
  124. * @param[in] bias pointer to bias
  125. * @param[in] bias_shift amount of left-shift for bias
  126. * @param[in] out_shift amount of right-shift for output
  127. * @param[in,out] Im_out pointer to output tensor
  128. * @param[in] dim_im_out output tensor dimension
  129. * @param[in,out] bufferA pointer to buffer space for input
  130. * @param[in,out] bufferB pointer to buffer space for output
  131. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  132. *
  133. */
  134. arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
  135. const uint16_t dim_im_in,
  136. const uint16_t ch_im_in,
  137. const q7_t * wt,
  138. const uint16_t ch_im_out,
  139. const uint16_t dim_kernel,
  140. const uint16_t padding,
  141. const uint16_t stride,
  142. const q7_t * bias,
  143. const uint16_t bias_shift,
  144. const uint16_t out_shift,
  145. q7_t * Im_out,
  146. const uint16_t dim_im_out,
  147. q15_t * bufferA,
  148. q7_t * bufferB);
  149. /**
  150. * @brief Basic Q7 convolution function (non-sqaure shape)
  151. * @param[in] Im_in pointer to input tensor
  152. * @param[in] dim_im_in_x input tensor dimention x
  153. * @param[in] dim_im_in_y input tensor dimention y
  154. * @param[in] ch_im_in number of input tensor channels
  155. * @param[in] wt pointer to kernel weights
  156. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  157. * @param[in] dim_kernel_x filter kernel size x
  158. * @param[in] dim_kernel_y filter kernel size y
  159. * @param[in] padding_x padding size x
  160. * @param[in] padding_y padding size y
  161. * @param[in] stride_x convolution stride x
  162. * @param[in] stride_y convolution stride y
  163. * @param[in] bias pointer to bias
  164. * @param[in] bias_shift amount of left-shift for bias
  165. * @param[in] out_shift amount of right-shift for output
  166. * @param[in,out] Im_out pointer to output tensor
  167. * @param[in] dim_im_out_x output tensor dimension x
  168. * @param[in] dim_im_out_y output tensor dimension y
  169. * @param[in,out] bufferA pointer to buffer space for input
  170. * @param[in,out] bufferB pointer to buffer space for output
  171. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  172. */
  173. arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
  174. const uint16_t dim_im_in_x,
  175. const uint16_t dim_im_in_y,
  176. const uint16_t ch_im_in,
  177. const q7_t * wt,
  178. const uint16_t ch_im_out,
  179. const uint16_t dim_kernel_x,
  180. const uint16_t dim_kernel_y,
  181. const uint16_t padding_x,
  182. const uint16_t padding_y,
  183. const uint16_t stride_x,
  184. const uint16_t stride_y,
  185. const q7_t * bias,
  186. const uint16_t bias_shift,
  187. const uint16_t out_shift,
  188. q7_t * Im_out,
  189. const uint16_t dim_im_out_x,
  190. const uint16_t dim_im_out_y,
  191. q15_t * bufferA,
  192. q7_t * bufferB);
  193. /**
  194. * @brief Basic Q15 convolution function
  195. * @param[in] Im_in pointer to input tensor
  196. * @param[in] dim_im_in input tensor dimention
  197. * @param[in] ch_im_in number of input tensor channels
  198. * @param[in] wt pointer to kernel weights
  199. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  200. * @param[in] dim_kernel filter kernel size
  201. * @param[in] padding padding sizes
  202. * @param[in] stride convolution stride
  203. * @param[in] bias pointer to bias
  204. * @param[in] bias_shift amount of left-shift for bias
  205. * @param[in] out_shift amount of right-shift for output
  206. * @param[in,out] Im_out pointer to output tensor
  207. * @param[in] dim_im_out output tensor dimension
  208. * @param[in,out] bufferA pointer to buffer space for input
  209. * @param[in,out] bufferB pointer to buffer space for output
  210. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  211. *
  212. */
  213. arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
  214. const uint16_t dim_im_in,
  215. const uint16_t ch_im_in,
  216. const q15_t * wt,
  217. const uint16_t ch_im_out,
  218. const uint16_t dim_kernel,
  219. const uint16_t padding,
  220. const uint16_t stride,
  221. const q15_t * bias,
  222. const uint16_t bias_shift,
  223. const uint16_t out_shift,
  224. q15_t * Im_out,
  225. const uint16_t dim_im_out,
  226. q15_t * bufferA,
  227. q7_t * bufferB);
  228. /**
  229. * @brief Fast Q7 convolution function
  230. * @param[in] Im_in pointer to input tensor
  231. * @param[in] dim_im_in input tensor dimention
  232. * @param[in] ch_im_in number of input tensor channels
  233. * @param[in] wt pointer to kernel weights
  234. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  235. * @param[in] dim_kernel filter kernel size
  236. * @param[in] padding padding sizes
  237. * @param[in] stride convolution stride
  238. * @param[in] bias pointer to bias
  239. * @param[in] bias_shift amount of left-shift for bias
  240. * @param[in] out_shift amount of right-shift for output
  241. * @param[in,out] Im_out pointer to output tensor
  242. * @param[in] dim_im_out output tensor dimension
  243. * @param[in,out] bufferA pointer to buffer space for input
  244. * @param[in,out] bufferB pointer to buffer space for output
  245. * @return The function returns either
  246. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  247. *
  248. * This function is the version with full list of optimization tricks, but with
  249. * some contraints:
  250. * ch_im_in is multiple of 4
  251. * ch_im_out is multiple of 2
  252. */
  253. arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
  254. const uint16_t dim_im_in,
  255. const uint16_t ch_im_in,
  256. const q7_t * wt,
  257. const uint16_t ch_im_out,
  258. const uint16_t dim_kernel,
  259. const uint16_t padding,
  260. const uint16_t stride,
  261. const q7_t * bias,
  262. const uint16_t bias_shift,
  263. const uint16_t out_shift,
  264. q7_t * Im_out,
  265. const uint16_t dim_im_out,
  266. q15_t * bufferA,
  267. q7_t * bufferB);
  268. /**
  269. * @brief Fast Q7 convolution function (non-sqaure shape)
  270. * @param[in] Im_in pointer to input tensor
  271. * @param[in] dim_im_in_x input tensor dimention x
  272. * @param[in] dim_im_in_y input tensor dimention y
  273. * @param[in] ch_im_in number of input tensor channels
  274. * @param[in] wt pointer to kernel weights
  275. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  276. * @param[in] dim_kernel_x filter kernel size x
  277. * @param[in] dim_kernel_y filter kernel size y
  278. * @param[in] padding_x padding size x
  279. * @param[in] padding_y padding size y
  280. * @param[in] stride_x convolution stride x
  281. * @param[in] stride_y convolution stride y
  282. * @param[in] bias pointer to bias
  283. * @param[in] bias_shift amount of left-shift for bias
  284. * @param[in] out_shift amount of right-shift for output
  285. * @param[in,out] Im_out pointer to output tensor
  286. * @param[in] dim_im_out_x output tensor dimension x
  287. * @param[in] dim_im_out_y output tensor dimension y
  288. * @param[in,out] bufferA pointer to buffer space for input
  289. * @param[in,out] bufferB pointer to buffer space for output
  290. * @return The function returns either
  291. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  292. *
  293. * This function is the version with full list of optimization tricks, but with
  294. * some contraints:
  295. * ch_im_in is multiple of 4
  296. * ch_im_out is multiple of 2
  297. */
  298. arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
  299. const uint16_t dim_im_in_x,
  300. const uint16_t dim_im_in_y,
  301. const uint16_t ch_im_in,
  302. const q7_t * wt,
  303. const uint16_t ch_im_out,
  304. const uint16_t dim_kernel_x,
  305. const uint16_t dim_kernel_y,
  306. const uint16_t padding_x,
  307. const uint16_t padding_y,
  308. const uint16_t stride_x,
  309. const uint16_t stride_y,
  310. const q7_t * bias,
  311. const uint16_t bias_shift,
  312. const uint16_t out_shift,
  313. q7_t * Im_out,
  314. const uint16_t dim_im_out_x,
  315. const uint16_t dim_im_out_y,
  316. q15_t * bufferA,
  317. q7_t * bufferB);
  318. /**
  319. * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
  320. * @param[in] Im_in pointer to input tensor
  321. * @param[in] dim_im_in_x input tensor dimention x
  322. * @param[in] dim_im_in_y input tensor dimention y
  323. * @param[in] ch_im_in number of input tensor channels
  324. * @param[in] wt pointer to kernel weights
  325. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  326. * @param[in] dim_kernel_x filter kernel size x
  327. * @param[in] dim_kernel_y filter kernel size y
  328. * @param[in] padding_x padding size x
  329. * @param[in] padding_y padding size y
  330. * @param[in] stride_x convolution stride x
  331. * @param[in] stride_y convolution stride y
  332. * @param[in] bias pointer to bias
  333. * @param[in] bias_shift amount of left-shift for bias
  334. * @param[in] out_shift amount of right-shift for output
  335. * @param[in,out] Im_out pointer to output tensor
  336. * @param[in] dim_im_out_x output tensor dimension x
  337. * @param[in] dim_im_out_y output tensor dimension y
  338. * @param[in,out] bufferA pointer to buffer space for input
  339. * @param[in,out] bufferB pointer to buffer space for output
  340. * @return The function returns either
  341. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  342. *
  343. * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
  344. * and dim_kernel_y=1). It can be used for
  345. * second half of MobileNets after depthwise separable convolution.
  346. *
  347. * This function is the version with full list of optimization tricks, but with
  348. * some contraints:
  349. * ch_im_in is multiple of 4
  350. * ch_im_out is multiple of 2
  351. */
  352. arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
  353. const uint16_t dim_im_in_x,
  354. const uint16_t dim_im_in_y,
  355. const uint16_t ch_im_in,
  356. const q7_t * wt,
  357. const uint16_t ch_im_out,
  358. const uint16_t dim_kernel_x,
  359. const uint16_t dim_kernel_y,
  360. const uint16_t padding_x,
  361. const uint16_t padding_y,
  362. const uint16_t stride_x,
  363. const uint16_t stride_y,
  364. const q7_t * bias,
  365. const uint16_t bias_shift,
  366. const uint16_t out_shift,
  367. q7_t * Im_out,
  368. const uint16_t dim_im_out_x,
  369. const uint16_t dim_im_out_y,
  370. q15_t * bufferA,
  371. q7_t * bufferB);
  372. /**
  373. * @brief Q7 version of convolution for RGB image
  374. * @param[in] Im_in pointer to input tensor
  375. * @param[in] dim_im_in input tensor dimention
  376. * @param[in] ch_im_in number of input tensor channels
  377. * @param[in] wt pointer to kernel weights
  378. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  379. * @param[in] dim_kernel filter kernel size
  380. * @param[in] padding padding sizes
  381. * @param[in] stride convolution stride
  382. * @param[in] bias pointer to bias
  383. * @param[in] bias_shift amount of left-shift for bias
  384. * @param[in] out_shift amount of right-shift for output
  385. * @param[in,out] Im_out pointer to output tensor
  386. * @param[in] dim_im_out output tensor dimension
  387. * @param[in,out] bufferA pointer to buffer space for input
  388. * @param[in,out] bufferB pointer to buffer space for output
  389. * @return The function returns either
  390. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  391. *
  392. * This kernel is written exclusively for convolution with ch_im_in
  393. * equals 3. This applies on the first layer of CNNs which has input
  394. * image with RGB format.
  395. */
  396. arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
  397. const uint16_t dim_im_in,
  398. const uint16_t ch_im_in,
  399. const q7_t * wt,
  400. const uint16_t ch_im_out,
  401. const uint16_t dim_kernel,
  402. const uint16_t padding,
  403. const uint16_t stride,
  404. const q7_t * bias,
  405. const uint16_t bias_shift,
  406. const uint16_t out_shift,
  407. q7_t * Im_out,
  408. const uint16_t dim_im_out,
  409. q15_t * bufferA,
  410. q7_t * bufferB);
  411. /**
  412. * @brief Fast Q15 convolution function
  413. * @param[in] Im_in pointer to input tensor
  414. * @param[in] dim_im_in input tensor dimention
  415. * @param[in] ch_im_in number of input tensor channels
  416. * @param[in] wt pointer to kernel weights
  417. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  418. * @param[in] dim_kernel filter kernel size
  419. * @param[in] padding padding sizes
  420. * @param[in] stride convolution stride
  421. * @param[in] bias pointer to bias
  422. * @param[in] bias_shift amount of left-shift for bias
  423. * @param[in] out_shift amount of right-shift for output
  424. * @param[in,out] Im_out pointer to output tensor
  425. * @param[in] dim_im_out output tensor dimension
  426. * @param[in,out] bufferA pointer to buffer space for input
  427. * @param[in,out] bufferB pointer to buffer space for output
  428. * @return The function returns either
  429. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  430. *
  431. * This function is the version with full list of optimization tricks, but with
  432. * some contraints:
  433. * ch_im_in is multiple of 2
  434. * ch_im_out is multiple of 2
  435. */
  436. arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
  437. const uint16_t dim_im_in,
  438. const uint16_t ch_im_in,
  439. const q15_t * wt,
  440. const uint16_t ch_im_out,
  441. const uint16_t dim_kernel,
  442. const uint16_t padding,
  443. const uint16_t stride,
  444. const q15_t * bias,
  445. const uint16_t bias_shift,
  446. const uint16_t out_shift,
  447. q15_t * Im_out,
  448. const uint16_t dim_im_out,
  449. q15_t * bufferA,
  450. q7_t * bufferB);
  451. /**
  452. * @brief Fast Q15 convolution function (non-sqaure shape)
  453. * @param[in] Im_in pointer to input tensor
  454. * @param[in] dim_im_in_x input tensor dimention x
  455. * @param[in] dim_im_in_y input tensor dimention y
  456. * @param[in] ch_im_in number of input tensor channels
  457. * @param[in] wt pointer to kernel weights
  458. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  459. * @param[in] dim_kernel_x filter kernel size x
  460. * @param[in] dim_kernel_y filter kernel size y
  461. * @param[in] padding_x padding size x
  462. * @param[in] padding_y padding size y
  463. * @param[in] stride_x convolution stride x
  464. * @param[in] stride_y convolution stride y
  465. * @param[in] bias pointer to bias
  466. * @param[in] bias_shift amount of left-shift for bias
  467. * @param[in] out_shift amount of right-shift for output
  468. * @param[in,out] Im_out pointer to output tensor
  469. * @param[in] dim_im_out_x output tensor dimension x
  470. * @param[in] dim_im_out_y output tensor dimension y
  471. * @param[in,out] bufferA pointer to buffer space for input
  472. * @param[in,out] bufferB pointer to buffer space for output
  473. * @return The function returns either
  474. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  475. *
  476. * @details
  477. *
  478. * <b>Buffer size:</b>
  479. *
  480. * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
  481. *
  482. * bufferB size: 0
  483. *
  484. * <b>Input dimension constraints:</b>
  485. *
  486. * ch_im_in is multiple of 2
  487. *
  488. * ch_im_out is multipe of 2
  489. *
  490. */
  491. arm_status
  492. arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
  493. const uint16_t dim_im_in_x,
  494. const uint16_t dim_im_in_y,
  495. const uint16_t ch_im_in,
  496. const q15_t * wt,
  497. const uint16_t ch_im_out,
  498. const uint16_t dim_kernel_x,
  499. const uint16_t dim_kernel_y,
  500. const uint16_t padding_x,
  501. const uint16_t padding_y,
  502. const uint16_t stride_x,
  503. const uint16_t stride_y,
  504. const q15_t * bias,
  505. const uint16_t bias_shift,
  506. const uint16_t out_shift,
  507. q15_t * Im_out,
  508. const uint16_t dim_im_out_x,
  509. const uint16_t dim_im_out_y,
  510. q15_t * bufferA,
  511. q7_t * bufferB);
  512. /**
  513. * @brief Q7 depthwise separable convolution function
  514. * @param[in] Im_in pointer to input tensor
  515. * @param[in] dim_im_in input tensor dimention
  516. * @param[in] ch_im_in number of input tensor channels
  517. * @param[in] wt pointer to kernel weights
  518. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  519. * @param[in] dim_kernel filter kernel size
  520. * @param[in] padding padding sizes
  521. * @param[in] stride convolution stride
  522. * @param[in] bias pointer to bias
  523. * @param[in] bias_shift amount of left-shift for bias
  524. * @param[in] out_shift amount of right-shift for output
  525. * @param[in,out] Im_out pointer to output tensor
  526. * @param[in] dim_im_out output tensor dimension
  527. * @param[in,out] bufferA pointer to buffer space for input
  528. * @param[in,out] bufferB pointer to buffer space for output
  529. * @return The function returns either
  530. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  531. *
  532. * This function is the version with full list of optimization tricks, but with
  533. * some contraints:
  534. * ch_im_in is multiple of 2
  535. * ch_im_out is multiple of 2
  536. */
  537. arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
  538. const uint16_t dim_im_in,
  539. const uint16_t ch_im_in,
  540. const q7_t * wt,
  541. const uint16_t ch_im_out,
  542. const uint16_t dim_kernel,
  543. const uint16_t padding,
  544. const uint16_t stride,
  545. const q7_t * bias,
  546. const uint16_t bias_shift,
  547. const uint16_t out_shift,
  548. q7_t * Im_out,
  549. const uint16_t dim_im_out,
  550. q15_t * bufferA,
  551. q7_t * bufferB);
  552. /**
  553. * @brief Q7 depthwise separable convolution function (non-square shape)
  554. * @param[in] Im_in pointer to input tensor
  555. * @param[in] dim_im_in_x input tensor dimention x
  556. * @param[in] dim_im_in_y input tensor dimention y
  557. * @param[in] ch_im_in number of input tensor channels
  558. * @param[in] wt pointer to kernel weights
  559. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  560. * @param[in] dim_kernel_x filter kernel size x
  561. * @param[in] dim_kernel_y filter kernel size y
  562. * @param[in] padding_x padding sizes x
  563. * @param[in] padding_y padding sizes y
  564. * @param[in] stride_x convolution stride x
  565. * @param[in] stride_y convolution stride y
  566. * @param[in] bias pointer to bias
  567. * @param[in] bias_shift amount of left-shift for bias
  568. * @param[in] out_shift amount of right-shift for output
  569. * @param[in,out] Im_out pointer to output tensor
  570. * @param[in] dim_im_out_x output tensor dimension x
  571. * @param[in] dim_im_out_y output tensor dimension y
  572. * @param[in,out] bufferA pointer to buffer space for input
  573. * @param[in,out] bufferB pointer to buffer space for output
  574. * @return The function returns either
  575. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  576. *
  577. * This function is the version with full list of optimization tricks, but with
  578. * some contraints:
  579. * ch_im_in is multiple of 2
  580. * ch_im_out is multiple of 2
  581. */
  582. arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
  583. const uint16_t dim_im_in_x,
  584. const uint16_t dim_im_in_y,
  585. const uint16_t ch_im_in,
  586. const q7_t * wt,
  587. const uint16_t ch_im_out,
  588. const uint16_t dim_kernel_x,
  589. const uint16_t dim_kernel_y,
  590. const uint16_t padding_x,
  591. const uint16_t padding_y,
  592. const uint16_t stride_x,
  593. const uint16_t stride_y,
  594. const q7_t * bias,
  595. const uint16_t bias_shift,
  596. const uint16_t out_shift,
  597. q7_t * Im_out,
  598. const uint16_t dim_im_out_x,
  599. const uint16_t dim_im_out_y,
  600. q15_t * bufferA,
  601. q7_t * bufferB);
  602. /**
  603. * @defgroup FC Fully-connected Layer Functions
  604. *
  605. * Perform fully-connected layer
  606. *
  607. * Fully-connected layer is basically a matrix-vector multiplication
  608. * with bias. The matrix is the weights and the input/output vectors
  609. * are the activation values. Supported {weight, activation} precisions
  610. * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
  611. *
  612. * Here we have two types of kernel functions. The basic function
  613. * implements the function using regular GEMV approach. The opt functions
  614. * operates with weights in interleaved formats.
  615. *
  616. */
  617. /**
  618. * @brief Q7 basic fully-connected layer function
  619. * @param[in] pV pointer to input vector
  620. * @param[in] pM pointer to matrix weights
  621. * @param[in] dim_vec length of the vector
  622. * @param[in] num_of_rows number of rows in weight matrix
  623. * @param[in] bias_shift amount of left-shift for bias
  624. * @param[in] out_shift amount of right-shift for output
  625. * @param[in] bias pointer to bias
  626. * @param[in,out] pOut pointer to output vector
  627. * @param[in,out] vec_buffer pointer to buffer space for input
  628. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  629. *
  630. */
  631. arm_status arm_fully_connected_q7(const q7_t * pV,
  632. const q7_t * pM,
  633. const uint16_t dim_vec,
  634. const uint16_t num_of_rows,
  635. const uint16_t bias_shift,
  636. const uint16_t out_shift,
  637. const q7_t * bias,
  638. q7_t * pOut,
  639. q15_t * vec_buffer);
  640. /**
  641. * @brief Q7 opt fully-connected layer function
  642. * @param[in] pV pointer to input vector
  643. * @param[in] pM pointer to matrix weights
  644. * @param[in] dim_vec length of the vector
  645. * @param[in] num_of_rows number of rows in weight matrix
  646. * @param[in] bias_shift amount of left-shift for bias
  647. * @param[in] out_shift amount of right-shift for output
  648. * @param[in] bias pointer to bias
  649. * @param[in,out] pOut pointer to output vector
  650. * @param[in,out] vec_buffer pointer to buffer space for input
  651. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  652. *
  653. */
  654. arm_status arm_fully_connected_q7_opt(const q7_t * pV,
  655. const q7_t * pM,
  656. const uint16_t dim_vec,
  657. const uint16_t num_of_rows,
  658. const uint16_t bias_shift,
  659. const uint16_t out_shift,
  660. const q7_t * bias,
  661. q7_t * pOut,
  662. q15_t * vec_buffer);
  663. /**
  664. * @brief Q15 basic fully-connected layer function
  665. * @param[in] pV pointer to input vector
  666. * @param[in] pM pointer to matrix weights
  667. * @param[in] dim_vec length of the vector
  668. * @param[in] num_of_rows number of rows in weight matrix
  669. * @param[in] bias_shift amount of left-shift for bias
  670. * @param[in] out_shift amount of right-shift for output
  671. * @param[in] bias pointer to bias
  672. * @param[in,out] pOut pointer to output vector
  673. * @param[in,out] vec_buffer pointer to buffer space for input
  674. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  675. *
  676. */
  677. arm_status arm_fully_connected_q15(const q15_t * pV,
  678. const q15_t * pM,
  679. const uint16_t dim_vec,
  680. const uint16_t num_of_rows,
  681. const uint16_t bias_shift,
  682. const uint16_t out_shift,
  683. const q15_t * bias,
  684. q15_t * pOut,
  685. q15_t * vec_buffer);
  686. /**
  687. * @brief Q15 opt fully-connected layer function
  688. * @param[in] pV pointer to input vector
  689. * @param[in] pM pointer to matrix weights
  690. * @param[in] dim_vec length of the vector
  691. * @param[in] num_of_rows number of rows in weight matrix
  692. * @param[in] bias_shift amount of left-shift for bias
  693. * @param[in] out_shift amount of right-shift for output
  694. * @param[in] bias pointer to bias
  695. * @param[in,out] pOut pointer to output vector
  696. * @param[in,out] vec_buffer pointer to buffer space for input
  697. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  698. *
  699. */
  700. arm_status arm_fully_connected_q15_opt(const q15_t * pV,
  701. const q15_t * pM,
  702. const uint16_t dim_vec,
  703. const uint16_t num_of_rows,
  704. const uint16_t bias_shift,
  705. const uint16_t out_shift,
  706. const q15_t * bias,
  707. q15_t * pOut,
  708. q15_t * vec_buffer);
  709. /**
  710. * @brief Mixed Q15-Q7 fully-connected layer function
  711. * @param[in] pV pointer to input vector
  712. * @param[in] pM pointer to matrix weights
  713. * @param[in] dim_vec length of the vector
  714. * @param[in] num_of_rows number of rows in weight matrix
  715. * @param[in] bias_shift amount of left-shift for bias
  716. * @param[in] out_shift amount of right-shift for output
  717. * @param[in] bias pointer to bias
  718. * @param[in,out] pOut pointer to output vector
  719. * @param[in,out] vec_buffer pointer to buffer space for input
  720. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  721. *
  722. */
  723. arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
  724. const q7_t * pM,
  725. const uint16_t dim_vec,
  726. const uint16_t num_of_rows,
  727. const uint16_t bias_shift,
  728. const uint16_t out_shift,
  729. const q7_t * bias,
  730. q15_t * pOut,
  731. q15_t * vec_buffer);
  732. /**
  733. * @brief Mixed Q15-Q7 opt fully-connected layer function
  734. * @param[in] pV pointer to input vector
  735. * @param[in] pM pointer to matrix weights
  736. * @param[in] dim_vec length of the vector
  737. * @param[in] num_of_rows number of rows in weight matrix
  738. * @param[in] bias_shift amount of left-shift for bias
  739. * @param[in] out_shift amount of right-shift for output
  740. * @param[in] bias pointer to bias
  741. * @param[in,out] pOut pointer to output vector
  742. * @param[in,out] vec_buffer pointer to buffer space for input
  743. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  744. *
  745. */
  746. arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
  747. const q7_t * pM,
  748. const uint16_t dim_vec,
  749. const uint16_t num_of_rows,
  750. const uint16_t bias_shift,
  751. const uint16_t out_shift,
  752. const q7_t * bias,
  753. q15_t * pOut,
  754. q15_t * vec_buffer);
  755. /**
  756. * @brief Matrix-Multiplication Kernels for Convolution
  757. *
  758. * These functions are used within convolution layer functions for
  759. * matrix multiplication.
  760. *
  761. * The implementation is similar to CMSIS-DSP arm_mat_mult functions
  762. * with one Q7 and one Q15 operands. The Q15 operand is the im2col
  763. * output which is always with 2 columns.
  764. *
  765. */
  766. /**
  767. * @brief Matrix-multiplication function for convolution
  768. * @param[in] pA pointer to operand A
  769. * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
  770. * @param[in] ch_im_out numRow of A
  771. * @param[in] numCol_A numCol of A
  772. * @param[in] bias_shift amount of left-shift for bias
  773. * @param[in] out_shift amount of right-shift for output
  774. * @param[in] bias the bias
  775. * @param[in,out] pOut pointer to output
  776. * @return The function returns the incremented output pointer
  777. */
  778. q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
  779. const q15_t * pInBuffer,
  780. const uint16_t ch_im_out,
  781. const uint16_t numCol_A,
  782. const uint16_t bias_shift,
  783. const uint16_t out_shift,
  784. const q7_t * bias,
  785. q7_t * pOut);
  786. /**
  787. * @brief Matrix-multiplication function for convolution with reordered columns
  788. * @param[in] pA pointer to operand A
  789. * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
  790. * @param[in] ch_im_out numRow of A
  791. * @param[in] numCol_A numCol of A
  792. * @param[in] bias_shift amount of left-shift for bias
  793. * @param[in] out_shift amount of right-shift for output
  794. * @param[in] bias the bias
  795. * @param[in,out] pOut pointer to output
  796. * @return The function returns the incremented output pointer
  797. */
  798. q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
  799. const q15_t * pInBuffer,
  800. const uint16_t ch_im_out,
  801. const uint16_t numCol_A,
  802. const uint16_t bias_shift,
  803. const uint16_t out_shift,
  804. const q7_t * bias,
  805. q7_t * pOut);
  806. #ifdef __cplusplus
  807. }
  808. #endif
  809. /*
  810. * Other functions
  811. * These layers are typically not timing critical
  812. * Basic implementation is supported here
  813. */
  814. #ifdef __cplusplus
  815. extern "C"
  816. {
  817. #endif
  818. /**
  819. * @defgroup Acti Neural Network Activation Functions
  820. *
  821. * Perform activation layers, including ReLU (Rectified Linear Unit),
  822. * sigmoid and tanh
  823. *
  824. */
  825. /**
  826. * @brief Q7 RELU function
  827. * @param[in,out] data pointer to input
  828. * @param[in] size number of elements
  829. * @return none.
  830. */
  831. void arm_relu_q7(q7_t * data, uint16_t size);
  832. /**
  833. * @brief Q15 RELU function
  834. * @param[in,out] data pointer to input
  835. * @param[in] size number of elements
  836. * @return none.
  837. */
  838. void arm_relu_q15(q15_t * data, uint16_t size);
  839. /**
  840. * @brief Q7 neural network activation function using direct table look-up
  841. * @param[in,out] data pointer to input
  842. * @param[in] size number of elements
  843. * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
  844. * @param[in] type type of activation functions
  845. * @return none.
  846. */
  847. void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
  848. arm_nn_activation_type type);
  849. /**
  850. * @brief Q15 neural network activation function using direct table look-up
  851. * @param[in,out] data pointer to input
  852. * @param[in] size number of elements
  853. * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
  854. * @param[in] type type of activation functions
  855. * @return none.
  856. */
  857. void arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
  858. arm_nn_activation_type type);
  859. /**
  860. * @defgroup Pooling Neural Network Pooling Functions
  861. *
  862. * Perform pooling functions, including max pooling and average pooling
  863. *
  864. */
  865. /**
  866. * @brief Q7 max pooling function
  867. * @param[in] Im_in pointer to input tensor
  868. * @param[in] dim_im_in input tensor dimention
  869. * @param[in] ch_im_in number of input tensor channels
  870. * @param[in] dim_kernel filter kernel size
  871. * @param[in] padding padding sizes
  872. * @param[in] stride convolution stride
  873. * @param[in] dim_im_out output tensor dimension
  874. * @param[in,out] bufferA pointer to buffer space for input
  875. * @param[in,out] Im_out pointer to output tensor
  876. * @return none.
  877. *
  878. */
  879. void arm_maxpool_q7_HWC(q7_t * Im_in,
  880. const uint16_t dim_im_in,
  881. const uint16_t ch_im_in,
  882. const uint16_t dim_kernel,
  883. const uint16_t padding,
  884. const uint16_t stride,
  885. const uint16_t dim_im_out,
  886. q7_t * bufferA,
  887. q7_t * Im_out);
  888. /**
  889. * @brief Q7 average pooling function
  890. * @param[in] Im_in pointer to input tensor
  891. * @param[in] dim_im_in input tensor dimention
  892. * @param[in] ch_im_in number of input tensor channels
  893. * @param[in] dim_kernel filter kernel size
  894. * @param[in] padding padding sizes
  895. * @param[in] stride convolution stride
  896. * @param[in] dim_im_out output tensor dimension
  897. * @param[in,out] bufferA pointer to buffer space for input
  898. * @param[in,out] Im_out pointer to output tensor
  899. * @return none.
  900. *
  901. */
  902. void arm_avepool_q7_HWC(q7_t * Im_in,
  903. const uint16_t dim_im_in,
  904. const uint16_t ch_im_in,
  905. const uint16_t dim_kernel,
  906. const uint16_t padding,
  907. const uint16_t stride,
  908. const uint16_t dim_im_out,
  909. q7_t * bufferA,
  910. q7_t * Im_out);
  911. /**
  912. * @defgroup Softmax Softmax Functions
  913. *
  914. * EXP(2) based softmax function
  915. *
  916. */
  917. /**
  918. * @brief Q7 softmax function
  919. * @param[in] vec_in pointer to input vector
  920. * @param[in] dim_vec input vector dimention
  921. * @param[out] p_out pointer to output vector
  922. * @return none.
  923. *
  924. */
  925. void arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
  926. /**
  927. * @brief Q15 softmax function
  928. * @param[in] vec_in pointer to input vector
  929. * @param[in] dim_vec input vector dimention
  930. * @param[out] p_out pointer to output vector
  931. * @return none.
  932. *
  933. */
  934. void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
  935. #ifdef __cplusplus
  936. }
  937. #endif
  938. #endif