1 /*
2  * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nnfunctions.h
22  * Description:  Public header file for CMSIS NN Library
23  *
24  * $Date:        19 March 2021
25  * $Revision:    V.7.0.0
26  *
27  * Target Processor:  Cortex-M CPUs
28  * -------------------------------------------------------------------- */
29 
30 /**
31    \mainpage CMSIS NN Software Library
32    *
33    * Introduction
34    * ------------
35    *
36    * This user manual describes the CMSIS NN software library,
37    * a collection of efficient neural network kernels developed to maximize the
38    * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
39    *
40    * The library is divided into a number of functions each covering a specific category:
41    * - Convolution Functions
42    * - Activation Functions
43    * - Fully-connected Layer Functions
44    * - SVDF Layer Functions
45    * - Pooling Functions
46    * - Softmax Functions
47    * - Basic math Functions
48    *
49    * The library has separate functions for operating on different weight and activation data
50    * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
51    * kernels are included in the function description. The implementation details are also
52    * described in this paper [1].
53    *
54    * Function Classification
55    * --------
56    * The functions can be classified into two segments
57    * - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
58    * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
59    *
60    * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
61    * The article in [2] describes in detail how to run a network using the legacy functions.
62    *
63    * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
64    * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
65    * a TensorFlow Lite model using optimized CMSIS-NN kernels.
66    *
67    * Block Diagram
68    * --------
69    * \image html CMSIS-NN-OVERVIEW.PNG
70    *
71    * Examples
72    * --------
73    *
74    * The library ships with a number of examples which demonstrate how to use the library functions.
75    *
76    * Pre-processor Macros
77    * ------------
78    *
79    * Each library project have different pre-processor macros.
80    *
81    * - ARM_MATH_DSP:
82    *
83    * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
84    *
85    * - ARM_MATH_MVEI:
86    *
87    * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.
88 
89    * - ARM_MATH_AUTOVECTORIZE
90    *  Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
91    *  assembly. It does not affect functions that use C or intrinsics.
92    * - ARM_MATH_BIG_ENDIAN:
93    *
94    * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
95    * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
96    * little endian targets.
97    *
98    * - ARM_NN_TRUNCATE:
99    *
100    * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
101    *
102    *
103    * Copyright Notice
104    * ------------
105    *
106    * Copyright (C) 2010-2019 Arm Limited. All rights reserved.
107    *
108    * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
109    *
110    * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
111    *
112    https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
113    * [3] https://www.tensorflow.org/lite/microcontrollers/library
114    *
115    * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
116    */
117 
118 /**
119  * @defgroup groupNN Neural Network Functions
120  * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
121  * TensorFlow Lite framework.
122  */
123 
124 #ifndef _ARM_NNFUNCTIONS_H
125 #define _ARM_NNFUNCTIONS_H
126 
127 #include "arm_math_types.h"
128 #include "arm_nn_types.h"
129 
130 #define USE_INTRINSIC
131 
132 //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
133 
134 #ifdef __cplusplus
135 extern "C" {
136 #endif
137 
138 /**
139  * @brief Struct for specifying activation function types
140  *
141  */
142 typedef enum
143 {
144     ARM_SIGMOID = 0,
145     /**< Sigmoid activation function */
146     ARM_TANH = 1,
147     /**< Tanh activation function */
148 } arm_nn_activation_type;
149 
150 /**
151  * @defgroup NNConv Convolution Functions
152  *
153  * Collection of convolution, depthwise convolution functions and their variants.
154  *
155  * The convolution is implemented in 2 steps: im2col and GEMM
156  *
157  * im2col is a process of converting each patch of image data into
158  * a column. After im2col, the convolution is computed as matrix-matrix
159  * multiplication.
160  *
161  * To reduce the memory footprint, the im2col is performed partially.
162  * Each iteration, only a few column (i.e., patches) are generated and
163  * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
164  *
165  */
166 
167 /**
168  * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
169  cmsis-nn
170  *        to perform the convolution.
171  *
172  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
173                                   arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
174  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
175  *                                Range of conv_params->input_offset  : [-127, 128]
176  *                                Range of conv_params->output_offset : [-128, 127]
177  * @param[in]      quant_params   Per-channel quantization info.
178  *                                It contains the multiplier and shift values to be applied to each output channel
179  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
180  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
181  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
182  *                                spatial filter dimensions
183  * @param[in]      filter_data    Filter data pointer. Data type: int8
184  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
185  * @param[in]      bias_data      Bias data pointer. Data type: int32
186  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
187  * @param[out]     output_data    Output data pointer. Data type: int8
188  *
189  * @return     The function returns either
190  *                  <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
191  *                  <code>ARM_MATH_SUCCESS</code> on successful completion.
192  *
193  */
194 arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
195                                    const cmsis_nn_conv_params *conv_params,
196                                    const cmsis_nn_per_channel_quant_params *quant_params,
197                                    const cmsis_nn_dims *input_dims,
198                                    const q7_t *input_data,
199                                    const cmsis_nn_dims *filter_dims,
200                                    const q7_t *filter_data,
201                                    const cmsis_nn_dims *bias_dims,
202                                    const int32_t *bias_data,
203                                    const cmsis_nn_dims *output_dims,
204                                    q7_t *output_data);
205 
206 /**
207  * @brief Get the required buffer size for arm_convolve_wrapper_s8
208  *
209  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
210  *                                Range of conv_params->input_offset  : [-127, 128]
211  *                                Range of conv_params->output_offset : [-128, 127]
212  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
213  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
214  *                                filter dimensions
215  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
216  *
217  * @return         The function returns  required buffer size(bytes)
218  *
219  */
220 int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
221                                                 const cmsis_nn_dims *input_dims,
222                                                 const cmsis_nn_dims *filter_dims,
223                                                 const cmsis_nn_dims *output_dims);
224 
225 /**
226  * @brief Basic s8 convolution function
227  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
228                                   arm_convolve_s8_get_buffer_size will return the buffer_size if required
229  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
230  *                                Range of conv_params->input_offset  : [-127, 128]
231  *                                Range of conv_params->output_offset : [-128, 127]
232  * @param[in]      quant_params   Per-channel quantization info.
233  *                                It contains the multiplier and shift values to be applied to each output channel
234  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
235  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
236  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
237  *                                spatial filter dimensions
238  * @param[in]      filter_data    Filter data pointer. Data type: int8
239  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
240  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
241  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
242  * @param[out]     output_data    Output data pointer. Data type: int8
243 
244  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
245  *
246  * @details
247  *    1. Supported framework: TensorFlow Lite micro
248  *    2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
249  *    3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
250  *
251  */
252 arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
253                            const cmsis_nn_conv_params *conv_params,
254                            const cmsis_nn_per_channel_quant_params *quant_params,
255                            const cmsis_nn_dims *input_dims,
256                            const q7_t *input_data,
257                            const cmsis_nn_dims *filter_dims,
258                            const q7_t *filter_data,
259                            const cmsis_nn_dims *bias_dims,
260                            const int32_t *bias_data,
261                            const cmsis_nn_dims *output_dims,
262                            q7_t *output_data);
263 
264 /**
265  * @brief Get the required buffer size for s8 convolution function
266  *
267  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
268  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
269  * are the spatial filter dimensions
270  * @return          The function returns  required buffer size(bytes)
271  *
272  */
273 int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
274 
275 /**
276  * @brief Basic Q7 convolution function
277  * @param[in]       Im_in       pointer to input tensor
278  * @param[in]       dim_im_in   input tensor dimension
279  * @param[in]       ch_im_in    number of input tensor channels
280  * @param[in]       wt          pointer to kernel weights
281  * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
282  * @param[in]       dim_kernel  filter kernel size
283  * @param[in]       padding     padding sizes
284  * @param[in]       stride      convolution stride
285  * @param[in]       bias        pointer to bias
286  * @param[in]       bias_shift  amount of left-shift for bias
287  * @param[in]       out_shift   amount of right-shift for output
288  * @param[in,out]   Im_out      pointer to output tensor
289  * @param[in]       dim_im_out  output tensor dimension
290  * @param[in,out]   bufferA     pointer to buffer space for input
291  * @param[in,out]   bufferB     pointer to buffer space for output
292  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
293  *
294  */
295 arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
296                                      const uint16_t dim_im_in,
297                                      const uint16_t ch_im_in,
298                                      const q7_t *wt,
299                                      const uint16_t ch_im_out,
300                                      const uint16_t dim_kernel,
301                                      const uint16_t padding,
302                                      const uint16_t stride,
303                                      const q7_t *bias,
304                                      const uint16_t bias_shift,
305                                      const uint16_t out_shift,
306                                      q7_t *Im_out,
307                                      const uint16_t dim_im_out,
308                                      q15_t *bufferA,
309                                      q7_t *bufferB);
310 
311 /**
312  * @brief Basic Q7 convolution function (non-square shape)
313  * @param[in]       Im_in        pointer to input tensor
314  * @param[in]       dim_im_in_x  input tensor dimension x
315  * @param[in]       dim_im_in_y  input tensor dimension y
316  * @param[in]       ch_im_in     number of input tensor channels
317  * @param[in]       wt           pointer to kernel weights
318  * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
319  * @param[in]       dim_kernel_x filter kernel size x
320  * @param[in]       dim_kernel_y filter kernel size y
321  * @param[in]       padding_x    padding size x
322  * @param[in]       padding_y    padding size y
323  * @param[in]       stride_x     convolution stride x
324  * @param[in]       stride_y     convolution stride y
325  * @param[in]       bias         pointer to bias
326  * @param[in]       bias_shift   amount of left-shift for bias
327  * @param[in]       out_shift    amount of right-shift for output
328  * @param[in,out]   Im_out       pointer to output tensor
329  * @param[in]       dim_im_out_x output tensor dimension x
330  * @param[in]       dim_im_out_y output tensor dimension y
331  * @param[in,out]   bufferA      pointer to buffer space for input
332  * @param[in,out]   bufferB      pointer to buffer space for output
333  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
334  */
335 arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
336                                                const uint16_t dim_im_in_x,
337                                                const uint16_t dim_im_in_y,
338                                                const uint16_t ch_im_in,
339                                                const q7_t *wt,
340                                                const uint16_t ch_im_out,
341                                                const uint16_t dim_kernel_x,
342                                                const uint16_t dim_kernel_y,
343                                                const uint16_t padding_x,
344                                                const uint16_t padding_y,
345                                                const uint16_t stride_x,
346                                                const uint16_t stride_y,
347                                                const q7_t *bias,
348                                                const uint16_t bias_shift,
349                                                const uint16_t out_shift,
350                                                q7_t *Im_out,
351                                                const uint16_t dim_im_out_x,
352                                                const uint16_t dim_im_out_y,
353                                                q15_t *bufferA,
354                                                q7_t *bufferB);
355 
356 /**
357  * @brief Basic Q15 convolution function
358  * @param[in]       Im_in       pointer to input tensor
359  * @param[in]       dim_im_in   input tensor dimension
360  * @param[in]       ch_im_in    number of input tensor channels
361  * @param[in]       wt          pointer to kernel weights
362  * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
363  * @param[in]       dim_kernel  filter kernel size
364  * @param[in]       padding     padding sizes
365  * @param[in]       stride      convolution stride
366  * @param[in]       bias        pointer to bias
367  * @param[in]       bias_shift  amount of left-shift for bias
368  * @param[in]       out_shift   amount of right-shift for output
369  * @param[in,out]   Im_out      pointer to output tensor
370  * @param[in]       dim_im_out  output tensor dimension
371  * @param[in,out]   bufferA     pointer to buffer space for input
372  * @param[in,out]   bufferB     pointer to buffer space for output
373  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
374  *
375  */
376 arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
377                                       const uint16_t dim_im_in,
378                                       const uint16_t ch_im_in,
379                                       const q15_t *wt,
380                                       const uint16_t ch_im_out,
381                                       const uint16_t dim_kernel,
382                                       const uint16_t padding,
383                                       const uint16_t stride,
384                                       const q15_t *bias,
385                                       const uint16_t bias_shift,
386                                       const uint16_t out_shift,
387                                       q15_t *Im_out,
388                                       const uint16_t dim_im_out,
389                                       q15_t *bufferA,
390                                       q7_t *bufferB);
391 
392 /**
393  * @brief Fast Q7 convolution function
394  * @param[in]       Im_in       pointer to input tensor
395  * @param[in]       dim_im_in   input tensor dimension
396  * @param[in]       ch_im_in    number of input tensor channels
397  * @param[in]       wt          pointer to kernel weights
398  * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
399  * @param[in]       dim_kernel  filter kernel size
400  * @param[in]       padding     padding sizes
401  * @param[in]       stride      convolution stride
402  * @param[in]       bias        pointer to bias
403  * @param[in]       bias_shift  amount of left-shift for bias
404  * @param[in]       out_shift   amount of right-shift for output
405  * @param[in,out]   Im_out      pointer to output tensor
406  * @param[in]       dim_im_out  output tensor dimension
407  * @param[in,out]   bufferA     pointer to buffer space for input
408  * @param[in,out]   bufferB     pointer to buffer space for output
409  * @return     The function returns either
410  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
411  *
412  * This function is the version with full list of optimization tricks, but with
413  * some contraints:
414  *   ch_im_in is multiple of 4
415  *   ch_im_out is multiple of 2
416  */
417 arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
418                                     const uint16_t dim_im_in,
419                                     const uint16_t ch_im_in,
420                                     const q7_t *wt,
421                                     const uint16_t ch_im_out,
422                                     const uint16_t dim_kernel,
423                                     const uint16_t padding,
424                                     const uint16_t stride,
425                                     const q7_t *bias,
426                                     const uint16_t bias_shift,
427                                     const uint16_t out_shift,
428                                     q7_t *Im_out,
429                                     const uint16_t dim_im_out,
430                                     q15_t *bufferA,
431                                     q7_t *bufferB);
432 
433 /**
434  * @brief Fast Q7 convolution function (non-sqaure shape)
435  * @param[in]       Im_in        pointer to input tensor
436  * @param[in]       dim_im_in_x  input tensor dimension x
437  * @param[in]       dim_im_in_y  input tensor dimension y
438  * @param[in]       ch_im_in     number of input tensor channels
439  * @param[in]       wt           pointer to kernel weights
440  * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
441  * @param[in]       dim_kernel_x filter kernel size x
442  * @param[in]       dim_kernel_y filter kernel size y
443  * @param[in]       padding_x    padding size x
444  * @param[in]       padding_y    padding size y
445  * @param[in]       stride_x     convolution stride x
446  * @param[in]       stride_y     convolution stride y
447  * @param[in]       bias         pointer to bias
448  * @param[in]       bias_shift   amount of left-shift for bias
449  * @param[in]       out_shift    amount of right-shift for output
450  * @param[in,out]   Im_out       pointer to output tensor
451  * @param[in]       dim_im_out_x output tensor dimension x
452  * @param[in]       dim_im_out_y output tensor dimension y
453  * @param[in,out]   bufferA      pointer to buffer space for input
454  * @param[in,out]   bufferB      pointer to buffer space for output
455  * @return     The function returns either
456  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
457  *
458  * This function is the version with full list of optimization tricks, but with
459  * some contraints:
460  *   ch_im_in is multiple of 4
461  *   ch_im_out is multiple of 2
462  */
463 
464 arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
465                                               const uint16_t dim_im_in_x,
466                                               const uint16_t dim_im_in_y,
467                                               const uint16_t ch_im_in,
468                                               const q7_t *wt,
469                                               const uint16_t ch_im_out,
470                                               const uint16_t dim_kernel_x,
471                                               const uint16_t dim_kernel_y,
472                                               const uint16_t padding_x,
473                                               const uint16_t padding_y,
474                                               const uint16_t stride_x,
475                                               const uint16_t stride_y,
476                                               const q7_t *bias,
477                                               const uint16_t bias_shift,
478                                               const uint16_t out_shift,
479                                               q7_t *Im_out,
480                                               const uint16_t dim_im_out_x,
481                                               const uint16_t dim_im_out_y,
482                                               q15_t *bufferA,
483                                               q7_t *bufferB);
484 
485 /**
486  * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
487  * @param[in]       Im_in        pointer to input tensor
488  * @param[in]       dim_im_in_x  input tensor dimension x
489  * @param[in]       dim_im_in_y  input tensor dimension y
490  * @param[in]       ch_im_in     number of input tensor channels
491  * @param[in]       wt           pointer to kernel weights
492  * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
493  * @param[in]       dim_kernel_x filter kernel size x
494  * @param[in]       dim_kernel_y filter kernel size y
495  * @param[in]       padding_x    padding size x
496  * @param[in]       padding_y    padding size y
497  * @param[in]       stride_x     convolution stride x
498  * @param[in]       stride_y     convolution stride y
499  * @param[in]       bias         pointer to bias
500  * @param[in]       bias_shift   amount of left-shift for bias
501  * @param[in]       out_shift    amount of right-shift for output
502  * @param[in,out]   Im_out       pointer to output tensor
503  * @param[in]       dim_im_out_x output tensor dimension x
504  * @param[in]       dim_im_out_y output tensor dimension y
505  * @param[in,out]   bufferA      pointer to buffer space for input
506  * @param[in,out]   bufferB      pointer to buffer space for output
507  * @return     The function returns either
508  *                          <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
509  *                          <code>ARM_MATH_SUCCESS</code> on successful completion.
510  *
511  * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
512  * and dim_kernel_y=1). It can be used for
513  * second half of MobileNets after depthwise separable convolution.
514  *
515  * This function is the version with full list of optimization tricks, but with
516  * some contraints:
517  *   ch_im_in is multiple of 4
518  *   ch_im_out is multiple of 2
519  */
520 arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
521                                                   const uint16_t dim_im_in_x,
522                                                   const uint16_t dim_im_in_y,
523                                                   const uint16_t ch_im_in,
524                                                   const q7_t *wt,
525                                                   const uint16_t ch_im_out,
526                                                   const uint16_t dim_kernel_x,
527                                                   const uint16_t dim_kernel_y,
528                                                   const uint16_t padding_x,
529                                                   const uint16_t padding_y,
530                                                   const uint16_t stride_x,
531                                                   const uint16_t stride_y,
532                                                   const q7_t *bias,
533                                                   const uint16_t bias_shift,
534                                                   const uint16_t out_shift,
535                                                   q7_t *Im_out,
536                                                   const uint16_t dim_im_out_x,
537                                                   const uint16_t dim_im_out_y,
538                                                   q15_t *bufferA,
539                                                   q7_t *bufferB);
540 
541 /**
542  * @brief Fast s8 version for 1x1 convolution (non-square shape)
543  *
544  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
545                                   arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required
546  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
547  *                                Range of conv_params->input_offset  : [-127, 128]
548  *                                Range of conv_params->output_offset : [-128, 127]
549  * @param[in]      quant_params   Per-channel quantization info.
550  *                                It contains the multiplier and shift values to be applied to each output channel
551  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
552  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
553  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
554  * @param[in]      filter_data    Filter data pointer. Data type: int8
555  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
556  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
557  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
558  * @param[out]     output_data    Output data pointer. Data type: int8
559  *
560  * @return     The function returns either
561  *                  <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
562  *                  <code>ARM_MATH_SUCCESS</code> on successful completion.
563  *
564  * @details
565  *   - Supported framework : TensorFlow Lite Micro
566  *   - The following constrains on the arguments apply
567  *      -# input_dims->c is a multiple of 4
568  *      -# conv_params->padding.w = conv_params->padding.h = 0
569  *      -# conv_params->stride.w = conv_params->stride.h = 1
570  *
571  */
572 arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
573                                     const cmsis_nn_conv_params *conv_params,
574                                     const cmsis_nn_per_channel_quant_params *quant_params,
575                                     const cmsis_nn_dims *input_dims,
576                                     const q7_t *input_data,
577                                     const cmsis_nn_dims *filter_dims,
578                                     const q7_t *filter_data,
579                                     const cmsis_nn_dims *bias_dims,
580                                     const int32_t *bias_data,
581                                     const cmsis_nn_dims *output_dims,
582                                     q7_t *output_data);
583 
584 /**
585  * @brief Get the required buffer size for arm_convolve_1x1_s8_fast
586  *
587  * @param[in]       input_dims            Input (activation) dimensions
588  * @return          The function returns the required buffer size in bytes
589  *
590  */
591 int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
592 
593 /**
594  * @brief 1xn convolution
595  *
596  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
597                                   arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required
598  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
599  *                                Range of conv_params->input_offset  : [-127, 128]
600  *                                Range of conv_params->output_offset : [-128, 127]
601  * @param[in]      quant_params   Per-channel quantization info.
602  *                                It contains the multiplier and shift values to be applied to each output channel
603  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
604  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
605  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
606  *                                spatial filter dimension
607  * @param[in]      filter_data    Filter data pointer. Data type: int8
608  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
609  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
610  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
611  * @param[out]     output_data    Output data pointer. Data type: int8
612  *
613  * @return     The function returns either
614  *                  <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
615  *                  <code>ARM_MATH_SUCCESS</code> on successful completion.
616  *
617  * @details
618  *   - Supported framework : TensorFlow Lite Micro
619  *   - The following constrains on the arguments apply
620  *      -# input_dims->n equals 1
621  *      -# ouput_dims->w is a multiple of 4
622  *      -# Explicit constraints(since it is for 1xN convolution)
623  *      -## input_dims->h equals 1
624  *      -## output_dims->h equals 1
625  *      -## filter_dims->h equals 1
626  *@todo  Remove constraint on output_dims->w to make the function generic.
627  *
628  */
629 arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
630                                  const cmsis_nn_conv_params *conv_params,
631                                  const cmsis_nn_per_channel_quant_params *quant_params,
632                                  const cmsis_nn_dims *input_dims,
633                                  const q7_t *input_data,
634                                  const cmsis_nn_dims *filter_dims,
635                                  const q7_t *filter_data,
636                                  const cmsis_nn_dims *bias_dims,
637                                  const int32_t *bias_data,
638                                  const cmsis_nn_dims *output_dims,
639                                  q7_t *output_data);
640 
641 /**
642  * @brief Get the required additional buffer size for 1xn convolution
643  *
644  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
645  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
646  *                                        horizontal spatial filter dimension
647  * @return          The function returns  required buffer size(bytes)
648  *
649  */
650 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
651 
652 /**
653  * @brief Q7 version of convolution for RGB image
654  * @param[in]       Im_in       pointer to input tensor
655  * @param[in]       dim_im_in   input tensor dimension
656  * @param[in]       ch_im_in    number of input tensor channels
657  * @param[in]       wt          pointer to kernel weights
658  * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
659  * @param[in]       dim_kernel  filter kernel size
660  * @param[in]       padding     padding sizes
661  * @param[in]       stride      convolution stride
662  * @param[in]       bias        pointer to bias
663  * @param[in]       bias_shift  amount of left-shift for bias
664  * @param[in]       out_shift   amount of right-shift for output
665  * @param[in,out]   Im_out      pointer to output tensor
666  * @param[in]       dim_im_out  output tensor dimension
667  * @param[in,out]   bufferA     pointer to buffer space for input
668  * @param[in,out]   bufferB     pointer to buffer space for output
669  * @return     The function returns either
670  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
671  *
672  * This kernel is written exclusively for convolution with ch_im_in
673  * equals 3. This applies on the first layer of CNNs which has input
674  * image with RGB format.
675  */
676 
677 arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
678                                    const uint16_t dim_im_in,
679                                    const uint16_t ch_im_in,
680                                    const q7_t *wt,
681                                    const uint16_t ch_im_out,
682                                    const uint16_t dim_kernel,
683                                    const uint16_t padding,
684                                    const uint16_t stride,
685                                    const q7_t *bias,
686                                    const uint16_t bias_shift,
687                                    const uint16_t out_shift,
688                                    q7_t *Im_out,
689                                    const uint16_t dim_im_out,
690                                    q15_t *bufferA,
691                                    q7_t *bufferB);
692 
693 /**
694  * @brief Fast Q15 convolution function
695  * @param[in]       Im_in       pointer to input tensor
696  * @param[in]       dim_im_in   input tensor dimension
697  * @param[in]       ch_im_in    number of input tensor channels
698  * @param[in]       wt          pointer to kernel weights
699  * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
700  * @param[in]       dim_kernel  filter kernel size
701  * @param[in]       padding     padding sizes
702  * @param[in]       stride      convolution stride
703  * @param[in]       bias        pointer to bias
704  * @param[in]       bias_shift  amount of left-shift for bias
705  * @param[in]       out_shift   amount of right-shift for output
706  * @param[in,out]   Im_out      pointer to output tensor
707  * @param[in]       dim_im_out  output tensor dimension
708  * @param[in,out]   bufferA     pointer to buffer space for input
709  * @param[in,out]   bufferB     pointer to buffer space for output
710  * @return     The function returns either
711  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
712  *
713  * This function is the version with full list of optimization tricks, but with
714  * some contraints:
715  *   ch_im_in is multiple of 2
716  *   ch_im_out is multiple of 2
717  */
718 
719 arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
720                                      const uint16_t dim_im_in,
721                                      const uint16_t ch_im_in,
722                                      const q15_t *wt,
723                                      const uint16_t ch_im_out,
724                                      const uint16_t dim_kernel,
725                                      const uint16_t padding,
726                                      const uint16_t stride,
727                                      const q15_t *bias,
728                                      const uint16_t bias_shift,
729                                      const uint16_t out_shift,
730                                      q15_t *Im_out,
731                                      const uint16_t dim_im_out,
732                                      q15_t *bufferA,
733                                      q7_t *bufferB);
734 
735 /**
736  * @brief Fast Q15 convolution function (non-sqaure shape)
737  * @param[in]       Im_in        pointer to input tensor
738  * @param[in]       dim_im_in_x  input tensor dimension x
739  * @param[in]       dim_im_in_y  input tensor dimension y
740  * @param[in]       ch_im_in     number of input tensor channels
741  * @param[in]       wt           pointer to kernel weights
742  * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
743  * @param[in]       dim_kernel_x filter kernel size x
744  * @param[in]       dim_kernel_y filter kernel size y
745  * @param[in]       padding_x    padding size x
746  * @param[in]       padding_y    padding size y
747  * @param[in]       stride_x     convolution stride x
748  * @param[in]       stride_y     convolution stride y
749  * @param[in]       bias         pointer to bias
750  * @param[in]       bias_shift   amount of left-shift for bias
751  * @param[in]       out_shift    amount of right-shift for output
752  * @param[in,out]   Im_out       pointer to output tensor
753  * @param[in]       dim_im_out_x output tensor dimension x
754  * @param[in]       dim_im_out_y output tensor dimension y
755  * @param[in,out]   bufferA      pointer to buffer space for input
756  * @param[in,out]   bufferB      pointer to buffer space for output
757  * @return     The function returns either
758  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
759  *
760  * @details
761  *
762  * <b>Buffer size:</b>
763  *
764  * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
765  *
766  * bufferB size: 0
767  *
768  * <b>Input dimension constraints:</b>
769  *
770  * ch_im_in is multiple of 2
771  *
772  * ch_im_out is multipe of 2
773  *
774  */
775 
776 arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
777                                                const uint16_t dim_im_in_x,
778                                                const uint16_t dim_im_in_y,
779                                                const uint16_t ch_im_in,
780                                                const q15_t *wt,
781                                                const uint16_t ch_im_out,
782                                                const uint16_t dim_kernel_x,
783                                                const uint16_t dim_kernel_y,
784                                                const uint16_t padding_x,
785                                                const uint16_t padding_y,
786                                                const uint16_t stride_x,
787                                                const uint16_t stride_y,
788                                                const q15_t *bias,
789                                                const uint16_t bias_shift,
790                                                const uint16_t out_shift,
791                                                q15_t *Im_out,
792                                                const uint16_t dim_im_out_x,
793                                                const uint16_t dim_im_out_y,
794                                                q15_t *bufferA,
795                                                q7_t *bufferB);
796 
797 /**
798  * @brief Q7 depthwise separable convolution function
799  * @param[in]       Im_in       pointer to input tensor
800  * @param[in]       dim_im_in   input tensor dimension
801  * @param[in]       ch_im_in    number of input tensor channels
802  * @param[in]       wt          pointer to kernel weights
803  * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
804  * @param[in]       dim_kernel  filter kernel size
805  * @param[in]       padding     padding sizes
806  * @param[in]       stride      convolution stride
807  * @param[in]       bias        pointer to bias
808  * @param[in]       bias_shift  amount of left-shift for bias
809  * @param[in]       out_shift   amount of right-shift for output
810  * @param[in,out]   Im_out      pointer to output tensor
811  * @param[in]       dim_im_out  output tensor dimension
812  * @param[in,out]   bufferA     pointer to buffer space for input
813  * @param[in,out]   bufferB     pointer to buffer space for output
814  * @return     The function returns either
815  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
816  *
817  * This function is the version with full list of optimization tricks, but with
818  * some contraints:
819  *   ch_im_in is multiple of 2
820  *   ch_im_out is multiple of 2
821  */
822 
823 arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
824                                                const uint16_t dim_im_in,
825                                                const uint16_t ch_im_in,
826                                                const q7_t *wt,
827                                                const uint16_t ch_im_out,
828                                                const uint16_t dim_kernel,
829                                                const uint16_t padding,
830                                                const uint16_t stride,
831                                                const q7_t *bias,
832                                                const uint16_t bias_shift,
833                                                const uint16_t out_shift,
834                                                q7_t *Im_out,
835                                                const uint16_t dim_im_out,
836                                                q15_t *bufferA,
837                                                q7_t *bufferB);
838 
839 /**
840  * @brief Q7 depthwise separable convolution function (non-square shape)
841  * @param[in]       Im_in         pointer to input tensor
842  * @param[in]       dim_im_in_x   input tensor dimension x
843  * @param[in]       dim_im_in_y   input tensor dimension y
844  * @param[in]       ch_im_in      number of input tensor channels
845  * @param[in]       wt            pointer to kernel weights
846  * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
847  * @param[in]       dim_kernel_x  filter kernel size x
848  * @param[in]       dim_kernel_y  filter kernel size y
849  * @param[in]       padding_x     padding sizes x
850  * @param[in]       padding_y     padding sizes y
851  * @param[in]       stride_x      convolution stride x
852  * @param[in]       stride_y      convolution stride y
853  * @param[in]       bias          pointer to bias
854  * @param[in]       bias_shift    amount of left-shift for bias
855  * @param[in]       out_shift     amount of right-shift for output
856  * @param[in,out]   Im_out        pointer to output tensor
857  * @param[in]       dim_im_out_x  output tensor dimension x
858  * @param[in]       dim_im_out_y  output tensor dimension y
859  * @param[in,out]   bufferA       pointer to buffer space for input
860  * @param[in,out]   bufferB       pointer to buffer space for output
861  * @return     The function returns either
862  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
863  *
864  * This function is the version with full list of optimization tricks, but with
865  * some contraints:
866  *   ch_im_in is multiple of 2
867  *   ch_im_out is multiple of 2
868  */
869 arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
870                                                          const uint16_t dim_im_in_x,
871                                                          const uint16_t dim_im_in_y,
872                                                          const uint16_t ch_im_in,
873                                                          const q7_t *wt,
874                                                          const uint16_t ch_im_out,
875                                                          const uint16_t dim_kernel_x,
876                                                          const uint16_t dim_kernel_y,
877                                                          const uint16_t padding_x,
878                                                          const uint16_t padding_y,
879                                                          const uint16_t stride_x,
880                                                          const uint16_t stride_y,
881                                                          const q7_t *bias,
882                                                          const uint16_t bias_shift,
883                                                          const uint16_t out_shift,
884                                                          q7_t *Im_out,
885                                                          const uint16_t dim_im_out_x,
886                                                          const uint16_t dim_im_out_y,
887                                                          q15_t *bufferA,
888                                                          q7_t *bufferB);
889 
890 /**
891  * @brief Wrapper function to pick the right optimized s8 depthwise convolution function
892  *
893  * @param[in, out] ctx            Function context (e.g. temporary buffer). Check the function
894  *                                definition file to see if an additional buffer is required.
895  *                                Optional function {API}_get_buffer_size() provides the buffer
896  *                                size if required.
897  * @param[in]      dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
898  *                                dw_conv_params->dilation is not used.
899  *                                Range of dw_conv_params->input_offset : [-127, 128]
900  *                                Range of dw_conv_params->output_offset : [-128, 127]
901  * @param[in]      quant_params   Per-channel quantization info.
902  *                               It contains the multiplier and shift values to be applied to each
903  *                               output channel
904  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [H, W, C_IN]
905  *                                Batch argument N is not used and assumed to be 1.
906  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
907  * @param[in]      filter_dims    Filter tensor dimensions. Format: [1, H, W, C_OUT]
908  * @param[in]      filter_data    Filter data pointer. Data type: int8
909  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
910  * @param[in]      bias_data      Bias data pointer. Data type: int32
911  * @param[in]      output_dims    Output tensor dimensions. Format: [1, H, W, C_OUT]
912  * @param[in, out] output_data    Output data pointer. Data type: int8
913  * @return     The function returns
914  *                <code>ARM_MATH_SUCCESS</code>   -  Successful completion.
915  *
916  * @details
917  *    - Supported framework: TensorFlow Lite
918  *    - Picks one of the the following functions
919  *        -# arm_depthwise_conv_s8()
920  *        -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
921  *        -# arm_depthwise_conv_s8_opt()
922  *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
923  *    - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
924  * boundary.
925  */
926 arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
927                                          const cmsis_nn_dw_conv_params *dw_conv_params,
928                                          const cmsis_nn_per_channel_quant_params *quant_params,
929                                          const cmsis_nn_dims *input_dims,
930                                          const q7_t *input_data,
931                                          const cmsis_nn_dims *filter_dims,
932                                          const q7_t *filter_data,
933                                          const cmsis_nn_dims *bias_dims,
934                                          const int32_t *bias_data,
935                                          const cmsis_nn_dims *output_dims,
936                                          q7_t *output_data);
937 
938 /**
939  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
940  *
941  * @param[in]      dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
942  *                                dw_conv_params->dilation is not used.
943  *                                Range of dw_conv_params->input_offset : [-127, 128]
944  *                                Range of dw_conv_params->input_offset : [-128, 127]
945  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [H, W, C_IN]
946  *                                Batch argument N is not used and assumed to be 1.
947  * @param[in]      filter_dims    Filter tensor dimensions. Format: [1, H, W, C_OUT]
948  * @param[in]      output_dims    Output tensor dimensions. Format: [1, H, W, C_OUT]
949  * @return                        Size of additional memory required for optimizations in bytes.
950  *
951  */
952 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
953                                                       const cmsis_nn_dims *input_dims,
954                                                       const cmsis_nn_dims *filter_dims,
955                                                       const cmsis_nn_dims *output_dims);
956 
957 /**
958  * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
959  *
960  * @param[in, out] ctx            Function context (e.g. temporary buffer). Check the function
961  *                                definition file to see if an additional buffer is required.
962  *                                Optional function {API}_get_buffer_size() provides the buffer
963  *                                size if an additional buffer is required.
964  *                                exists if additional memory is.
965  * @param[in]      dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
966  *                                dw_conv_params->dilation is not used.
967  *                                Range of dw_conv_params->input_offset : [-127, 128]
968  *                                Range of dw_conv_params->input_offset : [-128, 127]
969  * @param[in]      quant_params   Per-channel quantization info.
970  *                               It contains the multiplier and shift values to be applied to each
971  *                               output channel
972  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
973  *                                Batch argument N is not used.
974  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
975  * @param[in]      filter_dims    Filter tensor dimensions. Format: [1, H, W, C_OUT]
976  * @param[in]      filter_data    Filter data pointer. Data type: int8
977  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
978  * @param[in]      bias_data      Bias data pointer. Data type: int32
979  * @param[in]      output_dims    Output tensor dimensions. Format: [1, H, W, C_OUT]
980  * @param[in, out] output_data    Output data pointer. Data type: int8
981  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
982  *
983  * @details
984  *    - Supported framework: TensorFlow Lite
985  *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
986  */
987 arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
988                                  const cmsis_nn_dw_conv_params *dw_conv_params,
989                                  const cmsis_nn_per_channel_quant_params *quant_params,
990                                  const cmsis_nn_dims *input_dims,
991                                  const q7_t *input_data,
992                                  const cmsis_nn_dims *filter_dims,
993                                  const q7_t *filter_data,
994                                  const cmsis_nn_dims *bias_dims,
995                                  const int32_t *bias_data,
996                                  const cmsis_nn_dims *output_dims,
997                                  q7_t *output_data);
998 
999 /**
1000  * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
1001  *        the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
1002  *        argument details.
1003  *
1004  * @return     The function returns one of the following
1005  *                <code>ARM_MATH_SIZE_MISMATCH</code> - Unsupported dimension of tensors
1006  *                <code>ARM_MATH_ARGUMENT_ERROR</code> - Unsupported pad size along the x axis
1007  *                <code>ARM_MATH_SUCCESS</code> - Successful operation
1008  *
1009  * @details
1010  *   - Supported framework : TensorFlow Lite Micro
1011  *   - The following constrains on the arguments apply
1012  *      -# Number of input channel equals number of output channels
1013  *      -# Filter height and width equals 3
1014  *      -# Padding along x is either 0 or 1.
1015  *
1016  */
1017 arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
1018                                      const cmsis_nn_dw_conv_params *dw_conv_params,
1019                                      const cmsis_nn_per_channel_quant_params *quant_params,
1020                                      const cmsis_nn_dims *input_dims,
1021                                      const q7_t *input_data,
1022                                      const cmsis_nn_dims *filter_dims,
1023                                      const q7_t *filter_data,
1024                                      const cmsis_nn_dims *bias_dims,
1025                                      const int32_t *bias_data,
1026                                      const cmsis_nn_dims *output_dims,
1027                                      q7_t *output_data);
1028 
1029 /**
1030  * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
1031  *        Refer arm_depthwise_conv_s8() for function argument details.
1032  *
1033  * @return     The function returns one of the following
1034  *                <code>ARM_MATH_SIZE_MISMATCH</code> - input channel != output channel or
1035  *                                                      ch_mult != 1
1036  *                <code>ARM_MATH_SUCCESS</code> - Successful operation
1037  *
1038  * @note       If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
1039  *             for the following if MVE optimizations(Arm Helium Technology) are used.
1040  *               - Output shift
1041  *               - Output multiplier
1042  *               - Output bias
1043  *               - kernel
1044  * @details
1045  *    - Supported framework: TensorFlow Lite
1046  *    - The following constrains on the arguments apply
1047  *        -# Number of input channel equals number of output channels or ch_mult equals 1
1048  *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1049  *    - Reccomended when number of channels is 4 or greater.
1050  *
1051  */
1052 arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
1053                                      const cmsis_nn_dw_conv_params *dw_conv_params,
1054                                      const cmsis_nn_per_channel_quant_params *quant_params,
1055                                      const cmsis_nn_dims *input_dims,
1056                                      const q7_t *input_data,
1057                                      const cmsis_nn_dims *filter_dims,
1058                                      const q7_t *filter_data,
1059                                      const cmsis_nn_dims *bias_dims,
1060                                      const int32_t *bias_data,
1061                                      const cmsis_nn_dims *output_dims,
1062                                      q7_t *output_data);
1063 
1064 /**
1065  * @brief Get the required buffer size for optimized s8 depthwise convolution
1066  * function with constraint that in_channel equals out_channel.
1067  * @param[in]       input_dims     Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1068  *                                 Batch argument N is not used.
1069  * @param[in]       filter_dims    Filter tensor dimensions. Format: [1, H, W, C_OUT]
1070  * @return          The function returns  required buffer size in bytes
1071  *
1072  */
1073 int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
1074 
1075 /**
1076  * @defgroup FC Fully-connected Layer Functions
1077  *
1078  * Collection of fully-connected and matrix multiplication functions.
1079  *
1080  * Fully-connected layer is basically a matrix-vector multiplication
1081  * with bias. The matrix is the weights and the input/output vectors
1082  * are the activation values. Supported {weight, activation} precisions
1083  * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
1084  *
1085  * Here we have two types of kernel functions. The basic function
1086  * implements the function using regular GEMV approach. The opt functions
1087  * operates with weights in interleaved formats.
1088  *
1089  */
1090 
1091 /**
1092  *@brief Q7 basic fully-connected layer function
1093  *@param[in]       pV          pointer to input vector
1094  *@param[in]       pM          pointer to matrix weights
1095  *@param[in]       dim_vec     length of the vector
1096  *@param[in]       num_of_rows number of rows in weight matrix
1097  *@param[in]       bias_shift  amount of left-shift for bias
1098  *@param[in]       out_shift   amount of right-shift for output
1099  *@param[in]       bias        pointer to bias
1100  *@param[in,out]   pOut        pointer to output vector
1101  *@param[in,out]   vec_buffer  pointer to buffer space for input
1102  *@return     The function returns <code>ARM_MATH_SUCCESS</code>
1103  *
1104  */
1105 
1106 arm_status arm_fully_connected_q7(const q7_t *pV,
1107                                   const q7_t *pM,
1108                                   const uint16_t dim_vec,
1109                                   const uint16_t num_of_rows,
1110                                   const uint16_t bias_shift,
1111                                   const uint16_t out_shift,
1112                                   const q7_t *bias,
1113                                   q7_t *pOut,
1114                                   q15_t *vec_buffer);
1115 
1116 /**
1117  * @brief Basic s8 Fully Connected function.
1118  *
1119  * @param[in, out] ctx            Function context (e.g. temporary buffer). Check the function
1120  *                                definition file to see if an additional buffer is required.
1121  *                                Optional function {API}_get_buffer_size() provides the buffer
1122  *                                size if an additional buffer is required.
1123  * @param[in]      fc_params      Fully Connected layer parameters (e.g. strides, dilations, pads,...)
1124  *                                Range of fc_params->input_offset  : [-127, 128]
1125  *                                fc_params->filter_offset : 0
1126  *                                Range of fc_params->output_offset : [-128, 127]
1127  * @param[in]      quant_params   Per-tensor quantization info.
1128  *                                It contains the multiplier and shift values to be applied to the output tensor.
1129  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1130  *                                Input dimension is taken as Nx(H * W * C_IN)
1131  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
1132  * @param[in]      filter_dims    Two dimensional filter dimensions. Format: [N, C]
1133  *                                N : accumulation depth and equals (H * W * C_IN) from input_dims
1134  *                                C : output depth and equals C_OUT in output_dims
1135  *                                H & W : Not used
1136  * @param[in]      filter_data    Filter data pointer. Data type: int8
1137  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
1138  *                                N, H, W : Not used
1139  * @param[in]      bias_data      Bias data pointer. Data type: int32
1140  * @param[in]      output_dims    Output tensor dimensions. Format: [N, C_OUT]
1141  *                                N : Batches
1142  *                                C_OUT : Output depth
1143  *                                H & W : Not used.
1144  * @param[in, out] output_data    Output data pointer. Data type: int8
1145  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
1146  *
1147  * @details
1148  *    - Supported framework: TensorFlow Lite
1149  *    - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
1150  */
1151 arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
1152                                   const cmsis_nn_fc_params *fc_params,
1153                                   const cmsis_nn_per_tensor_quant_params *quant_params,
1154                                   const cmsis_nn_dims *input_dims,
1155                                   const q7_t *input_data,
1156                                   const cmsis_nn_dims *filter_dims,
1157                                   const q7_t *filter_data,
1158                                   const cmsis_nn_dims *bias_dims,
1159                                   const int32_t *bias_data,
1160                                   const cmsis_nn_dims *output_dims,
1161                                   q7_t *output_data);
1162 
1163 /**
1164  * @brief Get the required buffer size for S8 basic fully-connected and
1165  * matrix multiplication layer function for TF Lite
1166  * @param[in]      filter_dims             dimension of filter
1167  * @return         The function returns    required buffer size in bytes
1168  *
1169  */
1170 int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
1171 
1172 /**
1173  * @brief Q7 opt fully-connected layer function
1174  * @param[in]       pV          pointer to input vector
1175  * @param[in]       pM          pointer to matrix weights
1176  * @param[in]       dim_vec     length of the vector
1177  * @param[in]       num_of_rows number of rows in weight matrix
1178  * @param[in]       bias_shift  amount of left-shift for bias
1179  * @param[in]       out_shift   amount of right-shift for output
1180  * @param[in]       bias        pointer to bias
1181  * @param[in,out]   pOut        pointer to output vector
1182  * @param[in,out]   vec_buffer  pointer to buffer space for input
1183  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
1184  *
1185  */
1186 
1187 arm_status arm_fully_connected_q7_opt(const q7_t *pV,
1188                                       const q7_t *pM,
1189                                       const uint16_t dim_vec,
1190                                       const uint16_t num_of_rows,
1191                                       const uint16_t bias_shift,
1192                                       const uint16_t out_shift,
1193                                       const q7_t *bias,
1194                                       q7_t *pOut,
1195                                       q15_t *vec_buffer);
1196 
1197 /**
1198  * @brief Q15 basic fully-connected layer function
1199  * @param[in]       pV          pointer to input vector
1200  * @param[in]       pM          pointer to matrix weights
1201  * @param[in]       dim_vec     length of the vector
1202  * @param[in]       num_of_rows number of rows in weight matrix
1203  * @param[in]       bias_shift  amount of left-shift for bias
1204  * @param[in]       out_shift   amount of right-shift for output
1205  * @param[in]       bias        pointer to bias
1206  * @param[in,out]   pOut        pointer to output vector
1207  * @param[in,out]   vec_buffer  pointer to buffer space for input
1208  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
1209  *
1210  */
1211 
1212 arm_status arm_fully_connected_q15(const q15_t *pV,
1213                                    const q15_t *pM,
1214                                    const uint16_t dim_vec,
1215                                    const uint16_t num_of_rows,
1216                                    const uint16_t bias_shift,
1217                                    const uint16_t out_shift,
1218                                    const q15_t *bias,
1219                                    q15_t *pOut,
1220                                    q15_t *vec_buffer);
1221 
1222 /**
1223  * @brief Q15 opt fully-connected layer function
1224  * @param[in]       pV          pointer to input vector
1225  * @param[in]       pM          pointer to matrix weights
1226  * @param[in]       dim_vec     length of the vector
1227  * @param[in]       num_of_rows number of rows in weight matrix
1228  * @param[in]       bias_shift  amount of left-shift for bias
1229  * @param[in]       out_shift   amount of right-shift for output
1230  * @param[in]       bias        pointer to bias
1231  * @param[in,out]   pOut        pointer to output vector
1232  * @param[in,out]   vec_buffer  pointer to buffer space for input
1233  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
1234  *
1235  */
1236 
1237 arm_status arm_fully_connected_q15_opt(const q15_t *pV,
1238                                        const q15_t *pM,
1239                                        const uint16_t dim_vec,
1240                                        const uint16_t num_of_rows,
1241                                        const uint16_t bias_shift,
1242                                        const uint16_t out_shift,
1243                                        const q15_t *bias,
1244                                        q15_t *pOut,
1245                                        q15_t *vec_buffer);
1246 
1247 /**
1248  * @brief Mixed Q15-Q7 fully-connected layer function
1249  * @param[in]       pV          pointer to input vector
1250  * @param[in]       pM          pointer to matrix weights
1251  * @param[in]       dim_vec     length of the vector
1252  * @param[in]       num_of_rows number of rows in weight matrix
1253  * @param[in]       bias_shift  amount of left-shift for bias
1254  * @param[in]       out_shift   amount of right-shift for output
1255  * @param[in]       bias        pointer to bias
1256  * @param[in,out]   pOut        pointer to output vector
1257  * @param[in,out]   vec_buffer  pointer to buffer space for input
1258  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
1259  *
1260  */
1261 
1262 arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV,
1263                                               const q7_t *pM,
1264                                               const uint16_t dim_vec,
1265                                               const uint16_t num_of_rows,
1266                                               const uint16_t bias_shift,
1267                                               const uint16_t out_shift,
1268                                               const q7_t *bias,
1269                                               q15_t *pOut,
1270                                               q15_t *vec_buffer);
1271 
1272 /**
1273  * @brief Mixed Q15-Q7 opt fully-connected layer function
1274  * @param[in]       pV          pointer to input vector
1275  * @param[in]       pM          pointer to matrix weights
1276  * @param[in]       dim_vec     length of the vector
1277  * @param[in]       num_of_rows number of rows in weight matrix
1278  * @param[in]       bias_shift  amount of left-shift for bias
1279  * @param[in]       out_shift   amount of right-shift for output
1280  * @param[in]       bias        pointer to bias
1281  * @param[in,out]   pOut        pointer to output vector
1282  * @param[in,out]   vec_buffer  pointer to buffer space for input
1283  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
1284  *
1285  */
1286 
1287 arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV,
1288                                                   const q7_t *pM,
1289                                                   const uint16_t dim_vec,
1290                                                   const uint16_t num_of_rows,
1291                                                   const uint16_t bias_shift,
1292                                                   const uint16_t out_shift,
1293                                                   const q7_t *bias,
1294                                                   q15_t *pOut,
1295                                                   q15_t *vec_buffer);
1296 
1297 /**
1298  * @brief Matrix-Multiplication Kernels for Convolution
1299  *
1300  * These functions are used within convolution layer functions for
1301  * matrix multiplication.
1302  *
1303  * The implementation is similar to CMSIS-DSP arm_mat_mult functions
1304  * with one Q7 and one Q15 operands. The Q15 operand is the im2col
1305  * output which is always with 2 columns.
1306  *
1307  */
1308 
1309 /**
1310  * @brief Matrix-multiplication function for convolution
1311  * @param[in]       pA          pointer to operand A
1312  * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
1313  * @param[in]       ch_im_out   numRow of A
1314  * @param[in]       numCol_A    numCol of A
1315  * @param[in]       bias_shift  amount of left-shift for bias
1316  * @param[in]       out_shift   amount of right-shift for output
1317  * @param[in]       bias        the bias
1318  * @param[in,out]   pOut        pointer to output
1319  * @return     The function returns the incremented output pointer
1320  */
1321 
1322 q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA,
1323                                     const q15_t *pInBuffer,
1324                                     const uint16_t ch_im_out,
1325                                     const uint16_t numCol_A,
1326                                     const uint16_t bias_shift,
1327                                     const uint16_t out_shift,
1328                                     const q7_t *bias,
1329                                     q7_t *pOut);
1330 /**
1331  * @brief Matrix-multiplication function for convolution with per-channel requantization.
1332  * @param[in]       input_a     pointer to operand A
1333  * @param[in]       input_b     pointer to operand B, always consists of 2 vectors.
1334  * @param[in]       output_ch   number of rows of A
1335  * @param[in]       out_shift  pointer to per output channel requantization shift parameter.
1336  * @param[in]       out_mult   pointer to per output channel requantization multiplier parameter.
1337  * @param[in]       out_offset      output tensor offset.
1338  * @param[in]       activation_min   minimum value to clamp the output to. Range : int8
1339  * @param[in]       activation_max   maximum value to clamp the output to. Range : int8
1340  * @param[in]       num_col_a   number of columns of A
1341  * @param[in]       output_bias per output channel bias. Range : int32
1342  * @param[in,out]   out_0       pointer to output
1343  * @return     The function returns one of the two
1344  *              1. The incremented output pointer for a successful operation or
1345  *              2. NULL if implementation is not available.
1346  *
1347  * @details   This function does the matrix multiplication of weight matrix for all output channels
1348  *            with 2 columns from im2col and produces two elements/output_channel. The outputs are
1349  *            clamped in the range provided by activation min and max.
1350  *            Supported framework: TensorFlow Lite micro.
1351  */
1352 q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
1353                                     const q15_t *input_b,
1354                                     const uint16_t output_ch,
1355                                     const int32_t *out_shift,
1356                                     const int32_t *out_mult,
1357                                     const int32_t out_offset,
1358                                     const int16_t activation_min,
1359                                     const int16_t activation_max,
1360                                     const uint16_t num_col_a,
1361                                     const int32_t *const output_bias,
1362                                     q7_t *out_0);
1363 
1364 /**
1365  * @brief Matrix-multiplication of re-ordered input B with A.
1366  *
1367  * @details  For arguments, refer arm_nn_mat_mult_kernel_s8_s16. The re-ordering is a consequence
1368  *           of sign extension done by the SXTB16 command on input_b. The outputs are clamped in the range
1369  *           provided by activation min and max.
1370  *   * @details
1371  *   - Supported framework : TensorFlow Lite Micro
1372  *   - The following constrains on the arguments apply
1373  *      -# num_col_a is a multiple of 4
1374  *      -# output_ch is a multiple of 2
1375  *
1376  */
1377 q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
1378                                               const q15_t *input_b,
1379                                               const uint16_t output_ch,
1380                                               const int32_t *out_shift,
1381                                               const int32_t *out_mult,
1382                                               const int32_t out_offset,
1383                                               const int16_t activation_min,
1384                                               const int16_t activation_max,
1385                                               const uint16_t num_col_a,
1386                                               const int32_t *const output_bias,
1387                                               q7_t *out_0);
1388 
1389 /**
1390  *@brief Matrix-multiplication function for convolution with reordered columns
1391  *@param[in]       pA          pointer to operand A
1392  *@param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
1393  *@param[in]       ch_im_out   numRow of A
1394  *@param[in]       numCol_A    numCol of A
1395  *@param[in]       bias_shift  amount of left-shift for bias
1396  *@param[in]       out_shift   amount of right-shift for output
1397  *@param[in]       bias        the bias
1398  *@param[in,out]   pOut        pointer to output
1399  *@return     The function returns the incremented output pointer
1400  *
1401  *@details  This function assumes that data in pInBuffer are reordered
1402  */
1403 q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
1404                                               const q15_t *pInBuffer,
1405                                               const uint16_t ch_im_out,
1406                                               const uint16_t numCol_A,
1407                                               const uint16_t bias_shift,
1408                                               const uint16_t out_shift,
1409                                               const q7_t *bias,
1410                                               q7_t *pOut);
1411 
1412 #ifdef __cplusplus
1413 }
1414 #endif
1415 
1416 /*
1417  *  Other functions
1418  *  These layers are typically not timing critical
1419  *  Basic implementation is supported here
1420  */
1421 
1422 #ifdef __cplusplus
1423 extern "C" {
1424 #endif
1425 
1426 /**
1427  * @defgroup BasicMath Basic math functions
1428  *
1429  * Element wise add and multiplication functions.
1430  *
1431  */
1432 
1433 /**
1434  * @brief s8 element wise add of two vectors
1435  * @param[in]       input_1_vect            pointer to input vector 1
1436  * @param[in]       input_2_vect            pointer to input vector 2
1437  * @param[in]       input_1_offset          offset for input 1. Range: Range: -127 to 128
1438  * @param[in]       input_1_mult            multiplier for input 1
1439  * @param[in]       input_1_shift           shift for input 1
1440  * @param[in]       input_2_offset          offset for input 2. Range: Range: -127 to 128
1441  * @param[in]       input_2_mult            multiplier for input 2
1442  * @param[in]       input_2_shift           shift for input 2
1443  * @param[in]       left_shift              input left shift
1444  * @param[in,out]   output                  pointer to output vector
1445  * @param[in]       out_offset              output offset
1446  * @param[in]       out_mult                output multiplier
1447  * @param[in]       out_shift               output shift
1448  * @param[in]       out_activation_min      minimum value to clamp output to
1449  * @param[in]       out_activation_max      maximum value to clamp output to
1450  * @param[in]       block_size              number of samples
1451  * @return          The function returns    ARM_MATH_SUCCESS
1452  */
1453 arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
1454                                   const int8_t *input_2_vect,
1455                                   const int32_t input_1_offset,
1456                                   const int32_t input_1_mult,
1457                                   const int32_t input_1_shift,
1458                                   const int32_t input_2_offset,
1459                                   const int32_t input_2_mult,
1460                                   const int32_t input_2_shift,
1461                                   const int32_t left_shift,
1462                                   int8_t *output,
1463                                   const int32_t out_offset,
1464                                   const int32_t out_mult,
1465                                   const int32_t out_shift,
1466                                   const int32_t out_activation_min,
1467                                   const int32_t out_activation_max,
1468                                   const uint32_t block_size);
1469 
1470 /**
1471  * @brief s8 element wise multiplication
1472  * @param[in]       input_1_vect            pointer to input vector 1
1473  * @param[in]       input_2_vect            pointer to input vector 2
1474  * @param[in]       input_1_offset          offset for input 1. Range: Range: -127 to 128
1475  * @param[in]       input_2_offset          offset for input 2. Range: Range: -127 to 128
1476  * @param[in,out]   output                  pointer to output vector
1477  * @param[in]       out_offset              output offset
1478  * @param[in]       out_mult                output multiplier
1479  * @param[in]       out_shift               output shift
1480  * @param[in]       out_activation_min      minimum value to clamp output to
1481  * @param[in]       out_activation_max      maximum value to clamp output to
1482  * @param[in]       block_size              number of samples
1483  * @return          The function returns    ARM_MATH_SUCCESS
1484  *
1485  * @details   Supported framework: TensorFlow Lite micro
1486  */
1487 arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
1488                                   const int8_t *input_2_vect,
1489                                   const int32_t input_1_offset,
1490                                   const int32_t input_2_offset,
1491                                   int8_t *output,
1492                                   const int32_t out_offset,
1493                                   const int32_t out_mult,
1494                                   const int32_t out_shift,
1495                                   const int32_t out_activation_min,
1496                                   const int32_t out_activation_max,
1497                                   const uint32_t block_size);
1498 /**
1499  * @defgroup Acti Activation Functions
1500  *
1501  * Perform activation layers, including ReLU (Rectified Linear Unit),
1502  * sigmoid and tanh
1503  *
1504  */
1505 
1506 /**
1507  * @brief Q7 RELU function
1508  * @param[in,out]   data        pointer to input
1509  * @param[in]       size        number of elements
1510  * @return none.
1511  */
1512 
1513 void arm_relu_q7(q7_t *data, uint16_t size);
1514 
1515 /**
1516  * @brief s8 ReLU6 function
1517  * @param[in,out]   data        pointer to input
1518  * @param[in]       size        number of elements
1519  */
1520 
1521 void arm_relu6_s8(q7_t *data, uint16_t size);
1522 
1523 /**
1524  * @brief Q15 RELU function
1525  * @param[in,out]   data        pointer to input
1526  * @param[in]       size        number of elements
1527  * @return none.
1528  */
1529 
1530 void arm_relu_q15(q15_t *data, uint16_t size);
1531 
1532 /**
1533  * @brief Q7 neural network activation function using direct table look-up
1534  * @param[in,out]   data        pointer to input
1535  * @param[in]       size        number of elements
1536  * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
1537  * @param[in]       type        type of activation functions
1538  * @return none.
1539  */
1540 
1541 void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
1542 
1543 /**
1544  * @brief Q15 neural network activation function using direct table look-up
1545  * @param[in,out]   data        pointer to input
1546  * @param[in]       size        number of elements
1547  * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
1548  * @param[in]       type        type of activation functions
1549  * @return none.
1550  *
1551  * @details
1552  *
1553  * This is the direct table look-up approach.
1554  *
1555  * Assume here the integer part of the fixed-point is <= 3.
1556  * More than 3 just not making much sense, makes no difference with
1557  * saturation followed by any of these activation functions.
1558  */
1559 
1560 void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
1561 
1562 /**
1563  * @defgroup Pooling Pooling Functions
1564  *
1565  * Perform pooling functions, including max pooling and average pooling
1566  *
1567  */
1568 
1569 /**
1570  * @brief Q7 max pooling function
1571  * @param[in]       Im_in       pointer to input tensor
1572  * @param[in]       dim_im_in   input tensor dimension
1573  * @param[in]       ch_im_in    number of input tensor channels
1574  * @param[in]       dim_kernel  filter kernel size
1575  * @param[in]       padding     padding sizes
1576  * @param[in]       stride      convolution stride
1577  * @param[in]       dim_im_out  output tensor dimension
1578  * @param[in,out]   bufferA     pointer to buffer space for input
1579  * @param[in,out]   Im_out      pointer to output tensor
1580  * @return none.
1581  *
1582  */
1583 
1584 void arm_maxpool_q7_HWC(q7_t *Im_in,
1585                         const uint16_t dim_im_in,
1586                         const uint16_t ch_im_in,
1587                         const uint16_t dim_kernel,
1588                         const uint16_t padding,
1589                         const uint16_t stride,
1590                         const uint16_t dim_im_out,
1591                         q7_t *bufferA,
1592                         q7_t *Im_out);
1593 
1594 /**
1595  * @brief Q7 average pooling function
1596  * @param[in]       Im_in       pointer to input tensor
1597  * @param[in]       dim_im_in   input tensor dimension
1598  * @param[in]       ch_im_in    number of input tensor channels
1599  * @param[in]       dim_kernel  filter kernel size
1600  * @param[in]       padding     padding sizes
1601  * @param[in]       stride      convolution stride
1602  * @param[in]       dim_im_out  output tensor dimension
1603  * @param[in,out]   bufferA     pointer to buffer space for input
1604  * @param[in,out]   Im_out      pointer to output tensor
1605  * @return none.
1606  *
1607  */
1608 
1609 void arm_avepool_q7_HWC(q7_t *Im_in,
1610                         const uint16_t dim_im_in,
1611                         const uint16_t ch_im_in,
1612                         const uint16_t dim_kernel,
1613                         const uint16_t padding,
1614                         const uint16_t stride,
1615                         const uint16_t dim_im_out,
1616                         q7_t *bufferA,
1617                         q7_t *Im_out);
1618 
1619 /**
1620  * @brief s8 average pooling function.
1621  *
1622  * @param[in, out] ctx            Function context (e.g. temporary buffer). Check the function
1623  *                                definition file to see if an additional buffer is required.
1624  *                                Optional function {API}_get_buffer_size() provides the buffer
1625  *                                size if an additional buffer is required.
1626  * @param[in]      pool_params    Pooling parameters
1627  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [H, W, C_IN]
1628  *                                Argument 'N' is not used.
1629  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
1630  * @param[in]      filter_dims    Filter tensor dimensions. Format: [H, W]
1631  *                                Argument N and C are not used.
1632  * @param[in]      output_dims    Output tensor dimensions. Format: [H, W, C_OUT]
1633  *                                Argument N is not used.
1634  *                                C_OUT equals C_IN.
1635  * @param[in, out] output_data    Output data pointer. Data type: int8
1636  * @return                        The function returns
1637  *                                    <code>ARM_MATH_SUCCESS</code> - Successful operation
1638  *
1639  * @details
1640  *    - Supported Framework: TensorFlow Lite
1641  *
1642  */
1643 arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
1644                           const cmsis_nn_pool_params *pool_params,
1645                           const cmsis_nn_dims *input_dims,
1646                           const q7_t *input_data,
1647                           const cmsis_nn_dims *filter_dims,
1648                           const cmsis_nn_dims *output_dims,
1649                           q7_t *output_data);
1650 
1651 /**
1652  * @brief Get the required buffer size for S8 average pooling function
1653  * @param[in]       dim_dst_width         output tensor dimension
1654  * @param[in]       ch_src                number of input tensor channels
1655  * @return          The function returns  required buffer size in bytes
1656  *
1657  */
1658 int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
1659 
1660 /**
1661  * @brief s8 max pooling function.
1662  *
1663  * @param[in, out] ctx            Function context (e.g. temporary buffer). Check the function
1664  *                                definition file to see if an additional buffer is required.
1665  *                                Optional function {API}_get_buffer_size() provides the buffer
1666  *                                size if an additional buffer is required.
1667  * @param[in]      pool_params    Pooling parameters
1668  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [H, W, C_IN]
1669  *                                Argument 'N' is not used.
1670  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
1671  * @param[in]      filter_dims    Filter tensor dimensions. Format: [H, W]
1672  *                                Argument N and C are not used.
1673  * @param[in]      output_dims    Output tensor dimensions. Format: [H, W, C_OUT]
1674  *                                Argument N is not used.
1675  *                                C_OUT equals C_IN.
1676  * @param[in, out] output_data    Output data pointer. Data type: int8
1677  * @return                        The function returns
1678  *                                    <code>ARM_MATH_SUCCESS</code> - Successful operation
1679  *
1680  * @details
1681  *    - Supported Framework: TensorFlow Lite
1682  *
1683  */
1684 arm_status arm_max_pool_s8(const cmsis_nn_context *ctx,
1685                            const cmsis_nn_pool_params *pool_params,
1686                            const cmsis_nn_dims *input_dims,
1687                            const q7_t *input_data,
1688                            const cmsis_nn_dims *filter_dims,
1689                            const cmsis_nn_dims *output_dims,
1690                            q7_t *output_data);
1691 /**
1692  * @defgroup Softmax Softmax Functions
1693  *
1694  * EXP(2) based softmax functions.
1695  *
1696  */
1697 
1698 /**
1699  * @brief Q7 softmax function
1700  * @param[in]       vec_in      pointer to input vector
1701  * @param[in]       dim_vec     input vector dimension
1702  * @param[out]      p_out       pointer to output vector
1703  *
1704  * @note This function is an optimized version which is not bit-accurate with
1705  *       TensorFlow Lite's kernel
1706  *
1707  */
1708 
1709 void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
1710 
1711 /**
1712  * @brief Q7 softmax function with batch parameter
1713  * @param[in]       vec_in      pointer to input vector
1714  * @param[in]       nb_batches  number of batches
1715  * @param[in]       dim_vec     input vector dimension
1716  * @param[out]      p_out       pointer to output vector
1717  * @return none.
1718  *
1719  * @note This function is an optimized version which is not bit-accurate with
1720  *       TensorFlow Lite's kernel
1721  *
1722  */
1723 
1724 void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out);
1725 /**
1726  * @brief Q15 softmax function
1727  * @param[in]       vec_in      pointer to input vector
1728  * @param[in]       dim_vec     input vector dimension
1729  * @param[out]      p_out       pointer to output vector
1730  * @return none.
1731  *
1732  * @note This function is an optimized version which is not bit-accurate with
1733  *       TensorFlow Lite's kernel
1734  *
1735  */
1736 
1737 void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
1738 
1739 /**
1740  * @brief S8 softmax function
1741  * @param[in]  input     Pointer to the input tensor
1742  * @param[in]  num_rows  Number of rows in the input tensor
1743  * @param[in]  row_size  Number of elements in each input row
1744  * @param[in]  mult      Input quantization multiplier
1745  * @param[in]  shift     Input quantization shift within the range [0, 31]
1746  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
1747  *                       the quantized exponential operation can be performed
1748  * @param[out] output    Pointer to the output tensor
1749  *
1750  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
1751  *
1752  */
1753 
1754 void arm_softmax_s8(const int8_t *input,
1755                     const int32_t num_rows,
1756                     const int32_t row_size,
1757                     const int32_t mult,
1758                     const int32_t shift,
1759                     const int32_t diff_min,
1760                     int8_t *output);
1761 
1762 /**
1763  * @brief U8 softmax function
1764  * @param[in]  input     Pointer to the input tensor
1765  * @param[in]  num_rows  Number of rows in the input tensor
1766  * @param[in]  row_size  Number of elements in each input row
1767  * @param[in]  mult      Input quantization multiplier
1768  * @param[in]  shift     Input quantization shift within the range [0, 31]
1769  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
1770  *                       the quantized exponential operation can be performed
1771  * @param[out] output    Pointer to the output tensor
1772  *
1773  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
1774  *
1775  */
1776 
1777 void arm_softmax_u8(const uint8_t *input,
1778                     const int32_t num_rows,
1779                     const int32_t row_size,
1780                     const int32_t mult,
1781                     const int32_t shift,
1782                     const int32_t diff_min,
1783                     uint8_t *output);
1784 
1785 /**
1786  * @brief uint8 depthwise convolution function with asymmetric quantization
1787  *        Unless specified otherwise, arguments are mandatory.
1788  *
1789  * @param[in]     input     Pointer to input tensor
1790  * @param[in]     input_x   Width of input tensor
1791  * @param[in]     input_y   Height of input tensor
1792  * @param[in]     input_ch  Channels in input tensor
1793  * @param[in]     kernel    Pointer to kernel weights
1794  * @param[in]     kernel_x  Width of kernel
1795  * @param[in]     kernel_y  Height of kernel
1796  * @param[in]     ch_mult   Number of channel multiplier
1797  * @param[in]     pad_x     Padding sizes x
1798  * @param[in]     pad_y     Padding sizes y
1799  * @param[in]     stride_x  stride along the width
1800  * @param[in]     stride_y  stride along the height
1801  * @param[in]     dilation_x Dilation along width. Not used and intended for future enhancement.
1802  * @param[in]     dilation_y Dilation along height. Not used and intended for future enhancement.
1803  * @param[in]     bias       Pointer to optional bias values. If no bias is
1804  *                           availble, NULL is expected
1805  * @param[in]     input_offset  Input tensor zero offset
1806  * @param[in]     filter_offset Kernel tensor zero offset
1807  * @param[in]     output_offset Output tensor zero offset
1808  * @param[in,out] output        Pointer to output tensor
1809  * @param[in]     output_x  Width of output tensor
1810  * @param[in]     output_y  Height of output tensor
1811  * @param[in]     output_activation_min   Minimum value to clamp the output to. Range : {0, 255}
1812  * @param[in]     output_activation_max   Minimum value to clamp the output to. Range : {0, 255}
1813  * @param[in]     out_shift  Amount of right-shift for output
1814  * @param[in]     out_mult   Output multiplier for requantization
1815  * @return        The function returns the following
1816  *                <code>ARM_MATH_SUCCESS</code> - Successful operation
1817  *
1818  */
1819 arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
1820                                             const uint16_t input_x,
1821                                             const uint16_t input_y,
1822                                             const uint16_t input_ch,
1823                                             const uint8_t *kernel,
1824                                             const uint16_t kernel_x,
1825                                             const uint16_t kernel_y,
1826                                             const int16_t ch_mult,
1827                                             const int16_t pad_x,
1828                                             const int16_t pad_y,
1829                                             const int16_t stride_x,
1830                                             const int16_t stride_y,
1831                                             const int16_t dilation_x,
1832                                             const int16_t dilation_y,
1833                                             const int32_t *bias,
1834                                             const int32_t input_offset,
1835                                             const int32_t filter_offset,
1836                                             const int32_t output_offset,
1837                                             uint8_t *output,
1838                                             const uint16_t output_x,
1839                                             const uint16_t output_y,
1840                                             const int32_t output_activation_min,
1841                                             const int32_t output_activation_max,
1842                                             const int32_t out_shift,
1843                                             const int32_t out_mult);
1844 
1845 /**
1846  * @defgroup Reshape Reshape Functions
1847  *
1848  */
1849 
1850 /**
1851  * @brief Reshape a s8 vector into another with different shape
1852  * @param[in]  input      points to the s8 input vector
1853  * @param[out] output     points to the s8 output vector
1854  * @param[in]  total_size total size of the input and output vectors in bytes
1855  *
1856  * @note The output is expected to be in a memory area that does not overlap with the input's
1857  *
1858  */
1859 void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size);
1860 
1861 /**
1862  * @defgroup Concatenation Concatenation Functions
1863  *
1864  */
1865 
1866 /**
1867  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis
1868  *        This function should be called for each input tensor to concatenate. The argument offset_x
1869  *        will be used to store the input tensor in the correct position in the output tensor
1870  *
1871  *        i.e.    offset_x = 0
1872  *                for(i = 0 i < num_input_tensors; ++i)
1873  *                {
1874  *                    arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x)
1875  *                    offset_x += input_x[i]
1876  *                }
1877  *
1878  *        This function assumes that the output tensor has:
1879  *        -# The same height of the input tensor
1880  *        -# The same number of channels of the input tensor
1881  *        -# The same batch size of the input tensor
1882  *
1883  *        Unless specified otherwise, arguments are mandatory.
1884  *
1885  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
1886  *      does not involve any arithmetic operation
1887  *
1888  * @param[in]  input    Pointer to input tensor
1889  * @param[in]  input_x  Width of input tensor
1890  * @param[in]  input_y  Height of input tensor
1891  * @param[in]  input_z  Channels in input tensor
1892  * @param[in]  input_w  Batch size in input tensor
1893  * @param[out] output   Pointer to output tensor
1894  * @param[in]  output_x Width of output tensor
1895  * @param[in]  offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor
1896  *                      It is user responsibility to provide the correct value
1897  *
1898  * <b> Input constraints</b>
1899  * offset_x is less than output_x
1900  *
1901  */
1902 void arm_concatenation_s8_x(const int8_t *input,
1903                             const uint16_t input_x,
1904                             const uint16_t input_y,
1905                             const uint16_t input_z,
1906                             const uint16_t input_w,
1907                             int8_t *output,
1908                             const uint16_t output_x,
1909                             const uint32_t offset_x);
1910 
1911 /**
1912  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis
1913  *        This function should be called for each input tensor to concatenate. The argument offset_y
1914  *        will be used to store the input tensor in the correct position in the output tensor
1915  *
1916  *        i.e.    offset_y = 0
1917  *                for(i = 0 i < num_input_tensors; ++i)
1918  *                {
1919  *                    arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y)
1920  *                    offset_y += input_y[i]
1921  *                }
1922  *
1923  *        This function assumes that the output tensor has:
1924  *        -# The same width of the input tensor
1925  *        -# The same number of channels of the input tensor
1926  *        -# The same batch size of the input tensor
1927  *
1928  *        Unless specified otherwise, arguments are mandatory.
1929  *
1930  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
1931  *       does not involve any arithmetic operation
1932  *
1933  * @param[in]  input    Pointer to input tensor
1934  * @param[in]  input_x  Width of input tensor
1935  * @param[in]  input_y  Height of input tensor
1936  * @param[in]  input_z  Channels in input tensor
1937  * @param[in]  input_w  Batch size in input tensor
1938  * @param[out] output   Pointer to output tensor
1939  * @param[in]  output_y Height of output tensor
1940  * @param[in]  offset_y The offset on the Y axis to start concatenating the input tensor
1941  *                      It is user responsibility to provide the correct value
1942  *
1943  * <b> Input constraints</b>
1944  * offset_y is less than output_y
1945  *
1946  */
1947 void arm_concatenation_s8_y(const int8_t *input,
1948                             const uint16_t input_x,
1949                             const uint16_t input_y,
1950                             const uint16_t input_z,
1951                             const uint16_t input_w,
1952                             int8_t *output,
1953                             const uint16_t output_y,
1954                             const uint32_t offset_y);
1955 
1956 /**
1957  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis
1958  *        This function should be called for each input tensor to concatenate. The argument offset_z
1959  *        will be used to store the input tensor in the correct position in the output tensor
1960  *
1961  *        i.e.    offset_z = 0
1962  *                for(i = 0 i < num_input_tensors; ++i)
1963  *                {
1964  *                    arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z)
1965  *                    offset_z += input_z[i]
1966  *                }
1967  *
1968  *        This function assumes that the output tensor has:
1969  *        -# The same width of the input tensor
1970  *        -# The same height of the input tensor
1971  *        -# The same batch size of the input tensor
1972  *
1973  *        Unless specified otherwise, arguments are mandatory.
1974  *
1975  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
1976  *       does not involve any arithmetic operation
1977  *
1978  * @param[in]  input    Pointer to input tensor
1979  * @param[in]  input_x  Width of input tensor
1980  * @param[in]  input_y  Height of input tensor
1981  * @param[in]  input_z  Channels in input tensor
1982  * @param[in]  input_w  Batch size in input tensor
1983  * @param[out] output   Pointer to output tensor
1984  * @param[in]  output_z Channels in output tensor
1985  * @param[in]  offset_z The offset on the Z axis to start concatenating the input tensor
1986  *                      It is user responsibility to provide the correct value
1987  *
1988  * <b> Input constraints</b>
1989  * offset_z is less than output_z
1990  *
1991  */
1992 void arm_concatenation_s8_z(const int8_t *input,
1993                             const uint16_t input_x,
1994                             const uint16_t input_y,
1995                             const uint16_t input_z,
1996                             const uint16_t input_w,
1997                             int8_t *output,
1998                             const uint16_t output_z,
1999                             const uint32_t offset_z);
2000 
2001 /**
2002  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size)
2003  *        This function should be called for each input tensor to concatenate. The argument offset_w
2004  *        will be used to store the input tensor in the correct position in the output tensor
2005  *
2006  *        i.e.    offset_w = 0
2007  *                for(i = 0 i < num_input_tensors; ++i)
2008  *                {
2009  *                    arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w)
2010  *                    offset_w += input_w[i]
2011  *                }
2012  *
2013  *        This function assumes that the output tensor has:
2014  *        -# The same width of the input tensor
2015  *        -# The same height of the input tensor
2016  *        -# The same number o channels of the input tensor
2017  *
2018  *        Unless specified otherwise, arguments are mandatory.
2019  *
2020  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2021  *       does not involve any arithmetic operation
2022  *
2023  * @param[in]  input    Pointer to input tensor
2024  * @param[in]  input_x  Width of input tensor
2025  * @param[in]  input_y  Height of input tensor
2026  * @param[in]  input_z  Channels in input tensor
2027  * @param[in]  input_w  Batch size in input tensor
2028  * @param[out] output   Pointer to output tensor
2029  * @param[in]  offset_w The offset on the W axis to start concatenating the input tensor
2030  *                      It is user responsibility to provide the correct value
2031  *
2032  */
2033 void arm_concatenation_s8_w(const int8_t *input,
2034                             const uint16_t input_x,
2035                             const uint16_t input_y,
2036                             const uint16_t input_z,
2037                             const uint16_t input_w,
2038                             int8_t *output,
2039                             const uint32_t offset_w);
2040 /**
2041  * @defgroup SVDF SVDF Layer Functions
2042  *
2043  */
2044 
2045 /**
2046  * @brief s8 SVDF function
2047  *
2048  * @param[in]   input_ctx Temporary scratch buffer
2049  * @param[in]   output_ctx Temporary output scratch buffer
2050  * @param[in]   svdf_params SVDF Parameters
2051  *              Range of svdf_params->input_offset  : [-128, 127]
2052  *              Range of svdf_params->output_offset  : [-128, 127]
2053  * @param[in]   input_quant_params Input quantization parameters
2054  * @param[in]   output_quant_params Output quantization parameters
2055  * @param[in]   input_dims Input tensor dimensions
2056  * @param[in]   input_data Pointer to input tensor
2057  * @param[in]   state_dims State tensor dimensions
2058  * @param[in]   state_data Pointer to state tensor
2059  * @param[in]   weights_feature_dims Weights (feature) tensor dimensions
2060  * @param[in]   weights_feature_data Pointer to the weights (feature) tensor
2061  * @param[in]   weights_time_dims Weights (time) tensor dimensions
2062  * @param[in]   weights_time_data Pointer to the weights (time) tensor
2063  * @param[in]   bias_dims Bias tensor dimensions
2064  * @param[in]   bias_data Pointer to bias tensor
2065  * @param[in]   output_dims Output tensor dimensions
2066  * @param[out]  output_data Pointer to the output tensor
2067  *
2068  * @return     The function returns <code>ARM_MATH_SUCCESS</code>
2069  *
2070  * @details
2071  *    1. Supported framework: TensorFlow Lite micro
2072  *    2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
2073  *
2074  */
2075 arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
2076                        const cmsis_nn_context *output_ctx,
2077                        const cmsis_nn_svdf_params *svdf_params,
2078                        const cmsis_nn_per_tensor_quant_params *input_quant_params,
2079                        const cmsis_nn_per_tensor_quant_params *output_quant_params,
2080                        const cmsis_nn_dims *input_dims,
2081                        const q7_t *input_data,
2082                        const cmsis_nn_dims *state_dims,
2083                        q15_t *state_data,
2084                        const cmsis_nn_dims *weights_feature_dims,
2085                        const q7_t *weights_feature_data,
2086                        const cmsis_nn_dims *weights_time_dims,
2087                        const q15_t *weights_time_data,
2088                        const cmsis_nn_dims *bias_dims,
2089                        const q31_t *bias_data,
2090                        const cmsis_nn_dims *output_dims,
2091                        q7_t *output_data);
2092 
2093 #ifdef __cplusplus
2094 }
2095 #endif
2096 
2097 #endif
2098