1 /*
2  * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nnfunctions.h
22  * Description:  Public header file for CMSIS NN Library
23  *
24  * $Date:        13 January 2023
25  * $Revision:    V.11.3.0
26  *
27  * Target :  Arm(R) M-Profile Architecture
28  * -------------------------------------------------------------------- */
29 
30 /**
31    \mainpage CMSIS NN Software Library
32    *
33    * \tableofcontents
34    * \section Introduction
35    *
36    *
37    * This user manual describes the CMSIS NN software library,
38    * a collection of efficient neural network kernels developed to maximize the
39    * performance and minimize the memory footprint of neural networks on Arm Cortex-M processors.
40    *
41    * The library is divided into a number of functions each covering a specific category:
42    * - \ref NNConv
43    * - \ref Acti
44    * - \ref FC
45    * - \ref SVDF
46    * - \ref Pooling
47    * - \ref Softmax
48    * - \ref groupElementwise
49    * - \ref LSTM
50    *
51    * \section Processors Supported Processors
52    *
53    * CMSIS-NN targets Cortex-M processors with typically three different implementations for each function. Each
54    * targets a different group of processors.
55    *  - Processors without Single Instruction Multiple Data(SIMD) capability (e.g, Cortex-M0)
56    *  - Processors with DSP extension (e.g Cortex-M4)
57    *  - Processors with Arm M-Profile Vector Extension(MVE) instructions (e.g Cortex-M55)
58    * The right implementation is picked through feature flags and the user does not have to explicit set it.
59    *
60    * \section Framework Quantization Specification
61    * The library follows the [int8](https://www.tensorflow.org/lite/performance/quantization_spec) and int16
62    *  quantization specification of TensorFlow Lite for Microcontrollers.
63    * \section Overview Block Diagram
64    *
65    * \image html CMSIS-NN-OVERVIEW.PNG
66    *
67    * \section Examples
68    *
69    *
70    * An example image recognition application using TensorFlow Flow Lite for Microcontrollers as an inference engine
71    * and CMSIS-NN as the optimized library can be found in the Examples directory.
72    *
73    * \section Macros Pre-processor Macros
74    *
75    * \subsection Feature Feature flag based
76    * The macros below are defined in a build system based on feature flags for a chosen processor or architecture
77    * input to a compiler.
78    * These tie in to the classification in \ref Macros.
79    *
80    * For a CMSIS-NN file compiled as *armclang -mcpu=cortex-m4 --target=arm-arm-none-eabi -I<CMSIS Core Include>
81    * -Ofast -O file.c* , ARM_MATH_DSP is enabled as Cortex-M4 has the DSP extension as a feature.
82    *
83    * - `ARM_MATH_DSP`  - Selects code for processors with DSP extension.
84    *
85    * - `ARM_MATH_MVEI`  - Selects code for processors which supports MVE instructions.
86    *
87    * \subsection MiscFlags User Set
88    * - `ARM_MATH_AUTOVECTORIZE`
89    *  Applicable when ARM_MATH_MVEI is active to let the compiler auto vectorize functions, if available, that uses
90    inline
91    *  assembly. This has to be explicitly set at compile time.
92    *
93    * \section Inclusive Inclusive Language
94    * This product confirms to Arm’s inclusive language policy and, to the best of our knowledge,
95    * does not contain any non-inclusive language. If you find something that concerns you, email terms@arm.com.
96    *
97    * \section Copyright Copyright Notice
98    *
99    *
100    * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
101    *
102    *
103    */
104 
105 /**
106  * @defgroup Public Public
107  * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
108  * TensorFlow Lite framework.
109  */
110 
111 #ifndef _ARM_NNFUNCTIONS_H
112 #define _ARM_NNFUNCTIONS_H
113 
114 #include "arm_nn_math_types.h"
115 #include "arm_nn_types.h"
116 
117 #define USE_INTRINSIC
118 
119 #ifdef __cplusplus
120 extern "C" {
121 #endif
122 
123 /**
124  * @defgroup NNConv Convolution Functions
125  *
126  * Collection of convolution, depthwise convolution functions and their variants.
127  *
128  * The convolution is implemented in 2 steps: im2col and General Matrix Multiplication(GEMM)
129  *
130  * im2col is a process of converting each patch of image data into
131  * a column. After im2col, the convolution is computed as matrix-matrix
132  * multiplication.
133  *
134  * To reduce the memory footprint, the im2col is performed partially.
135  * Each iteration, only a few column (i.e., patches) are generated followed
136  * by GEMM.
137  *
138  */
139 
140 /**
141  * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
142  *        cmsis-nn  to perform the convolution.
143  *
144  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
145  *                                arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required.
146  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
147  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
148  *                                Range of conv_params->input_offset  : [-127, 128]
149  *                                Range of conv_params->output_offset : [-128, 127]
150  * @param[in]      quant_params   Per-channel quantization info.
151  *                                It contains the multiplier and shift values to be applied to each output channel
152  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
153  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
154  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
155  *                                spatial filter dimensions
156  * @param[in]      filter_data    Filter data pointer. Data type: int8
157  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
158  * @param[in]      bias_data      Bias data pointer. Data type: int32
159  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
160  * @param[out]     output_data    Output data pointer. Data type: int8
161  *
162  * @return     The function returns either
163  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
164  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
165  *
166  */
167 arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
168                                             const cmsis_nn_conv_params *conv_params,
169                                             const cmsis_nn_per_channel_quant_params *quant_params,
170                                             const cmsis_nn_dims *input_dims,
171                                             const int8_t *input_data,
172                                             const cmsis_nn_dims *filter_dims,
173                                             const int8_t *filter_data,
174                                             const cmsis_nn_dims *bias_dims,
175                                             const int32_t *bias_data,
176                                             const cmsis_nn_dims *output_dims,
177                                             int8_t *output_data);
178 
179 /**
180  * @brief Get the required buffer size for arm_convolve_wrapper_s8
181  *
182  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
183  *                                Range of conv_params->input_offset  : [-127, 128]
184  *                                Range of conv_params->output_offset : [-128, 127]
185  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
186  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
187  *                                filter dimensions
188  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
189  *
190  * @return         The function returns required buffer size(bytes)
191  *
192  */
193 int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
194                                                 const cmsis_nn_dims *input_dims,
195                                                 const cmsis_nn_dims *filter_dims,
196                                                 const cmsis_nn_dims *output_dims);
197 
198 /**
199  * @brief Get the required buffer size for arm_convolve_wrapper_s8 for Arm(R) Helium Architecture case.
200  *        Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details.
201  *
202  * @note       Intended for compilation on Host. If compiling for an Arm target, use
203  *             arm_convolve_wrapper_s8_get_buffer_size().
204  *
205  */
206 int32_t arm_convolve_wrapper_s8_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
207                                                     const cmsis_nn_dims *input_dims,
208                                                     const cmsis_nn_dims *filter_dims,
209                                                     const cmsis_nn_dims *output_dims);
210 
211 /**
212  * @brief Get the required buffer size for arm_convolve_wrapper_s8 for processors with DSP extension.
213  *        Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details.
214  *
215  * @note       Intended for compilation on Host. If compiling for an Arm target, use
216  *             arm_convolve_wrapper_s8_get_buffer_size().
217  *
218  */
219 int32_t arm_convolve_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
220                                                     const cmsis_nn_dims *input_dims,
221                                                     const cmsis_nn_dims *filter_dims,
222                                                     const cmsis_nn_dims *output_dims);
223 
224 /**
225  * @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
226  *        cmsis-nn to perform the convolution.
227  *
228  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
229  *                                arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
230  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
231  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
232  *                                conv_params->input_offset  : Not used
233  *                                conv_params->output_offset : Not used
234  * @param[in]      quant_params   Per-channel quantization info.
235  *                                It contains the multiplier and shift values to be applied to each output channel
236  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
237  * @param[in]      input_data     Input (activation) data pointer. Data type: int16
238  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
239  *                                spatial filter dimensions
240  * @param[in]      filter_data    Filter data pointer. Data type: int8
241  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
242  * @param[in]      bias_data      Bias data pointer. Data type: int64
243  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
244  * @param[out]     output_data    Output data pointer. Data type: int16
245  *
246  * @return     The function returns either
247  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
248  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
249  *
250  */
251 arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
252                                              const cmsis_nn_conv_params *conv_params,
253                                              const cmsis_nn_per_channel_quant_params *quant_params,
254                                              const cmsis_nn_dims *input_dims,
255                                              const int16_t *input_data,
256                                              const cmsis_nn_dims *filter_dims,
257                                              const int8_t *filter_data,
258                                              const cmsis_nn_dims *bias_dims,
259                                              const int64_t *bias_data,
260                                              const cmsis_nn_dims *output_dims,
261                                              int16_t *output_data);
262 
263 /**
264  * @brief Get the required buffer size for arm_convolve_wrapper_s16.
265  *
266  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
267  *                                conv_params->input_offset  : Not used
268  *                                conv_params->output_offset : Not used
269  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
270  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
271  *                                filter dimensions
272  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
273  *
274  * @return         The function returns required buffer size(bytes)
275  *
276  */
277 int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
278                                                  const cmsis_nn_dims *input_dims,
279                                                  const cmsis_nn_dims *filter_dims,
280                                                  const cmsis_nn_dims *output_dims);
281 
282 /**
283  * @brief Get the required buffer size for arm_convolve_wrapper_s16 for for processors with DSP extension.
284  *        Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details.
285  *
286  * @note       Intended for compilation on Host. If compiling for an Arm target, use
287  *             arm_convolve_wrapper_s16_get_buffer_size().
288  *
289  */
290 int32_t arm_convolve_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
291                                                      const cmsis_nn_dims *input_dims,
292                                                      const cmsis_nn_dims *filter_dims,
293                                                      const cmsis_nn_dims *output_dims);
294 
295 /**
296  * @brief Get the required buffer size for arm_convolve_wrapper_s16 for Arm(R) Helium Architecture case.
297  *        Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details.
298  *
299  * @note       Intended for compilation on Host. If compiling for an Arm target, use
300  *             arm_convolve_wrapper_s16_get_buffer_size().
301  *
302  */
303 int32_t arm_convolve_wrapper_s16_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
304                                                      const cmsis_nn_dims *input_dims,
305                                                      const cmsis_nn_dims *filter_dims,
306                                                      const cmsis_nn_dims *output_dims);
307 
308 /**
309  * @brief Basic s8 convolution function
310  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
311  *                                arm_convolve_s8_get_buffer_size will return the buffer_size if required.
312  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
313  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
314  *                                Range of conv_params->input_offset  : [-127, 128]
315  *                                Range of conv_params->output_offset : [-128, 127]
316  * @param[in]      quant_params   Per-channel quantization info.
317  *                                It contains the multiplier and shift values to be applied to each output channel
318  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
319  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
320  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
321  *                                spatial filter dimensions
322  * @param[in]      filter_data    Filter data pointer. Data type: int8
323  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
324  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
325  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
326  * @param[out]     output_data    Output data pointer. Data type: int8
327 
328  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
329  *
330  * @details
331  *    1. Supported framework: TensorFlow Lite micro
332  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
333  *
334  */
335 arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
336                                     const cmsis_nn_conv_params *conv_params,
337                                     const cmsis_nn_per_channel_quant_params *quant_params,
338                                     const cmsis_nn_dims *input_dims,
339                                     const int8_t *input_data,
340                                     const cmsis_nn_dims *filter_dims,
341                                     const int8_t *filter_data,
342                                     const cmsis_nn_dims *bias_dims,
343                                     const int32_t *bias_data,
344                                     const cmsis_nn_dims *output_dims,
345                                     int8_t *output_data);
346 
347 /**
348  * @brief Get the required buffer size for s8 convolution function
349  *
350  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
351  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
352  * are the spatial filter dimensions
353  * @return          The function returns required buffer size(bytes)
354  *
355  */
356 int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
357 
358 /**
359  * @brief Basic s16 convolution function
360  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
361  *                                arm_convolve_s16_get_buffer_size will return the buffer_size if required.
362  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
363  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
364  *                                conv_params->input_offset  : Not used
365  *                                conv_params->output_offset : Not used
366  * @param[in]      quant_params   Per-channel quantization info.
367  *                                It contains the multiplier and shift values to be applied to each output channel
368  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
369  * @param[in]      input_data     Input (activation) data pointer. Data type: int16
370  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
371  *                                spatial filter dimensions
372  * @param[in]      filter_data    Filter data pointer. Data type: int8
373  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
374  * @param[in]      bias_data      Optional bias data pointer. Data type: int64
375  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
376  * @param[out]     output_data    Output data pointer. Data type: int16
377 
378  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
379  *
380  * @details
381  *    1. Supported framework: TensorFlow Lite micro
382  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
383  *
384  */
385 arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
386                                      const cmsis_nn_conv_params *conv_params,
387                                      const cmsis_nn_per_channel_quant_params *quant_params,
388                                      const cmsis_nn_dims *input_dims,
389                                      const int16_t *input_data,
390                                      const cmsis_nn_dims *filter_dims,
391                                      const int8_t *filter_data,
392                                      const cmsis_nn_dims *bias_dims,
393                                      const int64_t *bias_data,
394                                      const cmsis_nn_dims *output_dims,
395                                      int16_t *output_data);
396 /**
397  * @brief Optimized s16 convolution function
398  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
399  *                                arm_convolve_fast_s16_get_buffer_size will return the buffer_size if required.
400  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
401  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
402  *                                conv_params->input_offset  : Not used
403  *                                conv_params->output_offset : Not used
404  * @param[in]      quant_params   Per-channel quantization info.
405  *                                It contains the multiplier and shift values to be applied to each output channel
406  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
407  * @param[in]      input_data     Input (activation) data pointer. Data type: int16
408  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
409  *                                spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not
410  exceed 512
411  * @param[in]      filter_data    Filter data pointer. Data type: int8
412  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
413  * @param[in]      bias_data      Optional bias data pointer. Data type: int64
414  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
415  * @param[out]     output_data    Output data pointer. Data type: int16
416 
417  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
418  *
419  * @details
420  *    1. Supported framework: TensorFlow Lite micro
421  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
422  *    3. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
423  *
424  */
425 
426 arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
427                                           const cmsis_nn_conv_params *conv_params,
428                                           const cmsis_nn_per_channel_quant_params *quant_params,
429                                           const cmsis_nn_dims *input_dims,
430                                           const int16_t *input_data,
431                                           const cmsis_nn_dims *filter_dims,
432                                           const int8_t *filter_data,
433                                           const cmsis_nn_dims *bias_dims,
434                                           const int64_t *bias_data,
435                                           const cmsis_nn_dims *output_dims,
436                                           int16_t *output_data);
437 
438 /**
439  * @brief Get the required buffer size for s16 convolution function
440  *
441  * @param[in]       input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
442  * @param[in]       filter_dims   Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
443  *                                are the spatial filter dimensions
444  * @return          The function returns required buffer size(bytes)
445  *
446  */
447 int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
448 
449 /**
450  * @brief Get the required buffer size for fast s16 convolution function
451  *
452  * @param[in]       input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
453  * @param[in]       filter_dims   Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
454  *                                are the spatial filter dimensions
455  * @return          The function returns required buffer size(bytes)
456  *
457  */
458 int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
459 
460 /**
461  * @brief Fast s8 version for 1x1 convolution (non-square shape)
462  *
463  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
464  *                               arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required.
465  *                               The caller is expected to clear the buffer ,if applicable, for security reasons.
466  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
467  *                               Range of conv_params->input_offset  : [-127, 128]
468  *                               Range of conv_params->output_offset : [-128, 127]
469  * @param[in]      quant_params  Per-channel quantization info.
470  *                               It contains the multiplier and shift values to be applied to each output channel
471  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
472  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
473  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
474  * @param[in]      filter_data   Filter data pointer. Data type: int8
475  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
476  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
477  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
478  * @param[out]     output_data   Output data pointer. Data type: int8
479  *
480  * @return     The function returns either
481  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
482  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
483  *
484  * @details
485  *   - Supported framework : TensorFlow Lite Micro
486  *   - The following constrains on the arguments apply
487  *      -# conv_params->padding.w = conv_params->padding.h = 0
488  *      -# conv_params->stride.w = conv_params->stride.h = 1
489  *
490  */
491 arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
492                                              const cmsis_nn_conv_params *conv_params,
493                                              const cmsis_nn_per_channel_quant_params *quant_params,
494                                              const cmsis_nn_dims *input_dims,
495                                              const int8_t *input_data,
496                                              const cmsis_nn_dims *filter_dims,
497                                              const int8_t *filter_data,
498                                              const cmsis_nn_dims *bias_dims,
499                                              const int32_t *bias_data,
500                                              const cmsis_nn_dims *output_dims,
501                                              int8_t *output_data);
502 
503 /**
504  * @brief Get the required buffer size for arm_convolve_1x1_s8_fast
505  *
506  * @param[in]       input_dims            Input (activation) dimensions
507  * @return          The function returns the required buffer size in bytes
508  *
509  */
510 int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
511 
512 /**
513  * @brief s8 version for 1x1 convolution with support for non-unity stride values
514  *
515  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
516  *                               None is required by this function.
517  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
518  *                               Range of conv_params->input_offset  : [-127, 128]
519  *                               Range of conv_params->output_offset : [-128, 127]
520  * @param[in]      quant_params  Per-channel quantization info.
521  *                               It contains the multiplier and shift values to be applied to each output channel
522  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
523  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
524  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
525  * @param[in]      filter_data   Filter data pointer. Data type: int8
526  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
527  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
528  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
529  * @param[out]     output_data   Output data pointer. Data type: int8
530  *
531  * @return     The function returns either
532  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
533  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
534  * @details
535  *   - Supported framework : TensorFlow Lite Micro
536  *   - The following constrains on the arguments apply
537  *      -# conv_params->padding.w = conv_params->padding.h = 0
538  *
539  */
540 arm_cmsis_nn_status arm_convolve_1x1_s8(const cmsis_nn_context *ctx,
541                                         const cmsis_nn_conv_params *conv_params,
542                                         const cmsis_nn_per_channel_quant_params *quant_params,
543                                         const cmsis_nn_dims *input_dims,
544                                         const int8_t *input_data,
545                                         const cmsis_nn_dims *filter_dims,
546                                         const int8_t *filter_data,
547                                         const cmsis_nn_dims *bias_dims,
548                                         const int32_t *bias_data,
549                                         const cmsis_nn_dims *output_dims,
550                                         int8_t *output_data);
551 
552 /**
553  * @brief 1xn convolution
554  *
555  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
556  *                               arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required
557  *                               The caller is expected to clear the buffer ,if applicable, for security reasons.
558  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
559  *                               Range of conv_params->input_offset  : [-127, 128]
560  *                               Range of conv_params->output_offset : [-128, 127]
561  * @param[in]      quant_params  Per-channel quantization info.
562  *                               It contains the multiplier and shift values to be applied to each output channel
563  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
564  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
565  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
566  *                               spatial filter dimension
567  * @param[in]      filter_data   Filter data pointer. Data type: int8
568  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
569  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
570  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
571  * @param[out]     output_data   Output data pointer. Data type: int8
572  *
573  * @return     The function returns either
574  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
575  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
576  *
577  * @details
578  *   - Supported framework : TensorFlow Lite Micro
579  *   - The following constrains on the arguments apply
580  *      -# input_dims->n equals 1
581  *      -# ouput_dims->w is a multiple of 4
582  *      -# Explicit constraints(since it is for 1xN convolution)
583  *      -## input_dims->h equals 1
584  *      -## output_dims->h equals 1
585  *      -## filter_dims->h equals 1
586  *@todo  Remove constraint on output_dims->w to make the function generic.
587  *
588  */
589 arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
590                                           const cmsis_nn_conv_params *conv_params,
591                                           const cmsis_nn_per_channel_quant_params *quant_params,
592                                           const cmsis_nn_dims *input_dims,
593                                           const int8_t *input_data,
594                                           const cmsis_nn_dims *filter_dims,
595                                           const int8_t *filter_data,
596                                           const cmsis_nn_dims *bias_dims,
597                                           const int32_t *bias_data,
598                                           const cmsis_nn_dims *output_dims,
599                                           int8_t *output_data);
600 
601 /**
602  * @brief Get the required additional buffer size for 1xn convolution
603  *
604  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
605  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
606  *                                        horizontal spatial filter dimension
607  * @return          The function returns required buffer size(bytes)
608  *
609  */
610 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
611 
612 /**
613  * @brief Wrapper function to pick the right optimized s8 depthwise convolution function
614  *
615  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
616  *                                 definition file to see if an additional buffer is required.
617  *                                 Optional function {API}_get_buffer_size() provides the buffer
618  *                                 size if required.
619  *                                 The caller is expected to clear the buffer ,if applicable, for security reasons.
620  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
621  *                                 dw_conv_params->dilation is not used.
622  *                                 Range of dw_conv_params->input_offset : [-127, 128]
623  *                                 Range of dw_conv_params->output_offset : [-128, 127]
624  * @param[in]      quant_params    Per-channel quantization info.
625  *                                 It contains the multiplier and shift values to be applied to each
626  *                                 output channel
627  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
628  *                                 Batch argument N is not used and assumed to be 1.
629  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
630  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
631  * @param[in]      filter_data     Filter data pointer. Data type: int8
632  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
633  * @param[in]      bias_data       Bias data pointer. Data type: int32
634  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
635  * @param[in, out] output_data     Output data pointer. Data type: int8
636  * @return     The function returns
637  *                <code>ARM_CMSIS_NN_SUCCESS</code>   -  Successful completion.
638  *
639  * @details
640  *    - Supported framework: TensorFlow Lite
641  *    - Picks one of the the following functions
642  *        -# arm_depthwise_conv_s8()
643  *        -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
644  *        -# arm_depthwise_conv_s8_opt()
645  *    - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
646  * boundary.
647  */
648 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
649                                                   const cmsis_nn_dw_conv_params *dw_conv_params,
650                                                   const cmsis_nn_per_channel_quant_params *quant_params,
651                                                   const cmsis_nn_dims *input_dims,
652                                                   const int8_t *input_data,
653                                                   const cmsis_nn_dims *filter_dims,
654                                                   const int8_t *filter_data,
655                                                   const cmsis_nn_dims *bias_dims,
656                                                   const int32_t *bias_data,
657                                                   const cmsis_nn_dims *output_dims,
658                                                   int8_t *output_data);
659 
660 /**
661  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
662  *
663  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
664  *                                 Range of dw_conv_params->input_offset : [-127, 128]
665  *                                 Range of dw_conv_params->input_offset : [-128, 127]
666  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
667  *                                 Batch argument N is not used and assumed to be 1.
668  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
669  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
670  * @return                         Size of additional memory required for optimizations in bytes.
671  *
672  */
673 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
674                                                       const cmsis_nn_dims *input_dims,
675                                                       const cmsis_nn_dims *filter_dims,
676                                                       const cmsis_nn_dims *output_dims);
677 
678 /**
679  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for processors with DSP extension.
680  *        Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details.
681  *
682  * @note       Intended for compilation on Host. If compiling for an Arm target, use
683  *             arm_depthwise_conv_wrapper_s8_get_buffer_size().
684  *
685  */
686 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
687                                                           const cmsis_nn_dims *input_dims,
688                                                           const cmsis_nn_dims *filter_dims,
689                                                           const cmsis_nn_dims *output_dims);
690 
691 /**
692  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for Arm(R) Helium Architecture case.
693  *        Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details.
694  *
695  * @note       Intended for compilation on Host. If compiling for an Arm target, use
696  *             arm_depthwise_conv_wrapper_s8_get_buffer_size().
697  *
698  */
699 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
700                                                           const cmsis_nn_dims *input_dims,
701                                                           const cmsis_nn_dims *filter_dims,
702                                                           const cmsis_nn_dims *output_dims);
703 
704 /**
705  * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
706  *
707  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
708  *                                 definition file to see if an additional buffer is required.
709  *                                 Optional function {API}_get_buffer_size() provides the buffer
710  *                                 size if an additional buffer is required exists if additional memory is.
711  *                                 The caller is expected to clear the buffer ,if applicable, for security reasons.
712  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
713  *                                 dw_conv_params->dilation is not used.
714  *                                 Range of dw_conv_params->input_offset : [-127, 128]
715  *                                 Range of dw_conv_params->input_offset : [-128, 127]
716  * @param[in]      quant_params    Per-channel quantization info.
717  *                                 It contains the multiplier and shift values to be applied to each
718  *                                 output channel
719  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
720  *                                 Batch argument N is not used.
721  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
722  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
723  * @param[in]      filter_data     Filter data pointer. Data type: int8
724  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
725  * @param[in]      bias_data       Bias data pointer. Data type: int32
726  * @param[in]      output_dims     Output tensor dimensions. Format: [N, H, W, C_OUT]
727  * @param[in, out] output_data     Output data pointer. Data type: int8
728  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
729  *
730  * @details
731  *    - Supported framework: TensorFlow Lite
732  */
733 arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
734                                           const cmsis_nn_dw_conv_params *dw_conv_params,
735                                           const cmsis_nn_per_channel_quant_params *quant_params,
736                                           const cmsis_nn_dims *input_dims,
737                                           const int8_t *input_data,
738                                           const cmsis_nn_dims *filter_dims,
739                                           const int8_t *filter_data,
740                                           const cmsis_nn_dims *bias_dims,
741                                           const int32_t *bias_data,
742                                           const cmsis_nn_dims *output_dims,
743                                           int8_t *output_data);
744 
745 /**
746  * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
747  *
748  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
749  *                                 definition file to see if an additional buffer is required.
750  *                                 Optional function {API}_get_buffer_size() provides the buffer
751  *                                 size if an additional buffer is required.
752  *                                 exists if additional memory is.
753  *                                 The caller is expected to clear the buffer ,if applicable, for security reasons.
754  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
755  *                                 conv_params->input_offset  : Not used
756  *                                 conv_params->output_offset : Not used
757  * @param[in]      quant_params    Per-channel quantization info.
758  *                                 It contains the multiplier and shift values to be applied to each
759  *                                 output channel
760  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
761  *                                 Batch argument N is not used.
762  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
763  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
764  * @param[in]      filter_data     Filter data pointer. Data type: int8
765  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
766  * @param[in]      bias_data       Bias data pointer. Data type: int64
767  * @param[in]      output_dims     Output tensor dimensions. Format: [N, H, W, C_OUT]
768  * @param[in, out] output_data     Output data pointer. Data type: int16
769  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
770  *
771  * @details
772  *    - Supported framework: TensorFlow Lite
773  */
774 arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
775                                            const cmsis_nn_dw_conv_params *dw_conv_params,
776                                            const cmsis_nn_per_channel_quant_params *quant_params,
777                                            const cmsis_nn_dims *input_dims,
778                                            const int16_t *input_data,
779                                            const cmsis_nn_dims *filter_dims,
780                                            const int8_t *filter_data,
781                                            const cmsis_nn_dims *bias_dims,
782                                            const int64_t *bias_data,
783                                            const cmsis_nn_dims *output_dims,
784                                            int16_t *output_data);
785 
786 /**
787  * @brief Wrapper function to pick the right optimized s16 depthwise convolution function
788  *
789  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
790  *                                 definition file to see if an additional buffer is required.
791  *                                 Optional function {API}_get_buffer_size() provides the buffer
792  *                                 size if required.
793  *                                 The caller is expected to clear the buffer ,if applicable, for security reasons.
794  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
795  *                                 dw_conv_params->dilation is not used.
796  *                                 Range of dw_conv_params->input_offset : Not used
797  *                                 Range of dw_conv_params->output_offset : Not used
798  * @param[in]      quant_params    Per-channel quantization info.
799  *                                 It contains the multiplier and shift values to be applied to each
800  *                                 output channel
801  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
802  *                                 Batch argument N is not used and assumed to be 1.
803  * @param[in]      input_data      Input (activation) data pointer. Data type: int16
804  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
805  * @param[in]      filter_data     Filter data pointer. Data type: int8
806  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
807  * @param[in]      bias_data       Bias data pointer. Data type: int64
808  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
809  * @param[in, out] output_data     Output data pointer. Data type: int16
810  * @return     The function returns
811  *                <code>ARM_CMSIS_NN_SUCCESS</code>   -  Successful completion.
812  *
813  * @details
814  *    - Supported framework: TensorFlow Lite
815  *    - Picks one of the the following functions
816  *        -# arm_depthwise_conv_s16()
817  *        -# arm_depthwise_conv_fast_s16()  - Cortex-M CPUs with DSP extension only
818  */
819 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx,
820                                                    const cmsis_nn_dw_conv_params *dw_conv_params,
821                                                    const cmsis_nn_per_channel_quant_params *quant_params,
822                                                    const cmsis_nn_dims *input_dims,
823                                                    const int16_t *input_data,
824                                                    const cmsis_nn_dims *filter_dims,
825                                                    const int8_t *filter_data,
826                                                    const cmsis_nn_dims *bias_dims,
827                                                    const int64_t *bias_data,
828                                                    const cmsis_nn_dims *output_dims,
829                                                    int16_t *output_data);
830 
831 /**
832  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16()
833  *
834  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
835  *                                 Range of dw_conv_params->input_offset : Not used
836  *                                 Range of dw_conv_params->input_offset : Not used
837  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
838  *                                 Batch argument N is not used and assumed to be 1.
839  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
840  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
841  * @return                         Size of additional memory required for optimizations in bytes.
842  *
843  */
844 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
845                                                        const cmsis_nn_dims *input_dims,
846                                                        const cmsis_nn_dims *filter_dims,
847                                                        const cmsis_nn_dims *output_dims);
848 
849 /**
850  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for processors with DSP extension.
851  *        Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details.
852  *
853  * @note       Intended for compilation on Host. If compiling for an Arm target, use
854  *             arm_depthwise_conv_wrapper_s16_get_buffer_size().
855  *
856  */
857 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
858                                                            const cmsis_nn_dims *input_dims,
859                                                            const cmsis_nn_dims *filter_dims,
860                                                            const cmsis_nn_dims *output_dims);
861 
862 /**
863  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for Arm(R) Helium Architecture
864  * case. Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details.
865  *
866  * @note       Intended for compilation on Host. If compiling for an Arm target, use
867  *             arm_depthwise_conv_wrapper_s16_get_buffer_size().
868  *
869  */
870 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
871                                                            const cmsis_nn_dims *input_dims,
872                                                            const cmsis_nn_dims *filter_dims,
873                                                            const cmsis_nn_dims *output_dims);
874 
875 /**
876  * @brief Optimized s16 depthwise convolution function with constraint that in_channel equals out_channel.
877  *        Refer arm_depthwise_conv_s16() for function argument details.
878  *
879  * @return     The function returns one of the following
880  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - ctx-buff == NULL and
881  *                                                      arm_depthwise_conv_fast_s16_get_buffer_size() > 0 or
882  *                                                      input channel != output channel or
883  *                                                      ch_mult != 1
884  *
885  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
886  *
887  * @details
888  *    - Supported framework: TensorFlow Lite
889  *    - The following constrains on the arguments apply
890  *        -# Number of input channel equals number of output channels or ch_mult equals 1
891  *    - Reccomended when number of channels is 4 or greater.
892  *
893  */
894 arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
895                                                 const cmsis_nn_dw_conv_params *dw_conv_params,
896                                                 const cmsis_nn_per_channel_quant_params *quant_params,
897                                                 const cmsis_nn_dims *input_dims,
898                                                 const int16_t *input_data,
899                                                 const cmsis_nn_dims *filter_dims,
900                                                 const int8_t *filter_data,
901                                                 const cmsis_nn_dims *bias_dims,
902                                                 const int64_t *bias_data,
903                                                 const cmsis_nn_dims *output_dims,
904                                                 int16_t *output_data);
905 
906 /**
907  * @brief Get the required buffer size for optimized s16 depthwise convolution
908  * function with constraint that in_channel equals out_channel.
909  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
910  *                               Batch argument N is not used.
911  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
912  * @return          The function returns required buffer size in bytes
913  *
914  */
915 int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
916 
917 /**
918  * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
919  *        the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
920  *        argument details.
921  *
922  * @return     The function returns one of the following
923  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - Unsupported dimension of tensors
924  *                                                    - Unsupported pad size along the x axis
925  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
926  *
927  * @details
928  *   - Supported framework : TensorFlow Lite Micro
929  *   - The following constrains on the arguments apply
930  *      -# Number of input channel equals number of output channels
931  *      -# Filter height and width equals 3
932  *      -# Padding along x is either 0 or 1.
933  *
934  */
935 arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
936                                               const cmsis_nn_dw_conv_params *dw_conv_params,
937                                               const cmsis_nn_per_channel_quant_params *quant_params,
938                                               const cmsis_nn_dims *input_dims,
939                                               const int8_t *input_data,
940                                               const cmsis_nn_dims *filter_dims,
941                                               const int8_t *filter_data,
942                                               const cmsis_nn_dims *bias_dims,
943                                               const int32_t *bias_data,
944                                               const cmsis_nn_dims *output_dims,
945                                               int8_t *output_data);
946 
947 /**
948  * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
949  *        Refer arm_depthwise_conv_s8() for function argument details.
950  *
951  * @return     The function returns one of the following
952  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - input channel != output channel or
953  *                                                      ch_mult != 1
954  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
955  *
956  * @note       If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
957  *             for the following if MVE optimizations(Arm Helium Technology) are used.
958  *               - Output shift
959  *               - Output multiplier
960  *               - Output bias
961  *               - kernel
962  * @details
963  *    - Supported framework: TensorFlow Lite
964  *    - The following constrains on the arguments apply
965  *        -# Number of input channel equals number of output channels or ch_mult equals 1
966  *    - Reccomended when number of channels is 4 or greater.
967  *
968  */
969 arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
970                                               const cmsis_nn_dw_conv_params *dw_conv_params,
971                                               const cmsis_nn_per_channel_quant_params *quant_params,
972                                               const cmsis_nn_dims *input_dims,
973                                               const int8_t *input_data,
974                                               const cmsis_nn_dims *filter_dims,
975                                               const int8_t *filter_data,
976                                               const cmsis_nn_dims *bias_dims,
977                                               const int32_t *bias_data,
978                                               const cmsis_nn_dims *output_dims,
979                                               int8_t *output_data);
980 
981 /**
982  * @brief Get the required buffer size for optimized s8 depthwise convolution
983  * function with constraint that in_channel equals out_channel.
984  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
985  *                               Batch argument N is not used.
986  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
987  * @return          The function returns required buffer size in bytes
988  *
989  */
990 int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
991 
992 /**
993  * @defgroup FC Fully-connected Layer Functions
994  *
995  * Collection of fully-connected and matrix multiplication functions.
996  *
997  * Fully-connected layer is basically a matrix-vector multiplication
998  * with bias. The matrix is the weights and the input/output vectors
999  * are the activation values. Supported {weight, activation} precisions
1000  * include {8-bit, 8-bit} and {8-bit, 16-bit}
1001  *
1002  *
1003  */
1004 
1005 /**
1006  * @brief Basic s8 Fully Connected function.
1007  *
1008  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1009  *                               definition file to see if an additional buffer is required.
1010  *                               Optional function {API}_get_buffer_size() provides the buffer
1011  *                               size if an additional buffer is required.
1012  *                               The caller is expected to clear the buffer ,if applicable, for security reasons.
1013  * @param[in]      fc_params     Fully Connected layer parameters.
1014  *                               Range of fc_params->input_offset  : [-127, 128]
1015  *                               fc_params->filter_offset : 0
1016  *                               Range of fc_params->output_offset : [-128, 127]
1017  * @param[in]      quant_params  Per-tensor quantization info.
1018  *                               It contains the multiplier and shift values to be applied to the output tensor.
1019  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1020  *                               Input dimension is taken as Nx(H * W * C_IN)
1021  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
1022  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1023  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1024  *                               C : output depth and equals C_OUT in output_dims
1025  *                               H & W : Not used
1026  * @param[in]      filter_data   Filter data pointer. Data type: int8
1027  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1028  *                               N, H, W : Not used
1029  * @param[in]      bias_data     Bias data pointer. Data type: int32
1030  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1031  *                               N : Batches
1032  *                               C_OUT : Output depth
1033  *                               H & W : Not used.
1034  * @param[in, out] output_data    Output data pointer. Data type: int8
1035  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1036  *
1037  * @details
1038  *    - Supported framework: TensorFlow Lite
1039  */
1040 arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
1041                                            const cmsis_nn_fc_params *fc_params,
1042                                            const cmsis_nn_per_tensor_quant_params *quant_params,
1043                                            const cmsis_nn_dims *input_dims,
1044                                            const int8_t *input_data,
1045                                            const cmsis_nn_dims *filter_dims,
1046                                            const int8_t *filter_data,
1047                                            const cmsis_nn_dims *bias_dims,
1048                                            const int32_t *bias_data,
1049                                            const cmsis_nn_dims *output_dims,
1050                                            int8_t *output_data);
1051 
1052 /**
1053  * @brief Get size of additional buffer required by arm_fully_connected_s8().
1054  * @param[in]      filter_dims             dimension of filter
1055  * @return         The function returns    required buffer size in bytes
1056  *
1057  */
1058 int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
1059 
1060 /**
1061  * @brief Get size of additional buffer required by arm_fully_connected_s8() for processors with DSP extension.
1062  *        Refer to arm_fully_connected_s8_get_buffer_size() for function argument details.
1063  *
1064  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1065  *             arm_fully_connected_s8_get_buffer_size().
1066  *
1067  */
1068 int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
1069 
1070 /**
1071  * @brief Get size of additional buffer required by arm_fully_connected_s8() for Arm(R) Helium Architecture case.
1072  *        Refer to arm_fully_connected_s8_get_buffer_size() for function argument details.
1073  *
1074  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1075  *             arm_fully_connected_s8_get_buffer_size().
1076  *
1077  */
1078 int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
1079 
1080 /**
1081  * @brief Basic s16 Fully Connected function.
1082  *
1083  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1084  *                               definition file to see if an additional buffer is required.
1085  *                               Optional function {API}_get_buffer_size() provides the buffer
1086  *                               size if an additional buffer is required.
1087  *                               The caller is expected to clear the buffer ,if applicable, for security reasons.
1088  * @param[in]      fc_params     Fully Connected layer parameters.
1089  *                               fc_params->input_offset  : 0
1090  *                               fc_params->filter_offset : 0
1091  *                               fc_params->output_offset : 0
1092  * @param[in]      quant_params  Per-tensor quantization info.
1093  *                               It contains the multiplier and shift values to be applied to the output tensor.
1094  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1095  *                               Input dimension is taken as Nx(H * W * C_IN)
1096  * @param[in]      input_data    Input (activation) data pointer. Data type: int16
1097  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1098  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1099  *                               C : output depth and equals C_OUT in output_dims
1100  *                               H & W : Not used
1101  * @param[in]      filter_data   Filter data pointer. Data type: int8
1102  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1103  *                               N, H, W : Not used
1104  * @param[in]      bias_data     Bias data pointer. Data type: int64
1105  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1106  *                               N : Batches
1107  *                               C_OUT : Output depth
1108  *                               H & W : Not used.
1109  * @param[in, out] output_data    Output data pointer. Data type: int16
1110  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1111  *
1112  * @details
1113  *    - Supported framework: TensorFlow Lite
1114  */
1115 arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
1116                                             const cmsis_nn_fc_params *fc_params,
1117                                             const cmsis_nn_per_tensor_quant_params *quant_params,
1118                                             const cmsis_nn_dims *input_dims,
1119                                             const int16_t *input_data,
1120                                             const cmsis_nn_dims *filter_dims,
1121                                             const int8_t *filter_data,
1122                                             const cmsis_nn_dims *bias_dims,
1123                                             const int64_t *bias_data,
1124                                             const cmsis_nn_dims *output_dims,
1125                                             int16_t *output_data);
1126 
1127 /**
1128  * @brief Get size of additional buffer required by arm_fully_connected_s16().
1129  * @param[in]      filter_dims             dimension of filter
1130  * @return         The function returns    required buffer size in bytes
1131  *
1132  */
1133 int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims);
1134 
1135 /**
1136  * @brief Get size of additional buffer required by arm_fully_connected_s16() for processors with DSP extension.
1137  *        Refer to arm_fully_connected_s16_get_buffer_size() for function argument details.
1138  *
1139  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1140  *             arm_fully_connected_s16_get_buffer_size().
1141  *
1142  */
1143 int32_t arm_fully_connected_s16_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
1144 
1145 /**
1146  * @brief Get size of additional buffer required by arm_fully_connected_s16() for Arm(R) Helium Architecture case.
1147  *        Refer to arm_fully_connected_s16_get_buffer_size() for function argument details.
1148  *
1149  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1150  *             arm_fully_connected_s16_get_buffer_size().
1151  *
1152  */
1153 int32_t arm_fully_connected_s16_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
1154 
1155 /**
1156  * @defgroup groupElementwise Elementwise Functions
1157  *
1158  * Elementwise add and multiplication functions.
1159  *
1160  */
1161 
1162 /**
1163  * @brief s8 elementwise add of two vectors
1164  * @param[in]       input_1_vect        pointer to input vector 1
1165  * @param[in]       input_2_vect        pointer to input vector 2
1166  * @param[in]       input_1_offset      offset for input 1. Range: -127 to 128
1167  * @param[in]       input_1_mult        multiplier for input 1
1168  * @param[in]       input_1_shift       shift for input 1
1169  * @param[in]       input_2_offset      offset for input 2. Range: -127 to 128
1170  * @param[in]       input_2_mult        multiplier for input 2
1171  * @param[in]       input_2_shift       shift for input 2
1172  * @param[in]       left_shift          input left shift
1173  * @param[in,out]   output              pointer to output vector
1174  * @param[in]       out_offset          output offset.  Range: -128 to 127
1175  * @param[in]       out_mult            output multiplier
1176  * @param[in]       out_shift           output shift
1177  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -128
1178  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 127
1179  * @param[in]       block_size          number of samples
1180  * @return          The function returns    ARM_CMSIS_NN_SUCCESS
1181  */
1182 arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
1183                                            const int8_t *input_2_vect,
1184                                            const int32_t input_1_offset,
1185                                            const int32_t input_1_mult,
1186                                            const int32_t input_1_shift,
1187                                            const int32_t input_2_offset,
1188                                            const int32_t input_2_mult,
1189                                            const int32_t input_2_shift,
1190                                            const int32_t left_shift,
1191                                            int8_t *output,
1192                                            const int32_t out_offset,
1193                                            const int32_t out_mult,
1194                                            const int32_t out_shift,
1195                                            const int32_t out_activation_min,
1196                                            const int32_t out_activation_max,
1197                                            const int32_t block_size);
1198 
1199 /**
1200  * @brief s16 elementwise add of two vectors
1201  * @param[in]       input_1_vect        pointer to input vector 1
1202  * @param[in]       input_2_vect        pointer to input vector 2
1203  * @param[in]       input_1_offset      offset for input 1. Not used.
1204  * @param[in]       input_1_mult        multiplier for input 1
1205  * @param[in]       input_1_shift       shift for input 1
1206  * @param[in]       input_2_offset      offset for input 2. Not used.
1207  * @param[in]       input_2_mult        multiplier for input 2
1208  * @param[in]       input_2_shift       shift for input 2
1209  * @param[in]       left_shift          input left shift
1210  * @param[in,out]   output              pointer to output vector
1211  * @param[in]       out_offset          output offset. Not used.
1212  * @param[in]       out_mult            output multiplier
1213  * @param[in]       out_shift           output shift
1214  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -32768
1215  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 32767
1216  * @param[in]       block_size          number of samples
1217  * @return          The function returns  ARM_CMSIS_NN_SUCCESS
1218  */
1219 arm_cmsis_nn_status arm_elementwise_add_s16(const int16_t *input_1_vect,
1220                                             const int16_t *input_2_vect,
1221                                             const int32_t input_1_offset,
1222                                             const int32_t input_1_mult,
1223                                             const int32_t input_1_shift,
1224                                             const int32_t input_2_offset,
1225                                             const int32_t input_2_mult,
1226                                             const int32_t input_2_shift,
1227                                             const int32_t left_shift,
1228                                             int16_t *output,
1229                                             const int32_t out_offset,
1230                                             const int32_t out_mult,
1231                                             const int32_t out_shift,
1232                                             const int32_t out_activation_min,
1233                                             const int32_t out_activation_max,
1234                                             const int32_t block_size);
1235 
1236 /**
1237  * @brief s8 elementwise multiplication
1238  * @param[in]       input_1_vect        pointer to input vector 1
1239  * @param[in]       input_2_vect        pointer to input vector 2
1240  * @param[in]       input_1_offset      offset for input 1. Range: -127 to 128
1241  * @param[in]       input_2_offset      offset for input 2. Range: -127 to 128
1242  * @param[in,out]   output              pointer to output vector
1243  * @param[in]       out_offset          output offset. Range: -128 to 127
1244  * @param[in]       out_mult            output multiplier
1245  * @param[in]       out_shift           output shift
1246  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -128
1247  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 127
1248  * @param[in]       block_size          number of samples
1249  * @return          The function returns ARM_CMSIS_NN_SUCCESS
1250  *
1251  * @details   Supported framework: TensorFlow Lite micro
1252  */
1253 arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
1254                                            const int8_t *input_2_vect,
1255                                            const int32_t input_1_offset,
1256                                            const int32_t input_2_offset,
1257                                            int8_t *output,
1258                                            const int32_t out_offset,
1259                                            const int32_t out_mult,
1260                                            const int32_t out_shift,
1261                                            const int32_t out_activation_min,
1262                                            const int32_t out_activation_max,
1263                                            const int32_t block_size);
1264 
1265 /**
1266  * @brief s16 elementwise multiplication
1267  * @param[in]       input_1_vect        pointer to input vector 1
1268  * @param[in]       input_2_vect        pointer to input vector 2
1269  * @param[in]       input_1_offset      offset for input 1. Not used.
1270  * @param[in]       input_2_offset      offset for input 2. Not used.
1271  * @param[in,out]   output              pointer to output vector
1272  * @param[in]       out_offset          output offset. Not used.
1273  * @param[in]       out_mult            output multiplier
1274  * @param[in]       out_shift           output shift
1275  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -32768
1276  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 32767
1277  * @param[in]       block_size          number of samples
1278  * @return          The function returns ARM_CMSIS_NN_SUCCESS
1279  *
1280  * @details   Supported framework: TensorFlow Lite micro
1281  */
1282 arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
1283                                             const int16_t *input_2_vect,
1284                                             const int32_t input_1_offset,
1285                                             const int32_t input_2_offset,
1286                                             int16_t *output,
1287                                             const int32_t out_offset,
1288                                             const int32_t out_mult,
1289                                             const int32_t out_shift,
1290                                             const int32_t out_activation_min,
1291                                             const int32_t out_activation_max,
1292                                             const int32_t block_size);
1293 
1294 /**
1295  * @defgroup Acti Activation Functions
1296  *
1297  * Perform activation layers, including ReLU (Rectified Linear Unit),
1298  * sigmoid and tanh
1299  *
1300  */
1301 
1302 /**
1303  * @brief Q7 RELU function
1304  * @param[in,out]   data        pointer to input
1305  * @param[in]       size        number of elements
1306  */
1307 void arm_relu_q7(int8_t *data, uint16_t size);
1308 
1309 /**
1310  * @brief s8 ReLU6 function
1311  * @param[in,out]   data        pointer to input
1312  * @param[in]       size        number of elements
1313  */
1314 void arm_relu6_s8(int8_t *data, uint16_t size);
1315 
1316 /**
1317  * @brief Q15 RELU function
1318  * @param[in,out]   data        pointer to input
1319  * @param[in]       size        number of elements
1320  */
1321 void arm_relu_q15(int16_t *data, uint16_t size);
1322 
1323 /**
1324  * @brief s16 neural network activation function using direct table look-up
1325  * @param[in]       input        pointer to input data
1326  * @param[out]      output      pointer to output
1327  * @param[in]       size        number of elements
1328  * @param[in]       left_shift  bit-width of the integer part, assume to be smaller than 3
1329  * @param[in]       type        type of activation functions
1330  *
1331  * @details Supported framework: TensorFlow Lite for Microcontrollers.
1332  * This activation function must be bit precise congruent with the corresponding TFLM tanh and sigmoid actication
1333  * functions
1334  */
1335 void arm_nn_activation_s16(const int16_t *input,
1336                            int16_t *output,
1337                            const uint16_t size,
1338                            const uint16_t left_shift,
1339                            const arm_nn_activation_type type);
1340 
1341 /**
1342  * @defgroup Pooling Pooling Functions
1343  *
1344  * Perform max and average pooling operations
1345  *
1346  */
1347 
1348 /**
1349  * @brief s8 average pooling function.
1350  *
1351  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
1352  *                              definition file to see if an additional buffer is required.
1353  *                              Optional function {API}_get_buffer_size() provides the buffer
1354  *                              size if an additional buffer is required.
1355  *                              The caller is expected to clear the buffer ,if applicable, for security reasons.
1356  * @param[in]      pool_params  Pooling parameters
1357  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
1358  *                              Argument 'N' is not used.
1359  * @param[in]      input_data   Input (activation) data pointer. Data type: int8
1360  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
1361  *                              Argument N and C are not used.
1362  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
1363  *                              Argument N is not used.
1364  *                              C_OUT equals C_IN.
1365  * @param[in, out] output_data Output data pointer. Data type: int8
1366  * @return                     The function returns
1367  *                             <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1368  *
1369  * @details
1370  *    - Supported Framework: TensorFlow Lite
1371  *
1372  */
1373 arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
1374                                    const cmsis_nn_pool_params *pool_params,
1375                                    const cmsis_nn_dims *input_dims,
1376                                    const int8_t *input_data,
1377                                    const cmsis_nn_dims *filter_dims,
1378                                    const cmsis_nn_dims *output_dims,
1379                                    int8_t *output_data);
1380 
1381 /**
1382  * @brief Get the required buffer size for S8 average pooling function
1383  * @param[in]       dim_dst_width         output tensor dimension
1384  * @param[in]       ch_src                number of input tensor channels
1385  * @return          The function returns required buffer size in bytes
1386  *
1387  */
1388 int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
1389 
1390 /**
1391  * @brief Get the required buffer size for S8 average pooling function for processors with DSP extension.
1392  *        Refer to arm_avgpool_s8_get_buffer_size() for function argument details.
1393  *
1394  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1395  *             arm_avgpool_s8_get_buffer_size().
1396  *
1397  */
1398 int32_t arm_avgpool_s8_get_buffer_size_dsp(const int dim_dst_width, const int ch_src);
1399 
1400 /**
1401  * @brief Get the required buffer size for S8 average pooling function for Arm(R) Helium Architecture case.
1402  *        Refer to arm_avgpool_s8_get_buffer_size() for function argument details.
1403  *
1404  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1405  *             arm_avgpool_s8_get_buffer_size().
1406  *
1407  */
1408 int32_t arm_avgpool_s8_get_buffer_size_mve(const int dim_dst_width, const int ch_src);
1409 
1410 /**
1411  * @brief s16 average pooling function.
1412  *
1413  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
1414  *                              definition file to see if an additional buffer is required.
1415  *                              Optional function {API}_get_buffer_size() provides the buffer
1416  *                              size if an additional buffer is required.
1417  *                              The caller is expected to clear the buffer ,if applicable, for security reasons.
1418  * @param[in]      pool_params  Pooling parameters
1419  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
1420  *                              Argument 'N' is not used.
1421  * @param[in]      input_data   Input (activation) data pointer. Data type: int16
1422  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
1423  *                              Argument N and C are not used.
1424  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
1425  *                              Argument N is not used.
1426  *                              C_OUT equals C_IN.
1427  * @param[in, out] output_data  Output data pointer. Data type: int16
1428  * @return                        The function returns
1429  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1430  *                                    <code>ARM_CMSIS_NN_ARG_ERROR</code> - In case of invalid arguments
1431  *
1432  * @details
1433  *    - Supported Framework: TensorFlow Lite
1434  *
1435  */
1436 arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
1437                                     const cmsis_nn_pool_params *pool_params,
1438                                     const cmsis_nn_dims *input_dims,
1439                                     const int16_t *input_data,
1440                                     const cmsis_nn_dims *filter_dims,
1441                                     const cmsis_nn_dims *output_dims,
1442                                     int16_t *output_data);
1443 
1444 /**
1445  * @brief Get the required buffer size for S16 average pooling function
1446  * @param[in]       dim_dst_width         output tensor dimension
1447  * @param[in]       ch_src                number of input tensor channels
1448  * @return          The function returns required buffer size in bytes
1449  *
1450  */
1451 int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src);
1452 
1453 /**
1454  * @brief Get the required buffer size for S16 average pooling function for processors with DSP extension.
1455  *        Refer to arm_avgpool_s16_get_buffer_size() for function argument details.
1456  *
1457  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1458  *             arm_avgpool_s16_get_buffer_size().
1459  *
1460  */
1461 int32_t arm_avgpool_s16_get_buffer_size_dsp(const int dim_dst_width, const int ch_src);
1462 
1463 /**
1464  * @brief Get the required buffer size for S16 average pooling function for Arm(R) Helium Architecture case.
1465  *        Refer to arm_avgpool_s16_get_buffer_size() for function argument details.
1466  *
1467  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1468  *             arm_avgpool_s16_get_buffer_size().
1469  *
1470  */
1471 int32_t arm_avgpool_s16_get_buffer_size_mve(const int dim_dst_width, const int ch_src);
1472 
1473 /**
1474  * @brief s8 max pooling function.
1475  *
1476  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
1477  *                              definition file to see if an additional buffer is required.
1478  *                              Optional function {API}_get_buffer_size() provides the buffer
1479  *                              size if an additional buffer is required.
1480  *                              The caller is expected to clear the buffer ,if applicable, for security reasons.
1481  * @param[in]      pool_params  Pooling parameters
1482  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
1483  *                              Argument 'N' is not used.
1484  * @param[in]      input_data   Input (activation) data pointer. The input tensor must not
1485  *                              overlap with the output tensor. Data type: int8
1486  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
1487  *                              Argument N and C are not used.
1488  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
1489  *                              Argument N is not used.
1490  *                              C_OUT equals C_IN.
1491  * @param[in, out] output_data    Output data pointer. Data type: int8
1492  * @return                        The function returns
1493  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1494  *
1495  * @details
1496  *    - Supported Framework: TensorFlow Lite
1497  *
1498  */
1499 arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx,
1500                                     const cmsis_nn_pool_params *pool_params,
1501                                     const cmsis_nn_dims *input_dims,
1502                                     const int8_t *input_data,
1503                                     const cmsis_nn_dims *filter_dims,
1504                                     const cmsis_nn_dims *output_dims,
1505                                     int8_t *output_data);
1506 
1507 /**
1508  * @brief s16 max pooling function.
1509  *
1510  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
1511  *                              definition file to see if an additional buffer is required.
1512  *                              Optional function {API}_get_buffer_size() provides the buffer
1513  *                              size if an additional buffer is required.
1514  *                              The caller is expected to clear the buffer ,if applicable, for security reasons.
1515  * @param[in]      pool_params  Pooling parameters
1516  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
1517  *                              Argument 'N' is not used.
1518  * @param[in]      src          Input (activation) data pointer. The input tensor must not
1519  *                              overlap with the output tensor. Data type: int16
1520  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
1521  *                              Argument N and C are not used.
1522  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
1523  *                              Argument N is not used.
1524  *                              C_OUT equals C_IN.
1525  * @param[in, out] dst          Output data pointer. Data type: int16
1526  * @return                        The function returns
1527  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1528  *
1529  * @details
1530  *    - Supported Framework: TensorFlow Lite
1531  *
1532  */
1533 arm_cmsis_nn_status arm_max_pool_s16(const cmsis_nn_context *ctx,
1534                                      const cmsis_nn_pool_params *pool_params,
1535                                      const cmsis_nn_dims *input_dims,
1536                                      const int16_t *src,
1537                                      const cmsis_nn_dims *filter_dims,
1538                                      const cmsis_nn_dims *output_dims,
1539                                      int16_t *dst);
1540 
1541 /**
1542  * @defgroup Softmax Softmax Functions
1543  *
1544  *
1545  */
1546 
1547 /**
1548  * @brief S8 softmax function
1549  * @param[in]  input     Pointer to the input tensor
1550  * @param[in]  num_rows  Number of rows in the input tensor
1551  * @param[in]  row_size  Number of elements in each input row
1552  * @param[in]  mult      Input quantization multiplier
1553  * @param[in]  shift     Input quantization shift within the range [0, 31]
1554  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
1555  *                       the quantized exponential operation can be performed
1556  * @param[out] output    Pointer to the output tensor
1557  *
1558  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
1559  *
1560  */
1561 void arm_softmax_s8(const int8_t *input,
1562                     const int32_t num_rows,
1563                     const int32_t row_size,
1564                     const int32_t mult,
1565                     const int32_t shift,
1566                     const int32_t diff_min,
1567                     int8_t *output);
1568 
1569 /**
1570  * @brief S8 to s16 softmax function
1571  * @param[in]  input     Pointer to the input tensor
1572  * @param[in]  num_rows  Number of rows in the input tensor
1573  * @param[in]  row_size  Number of elements in each input row
1574  * @param[in]  mult      Input quantization multiplier
1575  * @param[in]  shift     Input quantization shift within the range [0, 31]
1576  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
1577  *                       the quantized exponential operation can be performed
1578  * @param[out] output    Pointer to the output tensor
1579  *
1580  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
1581  *
1582  */
1583 void arm_softmax_s8_s16(const int8_t *input,
1584                         const int32_t num_rows,
1585                         const int32_t row_size,
1586                         const int32_t mult,
1587                         const int32_t shift,
1588                         const int32_t diff_min,
1589                         int16_t *output);
1590 
1591 /**
1592  * @brief S16 softmax function
1593  * @param[in]  input           Pointer to the input tensor
1594  * @param[in]  num_rows        Number of rows in the input tensor
1595  * @param[in]  row_size        Number of elements in each input row
1596  * @param[in]  mult            Input quantization multiplier
1597  * @param[in]  shift           Input quantization shift within the range [0, 31]
1598  * @param[in]  softmax_params  Softmax s16 layer parameters with two pointers to LUTs speficied below.
1599  *                             For indexing the high 9 bits are used and 7 remaining for interpolation.
1600  *                             That means 512 entries for the 9-bit indexing and 1 extra for interpolation, i.e. 513
1601  *                             values for each LUT.
1602  *                             - Lookup table for exp(x), where x uniform distributed between [-10.0 , 0.0]
1603  *                             - Lookup table for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
1604  * @param[out] output          Pointer to the output tensor
1605  * @return                        The function returns
1606  *                                    <code>ARM_CMSIS_NN_ARG_ERROR</code> Argument error check failed
1607  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1608  *
1609  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
1610  *
1611  */
1612 arm_cmsis_nn_status arm_softmax_s16(const int16_t *input,
1613                                     const int32_t num_rows,
1614                                     const int32_t row_size,
1615                                     const int32_t mult,
1616                                     const int32_t shift,
1617                                     const cmsis_nn_softmax_lut_s16 *softmax_params,
1618                                     int16_t *output);
1619 
1620 /**
1621  * @brief U8 softmax function
1622  * @param[in]  input     Pointer to the input tensor
1623  * @param[in]  num_rows  Number of rows in the input tensor
1624  * @param[in]  row_size  Number of elements in each input row
1625  * @param[in]  mult      Input quantization multiplier
1626  * @param[in]  shift     Input quantization shift within the range [0, 31]
1627  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
1628  *                       the quantized exponential operation can be performed
1629  * @param[out] output    Pointer to the output tensor
1630  *
1631  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
1632  *
1633  */
1634 
1635 void arm_softmax_u8(const uint8_t *input,
1636                     const int32_t num_rows,
1637                     const int32_t row_size,
1638                     const int32_t mult,
1639                     const int32_t shift,
1640                     const int32_t diff_min,
1641                     uint8_t *output);
1642 
1643 /**
1644  * @defgroup Reshape Reshape Functions
1645  *
1646  */
1647 
1648 /**
1649  * @brief Reshape a s8 vector into another with different shape
1650  * @param[in]  input      points to the s8 input vector
1651  * @param[out] output     points to the s8 output vector
1652  * @param[in]  total_size total size of the input and output vectors in bytes
1653  *
1654  * @note The output is expected to be in a memory area that does not overlap with the input's
1655  *
1656  */
1657 void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size);
1658 
1659 /**
1660  * @defgroup Concatenation Concatenation Functions
1661  *
1662  */
1663 
1664 /**
1665  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis
1666  *        This function should be called for each input tensor to concatenate. The argument offset_x
1667  *        will be used to store the input tensor in the correct position in the output tensor
1668  *
1669  *        i.e.    offset_x = 0
1670  *                for(i = 0 i < num_input_tensors; ++i)
1671  *                {
1672  *                    arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x)
1673  *                    offset_x += input_x[i]
1674  *                }
1675  *
1676  *        This function assumes that the output tensor has:
1677  *        -# The same height of the input tensor
1678  *        -# The same number of channels of the input tensor
1679  *        -# The same batch size of the input tensor
1680  *
1681  *        Unless specified otherwise, arguments are mandatory.
1682  *
1683  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
1684  *      does not involve any arithmetic operation
1685  *
1686  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with the output tensor.
1687  * @param[in]  input_x  Width of input tensor
1688  * @param[in]  input_y  Height of input tensor
1689  * @param[in]  input_z  Channels in input tensor
1690  * @param[in]  input_w  Batch size in input tensor
1691  * @param[out] output   Pointer to output tensor. Expected to be at least
1692  *                          (input_x * input_y * input_z * input_w) + offset_x
1693  *                      bytes.
1694  * @param[in]  output_x Width of output tensor
1695  * @param[in]  offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor
1696  *                      It is user responsibility to provide the correct value
1697  *
1698  * <b> Input constraints</b>
1699  * offset_x is less than output_x
1700  *
1701  */
1702 void arm_concatenation_s8_x(const int8_t *input,
1703                             const uint16_t input_x,
1704                             const uint16_t input_y,
1705                             const uint16_t input_z,
1706                             const uint16_t input_w,
1707                             int8_t *output,
1708                             const uint16_t output_x,
1709                             const uint32_t offset_x);
1710 
1711 /**
1712  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis
1713  *        This function should be called for each input tensor to concatenate. The argument offset_y
1714  *        will be used to store the input tensor in the correct position in the output tensor
1715  *
1716  *        i.e.    offset_y = 0
1717  *                for(i = 0 i < num_input_tensors; ++i)
1718  *                {
1719  *                    arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y)
1720  *                    offset_y += input_y[i]
1721  *                }
1722  *
1723  *        This function assumes that the output tensor has:
1724  *        -# The same width of the input tensor
1725  *        -# The same number of channels of the input tensor
1726  *        -# The same batch size of the input tensor
1727  *
1728  *        Unless specified otherwise, arguments are mandatory.
1729  *
1730  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
1731  *       does not involve any arithmetic operation
1732  *
1733  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with the output tensor.
1734  * @param[in]  input_x  Width of input tensor
1735  * @param[in]  input_y  Height of input tensor
1736  * @param[in]  input_z  Channels in input tensor
1737  * @param[in]  input_w  Batch size in input tensor
1738  * @param[out] output   Pointer to output tensor. Expected to be at least
1739  *                          (input_z * input_w * input_x * input_y) + offset_y
1740  *                      bytes.
1741  * @param[in]  output_y Height of output tensor
1742  * @param[in]  offset_y The offset on the Y axis to start concatenating the input tensor
1743  *                      It is user responsibility to provide the correct value
1744  *
1745  * <b> Input constraints</b>
1746  * offset_y is less than output_y
1747  *
1748  */
1749 void arm_concatenation_s8_y(const int8_t *input,
1750                             const uint16_t input_x,
1751                             const uint16_t input_y,
1752                             const uint16_t input_z,
1753                             const uint16_t input_w,
1754                             int8_t *output,
1755                             const uint16_t output_y,
1756                             const uint32_t offset_y);
1757 
1758 /**
1759  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis
1760  *        This function should be called for each input tensor to concatenate. The argument offset_z
1761  *        will be used to store the input tensor in the correct position in the output tensor
1762  *
1763  *        i.e.    offset_z = 0
1764  *                for(i = 0 i < num_input_tensors; ++i)
1765  *                {
1766  *                    arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z)
1767  *                    offset_z += input_z[i]
1768  *                }
1769  *
1770  *        This function assumes that the output tensor has:
1771  *        -# The same width of the input tensor
1772  *        -# The same height of the input tensor
1773  *        -# The same batch size of the input tensor
1774  *
1775  *        Unless specified otherwise, arguments are mandatory.
1776  *
1777  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
1778  *       does not involve any arithmetic operation
1779  *
1780  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with output tensor.
1781  * @param[in]  input_x  Width of input tensor
1782  * @param[in]  input_y  Height of input tensor
1783  * @param[in]  input_z  Channels in input tensor
1784  * @param[in]  input_w  Batch size in input tensor
1785  * @param[out] output   Pointer to output tensor. Expected to be at least
1786  *                          (input_x * input_y * input_z * input_w) + offset_z
1787  *                      bytes.
1788  * @param[in]  output_z Channels in output tensor
1789  * @param[in]  offset_z The offset on the Z axis to start concatenating the input tensor
1790  *                      It is user responsibility to provide the correct value
1791  *
1792  * <b> Input constraints</b>
1793  * offset_z is less than output_z
1794  *
1795  */
1796 void arm_concatenation_s8_z(const int8_t *input,
1797                             const uint16_t input_x,
1798                             const uint16_t input_y,
1799                             const uint16_t input_z,
1800                             const uint16_t input_w,
1801                             int8_t *output,
1802                             const uint16_t output_z,
1803                             const uint32_t offset_z);
1804 
1805 /**
1806  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size)
1807  *        This function should be called for each input tensor to concatenate. The argument offset_w
1808  *        will be used to store the input tensor in the correct position in the output tensor
1809  *
1810  *        i.e.    offset_w = 0
1811  *                for(i = 0 i < num_input_tensors; ++i)
1812  *                {
1813  *                    arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w)
1814  *                    offset_w += input_w[i]
1815  *                }
1816  *
1817  *        This function assumes that the output tensor has:
1818  *        -# The same width of the input tensor
1819  *        -# The same height of the input tensor
1820  *        -# The same number o channels of the input tensor
1821  *
1822  *        Unless specified otherwise, arguments are mandatory.
1823  *
1824  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
1825  *       does not involve any arithmetic operation
1826  *
1827  * @param[in]  input    Pointer to input tensor
1828  * @param[in]  input_x  Width of input tensor
1829  * @param[in]  input_y  Height of input tensor
1830  * @param[in]  input_z  Channels in input tensor
1831  * @param[in]  input_w  Batch size in input tensor
1832  * @param[out] output   Pointer to output tensor. Expected to be at least
1833  *                          input_x * input_y * input_z * input_w
1834  *                      bytes.
1835  * @param[in]  offset_w The offset on the W axis to start concatenating the input tensor
1836  *                      It is user responsibility to provide the correct value
1837  *
1838  */
1839 void arm_concatenation_s8_w(const int8_t *input,
1840                             const uint16_t input_x,
1841                             const uint16_t input_y,
1842                             const uint16_t input_z,
1843                             const uint16_t input_w,
1844                             int8_t *output,
1845                             const uint32_t offset_w);
1846 /**
1847  * @defgroup SVDF SVDF Functions
1848  *
1849  */
1850 
1851 /**
1852  * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
1853  *
1854  * @param[in]   input_ctx             Temporary scratch buffer
1855  *                                    The caller is expected to clear the buffer ,if applicable, for security reasons.
1856  * @param[in]   output_ctx            Temporary output scratch buffer
1857  *                                    The caller is expected to clear the buffer ,if applicable, for security reasons.
1858  * @param[in]   svdf_params           SVDF Parameters
1859  *                                    Range of svdf_params->input_offset  : [-128, 127]
1860  *                                    Range of svdf_params->output_offset  : [-128, 127]
1861  * @param[in]   input_quant_params    Input quantization parameters
1862  * @param[in]   output_quant_params   Output quantization parameters
1863  * @param[in]   input_dims            Input tensor dimensions
1864  * @param[in]   input_data            Pointer to input tensor
1865  * @param[in]   state_dims            State tensor dimensions
1866  * @param[in]   state_data            Pointer to state tensor
1867  * @param[in]   weights_feature_dims  Weights (feature) tensor dimensions
1868  * @param[in]   weights_feature_data  Pointer to the weights (feature) tensor
1869  * @param[in]   weights_time_dims     Weights (time) tensor dimensions
1870  * @param[in]   weights_time_data     Pointer to the weights (time) tensor
1871  * @param[in]   bias_dims             Bias tensor dimensions
1872  * @param[in]   bias_data             Pointer to bias tensor
1873  * @param[in]   output_dims           Output tensor dimensions
1874  * @param[out]  output_data           Pointer to the output tensor
1875  *
1876  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1877  *
1878  * @details
1879  *    1. Supported framework: TensorFlow Lite micro
1880  */
1881 arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
1882                                 const cmsis_nn_context *output_ctx,
1883                                 const cmsis_nn_svdf_params *svdf_params,
1884                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
1885                                 const cmsis_nn_per_tensor_quant_params *output_quant_params,
1886                                 const cmsis_nn_dims *input_dims,
1887                                 const int8_t *input_data,
1888                                 const cmsis_nn_dims *state_dims,
1889                                 int8_t *state_data,
1890                                 const cmsis_nn_dims *weights_feature_dims,
1891                                 const int8_t *weights_feature_data,
1892                                 const cmsis_nn_dims *weights_time_dims,
1893                                 const int8_t *weights_time_data,
1894                                 const cmsis_nn_dims *bias_dims,
1895                                 const int32_t *bias_data,
1896                                 const cmsis_nn_dims *output_dims,
1897                                 int8_t *output_data);
1898 
1899 /**
1900  * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
1901  *
1902  * @param[in]   input_ctx             Temporary scratch buffer
1903  *                                    The caller is expected to clear the buffer ,if applicable, for security reasons.
1904  * @param[in]   output_ctx            Temporary output scratch buffer
1905  *                                    The caller is expected to clear the buffer ,if applicable, for security reasons.
1906  * @param[in]   svdf_params           SVDF Parameters
1907  *                                    Range of svdf_params->input_offset  : [-128, 127]
1908  *                                    Range of svdf_params->output_offset  : [-128, 127]
1909  * @param[in]   input_quant_params    Input quantization parameters
1910  * @param[in]   output_quant_params   Output quantization parameters
1911  * @param[in]   input_dims            Input tensor dimensions
1912  * @param[in]   input_data            Pointer to input tensor
1913  * @param[in]   state_dims            State tensor dimensions
1914  * @param[in]   state_data            Pointer to state tensor
1915  * @param[in]   weights_feature_dims  Weights (feature) tensor dimensions
1916  * @param[in]   weights_feature_data  Pointer to the weights (feature) tensor
1917  * @param[in]   weights_time_dims     Weights (time) tensor dimensions
1918  * @param[in]   weights_time_data     Pointer to the weights (time) tensor
1919  * @param[in]   bias_dims             Bias tensor dimensions
1920  * @param[in]   bias_data             Pointer to bias tensor
1921  * @param[in]   output_dims           Output tensor dimensions
1922  * @param[out]  output_data           Pointer to the output tensor
1923  *
1924  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1925  *
1926  * @details
1927  *    1. Supported framework: TensorFlow Lite micro
1928  */
1929 arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
1930                                           const cmsis_nn_context *output_ctx,
1931                                           const cmsis_nn_svdf_params *svdf_params,
1932                                           const cmsis_nn_per_tensor_quant_params *input_quant_params,
1933                                           const cmsis_nn_per_tensor_quant_params *output_quant_params,
1934                                           const cmsis_nn_dims *input_dims,
1935                                           const int8_t *input_data,
1936                                           const cmsis_nn_dims *state_dims,
1937                                           int16_t *state_data,
1938                                           const cmsis_nn_dims *weights_feature_dims,
1939                                           const int8_t *weights_feature_data,
1940                                           const cmsis_nn_dims *weights_time_dims,
1941                                           const int16_t *weights_time_data,
1942                                           const cmsis_nn_dims *bias_dims,
1943                                           const int32_t *bias_data,
1944                                           const cmsis_nn_dims *output_dims,
1945                                           int8_t *output_data);
1946 
1947 /**
1948  * @defgroup LSTM LSTM Layer Functions
1949  *
1950  */
1951 
1952 /**
1953  * @brief LSTM unidirectional function with 8 bit input and output and 16 bit gate output
1954  * Peephole connections, projection, clipping, combined input/forget gate and layer normalization are not supported.
1955  *
1956  * @param[in]   scratch_buffers                 Struct containing scratch buffers
1957  *                                              Expected size for each scratch buffer is
1958  *                                              lstm_dims->num_batches * lstm_dims->num_outputs.
1959  * @param[in]   input_data                      Pointer to input data
1960  * @param[in]   lstm_dims                       LSTM input parameters related to dimensions
1961  * @param[in]   input_to_input_weights          Input to input weights
1962  * @param[in]   input_to_forget_weights         Input to forget weights
1963  * @param[in]   input_to_cell_weights           Input to cell weights
1964  * @param[in]   input_to_output_weights         Input to output weights
1965  * @param[in]   recurrent_to_input_weights      Recurrent to input weights
1966  * @param[in]   recurrent_to_forget_weights     Recurrent to forget weights
1967  * @param[in]   recurrent_to_cell_weights       Recurrent to cell weights
1968  * @param[in]   recurrent_to_output_weights     Recurrent to output weights
1969  * @param[in]   cell_to_input_weights           Cell to input weights. Not used.
1970  * @param[in]   cell_to_forget_weights          Cell to forget weights. Not used.
1971  * @param[in]   cell_to_output_weights          Cell to output weights. Not used.
1972  * @param[in]   projection_weights              Projection weights. Not used.
1973  * @param[in]   lstm                            LSTM parameters. See struct declaration
1974  * @param[in]   output_state                    Pointer to (recurrent) output state
1975  * @param[in]   cell_state                      Pointer to cell state
1976  * @param[in]   output_data                     Pointer to output state
1977  *
1978  * @note Following assumptions are done based on LSTM functionality as supported by
1979  *       Keras version 2.9.0 at the time of development. As stated here,
1980  *       https://github.com/tensorflow/community/blob/master/rfcs/20180920-unify-rnn-interface.md
1981  *       Keras's LSTMCell is equivalent to TensorFlow's BasicLSTMCell,
1982  *       which does not support peephole, clipping or projection.
1983  *       Layer normalization and combined input/forget gate are not supported either.
1984  *
1985  *       1 Input to input weight can not be nullptr. Otherwise nullptr for combined input/forgat gate.
1986  *       2 Cell weights are not used and should be nullptr. Otherwise needed for peephole connections.
1987  *       3 Projection weight is not used and should be nullpr. Otherwise needed for projection.
1988  *
1989  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1990  *
1991  * @details
1992  *    1. Supported framework: TensorFlow Lite micro
1993  *
1994  */
1995 arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratch_buffers,
1996                                                    const int8_t *input_data,
1997                                                    const cmsis_nn_lstm_dims *lstm_dims,
1998                                                    const int8_t *input_to_input_weights,
1999                                                    const int8_t *input_to_forget_weights,
2000                                                    const int8_t *input_to_cell_weights,
2001                                                    const int8_t *input_to_output_weights,
2002                                                    const int8_t *recurrent_to_input_weights,
2003                                                    const int8_t *recurrent_to_forget_weights,
2004                                                    const int8_t *recurrent_to_cell_weights,
2005                                                    const int8_t *recurrent_to_output_weights,
2006                                                    const int16_t *cell_to_input_weights,
2007                                                    const int16_t *cell_to_forget_weights,
2008                                                    const int16_t *cell_to_output_weights,
2009                                                    const int8_t *projection_weights,
2010                                                    const cmsis_nn_lstm_params *lstm,
2011                                                    int8_t *output_state,
2012                                                    int16_t *cell_state,
2013                                                    int8_t *output_data);
2014 
2015 #ifdef __cplusplus
2016 }
2017 #endif
2018 
2019 #endif
2020