1 /*
2  * SPDX-FileCopyrightText: Copyright 2010-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nnfunctions.h
22  * Description:  Public header file for CMSIS NN Library
23  *
24  * $Date:        04 November 2024
25  * $Revision:    V.18.0.0
26  *
27  * Target :  Arm(R) M-Profile Architecture
28  * -------------------------------------------------------------------- */
29 
30 /**
31  * @defgroup Public Public
32  * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
33  * TensorFlow Lite framework.
34  */
35 
36 #ifndef ARM_NNFUNCTIONS_H
37 #define ARM_NNFUNCTIONS_H
38 
39 #include "arm_nn_math_types.h"
40 #include "arm_nn_types.h"
41 
42 #define USE_INTRINSIC
43 
44 #ifdef __cplusplus
45 extern "C" {
46 #endif
47 
48 /**
49  * @defgroup NNConv Convolution Functions
50  *
51  * Collection of convolution, depthwise convolution functions and their variants.
52  *
53  * The convolution is implemented in 2 steps: im2col and General Matrix Multiplication(GEMM)
54  *
55  * im2col is a process of converting each patch of image data into
56  * a column. After im2col, the convolution is computed as matrix-matrix
57  * multiplication.
58  *
59  * To reduce the memory footprint, the im2col is performed partially.
60  * Each iteration, only a few column (i.e., patches) are generated followed
61  * by GEMM.
62  *
63  */
64 
65 /**
66  * @brief s4 convolution layer wrapper function with the main purpose to call the optimal kernel available in
67  *        cmsis-nn  to perform the convolution.
68  *
69  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
70  *                                arm_convolve_wrapper_s4_get_buffer_size will return the buffer_size if required.
71  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
72  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
73  *                                Range of conv_params->input_offset  : [-127, 128]
74  *                                Range of conv_params->output_offset : [-128, 127]
75  * @param[in]      quant_params   Per-channel quantization info.
76  *                                It contains the multiplier and shift values to be applied to each output channel
77  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
78  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
79  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
80  *                                spatial filter dimensions
81  * @param[in]      filter_data    Filter data pointer. Data type: int8 packed with 2x int4
82  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
83  * @param[in]      bias_data      Bias data pointer. Data type: int32
84  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
85  * @param[out]     output_data    Output data pointer. Data type: int8
86  *
87  * @return     The function returns either
88  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
89  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
90  *
91  */
92 arm_cmsis_nn_status arm_convolve_wrapper_s4(const cmsis_nn_context *ctx,
93                                             const cmsis_nn_conv_params *conv_params,
94                                             const cmsis_nn_per_channel_quant_params *quant_params,
95                                             const cmsis_nn_dims *input_dims,
96                                             const int8_t *input_data,
97                                             const cmsis_nn_dims *filter_dims,
98                                             const int8_t *filter_data,
99                                             const cmsis_nn_dims *bias_dims,
100                                             const int32_t *bias_data,
101                                             const cmsis_nn_dims *output_dims,
102                                             int8_t *output_data);
103 
104 /**
105  * @brief Get the required buffer size for arm_convolve_wrapper_s4
106  *
107  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
108  *                                Range of conv_params->input_offset  : [-127, 128]
109  *                                Range of conv_params->output_offset : [-128, 127]
110  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
111  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
112  *                                filter dimensions
113  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
114  *
115  * @return         The function returns required buffer size(bytes)
116  *
117  */
118 int32_t arm_convolve_wrapper_s4_get_buffer_size(const cmsis_nn_conv_params *conv_params,
119                                                 const cmsis_nn_dims *input_dims,
120                                                 const cmsis_nn_dims *filter_dims,
121                                                 const cmsis_nn_dims *output_dims);
122 
123 /**
124  * @brief Get the required buffer size for arm_convolve_wrapper_s4 for Arm(R) Helium Architecture case.
125  *        Refer to arm_convolve_wrapper_s4_get_buffer_size() for function argument details.
126  *
127  * @note       Intended for compilation on Host. If compiling for an Arm target, use
128  *             arm_convolve_wrapper_s4_get_buffer_size(). Currently this operator does not have an
129  *             mve implementation, so dsp will be used.
130  *
131  */
132 int32_t arm_convolve_wrapper_s4_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
133                                                     const cmsis_nn_dims *input_dims,
134                                                     const cmsis_nn_dims *filter_dims,
135                                                     const cmsis_nn_dims *output_dims);
136 
137 /**
138  * @brief Get the required buffer size for arm_convolve_wrapper_s4 for processors with DSP extension.
139  *        Refer to arm_convolve_wrapper_s4_get_buffer_size() for function argument details.
140  *
141  * @note       Intended for compilation on Host. If compiling for an Arm target, use
142  *             arm_convolve_wrapper_s4_get_buffer_size().
143  *
144  */
145 int32_t arm_convolve_wrapper_s4_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
146                                                     const cmsis_nn_dims *input_dims,
147                                                     const cmsis_nn_dims *filter_dims,
148                                                     const cmsis_nn_dims *output_dims);
149 
150 /**
151  * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
152  *        cmsis-nn  to perform the convolution.
153  *
154  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
155  *                                arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required.
156  *                                The caller is expected to clear the buffer, if applicable, for security reasons.
157  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
158  *                                Range of conv_params->input_offset  : [-127, 128]
159  *                                Range of conv_params->output_offset : [-128, 127]
160  * @param[in]      quant_params   Per-channel quantization info.
161  *                                It contains the multiplier and shift values to be applied to each output channel
162  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
163  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
164  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
165  *                                spatial filter dimensions
166  * @param[in]      filter_data    Filter data pointer. Data type: int8
167  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
168  * @param[in]      bias_data      Bias data pointer. Data type: int32
169  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
170  * @param[out]     output_data    Output data pointer. Data type: int8
171  *
172  * @return     The function returns either
173  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
174  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
175  *
176  */
177 arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
178                                             const cmsis_nn_conv_params *conv_params,
179                                             const cmsis_nn_per_channel_quant_params *quant_params,
180                                             const cmsis_nn_dims *input_dims,
181                                             const int8_t *input_data,
182                                             const cmsis_nn_dims *filter_dims,
183                                             const int8_t *filter_data,
184                                             const cmsis_nn_dims *bias_dims,
185                                             const int32_t *bias_data,
186                                             const cmsis_nn_dims *output_dims,
187                                             int8_t *output_data);
188 
189 /**
190  * @brief Get the required buffer size for arm_convolve_wrapper_s8
191  *
192  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
193  *                                Range of conv_params->input_offset  : [-127, 128]
194  *                                Range of conv_params->output_offset : [-128, 127]
195  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
196  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
197  *                                filter dimensions
198  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
199  *
200  * @return         The function returns required buffer size(bytes)
201  *
202  */
203 int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
204                                                 const cmsis_nn_dims *input_dims,
205                                                 const cmsis_nn_dims *filter_dims,
206                                                 const cmsis_nn_dims *output_dims);
207 
208 /**
209  * @brief Get the required buffer size for arm_convolve_wrapper_s8 for Arm(R) Helium Architecture case.
210  *        Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details.
211  *
212  * @note       Intended for compilation on Host. If compiling for an Arm target, use
213  *             arm_convolve_wrapper_s8_get_buffer_size().
214  *
215  */
216 int32_t arm_convolve_wrapper_s8_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
217                                                     const cmsis_nn_dims *input_dims,
218                                                     const cmsis_nn_dims *filter_dims,
219                                                     const cmsis_nn_dims *output_dims);
220 
221 /**
222  * @brief Get the required buffer size for arm_convolve_wrapper_s8 for processors with DSP extension.
223  *        Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details.
224  *
225  * @note       Intended for compilation on Host. If compiling for an Arm target, use
226  *             arm_convolve_wrapper_s8_get_buffer_size().
227  *
228  */
229 int32_t arm_convolve_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
230                                                     const cmsis_nn_dims *input_dims,
231                                                     const cmsis_nn_dims *filter_dims,
232                                                     const cmsis_nn_dims *output_dims);
233 
234 /**
235  * @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
236  *        cmsis-nn to perform the convolution.
237  *
238  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
239  *                                arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
240  *                                The caller is expected to clear the buffer, if applicable, for security reasons.
241  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
242  *                                conv_params->input_offset  : Not used
243  *                                conv_params->output_offset : Not used
244  * @param[in]      quant_params   Per-channel quantization info.
245  *                                It contains the multiplier and shift values to be applied to each output channel
246  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
247  * @param[in]      input_data     Input (activation) data pointer. Data type: int16
248  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
249  *                                spatial filter dimensions
250  * @param[in]      filter_data    Filter data pointer. Data type: int8
251  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
252  * @param[in]      bias_data      Struct with optional bias data pointer. Bias data type can be int64 or int32 depending
253  *                                flag in struct.
254  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
255  * @param[out]     output_data    Output data pointer. Data type: int16
256  *
257  * @return     The function returns either
258  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
259  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
260  *
261  */
262 arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
263                                              const cmsis_nn_conv_params *conv_params,
264                                              const cmsis_nn_per_channel_quant_params *quant_params,
265                                              const cmsis_nn_dims *input_dims,
266                                              const int16_t *input_data,
267                                              const cmsis_nn_dims *filter_dims,
268                                              const int8_t *filter_data,
269                                              const cmsis_nn_dims *bias_dims,
270                                              const cmsis_nn_bias_data *bias_data,
271                                              const cmsis_nn_dims *output_dims,
272                                              int16_t *output_data);
273 
274 /**
275  * @brief Get the required buffer size for arm_convolve_wrapper_s16.
276  *
277  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
278  *                                conv_params->input_offset  : Not used
279  *                                conv_params->output_offset : Not used
280  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
281  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
282  *                                filter dimensions
283  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
284  *
285  * @return         The function returns required buffer size(bytes)
286  *
287  */
288 int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
289                                                  const cmsis_nn_dims *input_dims,
290                                                  const cmsis_nn_dims *filter_dims,
291                                                  const cmsis_nn_dims *output_dims);
292 
293 /**
294  * @brief Get the required buffer size for arm_convolve_wrapper_s16 for for processors with DSP extension.
295  *        Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details.
296  *
297  * @note       Intended for compilation on Host. If compiling for an Arm target, use
298  *             arm_convolve_wrapper_s16_get_buffer_size().
299  *
300  */
301 int32_t arm_convolve_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
302                                                      const cmsis_nn_dims *input_dims,
303                                                      const cmsis_nn_dims *filter_dims,
304                                                      const cmsis_nn_dims *output_dims);
305 
306 /**
307  * @brief Get the required buffer size for arm_convolve_wrapper_s16 for Arm(R) Helium Architecture case.
308  *        Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details.
309  *
310  * @note       Intended for compilation on Host. If compiling for an Arm target, use
311  *             arm_convolve_wrapper_s16_get_buffer_size().
312  *
313  */
314 int32_t arm_convolve_wrapper_s16_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
315                                                      const cmsis_nn_dims *input_dims,
316                                                      const cmsis_nn_dims *filter_dims,
317                                                      const cmsis_nn_dims *output_dims);
318 
319 /**
320  * @brief Basic s4 convolution function
321  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
322  *                                arm_convolve_s4_get_buffer_size will return the buffer_size if required.
323  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
324  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
325  *                                Range of conv_params->input_offset  : [-127, 128]
326  *                                Range of conv_params->output_offset : [-128, 127]
327  * @param[in]      quant_params   Per-channel quantization info.
328  *                                It contains the multiplier and shift values to be applied to each output channel
329  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
330  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
331  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
332  *                                spatial filter dimensions
333  * @param[in]      filter_data    Packed Filter data pointer. Data type: int8 packed with 2x int4
334  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
335  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
336  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
337  * @param[out]     output_data    Output data pointer. Data type: int8
338 
339  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
340  *
341  * @details
342  *    1. Supported framework: TensorFlow Lite micro
343  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
344  *
345  */
346 arm_cmsis_nn_status arm_convolve_s4(const cmsis_nn_context *ctx,
347                                     const cmsis_nn_conv_params *conv_params,
348                                     const cmsis_nn_per_channel_quant_params *quant_params,
349                                     const cmsis_nn_dims *input_dims,
350                                     const int8_t *input_data,
351                                     const cmsis_nn_dims *filter_dims,
352                                     const int8_t *filter_data,
353                                     const cmsis_nn_dims *bias_dims,
354                                     const int32_t *bias_data,
355                                     const cmsis_nn_dims *output_dims,
356                                     int8_t *output_data);
357 
358 /**
359  * @brief Basic s4 convolution function with a requirement of even number of kernels.
360  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
361  *                                arm_convolve_s4_get_buffer_size will return the buffer_size if required.
362  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
363  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
364  *                                Range of conv_params->input_offset  : [-127, 128]
365  *                                Range of conv_params->output_offset : [-128, 127]
366  * @param[in]      quant_params   Per-channel quantization info.
367  *                                It contains the multiplier and shift values to be applied to each output channel
368  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
369  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
370  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
371  *                                spatial filter dimensions. Note the product must be even.
372  * @param[in]      filter_data    Packed Filter data pointer. Data type: int8 packed with 2x int4
373  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
374  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
375  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
376  * @param[out]     output_data    Output data pointer. Data type: int8
377  *
378  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
379  *                                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
380  *                                  <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code> if not for MVE
381  *
382  * @details
383  *    1. Supported framework: TensorFlow Lite micro
384  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
385  *
386  */
387 arm_cmsis_nn_status arm_convolve_even_s4(const cmsis_nn_context *ctx,
388                                          const cmsis_nn_conv_params *conv_params,
389                                          const cmsis_nn_per_channel_quant_params *quant_params,
390                                          const cmsis_nn_dims *input_dims,
391                                          const int8_t *input_data,
392                                          const cmsis_nn_dims *filter_dims,
393                                          const int8_t *filter_data,
394                                          const cmsis_nn_dims *bias_dims,
395                                          const int32_t *bias_data,
396                                          const cmsis_nn_dims *output_dims,
397                                          int8_t *output_data);
398 
399 /**
400  * @brief Basic s8 convolution function
401  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
402  *                                arm_convolve_s8_get_buffer_size will return the buffer_size if required.
403  *                                The caller is expected to clear the buffer, if applicable, for security reasons.
404  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
405  *                                Range of conv_params->input_offset  : [-127, 128]
406  *                                Range of conv_params->output_offset : [-128, 127]
407  * @param[in]      quant_params   Per-channel quantization info.
408  *                                It contains the multiplier and shift values to be applied to each output channel
409  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
410  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
411  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, CK] where HK, WK and CK are the
412  *                                spatial filter dimensions. CK != C_IN is used for grouped convolution, in which
413  *                                case the required conditions are C_IN = N * CK and C_OUT = N * M for N groups of
414  *                                size M.
415  * @param[in]      filter_data    Filter data pointer. Data type: int8
416  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
417  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
418  * @param[in]      upscale_dims   Inserts zeroes to upscale the input in h/w dimensions if set to 2. This is used for
419  * tranposed convolution.
420  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
421  * @param[out]     output_data    Output data pointer. Data type: int8
422  *
423  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
424  *                                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
425  *                                  <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code>
426  *
427  * @details
428  *    1. Supported framework: TensorFlow Lite micro
429  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
430  *
431  */
432 arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
433                                     const cmsis_nn_conv_params *conv_params,
434                                     const cmsis_nn_per_channel_quant_params *quant_params,
435                                     const cmsis_nn_dims *input_dims,
436                                     const int8_t *input_data,
437                                     const cmsis_nn_dims *filter_dims,
438                                     const int8_t *filter_data,
439                                     const cmsis_nn_dims *bias_dims,
440                                     const int32_t *bias_data,
441                                     const cmsis_nn_dims *upscale_dims,
442                                     const cmsis_nn_dims *output_dims,
443                                     int8_t *output_data);
444 
445 /**
446  * @brief Get the required buffer size for s4 convolution function
447  *
448  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
449  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
450  * are the spatial filter dimensions
451  * @return          The function returns required buffer size(bytes)
452  *
453  */
454 int32_t arm_convolve_s4_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
455 
456 /**
457  * @brief Get the required buffer size for s8 convolution function
458  *
459  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
460  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
461  * are the spatial filter dimensions
462  * @return          The function returns required buffer size(bytes)
463  *
464  */
465 int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
466 
467 /**
468  * @brief Wrapper to select optimal transposed convolution algorithm depending on parameters.
469  * @param[in, out] ctx                   Function context that contains the additional buffer if required by the
470  *                                       function.
471  *                                       arm_transpose_conv_s8_get_buffer_size will return the buffer_size if required.
472  *                                       The caller is expected to clear the buffer, if applicable, for security
473  reasons.
474  * @param[in, out] output_ctx            Temporary scratch buffer.
475  *                                       The size required size is: output width * output height * output channel * 4
476  *                                       The caller is expected to clear the buffer, if applicable, for security
477  *                                        reasons.
478  * @param[in]      transpose_conv_params Convolution parameters (e.g. strides, dilations, pads,...).
479  *                                       Range of transpose_conv_params->input_offset  : [-127, 128]
480  *                                       Range of transpose_conv_params->output_offset : [-128, 127]
481  * @param[in]      quant_params          Per-channel quantization info.
482  *                                       It contains the multiplier and shift values to be applied to each out channel.
483  * @param[in]      input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
484  * @param[in]      input_data            Input (activation) data pointer. Data type: int8
485  * @param[in]      filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
486  *                                       spatial filter dimensions
487  * @param[in]      filter_data           Filter data pointer. Data type: int8
488  * @param[in]      bias_dims             Bias tensor dimensions. Format: [C_OUT]
489  * @param[in]      bias_data             Optional bias data pointer. Data type: int32
490  * @param[in]      output_dims           Output tensor dimensions. Format: [N, H, W, C_OUT]
491  * @param[out]     output_data           Output data pointer. Data type: int8
492 
493  * @return     The function returns either
494  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
495  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
496  *
497  * @details
498  *    1. Supported framework: TensorFlow Lite micro
499  *    2. Additional memory is required for optimization. Refer to arguments 'ctx' and 'output_ctx' for details.
500  *
501  */
502 arm_cmsis_nn_status arm_transpose_conv_wrapper_s8(const cmsis_nn_context *ctx,
503                                                   const cmsis_nn_context *output_ctx,
504                                                   const cmsis_nn_transpose_conv_params *transpose_conv_params,
505                                                   const cmsis_nn_per_channel_quant_params *quant_params,
506                                                   const cmsis_nn_dims *input_dims,
507                                                   const int8_t *input_data,
508                                                   const cmsis_nn_dims *filter_dims,
509                                                   const int8_t *filter_data,
510                                                   const cmsis_nn_dims *bias_dims,
511                                                   const int32_t *bias_data,
512                                                   const cmsis_nn_dims *output_dims,
513                                                   int8_t *output_data);
514 
515 /**
516  * @brief Basic s8 transpose convolution function
517  * @param[in, out] ctx                   Function context that contains the additional buffer if required by the
518  *                                       function.
519  *                                       arm_transpose_conv_s8_get_buffer_size will return the buffer_size if required.
520  *                                       The caller is expected to clear the buffer, if applicable, for security
521  reasons.
522  * @param[in, out] output_ctx            Temporary scratch buffer.
523  *                                       The size required size is: output width * output height * output channel * 4
524  *                                       The caller is expected to clear the buffer, if applicable, for security
525  *                                        reasons.
526  * @param[in]      transpose_conv_params Convolution parameters (e.g. strides, dilations, pads,...).
527  *                                       Range of transpose_conv_params->input_offset  : [-127, 128]
528  *                                       Range of transpose_conv_params->output_offset : [-128, 127]
529  * @param[in]      quant_params          Per-channel quantization info.
530  *                                       It contains the multiplier and shift values to be applied to each out channel.
531  * @param[in]      input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
532  * @param[in]      input_data            Input (activation) data pointer. Data type: int8
533  * @param[in]      filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
534  *                                       spatial filter dimensions
535  * @param[in]      filter_data           Filter data pointer. Data type: int8
536  * @param[in]      bias_dims             Bias tensor dimensions. Format: [C_OUT]
537  * @param[in]      bias_data             Optional bias data pointer. Data type: int32
538  * @param[in]      output_dims           Output tensor dimensions. Format: [N, H, W, C_OUT]
539  * @param[out]     output_data           Output data pointer. Data type: int8
540 
541  * @return     The function returns either
542  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
543  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
544  *
545  * @details
546  *    1. Supported framework: TensorFlow Lite micro
547  *    2. Additional memory is required for optimization. Refer to arguments 'ctx' and 'output_ctx' for details.
548  *
549  */
550 arm_cmsis_nn_status arm_transpose_conv_s8(const cmsis_nn_context *ctx,
551                                           const cmsis_nn_context *output_ctx,
552                                           const cmsis_nn_transpose_conv_params *transpose_conv_params,
553                                           const cmsis_nn_per_channel_quant_params *quant_params,
554                                           const cmsis_nn_dims *input_dims,
555                                           const int8_t *input_data,
556                                           const cmsis_nn_dims *filter_dims,
557                                           const int8_t *filter_data,
558                                           const cmsis_nn_dims *bias_dims,
559                                           const int32_t *bias_data,
560                                           const cmsis_nn_dims *output_dims,
561                                           int8_t *output_data);
562 
563 /**
564  * @brief Get the required buffer size for ctx in s8 transpose conv function
565  *
566  * @param[in]       transposed_conv_params  Transposed convolution parameters
567  * @param[in]       input_dims              Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
568  * @param[in]       filter_dims             Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
569  *                                          are the spatial filter dimensions
570  * @param[in]       out_dims                Output tensor dimensions. Format: [N, H, W, C_OUT]
571  * @return          The function returns required buffer size(bytes)
572  *
573  */
574 int32_t arm_transpose_conv_s8_get_buffer_size(const cmsis_nn_transpose_conv_params *transposed_conv_params,
575                                               const cmsis_nn_dims *input_dims,
576                                               const cmsis_nn_dims *filter_dims,
577                                               const cmsis_nn_dims *out_dims);
578 
579 /**
580  * @brief Get the required buffer size for output_ctx in s8 transpose conv function
581  *
582  * @param[in]       transposed_conv_params  Transposed convolution parameters
583  * @param[in]       input_dims              Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
584  * @param[in]       filter_dims             Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
585  *                                        are the spatial filter dimensions
586  * @return          The function returns required buffer size(bytes)
587  *
588  */
589 int32_t arm_transpose_conv_s8_get_reverse_conv_buffer_size(const cmsis_nn_transpose_conv_params *transposed_conv_params,
590                                                            const cmsis_nn_dims *input_dims,
591                                                            const cmsis_nn_dims *filter_dims);
592 
593 /**
594  * @brief Get size of additional buffer required by arm_transpose_conv_s8() for processors with DSP extension.
595  *        Refer to arm_transpose_conv_s8_get_buffer_size() for function argument details.
596  *
597  * @note       Intended for compilation on Host. If compiling for an Arm target, use
598  *             arm_transpose_conv_s8_get_buffer_size().
599  *
600  */
601 int32_t arm_transpose_conv_s8_get_buffer_size_dsp(const cmsis_nn_dims *input_dims,
602                                                   const cmsis_nn_dims *filter_dims,
603                                                   const cmsis_nn_dims *out_dims);
604 
605 /**
606  * @brief Get size of additional buffer required by arm_transpose_conv_s8() for Arm(R) Helium Architecture case.
607  *        Refer to arm_transpose_conv_s8_get_buffer_size() for function argument details.
608  *
609  * @note       Intended for compilation on Host. If compiling for an Arm target, use
610  *             arm_transpose_conv_s8_get_buffer_size().
611  *
612  */
613 int32_t arm_transpose_conv_s8_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
614                                                   const cmsis_nn_dims *filter_dims,
615                                                   const cmsis_nn_dims *out_dims);
616 
617 /**
618  * @brief Basic s16 convolution function
619  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
620  *                                arm_convolve_s16_get_buffer_size will return the buffer_size if required.
621  *                                The caller is expected to clear the buffer, if applicable, for security reasons.
622  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
623  *                                conv_params->input_offset  : Not used
624  *                                conv_params->output_offset : Not used
625  * @param[in]      quant_params   Per-channel quantization info.
626  *                                It contains the multiplier and shift values to be applied to each output channel
627  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
628  * @param[in]      input_data     Input (activation) data pointer. Data type: int16
629  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
630  *                                spatial filter dimensions
631  * @param[in]      filter_data    Filter data pointer. Data type: int8
632  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
633  * @param[in]      bias_data      Struct with optional bias data pointer. Bias data type can be int64 or int32 depending
634  *                                flag in struct.
635  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
636  * @param[out]     output_data    Output data pointer. Data type: int16
637  *
638  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
639  *                                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
640  *                                  <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code>
641  *
642  * @details
643  *    1. Supported framework: TensorFlow Lite micro
644  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
645  *
646  */
647 arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
648                                      const cmsis_nn_conv_params *conv_params,
649                                      const cmsis_nn_per_channel_quant_params *quant_params,
650                                      const cmsis_nn_dims *input_dims,
651                                      const int16_t *input_data,
652                                      const cmsis_nn_dims *filter_dims,
653                                      const int8_t *filter_data,
654                                      const cmsis_nn_dims *bias_dims,
655                                      const cmsis_nn_bias_data *bias_data,
656                                      const cmsis_nn_dims *output_dims,
657                                      int16_t *output_data);
658 
659 /**
660  * @brief Get the required buffer size for s16 convolution function
661  *
662  * @param[in]       input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
663  * @param[in]       filter_dims   Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
664  *                                are the spatial filter dimensions
665  * @return          The function returns required buffer size(bytes)
666  *
667  */
668 int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
669 
670 /**
671  * @brief Fast s4 version for 1x1 convolution (non-square shape)
672  *
673  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
674  *                               arm_convolve_1x1_s4_fast_get_buffer_size will return the buffer_size if required.
675  *                               The caller is expected to clear the buffer ,if applicable, for security reasons.
676  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
677  *                               Range of conv_params->input_offset  : [-127, 128]
678  *                               Range of conv_params->output_offset : [-128, 127]
679  * @param[in]      quant_params  Per-channel quantization info.
680  *                               It contains the multiplier and shift values to be applied to each output channel
681  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
682  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
683  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
684  * @param[in]      filter_data   Filter data pointer. Data type: int8 packed with 2x int4
685  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
686  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
687  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
688  * @param[out]     output_data   Output data pointer. Data type: int8
689  *
690  * @return     The function returns either
691  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
692  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
693  *
694  * @details
695  *   - Supported framework : TensorFlow Lite Micro
696  *   - The following constrains on the arguments apply
697  *      -# conv_params->padding.w = conv_params->padding.h = 0
698  *      -# conv_params->stride.w = conv_params->stride.h = 1
699  *
700  */
701 arm_cmsis_nn_status arm_convolve_1x1_s4_fast(const cmsis_nn_context *ctx,
702                                              const cmsis_nn_conv_params *conv_params,
703                                              const cmsis_nn_per_channel_quant_params *quant_params,
704                                              const cmsis_nn_dims *input_dims,
705                                              const int8_t *input_data,
706                                              const cmsis_nn_dims *filter_dims,
707                                              const int8_t *filter_data,
708                                              const cmsis_nn_dims *bias_dims,
709                                              const int32_t *bias_data,
710                                              const cmsis_nn_dims *output_dims,
711                                              int8_t *output_data);
712 
713 /**
714  * @brief s4 version for 1x1 convolution with support for non-unity stride values
715  *
716  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
717  *                               None is required by this function.
718  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
719  *                               Range of conv_params->input_offset  : [-127, 128]
720  *                               Range of conv_params->output_offset : [-128, 127]
721  * @param[in]      quant_params  Per-channel quantization info.
722  *                               It contains the multiplier and shift values to be applied to each output channel
723  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
724  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
725  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
726  * @param[in]      filter_data   Filter data pointer. Data type: int8 packed with 2x int4
727  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
728  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
729  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
730  * @param[out]     output_data   Output data pointer. Data type: int8
731  *
732  * @return     The function returns either
733  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
734  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
735  * @details
736  *   - Supported framework : TensorFlow Lite Micro
737  *   - The following constrains on the arguments apply
738  *      -# conv_params->padding.w = conv_params->padding.h = 0
739  *
740  */
741 arm_cmsis_nn_status arm_convolve_1x1_s4(const cmsis_nn_context *ctx,
742                                         const cmsis_nn_conv_params *conv_params,
743                                         const cmsis_nn_per_channel_quant_params *quant_params,
744                                         const cmsis_nn_dims *input_dims,
745                                         const int8_t *input_data,
746                                         const cmsis_nn_dims *filter_dims,
747                                         const int8_t *filter_data,
748                                         const cmsis_nn_dims *bias_dims,
749                                         const int32_t *bias_data,
750                                         const cmsis_nn_dims *output_dims,
751                                         int8_t *output_data);
752 
753 /**
754  * @brief Fast s8 version for 1x1 convolution (non-square shape)
755  *
756  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
757  *                               arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required.
758  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
759  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
760  *                               Range of conv_params->input_offset  : [-127, 128]
761  *                               Range of conv_params->output_offset : [-128, 127]
762  * @param[in]      quant_params  Per-channel quantization info.
763  *                               It contains the multiplier and shift values to be applied to each output channel
764  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
765  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
766  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
767  * @param[in]      filter_data   Filter data pointer. Data type: int8
768  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
769  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
770  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
771  * @param[out]     output_data   Output data pointer. Data type: int8
772  *
773  * @return     The function returns either
774  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
775  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
776  *
777  * @details
778  *   - Supported framework : TensorFlow Lite Micro
779  *   - The following constrains on the arguments apply
780  *      -# conv_params->padding.w = conv_params->padding.h = 0
781  *      -# conv_params->stride.w = conv_params->stride.h = 1
782  *
783  */
784 arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
785                                              const cmsis_nn_conv_params *conv_params,
786                                              const cmsis_nn_per_channel_quant_params *quant_params,
787                                              const cmsis_nn_dims *input_dims,
788                                              const int8_t *input_data,
789                                              const cmsis_nn_dims *filter_dims,
790                                              const int8_t *filter_data,
791                                              const cmsis_nn_dims *bias_dims,
792                                              const int32_t *bias_data,
793                                              const cmsis_nn_dims *output_dims,
794                                              int8_t *output_data);
795 
796 /**
797  * @brief Get the required buffer size for arm_convolve_1x1_s4_fast
798  *
799  * @param[in]       input_dims            Input (activation) dimensions
800  * @return          The function returns the required buffer size in bytes
801  *
802  */
803 int32_t arm_convolve_1x1_s4_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
804 
805 /**
806  * @brief Get the required buffer size for arm_convolve_1x1_s8_fast
807  *
808  * @param[in]       input_dims            Input (activation) dimensions
809  * @return          The function returns the required buffer size in bytes
810  *
811  */
812 int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
813 
814 /**
815  * @brief s8 version for 1x1 convolution with support for non-unity stride values
816  *
817  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
818  *                               None is required by this function.
819  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
820  *                               Range of conv_params->input_offset  : [-127, 128]
821  *                               Range of conv_params->output_offset : [-128, 127]
822  * @param[in]      quant_params  Per-channel quantization info.
823  *                               It contains the multiplier and shift values to be applied to each output channel
824  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
825  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
826  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
827  * @param[in]      filter_data   Filter data pointer. Data type: int8
828  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
829  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
830  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
831  * @param[out]     output_data   Output data pointer. Data type: int8
832  *
833  * @return     The function returns either
834  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
835  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
836  * @details
837  *   - Supported framework : TensorFlow Lite Micro
838  *   - The following constrains on the arguments apply
839  *      -# conv_params->padding.w = conv_params->padding.h = 0
840  *
841  */
842 arm_cmsis_nn_status arm_convolve_1x1_s8(const cmsis_nn_context *ctx,
843                                         const cmsis_nn_conv_params *conv_params,
844                                         const cmsis_nn_per_channel_quant_params *quant_params,
845                                         const cmsis_nn_dims *input_dims,
846                                         const int8_t *input_data,
847                                         const cmsis_nn_dims *filter_dims,
848                                         const int8_t *filter_data,
849                                         const cmsis_nn_dims *bias_dims,
850                                         const int32_t *bias_data,
851                                         const cmsis_nn_dims *output_dims,
852                                         int8_t *output_data);
853 
854 /**
855  * @brief 1xn convolution
856  *
857  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
858  *                               arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required
859  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
860  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
861  *                               Range of conv_params->input_offset  : [-127, 128]
862  *                               Range of conv_params->output_offset : [-128, 127]
863  * @param[in]      quant_params  Per-channel quantization info.
864  *                               It contains the multiplier and shift values to be applied to each output channel
865  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
866  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
867  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
868  *                               spatial filter dimension
869  * @param[in]      filter_data   Filter data pointer. Data type: int8
870  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
871  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
872  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
873  * @param[out]     output_data   Output data pointer. Data type: int8
874  *
875  * @return     The function returns either
876  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
877  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
878  *
879  * @details
880  *   - Supported framework : TensorFlow Lite Micro
881  *   - The following constrains on the arguments apply
882  *      -# input_dims->n equals 1
883  *      -# ouput_dims->w is a multiple of 4
884  *      -# Explicit constraints(since it is for 1xN convolution)
885  *      -## input_dims->h equals 1
886  *      -## output_dims->h equals 1
887  *      -## filter_dims->h equals 1
888  *@todo  Remove constraint on output_dims->w to make the function generic.
889  *
890  */
891 arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
892                                           const cmsis_nn_conv_params *conv_params,
893                                           const cmsis_nn_per_channel_quant_params *quant_params,
894                                           const cmsis_nn_dims *input_dims,
895                                           const int8_t *input_data,
896                                           const cmsis_nn_dims *filter_dims,
897                                           const int8_t *filter_data,
898                                           const cmsis_nn_dims *bias_dims,
899                                           const int32_t *bias_data,
900                                           const cmsis_nn_dims *output_dims,
901                                           int8_t *output_data);
902 
903 /**
904  * @brief 1xn convolution for s4 weights
905  *
906  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
907  *                               arm_convolve_1_x_n_s4_get_buffer_size will return the buffer_size if required
908  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
909  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
910  *                               Range of conv_params->input_offset  : [-127, 128]
911  *                               Range of conv_params->output_offset : [-128, 127]
912  * @param[in]      quant_params  Per-channel quantization info.
913  *                               It contains the multiplier and shift values to be applied to each output channel
914  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
915  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
916  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
917  *                               spatial filter dimension
918  * @param[in]      filter_data   Filter data pointer. Data type: int8 as packed int4
919  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
920  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
921  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
922  * @param[out]     output_data   Output data pointer. Data type: int8
923  *
924  * @return     The function returns either
925  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
926  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
927  *
928  * @details
929  *   - Supported framework : TensorFlow Lite Micro
930  *   - The following constrains on the arguments apply
931  *      -# stride.w * input_dims->c is a multiple of 4
932  *      -# Explicit constraints(since it is for 1xN convolution)
933  *      -## input_dims->h equals 1
934  *      -## output_dims->h equals 1
935  *      -## filter_dims->h equals 1
936  *@todo  Remove constraint on output_dims->w to make the function generic.
937  *
938  */
939 arm_cmsis_nn_status arm_convolve_1_x_n_s4(const cmsis_nn_context *ctx,
940                                           const cmsis_nn_conv_params *conv_params,
941                                           const cmsis_nn_per_channel_quant_params *quant_params,
942                                           const cmsis_nn_dims *input_dims,
943                                           const int8_t *input_data,
944                                           const cmsis_nn_dims *filter_dims,
945                                           const int8_t *filter_data,
946                                           const cmsis_nn_dims *bias_dims,
947                                           const int32_t *bias_data,
948                                           const cmsis_nn_dims *output_dims,
949                                           int8_t *output_data);
950 
951 /**
952  * @brief Get the required additional buffer size for 1xn convolution
953  *
954  * @param[in]       conv_params           Convolution parameters (e.g. strides, dilations, pads,...).
955  *                                        Range of conv_params->input_offset  : [-127, 128]
956  *                                        Range of conv_params->output_offset : [-128, 127]
957  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
958  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
959  *                                        horizontal spatial filter dimension
960  * @param[in]       output_dims           Output tensor dimensions. Format: [N, H, W, C_OUT]
961  *
962  * @return          The function returns required buffer size(bytes)
963  *
964  */
965 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
966                                               const cmsis_nn_dims *input_dims,
967                                               const cmsis_nn_dims *filter_dims,
968                                               const cmsis_nn_dims *output_dims);
969 
970 /**
971  * @brief Get the required additional buffer size for 1xn convolution
972  *
973  * @param[in]       conv_params           Convolution parameters (e.g. strides, dilations, pads,...).
974  *                                        Range of conv_params->input_offset  : [-127, 128]
975  *                                        Range of conv_params->output_offset : [-128, 127]
976  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
977  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
978  *                                        horizontal spatial filter dimension
979  * @param[in]       output_dims           Output tensor dimensions. Format: [N, H, W, C_OUT]
980  *
981  * @return          The function returns required buffer size(bytes)
982  *
983  */
984 int32_t arm_convolve_1_x_n_s4_get_buffer_size(const cmsis_nn_conv_params *conv_params,
985                                               const cmsis_nn_dims *input_dims,
986                                               const cmsis_nn_dims *filter_dims,
987                                               const cmsis_nn_dims *output_dims);
988 
989 /**
990  * @brief Wrapper function to pick the right optimized s8 depthwise convolution function
991  *
992  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
993  *                                 definition file to see if an additional buffer is required.
994  *                                 Optional function {API}_get_buffer_size() provides the buffer
995  *                                 size if required.
996  *                                 The caller is expected to clear the buffer, if applicable, for security reasons.
997  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
998  *                                 dw_conv_params->dilation is not used.
999  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1000  *                                 Range of dw_conv_params->output_offset : [-128, 127]
1001  * @param[in]      quant_params    Per-channel quantization info.
1002  *                                 It contains the multiplier and shift values to be applied to each
1003  *                                 output channel
1004  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1005  *                                 Batch argument N is not used and assumed to be 1.
1006  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
1007  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1008  * @param[in]      filter_data     Filter data pointer. Data type: int8
1009  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1010  * @param[in]      bias_data       Bias data pointer. Data type: int32
1011  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1012  * @param[in, out] output_data     Output data pointer. Data type: int8
1013  * @return     The function returns
1014  *                <code>ARM_CMSIS_NN_SUCCESS</code>   -  Successful completion.
1015  *
1016  * @details
1017  *    - Supported framework: TensorFlow Lite
1018  *    - Picks one of the the following functions
1019  *        -# arm_depthwise_conv_s8()
1020  *        -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
1021  *        -# arm_depthwise_conv_s8_opt()
1022  *    - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
1023  * boundary.
1024  */
1025 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
1026                                                   const cmsis_nn_dw_conv_params *dw_conv_params,
1027                                                   const cmsis_nn_per_channel_quant_params *quant_params,
1028                                                   const cmsis_nn_dims *input_dims,
1029                                                   const int8_t *input_data,
1030                                                   const cmsis_nn_dims *filter_dims,
1031                                                   const int8_t *filter_data,
1032                                                   const cmsis_nn_dims *bias_dims,
1033                                                   const int32_t *bias_data,
1034                                                   const cmsis_nn_dims *output_dims,
1035                                                   int8_t *output_data);
1036 
1037 /**
1038  * @brief Wrapper function to pick the right optimized s4 depthwise convolution function
1039  *
1040  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1041  *                                 definition file to see if an additional buffer is required.
1042  *                                 Optional function {API}_get_buffer_size() provides the buffer
1043  *                                 size if required.
1044  *                                 The caller is expected to clear the buffer ,if applicable, for security reasons.
1045  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1046  *                                 dw_conv_params->dilation is not used.
1047  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1048  *                                 Range of dw_conv_params->output_offset : [-128, 127]
1049  * @param[in]      quant_params    Per-channel quantization info.
1050  *                                 It contains the multiplier and shift values to be applied to each
1051  *                                 output channel
1052  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1053  *                                 Batch argument N is not used and assumed to be 1.
1054  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
1055  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1056  * @param[in]      filter_data     Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
1057  *                                 weights [0x1, 0x2, 0x3, 0x4]  packed as [0x21, 0x43].
1058  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1059  * @param[in]      bias_data       Bias data pointer. Data type: int32
1060  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1061  * @param[in, out] output_data     Output data pointer. Data type: int8
1062  * @return     The function returns
1063  *                <code>ARM_CMSIS_NN_SUCCESS</code>   -  Successful completion.
1064  *
1065  * @details
1066  *    - Supported framework: TensorFlow Lite
1067  */
1068 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s4(const cmsis_nn_context *ctx,
1069                                                   const cmsis_nn_dw_conv_params *dw_conv_params,
1070                                                   const cmsis_nn_per_channel_quant_params *quant_params,
1071                                                   const cmsis_nn_dims *input_dims,
1072                                                   const int8_t *input_data,
1073                                                   const cmsis_nn_dims *filter_dims,
1074                                                   const int8_t *filter_data,
1075                                                   const cmsis_nn_dims *bias_dims,
1076                                                   const int32_t *bias_data,
1077                                                   const cmsis_nn_dims *output_dims,
1078                                                   int8_t *output_data);
1079 
1080 /**
1081  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
1082  *
1083  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1084  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1085  *                                 Range of dw_conv_params->input_offset : [-128, 127]
1086  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1087  *                                 Batch argument N is not used and assumed to be 1.
1088  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1089  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1090  * @return                         Size of additional memory required for optimizations in bytes.
1091  *
1092  */
1093 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
1094                                                       const cmsis_nn_dims *input_dims,
1095                                                       const cmsis_nn_dims *filter_dims,
1096                                                       const cmsis_nn_dims *output_dims);
1097 
1098 /**
1099  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for processors with DSP extension.
1100  *        Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details.
1101  *
1102  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1103  *             arm_depthwise_conv_wrapper_s8_get_buffer_size().
1104  *
1105  */
1106 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
1107                                                           const cmsis_nn_dims *input_dims,
1108                                                           const cmsis_nn_dims *filter_dims,
1109                                                           const cmsis_nn_dims *output_dims);
1110 
1111 /**
1112  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for Arm(R) Helium Architecture case.
1113  *        Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details.
1114  *
1115  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1116  *             arm_depthwise_conv_wrapper_s8_get_buffer_size().
1117  *
1118  */
1119 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
1120                                                           const cmsis_nn_dims *input_dims,
1121                                                           const cmsis_nn_dims *filter_dims,
1122                                                           const cmsis_nn_dims *output_dims);
1123 
1124 /**
1125  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4()
1126  *
1127  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1128  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1129  *                                 Range of dw_conv_params->input_offset : [-128, 127]
1130  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1131  *                                 Batch argument N is not used and assumed to be 1.
1132  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1133  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1134  * @return                         Size of additional memory required for optimizations in bytes.
1135  *
1136  */
1137 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
1138                                                       const cmsis_nn_dims *input_dims,
1139                                                       const cmsis_nn_dims *filter_dims,
1140                                                       const cmsis_nn_dims *output_dims);
1141 
1142 /**
1143  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() for processors with DSP extension.
1144  *        Refer to arm_depthwise_conv_wrapper_s4_get_buffer_size() for function argument details.
1145  *
1146  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1147  *             arm_depthwise_conv_wrapper_s4_get_buffer_size().
1148  *
1149  */
1150 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
1151                                                           const cmsis_nn_dims *input_dims,
1152                                                           const cmsis_nn_dims *filter_dims,
1153                                                           const cmsis_nn_dims *output_dims);
1154 
1155 /**
1156  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() for Arm(R) Helium Architecture case.
1157  *        Refer to arm_depthwise_conv_wrapper_s4_get_buffer_size() for function argument details.
1158  *
1159  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1160  *             arm_depthwise_conv_wrapper_s4_get_buffer_size().
1161  *
1162  */
1163 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
1164                                                           const cmsis_nn_dims *input_dims,
1165                                                           const cmsis_nn_dims *filter_dims,
1166                                                           const cmsis_nn_dims *output_dims);
1167 
1168 /**
1169  * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
1170  *
1171  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1172  *                                 definition file to see if an additional buffer is required.
1173  *                                 Optional function {API}_get_buffer_size() provides the buffer
1174  *                                 size if an additional buffer is required exists if additional memory is.
1175  *                                 The caller is expected to clear the buffer, if applicable, for security reasons.
1176  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1177  *                                 dw_conv_params->dilation is not used.
1178  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1179  *                                 Range of dw_conv_params->input_offset : [-128, 127]
1180  * @param[in]      quant_params    Per-channel quantization info.
1181  *                                 It contains the multiplier and shift values to be applied to each
1182  *                                 output channel
1183  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1184  *                                 Batch argument N is not used.
1185  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
1186  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1187  * @param[in]      filter_data     Filter data pointer. Data type: int8
1188  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1189  * @param[in]      bias_data       Bias data pointer. Data type: int32
1190  * @param[in]      output_dims     Output tensor dimensions. Format: [N, H, W, C_OUT]
1191  * @param[in, out] output_data     Output data pointer. Data type: int8
1192  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1193  *
1194  * @details
1195  *    - Supported framework: TensorFlow Lite
1196  */
1197 arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
1198                                           const cmsis_nn_dw_conv_params *dw_conv_params,
1199                                           const cmsis_nn_per_channel_quant_params *quant_params,
1200                                           const cmsis_nn_dims *input_dims,
1201                                           const int8_t *input_data,
1202                                           const cmsis_nn_dims *filter_dims,
1203                                           const int8_t *filter_data,
1204                                           const cmsis_nn_dims *bias_dims,
1205                                           const int32_t *bias_data,
1206                                           const cmsis_nn_dims *output_dims,
1207                                           int8_t *output_data);
1208 
1209 /**
1210  * @brief Basic s4 depthwise convolution function that doesn't have any constraints on the input dimensions.
1211  *
1212  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1213  *                                 definition file to see if an additional buffer is required.
1214  *                                 Optional function {API}_get_buffer_size() provides the buffer
1215  *                                 size if an additional buffer is required exists if additional memory is.
1216  *                                 The caller is expected to clear the buffer ,if applicable, for security reasons.
1217  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1218  *                                 dw_conv_params->dilation is not used.
1219  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1220  *                                 Range of dw_conv_params->input_offset : [-128, 127]
1221  * @param[in]      quant_params    Per-channel quantization info.
1222  *                                 It contains the multiplier and shift values to be applied to each
1223  *                                 output channel
1224  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1225  *                                 Batch argument N is not used.
1226  * @param[in]      input           Input (activation) data pointer. Data type: int8
1227  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1228  * @param[in]      kernel          Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
1229  *                                 weights [0x1, 0x2, 0x3, 0x4]  packed as [0x21, 0x43].
1230  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1231  * @param[in]      bias            Bias data pointer. Data type: int32
1232  * @param[in]      output_dims     Output tensor dimensions. Format: [N, H, W, C_OUT]
1233  * @param[in, out] output          Output data pointer. Data type: int8
1234  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1235  *
1236  * @details
1237  *    - Supported framework: TensorFlow Lite
1238  */
1239 arm_cmsis_nn_status arm_depthwise_conv_s4(const cmsis_nn_context *ctx,
1240                                           const cmsis_nn_dw_conv_params *dw_conv_params,
1241                                           const cmsis_nn_per_channel_quant_params *quant_params,
1242                                           const cmsis_nn_dims *input_dims,
1243                                           const int8_t *input,
1244                                           const cmsis_nn_dims *filter_dims,
1245                                           const int8_t *kernel,
1246                                           const cmsis_nn_dims *bias_dims,
1247                                           const int32_t *bias,
1248                                           const cmsis_nn_dims *output_dims,
1249                                           int8_t *output);
1250 
1251 /**
1252  * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
1253  *
1254  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1255  *                                 definition file to see if an additional buffer is required.
1256  *                                 Optional function {API}_get_buffer_size() provides the buffer
1257  *                                 size if an additional buffer is required.
1258  *                                 exists if additional memory is.
1259  *                                 The caller is expected to clear the buffer, if applicable, for security reasons.
1260  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1261  *                                 conv_params->input_offset  : Not used
1262  *                                 conv_params->output_offset : Not used
1263  * @param[in]      quant_params    Per-channel quantization info.
1264  *                                 It contains the multiplier and shift values to be applied to each
1265  *                                 output channel
1266  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1267  *                                 Batch argument N is not used.
1268  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
1269  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1270  * @param[in]      filter_data     Filter data pointer. Data type: int8
1271  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1272  * @param[in]      bias_data       Bias data pointer. Data type: int64
1273  * @param[in]      output_dims     Output tensor dimensions. Format: [N, H, W, C_OUT]
1274  * @param[in, out] output_data     Output data pointer. Data type: int16
1275  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1276  *
1277  * @details
1278  *    - Supported framework: TensorFlow Lite
1279  */
1280 arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
1281                                            const cmsis_nn_dw_conv_params *dw_conv_params,
1282                                            const cmsis_nn_per_channel_quant_params *quant_params,
1283                                            const cmsis_nn_dims *input_dims,
1284                                            const int16_t *input_data,
1285                                            const cmsis_nn_dims *filter_dims,
1286                                            const int8_t *filter_data,
1287                                            const cmsis_nn_dims *bias_dims,
1288                                            const int64_t *bias_data,
1289                                            const cmsis_nn_dims *output_dims,
1290                                            int16_t *output_data);
1291 
1292 /**
1293  * @brief Wrapper function to pick the right optimized s16 depthwise convolution function
1294  *
1295  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1296  *                                 definition file to see if an additional buffer is required.
1297  *                                 Optional function {API}_get_buffer_size() provides the buffer
1298  *                                 size if required.
1299  *                                 The caller is expected to clear the buffer, if applicable, for security reasons.
1300  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1301  *                                 dw_conv_params->dilation is not used.
1302  *                                 Range of dw_conv_params->input_offset : Not used
1303  *                                 Range of dw_conv_params->output_offset : Not used
1304  * @param[in]      quant_params    Per-channel quantization info.
1305  *                                 It contains the multiplier and shift values to be applied to each
1306  *                                 output channel
1307  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1308  *                                 Batch argument N is not used and assumed to be 1.
1309  * @param[in]      input_data      Input (activation) data pointer. Data type: int16
1310  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1311  * @param[in]      filter_data     Filter data pointer. Data type: int8
1312  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1313  * @param[in]      bias_data       Bias data pointer. Data type: int64
1314  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1315  * @param[in, out] output_data     Output data pointer. Data type: int16
1316  * @return     The function returns
1317  *                <code>ARM_CMSIS_NN_SUCCESS</code>   -  Successful completion.
1318  *
1319  * @details
1320  *    - Supported framework: TensorFlow Lite
1321  *    - Picks one of the the following functions
1322  *        -# arm_depthwise_conv_s16()
1323  *        -# arm_depthwise_conv_fast_s16()  - Cortex-M CPUs with DSP extension only
1324  */
1325 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx,
1326                                                    const cmsis_nn_dw_conv_params *dw_conv_params,
1327                                                    const cmsis_nn_per_channel_quant_params *quant_params,
1328                                                    const cmsis_nn_dims *input_dims,
1329                                                    const int16_t *input_data,
1330                                                    const cmsis_nn_dims *filter_dims,
1331                                                    const int8_t *filter_data,
1332                                                    const cmsis_nn_dims *bias_dims,
1333                                                    const int64_t *bias_data,
1334                                                    const cmsis_nn_dims *output_dims,
1335                                                    int16_t *output_data);
1336 
1337 /**
1338  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16()
1339  *
1340  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1341  *                                 Range of dw_conv_params->input_offset : Not used
1342  *                                 Range of dw_conv_params->input_offset : Not used
1343  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1344  *                                 Batch argument N is not used and assumed to be 1.
1345  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1346  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1347  * @return                         Size of additional memory required for optimizations in bytes.
1348  *
1349  */
1350 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
1351                                                        const cmsis_nn_dims *input_dims,
1352                                                        const cmsis_nn_dims *filter_dims,
1353                                                        const cmsis_nn_dims *output_dims);
1354 
1355 /**
1356  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for processors with DSP extension.
1357  *        Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details.
1358  *
1359  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1360  *             arm_depthwise_conv_wrapper_s16_get_buffer_size().
1361  *
1362  */
1363 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
1364                                                            const cmsis_nn_dims *input_dims,
1365                                                            const cmsis_nn_dims *filter_dims,
1366                                                            const cmsis_nn_dims *output_dims);
1367 
1368 /**
1369  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for Arm(R) Helium Architecture
1370  * case. Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details.
1371  *
1372  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1373  *             arm_depthwise_conv_wrapper_s16_get_buffer_size().
1374  *
1375  */
1376 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
1377                                                            const cmsis_nn_dims *input_dims,
1378                                                            const cmsis_nn_dims *filter_dims,
1379                                                            const cmsis_nn_dims *output_dims);
1380 
1381 /**
1382  * @brief Optimized s16 depthwise convolution function with constraint that in_channel equals out_channel.
1383  *        Refer arm_depthwise_conv_s16() for function argument details.
1384  *
1385  * @return     The function returns one of the following
1386  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - ctx-buff == NULL and
1387  *                                                      arm_depthwise_conv_fast_s16_get_buffer_size() > 0 or
1388  *                                                      input channel != output channel or
1389  *                                                      ch_mult != 1
1390  *
1391  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1392  *
1393  * @details
1394  *    - Supported framework: TensorFlow Lite
1395  *    - The following constrains on the arguments apply
1396  *        -# Number of input channel equals number of output channels or ch_mult equals 1
1397  *    - Reccomended when number of channels is 4 or greater.
1398  *
1399  */
1400 arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
1401                                                 const cmsis_nn_dw_conv_params *dw_conv_params,
1402                                                 const cmsis_nn_per_channel_quant_params *quant_params,
1403                                                 const cmsis_nn_dims *input_dims,
1404                                                 const int16_t *input_data,
1405                                                 const cmsis_nn_dims *filter_dims,
1406                                                 const int8_t *filter_data,
1407                                                 const cmsis_nn_dims *bias_dims,
1408                                                 const int64_t *bias_data,
1409                                                 const cmsis_nn_dims *output_dims,
1410                                                 int16_t *output_data);
1411 
1412 /**
1413  * @brief Get the required buffer size for optimized s16 depthwise convolution
1414  * function with constraint that in_channel equals out_channel.
1415  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1416  *                               Batch argument N is not used.
1417  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
1418  * @return          The function returns required buffer size in bytes
1419  *
1420  */
1421 int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
1422 
1423 /**
1424  * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
1425  *        the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
1426  *        argument details.
1427  *
1428  * @return     The function returns one of the following
1429  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - Unsupported dimension of tensors
1430  *                                                    - Unsupported pad size along the x axis
1431  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1432  *
1433  * @details
1434  *   - Supported framework : TensorFlow Lite Micro
1435  *   - The following constrains on the arguments apply
1436  *      -# Number of input channel equals number of output channels
1437  *      -# Filter height and width equals 3
1438  *      -# Padding along x is either 0 or 1.
1439  *
1440  */
1441 arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
1442                                               const cmsis_nn_dw_conv_params *dw_conv_params,
1443                                               const cmsis_nn_per_channel_quant_params *quant_params,
1444                                               const cmsis_nn_dims *input_dims,
1445                                               const int8_t *input_data,
1446                                               const cmsis_nn_dims *filter_dims,
1447                                               const int8_t *filter_data,
1448                                               const cmsis_nn_dims *bias_dims,
1449                                               const int32_t *bias_data,
1450                                               const cmsis_nn_dims *output_dims,
1451                                               int8_t *output_data);
1452 
1453 /**
1454  * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
1455  *        Refer arm_depthwise_conv_s8() for function argument details.
1456  *
1457  * @return     The function returns one of the following
1458  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - input channel != output channel or
1459  *                                                      ch_mult != 1
1460  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1461  *
1462  * @note       If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
1463  *             for the following if MVE optimizations(Arm Helium Technology) are used.
1464  *               - Output shift
1465  *               - Output multiplier
1466  *               - Output bias
1467  *               - kernel
1468  * @details
1469  *    - Supported framework: TensorFlow Lite
1470  *    - The following constrains on the arguments apply
1471  *        -# Number of input channel equals number of output channels or ch_mult equals 1
1472  *    - Reccomended when number of channels is 4 or greater.
1473  *
1474  */
1475 arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
1476                                               const cmsis_nn_dw_conv_params *dw_conv_params,
1477                                               const cmsis_nn_per_channel_quant_params *quant_params,
1478                                               const cmsis_nn_dims *input_dims,
1479                                               const int8_t *input_data,
1480                                               const cmsis_nn_dims *filter_dims,
1481                                               const int8_t *filter_data,
1482                                               const cmsis_nn_dims *bias_dims,
1483                                               const int32_t *bias_data,
1484                                               const cmsis_nn_dims *output_dims,
1485                                               int8_t *output_data);
1486 
1487 /**
1488  * @brief Optimized s4 depthwise convolution function with constraint that in_channel equals out_channel.
1489  *        Refer arm_depthwise_conv_s4() for function argument details.
1490  *
1491  * @return     The function returns one of the following
1492  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - input channel != output channel or
1493  *                                                      ch_mult != 1
1494  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1495  *
1496  * @note       If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
1497  *             for the following if MVE optimizations(Arm Helium Technology) are used.
1498  *               - Output shift
1499  *               - Output multiplier
1500  *               - Output bias
1501  *               - kernel
1502  * @details
1503  *    - Supported framework: TensorFlow Lite
1504  *    - The following constrains on the arguments apply
1505  *        -# Number of input channel equals number of output channels or ch_mult equals 1
1506  *    - Reccomended when number of channels is 4 or greater.
1507  *
1508  */
1509 arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
1510                                               const cmsis_nn_dw_conv_params *dw_conv_params,
1511                                               const cmsis_nn_per_channel_quant_params *quant_params,
1512                                               const cmsis_nn_dims *input_dims,
1513                                               const int8_t *input_data,
1514                                               const cmsis_nn_dims *filter_dims,
1515                                               const int8_t *filter_data,
1516                                               const cmsis_nn_dims *bias_dims,
1517                                               const int32_t *bias_data,
1518                                               const cmsis_nn_dims *output_dims,
1519                                               int8_t *output_data);
1520 
1521 /**
1522  * @brief Get the required buffer size for optimized s8 depthwise convolution
1523  * function with constraint that in_channel equals out_channel.
1524  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1525  *                               Batch argument N is not used.
1526  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
1527  * @return          The function returns required buffer size in bytes
1528  *
1529  */
1530 int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
1531 
1532 /**
1533  * @brief Get the required buffer size for optimized s4 depthwise convolution
1534  * function with constraint that in_channel equals out_channel.
1535  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1536  *                               Batch argument N is not used.
1537  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
1538  * @return          The function returns required buffer size in bytes
1539  *
1540  */
1541 int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
1542 
1543 /**
1544  * @defgroup FC Fully-connected Layer Functions
1545  *
1546  * Collection of fully-connected and matrix multiplication functions.
1547  *
1548  * Fully-connected layer is basically a matrix-vector multiplication
1549  * with bias. The matrix is the weights and the input/output vectors
1550  * are the activation values. Supported {weight, activation} precisions
1551  * include {8-bit, 8-bit} and {8-bit, 16-bit}
1552  *
1553  *
1554  */
1555 
1556 /**
1557  * @brief Basic s4 Fully Connected function.
1558  *
1559  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1560  *                               definition file to see if an additional buffer is required.
1561  *                               Optional function {API}_get_buffer_size() provides the buffer
1562  *                               size if an additional buffer is required.
1563  *                               The caller is expected to clear the buffer ,if applicable, for security reasons.
1564  * @param[in]      fc_params     Fully Connected layer parameters.
1565  *                               Range of fc_params->input_offset  : [-127, 128]
1566  *                               fc_params->filter_offset : 0
1567  *                               Range of fc_params->output_offset : [-128, 127]
1568  * @param[in]      quant_params  Per-tensor quantization info.
1569  *                               It contains the multiplier and shift value to be applied to the output tensor.
1570  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1571  *                               Input dimension is taken as Nx(H * W * C_IN)
1572  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
1573  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1574  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1575  *                               C : output depth and equals C_OUT in output_dims
1576  *                               H & W : Not used
1577  * @param[in]      filter_data   Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
1578  *                               weights [0x1, 0x2, 0x3, 0x4]  packed as [0x21, 0x43].
1579  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1580  *                               N, H, W : Not used
1581  * @param[in]      bias_data     Bias data pointer. Data type: int32
1582  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1583  *                               N : Batches
1584  *                               C_OUT : Output depth
1585  *                               H & W : Not used.
1586  * @param[in, out] output_data    Output data pointer. Data type: int8
1587  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1588  *
1589  * @details
1590  *    - Supported framework: TensorFlow Lite
1591  */
1592 arm_cmsis_nn_status arm_fully_connected_s4(const cmsis_nn_context *ctx,
1593                                            const cmsis_nn_fc_params *fc_params,
1594                                            const cmsis_nn_per_tensor_quant_params *quant_params,
1595                                            const cmsis_nn_dims *input_dims,
1596                                            const int8_t *input_data,
1597                                            const cmsis_nn_dims *filter_dims,
1598                                            const int8_t *filter_data,
1599                                            const cmsis_nn_dims *bias_dims,
1600                                            const int32_t *bias_data,
1601                                            const cmsis_nn_dims *output_dims,
1602                                            int8_t *output_data);
1603 
1604 /**
1605  * @brief Basic s8 Fully Connected function.
1606  *
1607  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1608  *                               definition file to see if an additional buffer is required.
1609  *                               Optional function {API}_get_buffer_size() provides the buffer
1610  *                               size if an additional buffer is required.
1611  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
1612  * @param[in]      fc_params     Fully Connected layer parameters.
1613  *                               Range of fc_params->input_offset  : [-127, 128]
1614  *                               fc_params->filter_offset : 0
1615  *                               Range of fc_params->output_offset : [-128, 127]
1616  * @param[in]      quant_params  Per-tensor quantization info.
1617  *                               It contains the multiplier and shift value to be applied to the output tensor.
1618  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1619  *                               Input dimension is taken as Nx(H * W * C_IN)
1620  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
1621  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1622  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1623  *                               C : output depth and equals C_OUT in output_dims
1624  *                               H & W : Not used
1625  * @param[in]      filter_data   Filter data pointer. Data type: int8
1626  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1627  *                               N, H, W : Not used
1628  * @param[in]      bias_data     Bias data pointer. Data type: int32
1629  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1630  *                               N : Batches
1631  *                               C_OUT : Output depth
1632  *                               H & W : Not used.
1633  * @param[in, out] output_data    Output data pointer. Data type: int8
1634  *
1635  * @return     The function returns either
1636  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
1637  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
1638  *
1639  * @details
1640  *    - Supported framework: TensorFlow Lite
1641  */
1642 arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
1643                                            const cmsis_nn_fc_params *fc_params,
1644                                            const cmsis_nn_per_tensor_quant_params *quant_params,
1645                                            const cmsis_nn_dims *input_dims,
1646                                            const int8_t *input_data,
1647                                            const cmsis_nn_dims *filter_dims,
1648                                            const int8_t *filter_data,
1649                                            const cmsis_nn_dims *bias_dims,
1650                                            const int32_t *bias_data,
1651                                            const cmsis_nn_dims *output_dims,
1652                                            int8_t *output_data);
1653 
1654 /**
1655  * @brief Basic s8 Fully Connected function using per channel quantization.
1656  *
1657  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1658  *                               definition file to see if an additional buffer is required.
1659  *                               Optional function {API}_get_buffer_size() provides the buffer
1660  *                               size if an additional buffer is required.
1661  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
1662  * @param[in]      fc_params     Fully Connected layer parameters.
1663  *                               Range of fc_params->input_offset  : [-127, 128]
1664  *                               fc_params->filter_offset : 0
1665  *                               Range of fc_params->output_offset : [-128, 127]
1666  * @param[in]      quant_params  Per-channel quantization info.
1667  *                               It contains the multiplier and shift values to be applied to each output channel
1668  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1669  *                               Input dimension is taken as Nx(H * W * C_IN)
1670  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
1671  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1672  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1673  *                               C : output depth and equals C_OUT in output_dims
1674  *                               H & W : Not used
1675  * @param[in]      filter_data   Filter data pointer. Data type: int8
1676  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1677  *                               N, H, W : Not used
1678  * @param[in]      bias_data     Bias data pointer. Data type: int32
1679  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1680  *                               N : Batches
1681  *                               C_OUT : Output depth
1682  *                               H & W : Not used.
1683  * @param[in, out] output_data    Output data pointer. Data type: int8
1684  *
1685  * @return     The function returns either
1686  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
1687  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
1688  *
1689  * @details
1690  *    - Supported framework: TensorFlow Lite
1691  */
1692 arm_cmsis_nn_status arm_fully_connected_per_channel_s8(const cmsis_nn_context *ctx,
1693                                                        const cmsis_nn_fc_params *fc_params,
1694                                                        const cmsis_nn_per_channel_quant_params *quant_params,
1695                                                        const cmsis_nn_dims *input_dims,
1696                                                        const int8_t *input_data,
1697                                                        const cmsis_nn_dims *filter_dims,
1698                                                        const int8_t *filter_data,
1699                                                        const cmsis_nn_dims *bias_dims,
1700                                                        const int32_t *bias_data,
1701                                                        const cmsis_nn_dims *output_dims,
1702                                                        int8_t *output_data);
1703 
1704 /**
1705  * @brief s8 Fully Connected layer wrapper function
1706  *
1707  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1708  *                               definition file to see if an additional buffer is required.
1709  *                               Optional function {API}_get_buffer_size() provides the buffer
1710  *                               size if an additional buffer is required.
1711  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
1712  * @param[in]      fc_params     Fully Connected layer parameters.
1713  *                               Range of fc_params->input_offset  : [-127, 128]
1714  *                               fc_params->filter_offset : 0
1715  *                               Range of fc_params->output_offset : [-128, 127]
1716  * @param[in]      quant_params  Per-channel or per-tensor quantization info. Check struct defintion for details.
1717  *                               It contains the multiplier and shift value(s) to be applied to each output channel
1718  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1719  *                               Input dimension is taken as Nx(H * W * C_IN)
1720  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
1721  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1722  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1723  *                               C : output depth and equals C_OUT in output_dims
1724  *                               H & W : Not used
1725  * @param[in]      filter_data   Filter data pointer. Data type: int8
1726  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1727  *                               N, H, W : Not used
1728  * @param[in]      bias_data     Bias data pointer. Data type: int32
1729  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1730  *                               N : Batches
1731  *                               C_OUT : Output depth
1732  *                               H & W : Not used.
1733  * @param[in, out] output_data    Output data pointer. Data type: int8
1734  *
1735  * @return     The function returns either
1736  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
1737  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
1738  *
1739  * @details
1740  *    - Supported framework: TensorFlow Lite
1741  */
1742 arm_cmsis_nn_status arm_fully_connected_wrapper_s8(const cmsis_nn_context *ctx,
1743                                                    const cmsis_nn_fc_params *fc_params,
1744                                                    const cmsis_nn_quant_params *quant_params,
1745                                                    const cmsis_nn_dims *input_dims,
1746                                                    const int8_t *input_data,
1747                                                    const cmsis_nn_dims *filter_dims,
1748                                                    const int8_t *filter_data,
1749                                                    const cmsis_nn_dims *bias_dims,
1750                                                    const int32_t *bias_data,
1751                                                    const cmsis_nn_dims *output_dims,
1752                                                    int8_t *output_data);
1753 
1754 /**
1755  * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s32 bias_data.
1756  * @param[in, out]      vector_sum_buf              Buffer for vector sums
1757  * @param[in]           vector_cols                 Number of vector columns
1758  * @param[in]           vector_rows                 Number of vector rows
1759  * @param[in]           vector_data                 Vector of weigths data
1760  * @param[in]           lhs_offset                  Constant multiplied with each sum
1761  * @param[in]           rhs_offset                  Constant added to each vector element before sum
1762  * @param[in]           bias_data                   Vector of bias data, added to each sum.
1763  * @return              The function returns
1764  *                         <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1765  */
1766 arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
1767                                       const int32_t vector_cols,
1768                                       const int32_t vector_rows,
1769                                       const int8_t *vector_data,
1770                                       const int32_t lhs_offset,
1771                                       const int32_t rhs_offset,
1772                                       const int32_t *bias_data);
1773 
1774 /**
1775  * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s64 bias_data.
1776  * @param[in, out]      vector_sum_buf              Buffer for vector sums
1777  * @param[in]           vector_cols                 Number of vector columns
1778  * @param[in]           vector_rows                 Number of vector rows
1779  * @param[in]           vector_data                 Vector of weigths data
1780  * @param[in]           lhs_offset                  Constant multiplied with each sum
1781  * @param[in]           bias_data                   Vector of bias data, added to each sum.
1782  * @return              The function returns
1783  *                         <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1784  */
1785 arm_cmsis_nn_status arm_vector_sum_s8_s64(int64_t *vector_sum_buf,
1786                                           const int32_t vector_cols,
1787                                           const int32_t vector_rows,
1788                                           const int8_t *vector_data,
1789                                           const int32_t lhs_offset,
1790                                           const int64_t *bias_data);
1791 
1792 /**
1793  * @brief Get size of additional buffer required by arm_fully_connected_s8().
1794  *        See also arm_vector_sum_s8, which is required if buffer size is > 0.
1795  * @param[in]      filter_dims             dimension of filter
1796  * @return         The function returns    required buffer size in bytes
1797  *
1798  */
1799 int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
1800 
1801 /**
1802  * @brief Get size of additional buffer required by arm_fully_connected_s8() for processors with DSP extension.
1803  *        Refer to arm_fully_connected_s8_get_buffer_size() for function argument details.
1804  *
1805  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1806  *             arm_fully_connected_s8_get_buffer_size().
1807  *
1808  */
1809 int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
1810 
1811 /**
1812  * @brief Get size of additional buffer required by arm_fully_connected_s8() for Arm(R) Helium Architecture case.
1813  *        Refer to arm_fully_connected_s8_get_buffer_size() for function argument details.
1814  *
1815  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1816  *             arm_fully_connected_s8_get_buffer_size().
1817  *
1818  */
1819 int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
1820 
1821 /**
1822  * @brief Basic s16 Fully Connected function.
1823  *
1824  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1825  *                               definition file to see if an additional buffer is required.
1826  *                               Optional function {API}_get_buffer_size() provides the buffer
1827  *                               size if an additional buffer is required.
1828  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
1829  * @param[in]      fc_params     Fully Connected layer parameters.
1830  *                               fc_params->input_offset  : 0
1831  *                               fc_params->filter_offset : 0
1832  *                               fc_params->output_offset : 0
1833  * @param[in]      quant_params  Per-tensor quantization info.
1834  *                               It contains the multiplier and shift value to be applied to the output tensor.
1835  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1836  *                               Input dimension is taken as Nx(H * W * C_IN)
1837  * @param[in]      input_data    Input (activation) data pointer. Data type: int16
1838  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1839  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1840  *                               C : output depth and equals C_OUT in output_dims
1841  *                               H & W : Not used
1842  * @param[in]      filter_data   Filter data pointer. Data type: int8
1843  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1844  *                               N, H, W : Not used
1845  * @param[in]      bias_data     Bias data pointer. Data type: int64
1846  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1847  *                               N : Batches
1848  *                               C_OUT : Output depth
1849  *                               H & W : Not used.
1850  * @param[in, out] output_data    Output data pointer. Data type: int16
1851  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1852  *
1853  * @details
1854  *    - Supported framework: TensorFlow Lite
1855  */
1856 arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
1857                                             const cmsis_nn_fc_params *fc_params,
1858                                             const cmsis_nn_per_tensor_quant_params *quant_params,
1859                                             const cmsis_nn_dims *input_dims,
1860                                             const int16_t *input_data,
1861                                             const cmsis_nn_dims *filter_dims,
1862                                             const int8_t *filter_data,
1863                                             const cmsis_nn_dims *bias_dims,
1864                                             const int64_t *bias_data,
1865                                             const cmsis_nn_dims *output_dims,
1866                                             int16_t *output_data);
1867 
1868 /**
1869  * @brief Get size of additional buffer required by arm_fully_connected_s16().
1870  * @param[in]      filter_dims             dimension of filter
1871  * @return         The function returns    required buffer size in bytes
1872  *
1873  */
1874 int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims);
1875 
1876 /**
1877  * @brief Get size of additional buffer required by arm_fully_connected_s16() for processors with DSP extension.
1878  *        Refer to arm_fully_connected_s16_get_buffer_size() for function argument details.
1879  *
1880  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1881  *             arm_fully_connected_s16_get_buffer_size().
1882  *
1883  */
1884 int32_t arm_fully_connected_s16_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
1885 
1886 /**
1887  * @brief Get size of additional buffer required by arm_fully_connected_s16() for Arm(R) Helium Architecture case.
1888  *        Refer to arm_fully_connected_s16_get_buffer_size() for function argument details.
1889  *
1890  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1891  *             arm_fully_connected_s16_get_buffer_size().
1892  *
1893  */
1894 int32_t arm_fully_connected_s16_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
1895 
1896 /**
1897  * @defgroup groupElementwise Elementwise Functions
1898  *
1899  * Elementwise add and multiplication functions.
1900  *
1901  */
1902 
1903 /**
1904  * @brief s8 elementwise add of two vectors
1905  * @param[in]       input_1_vect        pointer to input vector 1
1906  * @param[in]       input_2_vect        pointer to input vector 2
1907  * @param[in]       input_1_offset      offset for input 1. Range: -127 to 128
1908  * @param[in]       input_1_mult        multiplier for input 1
1909  * @param[in]       input_1_shift       shift for input 1
1910  * @param[in]       input_2_offset      offset for input 2. Range: -127 to 128
1911  * @param[in]       input_2_mult        multiplier for input 2
1912  * @param[in]       input_2_shift       shift for input 2
1913  * @param[in]       left_shift          input left shift
1914  * @param[in,out]   output              pointer to output vector
1915  * @param[in]       out_offset          output offset.  Range: -128 to 127
1916  * @param[in]       out_mult            output multiplier
1917  * @param[in]       out_shift           output shift
1918  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -128
1919  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 127
1920  * @param[in]       block_size          number of samples
1921  * @return          The function returns    ARM_CMSIS_NN_SUCCESS
1922  */
1923 arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
1924                                            const int8_t *input_2_vect,
1925                                            const int32_t input_1_offset,
1926                                            const int32_t input_1_mult,
1927                                            const int32_t input_1_shift,
1928                                            const int32_t input_2_offset,
1929                                            const int32_t input_2_mult,
1930                                            const int32_t input_2_shift,
1931                                            const int32_t left_shift,
1932                                            int8_t *output,
1933                                            const int32_t out_offset,
1934                                            const int32_t out_mult,
1935                                            const int32_t out_shift,
1936                                            const int32_t out_activation_min,
1937                                            const int32_t out_activation_max,
1938                                            const int32_t block_size);
1939 
1940 /**
1941  * @brief s16 elementwise add of two vectors
1942  * @param[in]       input_1_vect        pointer to input vector 1
1943  * @param[in]       input_2_vect        pointer to input vector 2
1944  * @param[in]       input_1_offset      offset for input 1. Not used.
1945  * @param[in]       input_1_mult        multiplier for input 1
1946  * @param[in]       input_1_shift       shift for input 1
1947  * @param[in]       input_2_offset      offset for input 2. Not used.
1948  * @param[in]       input_2_mult        multiplier for input 2
1949  * @param[in]       input_2_shift       shift for input 2
1950  * @param[in]       left_shift          input left shift
1951  * @param[in,out]   output              pointer to output vector
1952  * @param[in]       out_offset          output offset. Not used.
1953  * @param[in]       out_mult            output multiplier
1954  * @param[in]       out_shift           output shift
1955  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -32768
1956  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 32767
1957  * @param[in]       block_size          number of samples
1958  * @return          The function returns  ARM_CMSIS_NN_SUCCESS
1959  */
1960 arm_cmsis_nn_status arm_elementwise_add_s16(const int16_t *input_1_vect,
1961                                             const int16_t *input_2_vect,
1962                                             const int32_t input_1_offset,
1963                                             const int32_t input_1_mult,
1964                                             const int32_t input_1_shift,
1965                                             const int32_t input_2_offset,
1966                                             const int32_t input_2_mult,
1967                                             const int32_t input_2_shift,
1968                                             const int32_t left_shift,
1969                                             int16_t *output,
1970                                             const int32_t out_offset,
1971                                             const int32_t out_mult,
1972                                             const int32_t out_shift,
1973                                             const int32_t out_activation_min,
1974                                             const int32_t out_activation_max,
1975                                             const int32_t block_size);
1976 
1977 /**
1978  * @brief s8 elementwise multiplication
1979  * @param[in]       input_1_vect        pointer to input vector 1
1980  * @param[in]       input_2_vect        pointer to input vector 2
1981  * @param[in]       input_1_offset      offset for input 1. Range: -127 to 128
1982  * @param[in]       input_2_offset      offset for input 2. Range: -127 to 128
1983  * @param[in,out]   output              pointer to output vector
1984  * @param[in]       out_offset          output offset. Range: -128 to 127
1985  * @param[in]       out_mult            output multiplier
1986  * @param[in]       out_shift           output shift
1987  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -128
1988  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 127
1989  * @param[in]       block_size          number of samples
1990  * @return          The function returns ARM_CMSIS_NN_SUCCESS
1991  *
1992  * @details   Supported framework: TensorFlow Lite micro
1993  */
1994 arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
1995                                            const int8_t *input_2_vect,
1996                                            const int32_t input_1_offset,
1997                                            const int32_t input_2_offset,
1998                                            int8_t *output,
1999                                            const int32_t out_offset,
2000                                            const int32_t out_mult,
2001                                            const int32_t out_shift,
2002                                            const int32_t out_activation_min,
2003                                            const int32_t out_activation_max,
2004                                            const int32_t block_size);
2005 
2006 /**
2007  * @brief s16 elementwise multiplication
2008  * @param[in]       input_1_vect        pointer to input vector 1
2009  * @param[in]       input_2_vect        pointer to input vector 2
2010  * @param[in]       input_1_offset      offset for input 1. Not used.
2011  * @param[in]       input_2_offset      offset for input 2. Not used.
2012  * @param[in,out]   output              pointer to output vector
2013  * @param[in]       out_offset          output offset. Not used.
2014  * @param[in]       out_mult            output multiplier
2015  * @param[in]       out_shift           output shift
2016  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -32768
2017  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 32767
2018  * @param[in]       block_size          number of samples
2019  * @return          The function returns ARM_CMSIS_NN_SUCCESS
2020  *
2021  * @details   Supported framework: TensorFlow Lite micro
2022  */
2023 arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
2024                                             const int16_t *input_2_vect,
2025                                             const int32_t input_1_offset,
2026                                             const int32_t input_2_offset,
2027                                             int16_t *output,
2028                                             const int32_t out_offset,
2029                                             const int32_t out_mult,
2030                                             const int32_t out_shift,
2031                                             const int32_t out_activation_min,
2032                                             const int32_t out_activation_max,
2033                                             const int32_t block_size);
2034 
2035 /**
2036  * @defgroup Acti Activation Functions
2037  *
2038  * Perform activation layers, including ReLU (Rectified Linear Unit),
2039  * sigmoid and tanh
2040  *
2041  */
2042 
2043 /**
2044  * @brief Q7 RELU function
2045  * @param[in,out]   data        pointer to input
2046  * @param[in]       size        number of elements
2047  */
2048 void arm_relu_q7(int8_t *data, uint16_t size);
2049 
2050 /**
2051  * @brief s8 ReLU6 function
2052  * @param[in,out]   data        pointer to input
2053  * @param[in]       size        number of elements
2054  */
2055 void arm_relu6_s8(int8_t *data, uint16_t size);
2056 
2057 /**
2058  * @brief Q15 RELU function
2059  * @param[in,out]   data        pointer to input
2060  * @param[in]       size        number of elements
2061  */
2062 void arm_relu_q15(int16_t *data, uint16_t size);
2063 
2064 /**
2065  * @brief s16 neural network activation function using direct table look-up
2066  * @param[in]       input       pointer to input data
2067  * @param[out]      output      pointer to output
2068  * @param[in]       size        number of elements
2069  * @param[in]       left_shift  bit-width of the integer part, assumed to be smaller than 3.
2070  * @param[in]       type        type of activation functions
2071  * @return                      The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2072 
2073  *
2074  * @details Supported framework: TensorFlow Lite for Microcontrollers.
2075  * This activation function must be bit precise congruent with the corresponding TFLM tanh and sigmoid activation
2076  * functions
2077  */
2078 arm_cmsis_nn_status arm_nn_activation_s16(const int16_t *input,
2079                                           int16_t *output,
2080                                           const int32_t size,
2081                                           const int32_t left_shift,
2082                                           const arm_nn_activation_type type);
2083 
2084 /**
2085  * @defgroup Pooling Pooling Functions
2086  *
2087  * Perform max and average pooling operations
2088  *
2089  */
2090 
2091 /**
2092  * @brief s8 average pooling function.
2093  *
2094  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
2095  *                              definition file to see if an additional buffer is required.
2096  *                              Optional function {API}_get_buffer_size() provides the buffer
2097  *                              size if an additional buffer is required.
2098  *                              The caller is expected to clear the buffer, if applicable, for security reasons.
2099  * @param[in]      pool_params  Pooling parameters
2100  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
2101  * @param[in]      input_data   Input (activation) data pointer. Data type: int8
2102  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
2103  *                              Argument N and C are not used.
2104  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
2105  *                              Argument N is not used.
2106  *                              C_OUT equals C_IN.
2107  * @param[in, out] output_data Output data pointer. Data type: int8
2108  *
2109  * @return     The function returns either
2110  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
2111  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
2112  *
2113  * @details
2114  *    - Supported Framework: TensorFlow Lite
2115  *
2116  */
2117 arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
2118                                    const cmsis_nn_pool_params *pool_params,
2119                                    const cmsis_nn_dims *input_dims,
2120                                    const int8_t *input_data,
2121                                    const cmsis_nn_dims *filter_dims,
2122                                    const cmsis_nn_dims *output_dims,
2123                                    int8_t *output_data);
2124 
2125 /**
2126  * @brief Get the required buffer size for S8 average pooling function
2127  * @param[in]       dim_dst_width         output tensor dimension
2128  * @param[in]       ch_src                number of input tensor channels
2129  * @return          The function returns required buffer size in bytes
2130  *
2131  */
2132 int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
2133 
2134 /**
2135  * @brief Get the required buffer size for S8 average pooling function for processors with DSP extension.
2136  *        Refer to arm_avgpool_s8_get_buffer_size() for function argument details.
2137  *
2138  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2139  *             arm_avgpool_s8_get_buffer_size().
2140  *
2141  */
2142 int32_t arm_avgpool_s8_get_buffer_size_dsp(const int dim_dst_width, const int ch_src);
2143 
2144 /**
2145  * @brief Get the required buffer size for S8 average pooling function for Arm(R) Helium Architecture case.
2146  *        Refer to arm_avgpool_s8_get_buffer_size() for function argument details.
2147  *
2148  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2149  *             arm_avgpool_s8_get_buffer_size().
2150  *
2151  */
2152 int32_t arm_avgpool_s8_get_buffer_size_mve(const int dim_dst_width, const int ch_src);
2153 
2154 /**
2155  * @brief s16 average pooling function.
2156  *
2157  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
2158  *                              definition file to see if an additional buffer is required.
2159  *                              Optional function {API}_get_buffer_size() provides the buffer
2160  *                              size if an additional buffer is required.
2161  *                              The caller is expected to clear the buffer, if applicable, for security reasons.
2162  * @param[in]      pool_params  Pooling parameters
2163  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
2164  * @param[in]      input_data   Input (activation) data pointer. Data type: int16
2165  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
2166  *                              Argument N and C are not used.
2167  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
2168  *                              Argument N is not used.
2169  *                              C_OUT equals C_IN.
2170  * @param[in, out] output_data  Output data pointer. Data type: int16
2171  *
2172  * @return                        The function returns
2173  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
2174  *                                    <code>ARM_CMSIS_NN_ARG_ERROR</code> - In case of invalid arguments
2175  *
2176  * @details
2177  *    - Supported Framework: TensorFlow Lite
2178  *
2179  */
2180 arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
2181                                     const cmsis_nn_pool_params *pool_params,
2182                                     const cmsis_nn_dims *input_dims,
2183                                     const int16_t *input_data,
2184                                     const cmsis_nn_dims *filter_dims,
2185                                     const cmsis_nn_dims *output_dims,
2186                                     int16_t *output_data);
2187 
2188 /**
2189  * @brief Get the required buffer size for S16 average pooling function
2190  * @param[in]       dim_dst_width         output tensor dimension
2191  * @param[in]       ch_src                number of input tensor channels
2192  * @return          The function returns required buffer size in bytes
2193  *
2194  */
2195 int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src);
2196 
2197 /**
2198  * @brief Get the required buffer size for S16 average pooling function for processors with DSP extension.
2199  *        Refer to arm_avgpool_s16_get_buffer_size() for function argument details.
2200  *
2201  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2202  *             arm_avgpool_s16_get_buffer_size().
2203  *
2204  */
2205 int32_t arm_avgpool_s16_get_buffer_size_dsp(const int dim_dst_width, const int ch_src);
2206 
2207 /**
2208  * @brief Get the required buffer size for S16 average pooling function for Arm(R) Helium Architecture case.
2209  *        Refer to arm_avgpool_s16_get_buffer_size() for function argument details.
2210  *
2211  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2212  *             arm_avgpool_s16_get_buffer_size().
2213  *
2214  */
2215 int32_t arm_avgpool_s16_get_buffer_size_mve(const int dim_dst_width, const int ch_src);
2216 
2217 /**
2218  * @brief s8 max pooling function.
2219  *
2220  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
2221  *                              definition file to see if an additional buffer is required.
2222  *                              Optional function {API}_get_buffer_size() provides the buffer
2223  *                              size if an additional buffer is required.
2224  *                              The caller is expected to clear the buffer, if applicable, for security reasons.
2225  * @param[in]      pool_params  Pooling parameters
2226  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
2227  * @param[in]      input_data   Input (activation) data pointer. The input tensor must not
2228  *                              overlap with the output tensor. Data type: int8
2229  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
2230  *                              Argument N and C are not used.
2231  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
2232  *                              Argument N is not used.
2233  *                              C_OUT equals C_IN.
2234  * @param[in, out] output_data    Output data pointer. Data type: int8
2235  *
2236  * @return     The function returns either
2237  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
2238  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
2239  *
2240  * @details
2241  *    - Supported Framework: TensorFlow Lite
2242  *
2243  */
2244 arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx,
2245                                     const cmsis_nn_pool_params *pool_params,
2246                                     const cmsis_nn_dims *input_dims,
2247                                     const int8_t *input_data,
2248                                     const cmsis_nn_dims *filter_dims,
2249                                     const cmsis_nn_dims *output_dims,
2250                                     int8_t *output_data);
2251 
2252 /**
2253  * @brief s16 max pooling function.
2254  *
2255  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
2256  *                              definition file to see if an additional buffer is required.
2257  *                              Optional function {API}_get_buffer_size() provides the buffer
2258  *                              size if an additional buffer is required.
2259  *                              The caller is expected to clear the buffer, if applicable, for security reasons.
2260  * @param[in]      pool_params  Pooling parameters
2261  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
2262  * @param[in]      src          Input (activation) data pointer. The input tensor must not
2263  *                              overlap with the output tensor. Data type: int16
2264  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
2265  *                              Argument N and C are not used.
2266  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
2267  *                              Argument N is not used.
2268  *                              C_OUT equals C_IN.
2269  * @param[in, out] dst          Output data pointer. Data type: int16
2270  *
2271  * @return     The function returns either
2272  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
2273  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
2274  *
2275  * @details
2276  *    - Supported Framework: TensorFlow Lite
2277  *
2278  */
2279 arm_cmsis_nn_status arm_max_pool_s16(const cmsis_nn_context *ctx,
2280                                      const cmsis_nn_pool_params *pool_params,
2281                                      const cmsis_nn_dims *input_dims,
2282                                      const int16_t *src,
2283                                      const cmsis_nn_dims *filter_dims,
2284                                      const cmsis_nn_dims *output_dims,
2285                                      int16_t *dst);
2286 
2287 /**
2288  * @defgroup Softmax Softmax Functions
2289  *
2290  *
2291  */
2292 
2293 /**
2294  * @brief S8 softmax function
2295  * @param[in]  input     Pointer to the input tensor
2296  * @param[in]  num_rows  Number of rows in the input tensor
2297  * @param[in]  row_size  Number of elements in each input row
2298  * @param[in]  mult      Input quantization multiplier
2299  * @param[in]  shift     Input quantization shift within the range [0, 31]
2300  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
2301  *                       the quantized exponential operation can be performed
2302  * @param[out] output    Pointer to the output tensor
2303  *
2304  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2305  *
2306  */
2307 void arm_softmax_s8(const int8_t *input,
2308                     const int32_t num_rows,
2309                     const int32_t row_size,
2310                     const int32_t mult,
2311                     const int32_t shift,
2312                     const int32_t diff_min,
2313                     int8_t *output);
2314 
2315 /**
2316  * @brief S8 to s16 softmax function
2317  * @param[in]  input     Pointer to the input tensor
2318  * @param[in]  num_rows  Number of rows in the input tensor
2319  * @param[in]  row_size  Number of elements in each input row
2320  * @param[in]  mult      Input quantization multiplier
2321  * @param[in]  shift     Input quantization shift within the range [0, 31]
2322  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
2323  *                       the quantized exponential operation can be performed
2324  * @param[out] output    Pointer to the output tensor
2325  *
2326  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2327  *
2328  */
2329 void arm_softmax_s8_s16(const int8_t *input,
2330                         const int32_t num_rows,
2331                         const int32_t row_size,
2332                         const int32_t mult,
2333                         const int32_t shift,
2334                         const int32_t diff_min,
2335                         int16_t *output);
2336 
2337 /**
2338  * @brief S16 softmax function
2339  * @param[in]  input           Pointer to the input tensor
2340  * @param[in]  num_rows        Number of rows in the input tensor
2341  * @param[in]  row_size        Number of elements in each input row
2342  * @param[in]  mult            Input quantization multiplier
2343  * @param[in]  shift           Input quantization shift within the range [0, 31]
2344  * @param[in]  softmax_params  Softmax s16 layer parameters with two pointers to LUTs speficied below.
2345  *                             For indexing the high 9 bits are used and 7 remaining for interpolation.
2346  *                             That means 512 entries for the 9-bit indexing and 1 extra for interpolation, i.e. 513
2347  *                             values for each LUT.
2348  *                             - Lookup table for exp(x), where x uniform distributed between [-10.0 , 0.0]
2349  *                             - Lookup table for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
2350  * @param[out] output          Pointer to the output tensor
2351  * @return                        The function returns
2352  *                                    <code>ARM_CMSIS_NN_ARG_ERROR</code> Argument error check failed
2353  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
2354  *
2355  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2356  *
2357  */
2358 arm_cmsis_nn_status arm_softmax_s16(const int16_t *input,
2359                                     const int32_t num_rows,
2360                                     const int32_t row_size,
2361                                     const int32_t mult,
2362                                     const int32_t shift,
2363                                     const cmsis_nn_softmax_lut_s16 *softmax_params,
2364                                     int16_t *output);
2365 
2366 /**
2367  * @brief U8 softmax function
2368  * @param[in]  input     Pointer to the input tensor
2369  * @param[in]  num_rows  Number of rows in the input tensor
2370  * @param[in]  row_size  Number of elements in each input row
2371  * @param[in]  mult      Input quantization multiplier
2372  * @param[in]  shift     Input quantization shift within the range [0, 31]
2373  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
2374  *                       the quantized exponential operation can be performed
2375  * @param[out] output    Pointer to the output tensor
2376  *
2377  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2378  *
2379  */
2380 
2381 void arm_softmax_u8(const uint8_t *input,
2382                     const int32_t num_rows,
2383                     const int32_t row_size,
2384                     const int32_t mult,
2385                     const int32_t shift,
2386                     const int32_t diff_min,
2387                     uint8_t *output);
2388 
2389 /**
2390  * @defgroup Reshape Reshape Functions
2391  *
2392  */
2393 
2394 /**
2395  * @brief Reshape a s8 vector into another with different shape
2396  * @param[in]  input      points to the s8 input vector
2397  * @param[out] output     points to the s8 output vector
2398  * @param[in]  total_size total size of the input and output vectors in bytes
2399  *
2400  * @note The output is expected to be in a memory area that does not overlap with the input's
2401  *
2402  */
2403 void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size);
2404 
2405 /**
2406  * @defgroup Transpose Transpose Functions
2407  *
2408  */
2409 
2410 /**
2411  * @brief Basic transpose function
2412  *
2413  * @param[in]       input_data            Input (activation) data pointer. Data type: int8
2414  * @param[out]      output_data           Output data pointer. Data type: int8
2415  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
2416  * @param[in]       output_dims           Output tensor dimensions. Format may be arbitrary relative to input format.
2417  *                                        The output dimension will depend on the permutation dimensions.
2418  *                                        In other words the out dimensions are the result of applying the permutation
2419  *                                        to the input dimensions.
2420  * @param[in]       transpose_params      Transpose parameters. Contains permutation dimensions.
2421  *
2422  * @return          The function returns either
2423  *                      <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
2424  *                      <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
2425  *
2426  */
2427 arm_cmsis_nn_status arm_transpose_s8(const int8_t *input_data,
2428                                      int8_t *const output_data,
2429                                      const cmsis_nn_dims *const input_dims,
2430                                      const cmsis_nn_dims *const output_dims,
2431                                      const cmsis_nn_transpose_params *const transpose_params);
2432 
2433 /**
2434  * @defgroup Concatenation Concatenation Functions
2435  *
2436  */
2437 
2438 /**
2439  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis
2440  *        This function should be called for each input tensor to concatenate. The argument offset_x
2441  *        will be used to store the input tensor in the correct position in the output tensor
2442  *
2443  *        i.e.    offset_x = 0
2444  *                for(i = 0 i < num_input_tensors; ++i)
2445  *                {
2446  *                    arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x)
2447  *                    offset_x += input_x[i]
2448  *                }
2449  *
2450  *        This function assumes that the output tensor has:
2451  *        -# The same height of the input tensor
2452  *        -# The same number of channels of the input tensor
2453  *        -# The same batch size of the input tensor
2454  *
2455  *        Unless specified otherwise, arguments are mandatory.
2456  *
2457  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2458  *      does not involve any arithmetic operation
2459  *
2460  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with the output tensor.
2461  * @param[in]  input_x  Width of input tensor
2462  * @param[in]  input_y  Height of input tensor
2463  * @param[in]  input_z  Channels in input tensor
2464  * @param[in]  input_w  Batch size in input tensor
2465  * @param[out] output   Pointer to output tensor. Expected to be at least
2466  *                          (input_x * input_y * input_z * input_w) + offset_x
2467  *                      bytes.
2468  * @param[in]  output_x Width of output tensor
2469  * @param[in]  offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor
2470  *                      It is user responsibility to provide the correct value
2471  *
2472  * <b> Input constraints</b>
2473  * offset_x is less than output_x
2474  *
2475  */
2476 void arm_concatenation_s8_x(const int8_t *input,
2477                             const uint16_t input_x,
2478                             const uint16_t input_y,
2479                             const uint16_t input_z,
2480                             const uint16_t input_w,
2481                             int8_t *output,
2482                             const uint16_t output_x,
2483                             const uint32_t offset_x);
2484 
2485 /**
2486  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis
2487  *        This function should be called for each input tensor to concatenate. The argument offset_y
2488  *        will be used to store the input tensor in the correct position in the output tensor
2489  *
2490  *        i.e.    offset_y = 0
2491  *                for(i = 0 i < num_input_tensors; ++i)
2492  *                {
2493  *                    arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y)
2494  *                    offset_y += input_y[i]
2495  *                }
2496  *
2497  *        This function assumes that the output tensor has:
2498  *        -# The same width of the input tensor
2499  *        -# The same number of channels of the input tensor
2500  *        -# The same batch size of the input tensor
2501  *
2502  *        Unless specified otherwise, arguments are mandatory.
2503  *
2504  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2505  *       does not involve any arithmetic operation
2506  *
2507  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with the output tensor.
2508  * @param[in]  input_x  Width of input tensor
2509  * @param[in]  input_y  Height of input tensor
2510  * @param[in]  input_z  Channels in input tensor
2511  * @param[in]  input_w  Batch size in input tensor
2512  * @param[out] output   Pointer to output tensor. Expected to be at least
2513  *                          (input_z * input_w * input_x * input_y) + offset_y
2514  *                      bytes.
2515  * @param[in]  output_y Height of output tensor
2516  * @param[in]  offset_y The offset on the Y axis to start concatenating the input tensor
2517  *                      It is user responsibility to provide the correct value
2518  *
2519  * <b> Input constraints</b>
2520  * offset_y is less than output_y
2521  *
2522  */
2523 void arm_concatenation_s8_y(const int8_t *input,
2524                             const uint16_t input_x,
2525                             const uint16_t input_y,
2526                             const uint16_t input_z,
2527                             const uint16_t input_w,
2528                             int8_t *output,
2529                             const uint16_t output_y,
2530                             const uint32_t offset_y);
2531 
2532 /**
2533  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis
2534  *        This function should be called for each input tensor to concatenate. The argument offset_z
2535  *        will be used to store the input tensor in the correct position in the output tensor
2536  *
2537  *        i.e.    offset_z = 0
2538  *                for(i = 0 i < num_input_tensors; ++i)
2539  *                {
2540  *                    arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z)
2541  *                    offset_z += input_z[i]
2542  *                }
2543  *
2544  *        This function assumes that the output tensor has:
2545  *        -# The same width of the input tensor
2546  *        -# The same height of the input tensor
2547  *        -# The same batch size of the input tensor
2548  *
2549  *        Unless specified otherwise, arguments are mandatory.
2550  *
2551  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2552  *       does not involve any arithmetic operation
2553  *
2554  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with output tensor.
2555  * @param[in]  input_x  Width of input tensor
2556  * @param[in]  input_y  Height of input tensor
2557  * @param[in]  input_z  Channels in input tensor
2558  * @param[in]  input_w  Batch size in input tensor
2559  * @param[out] output   Pointer to output tensor. Expected to be at least
2560  *                          (input_x * input_y * input_z * input_w) + offset_z
2561  *                      bytes.
2562  * @param[in]  output_z Channels in output tensor
2563  * @param[in]  offset_z The offset on the Z axis to start concatenating the input tensor
2564  *                      It is user responsibility to provide the correct value
2565  *
2566  * <b> Input constraints</b>
2567  * offset_z is less than output_z
2568  *
2569  */
2570 void arm_concatenation_s8_z(const int8_t *input,
2571                             const uint16_t input_x,
2572                             const uint16_t input_y,
2573                             const uint16_t input_z,
2574                             const uint16_t input_w,
2575                             int8_t *output,
2576                             const uint16_t output_z,
2577                             const uint32_t offset_z);
2578 
2579 /**
2580  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size)
2581  *        This function should be called for each input tensor to concatenate. The argument offset_w
2582  *        will be used to store the input tensor in the correct position in the output tensor
2583  *
2584  *        i.e.    offset_w = 0
2585  *                for(i = 0 i < num_input_tensors; ++i)
2586  *                {
2587  *                    arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w)
2588  *                    offset_w += input_w[i]
2589  *                }
2590  *
2591  *        This function assumes that the output tensor has:
2592  *        -# The same width of the input tensor
2593  *        -# The same height of the input tensor
2594  *        -# The same number o channels of the input tensor
2595  *
2596  *        Unless specified otherwise, arguments are mandatory.
2597  *
2598  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2599  *       does not involve any arithmetic operation
2600  *
2601  * @param[in]  input    Pointer to input tensor
2602  * @param[in]  input_x  Width of input tensor
2603  * @param[in]  input_y  Height of input tensor
2604  * @param[in]  input_z  Channels in input tensor
2605  * @param[in]  input_w  Batch size in input tensor
2606  * @param[out] output   Pointer to output tensor. Expected to be at least
2607  *                          input_x * input_y * input_z * input_w
2608  *                      bytes.
2609  * @param[in]  offset_w The offset on the W axis to start concatenating the input tensor
2610  *                      It is user responsibility to provide the correct value
2611  *
2612  */
2613 void arm_concatenation_s8_w(const int8_t *input,
2614                             const uint16_t input_x,
2615                             const uint16_t input_y,
2616                             const uint16_t input_z,
2617                             const uint16_t input_w,
2618                             int8_t *output,
2619                             const uint32_t offset_w);
2620 /**
2621  * @defgroup SVDF SVDF Functions
2622  *
2623  */
2624 
2625 /**
2626  * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
2627  *
2628  * @param[in, out] ctx                Function context (e.g. temporary buffer). Check the function
2629  *                                    definition file to see if an additional buffer is required.
2630  *                                    Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
2631  *                                    size if an additional buffer is required.
2632  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2633  * @param[in]   input_ctx             Temporary scratch buffer
2634  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2635  * @param[in]   output_ctx            Temporary output scratch buffer
2636  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2637  * @param[in]   svdf_params           SVDF Parameters
2638  *                                    Range of svdf_params->input_offset  : [-128, 127]
2639  *                                    Range of svdf_params->output_offset  : [-128, 127]
2640  * @param[in]   input_quant_params    Input quantization parameters
2641  * @param[in]   output_quant_params   Output quantization parameters
2642  * @param[in]   input_dims            Input tensor dimensions
2643  * @param[in]   input_data            Pointer to input tensor
2644  * @param[in]   state_dims            State tensor dimensions
2645  * @param[in]   state_data            Pointer to state tensor
2646  * @param[in]   weights_feature_dims  Weights (feature) tensor dimensions
2647  * @param[in]   weights_feature_data  Pointer to the weights (feature) tensor
2648  * @param[in]   weights_time_dims     Weights (time) tensor dimensions
2649  * @param[in]   weights_time_data     Pointer to the weights (time) tensor
2650  * @param[in]   bias_dims             Bias tensor dimensions
2651  * @param[in]   bias_data             Pointer to bias tensor
2652  * @param[in]   output_dims           Output tensor dimensions
2653  * @param[out]  output_data           Pointer to the output tensor
2654  *
2655  * @return     The function returns either
2656  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
2657  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
2658  *
2659  * @details
2660  *    1. Supported framework: TensorFlow Lite micro
2661  */
2662 arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx,
2663                                 const cmsis_nn_context *input_ctx,
2664                                 const cmsis_nn_context *output_ctx,
2665                                 const cmsis_nn_svdf_params *svdf_params,
2666                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
2667                                 const cmsis_nn_per_tensor_quant_params *output_quant_params,
2668                                 const cmsis_nn_dims *input_dims,
2669                                 const int8_t *input_data,
2670                                 const cmsis_nn_dims *state_dims,
2671                                 int8_t *state_data,
2672                                 const cmsis_nn_dims *weights_feature_dims,
2673                                 const int8_t *weights_feature_data,
2674                                 const cmsis_nn_dims *weights_time_dims,
2675                                 const int8_t *weights_time_data,
2676                                 const cmsis_nn_dims *bias_dims,
2677                                 const int32_t *bias_data,
2678                                 const cmsis_nn_dims *output_dims,
2679                                 int8_t *output_data);
2680 
2681 /**
2682  * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
2683  *
2684  * @param[in]   input_ctx             Temporary scratch buffer
2685  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2686  * @param[in]   output_ctx            Temporary output scratch buffer
2687  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2688  * @param[in]   svdf_params           SVDF Parameters
2689  *                                    Range of svdf_params->input_offset  : [-128, 127]
2690  *                                    Range of svdf_params->output_offset  : [-128, 127]
2691  * @param[in]   input_quant_params    Input quantization parameters
2692  * @param[in]   output_quant_params   Output quantization parameters
2693  * @param[in]   input_dims            Input tensor dimensions
2694  * @param[in]   input_data            Pointer to input tensor
2695  * @param[in]   state_dims            State tensor dimensions
2696  * @param[in]   state_data            Pointer to state tensor
2697  * @param[in]   weights_feature_dims  Weights (feature) tensor dimensions
2698  * @param[in]   weights_feature_data  Pointer to the weights (feature) tensor
2699  * @param[in]   weights_time_dims     Weights (time) tensor dimensions
2700  * @param[in]   weights_time_data     Pointer to the weights (time) tensor
2701  * @param[in]   bias_dims             Bias tensor dimensions
2702  * @param[in]   bias_data             Pointer to bias tensor
2703  * @param[in]   output_dims           Output tensor dimensions
2704  * @param[out]  output_data           Pointer to the output tensor
2705  *
2706  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2707  *
2708  * @details
2709  *    1. Supported framework: TensorFlow Lite micro
2710  */
2711 arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
2712                                           const cmsis_nn_context *output_ctx,
2713                                           const cmsis_nn_svdf_params *svdf_params,
2714                                           const cmsis_nn_per_tensor_quant_params *input_quant_params,
2715                                           const cmsis_nn_per_tensor_quant_params *output_quant_params,
2716                                           const cmsis_nn_dims *input_dims,
2717                                           const int8_t *input_data,
2718                                           const cmsis_nn_dims *state_dims,
2719                                           int16_t *state_data,
2720                                           const cmsis_nn_dims *weights_feature_dims,
2721                                           const int8_t *weights_feature_data,
2722                                           const cmsis_nn_dims *weights_time_dims,
2723                                           const int16_t *weights_time_data,
2724                                           const cmsis_nn_dims *bias_dims,
2725                                           const int32_t *bias_data,
2726                                           const cmsis_nn_dims *output_dims,
2727                                           int8_t *output_data);
2728 
2729 /**
2730  * @brief Get size of additional buffer required by arm_svdf_s8().
2731  * @param[in]      filter_dims             dimension of filter
2732  * @return         The function returns    required buffer size in bytes
2733  *
2734  */
2735 int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
2736 
2737 /**
2738  * @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension.
2739  *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
2740  *
2741  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2742  *             arm_svdf_s8_get_buffer_size().
2743  *
2744  */
2745 int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
2746 
2747 /**
2748  * @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case.
2749  *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
2750  *
2751  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2752  *             arm_svdf_s8_get_buffer_size().
2753  *
2754  */
2755 int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
2756 
2757 /**
2758  * @defgroup LSTM LSTM Layer Functions
2759  *
2760  */
2761 
2762 /**
2763  * @brief LSTM unidirectional function with 8 bit input and output and 16 bit gate output, 32 bit bias.
2764  *
2765  * @param[in]   input                      Pointer to input data
2766  * @param[out]  output                     Pointer to output data
2767  * @param[in]   params                     Struct containing all information about the lstm operator, see arm_nn_types.
2768  * @param[in]   buffers                    Struct containing pointers to all temporary scratch buffers needed for the
2769  * lstm operator, see arm_nn_types.
2770  *
2771  *
2772  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2773  *
2774  * @details
2775  *    1. Supported framework: TensorFlow Lite Micro
2776  *
2777  */
2778 arm_cmsis_nn_status arm_lstm_unidirectional_s8(const int8_t *input,
2779                                                int8_t *output,
2780                                                const cmsis_nn_lstm_params *params,
2781                                                cmsis_nn_lstm_context *buffers);
2782 
2783 /**
2784  * @brief LSTM unidirectional function with 16 bit input and output and 16 bit gate output, 64 bit bias.
2785  *
2786  * @param[in]   input                      Pointer to input data
2787  * @param[out]  output                     Pointer to output data
2788  * @param[in]   params                     Struct containing all information about the lstm operator, see arm_nn_types.
2789  * @param[in]   buffers                    Struct containing pointers to all temporary scratch buffers needed for the
2790  * lstm operator, see arm_nn_types.
2791  *
2792  *
2793  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2794  *
2795  * @details
2796  *    1. Supported framework: TensorFlow Lite Micro
2797  *
2798  */
2799 arm_cmsis_nn_status arm_lstm_unidirectional_s16(const int16_t *input,
2800                                                 int16_t *output,
2801                                                 const cmsis_nn_lstm_params *params,
2802                                                 cmsis_nn_lstm_context *buffers);
2803 
2804 /**
2805  * @brief Batch matmul function with 8 bit input and output.
2806  *
2807  * @param[in]   ctx                   Temporary scratch buffer
2808  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2809  *                                    Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
2810  *                                    size if an additional buffer is required.
2811  * @param[in]   bmm_params            Batch matmul Parameters
2812  *                                    Adjoint flags are currently unused.
2813  * @param[in]   quant_params          Quantization parameters
2814  * @param[in]   input_lhs_dims        Input lhs tensor dimensions.
2815  *                                    This should be NHWC where lhs C = rhs C
2816  * @param[in]   input_lhs             Pointer to input tensor
2817  * @param[in]   input_rhs_dims        Input lhs tensor dimensions.
2818  *                                    This is expected to be transposed so
2819  *                                    should be NHWC where lhs C = rhs C
2820  * @param[in]   input_rhs             Pointer to transposed input tensor
2821  * @param[in]   output_dims           Output tensor dimensions
2822  * @param[out]  output                Pointer to the output tensor
2823  *
2824  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2825  *
2826  * @details
2827  *    1. Supported framework: TensorFlow Lite Micro
2828  *    2. Performs row * row matrix multiplication with the RHS transposed.
2829  *
2830  */
2831 arm_cmsis_nn_status arm_batch_matmul_s8(const cmsis_nn_context *ctx,
2832                                         const cmsis_nn_bmm_params *bmm_params,
2833                                         const cmsis_nn_per_tensor_quant_params *quant_params,
2834                                         const cmsis_nn_dims *input_lhs_dims,
2835                                         const int8_t *input_lhs,
2836                                         const cmsis_nn_dims *input_rhs_dims,
2837                                         const int8_t *input_rhs,
2838                                         const cmsis_nn_dims *output_dims,
2839                                         int8_t *output);
2840 
2841 /**
2842  * @brief Batch matmul function with 16 bit input and output.
2843  *
2844  * @param[in]   ctx                   Temporary scratch buffer
2845  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2846  *                                    Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
2847  *                                    size if an additional buffer is required.
2848  * @param[in]   bmm_params            Batch matmul Parameters
2849  *                                    Adjoint flags are currently unused.
2850  * @param[in]   quant_params          Quantization parameters
2851  * @param[in]   input_lhs_dims        Input lhs tensor dimensions.
2852  *                                    This should be NHWC where LHS.C = RHS.C
2853  * @param[in]   input_lhs             Pointer to input tensor
2854  * @param[in]   input_rhs_dims        Input lhs tensor dimensions.
2855  *                                    This is expected to be transposed so
2856  *                                    should be NHWC where LHS.C = RHS.C
2857  * @param[in]   input_rhs             Pointer to transposed input tensor
2858  * @param[in]   output_dims           Output tensor dimensions
2859  * @param[out]  output                Pointer to the output tensor
2860  *
2861  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2862  *
2863  * @details
2864  *    1. Supported framework: TensorFlow Lite Micro
2865  *    2. Performs row * row matrix multiplication with the RHS transposed.
2866  *
2867  */
2868 arm_cmsis_nn_status arm_batch_matmul_s16(const cmsis_nn_context *ctx,
2869                                          const cmsis_nn_bmm_params *bmm_params,
2870                                          const cmsis_nn_per_tensor_quant_params *quant_params,
2871                                          const cmsis_nn_dims *input_lhs_dims,
2872                                          const int16_t *input_lhs,
2873                                          const cmsis_nn_dims *input_rhs_dims,
2874                                          const int16_t *input_rhs,
2875                                          const cmsis_nn_dims *output_dims,
2876                                          int16_t *output);
2877 
2878 /**
2879  * @defgroup Pad Pad Layer Functions:
2880  *
2881  */
2882 
2883 /**
2884  * @brief Expands the size of the input by adding constant values before and after the data, in all dimensions.
2885  *
2886  * @param[in]   input                      Pointer to input data
2887  * @param[out]  output                     Pointer to output data
2888  * @param[in]   pad_value                  Value to pad with
2889  * @param[in]   input_size                 Input tensor dimensions
2890  * @param[in]   pre_pad                           Padding to apply before data in each dimension
2891  * @param[in]        post_pad                   Padding to apply after data in each dimension
2892  *
2893  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2894  *
2895  */
2896 arm_cmsis_nn_status arm_pad_s8(const int8_t *input,
2897                                int8_t *output,
2898                                const int8_t pad_value,
2899                                const cmsis_nn_dims *input_size,
2900                                const cmsis_nn_dims *pre_pad,
2901                                const cmsis_nn_dims *post_pad);
2902 
2903 /**
2904  * @brief Elementwise binary minimum with 8bit data.
2905  *
2906  * @param[in]   ctx                   Temporary scratch buffer
2907  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2908  * @param[in]   input_1_data          Pointer to input1 tensor
2909  * @param[in]   input_1_dims          Input1 tensor dimensions
2910  * @param[in]   input_2_data          Pointer to input2 tensor
2911  * @param[in]   input_2_dims          Input2 tensor dimensions
2912  * @param[out]  output_data           Pointer to the output tensor
2913  * @param[in]   output_dims           Output tensor dimensions
2914  *
2915  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2916  *
2917  * @details
2918  *    1. Supported framework: TensorFlow Lite Micro
2919  *
2920  */
2921 arm_cmsis_nn_status arm_minimum_s8(const cmsis_nn_context *ctx,
2922                                    const int8_t *input_1_data,
2923                                    const cmsis_nn_dims *input_1_dims,
2924                                    const int8_t *input_2_data,
2925                                    const cmsis_nn_dims *input_2_dims,
2926                                    int8_t *output_data,
2927                                    const cmsis_nn_dims *output_dims);
2928 
2929 /**
2930  * @brief Elementwise binary maximum with 8bit data.
2931  *
2932  * @param[in]   ctx                   Temporary scratch buffer
2933  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2934  * @param[in]   input_1_data          Pointer to input1 tensor
2935  * @param[in]   input_1_dims          Input1 tensor dimensions
2936  * @param[in]   input_2_data          Pointer to input2 tensor
2937  * @param[in]   input_2_dims          Input2 tensor dimensions
2938  * @param[out]  output_data           Pointer to the output tensor
2939  * @param[in]   output_dims           Output tensor dimensions
2940  *
2941  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2942  *
2943  * @details
2944  *    1. Supported framework: TensorFlow Lite Micro
2945  *
2946  */
2947 arm_cmsis_nn_status arm_maximum_s8(const cmsis_nn_context *ctx,
2948                                    const int8_t *input_1_data,
2949                                    const cmsis_nn_dims *input_1_dims,
2950                                    const int8_t *input_2_data,
2951                                    const cmsis_nn_dims *input_2_dims,
2952                                    int8_t *output_data,
2953                                    const cmsis_nn_dims *output_dims);
2954 
2955 #ifdef __cplusplus
2956 }
2957 #endif
2958 
2959 #endif /* ARM_NNFUNCTIONS_H */
2960