1 /*
2  * SPDX-FileCopyrightText: Copyright 2010-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nnfunctions.h
22  * Description:  Public header file for CMSIS NN Library
23  *
24  * $Date:        23 April 2024
25  * $Revision:    V.16.0.0
26  *
27  * Target :  Arm(R) M-Profile Architecture
28  * -------------------------------------------------------------------- */
29 
30 /**
31  * @defgroup Public Public
32  * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
33  * TensorFlow Lite framework.
34  */
35 
36 #ifndef ARM_NNFUNCTIONS_H
37 #define ARM_NNFUNCTIONS_H
38 
39 #include "arm_nn_math_types.h"
40 #include "arm_nn_types.h"
41 
42 #define USE_INTRINSIC
43 
44 #ifdef __cplusplus
45 extern "C" {
46 #endif
47 
48 /**
49  * @defgroup NNConv Convolution Functions
50  *
51  * Collection of convolution, depthwise convolution functions and their variants.
52  *
53  * The convolution is implemented in 2 steps: im2col and General Matrix Multiplication(GEMM)
54  *
55  * im2col is a process of converting each patch of image data into
56  * a column. After im2col, the convolution is computed as matrix-matrix
57  * multiplication.
58  *
59  * To reduce the memory footprint, the im2col is performed partially.
60  * Each iteration, only a few column (i.e., patches) are generated followed
61  * by GEMM.
62  *
63  */
64 
65 /**
66  * @brief s4 convolution layer wrapper function with the main purpose to call the optimal kernel available in
67  *        cmsis-nn  to perform the convolution.
68  *
69  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
70  *                                arm_convolve_wrapper_s4_get_buffer_size will return the buffer_size if required.
71  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
72  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
73  *                                Range of conv_params->input_offset  : [-127, 128]
74  *                                Range of conv_params->output_offset : [-128, 127]
75  * @param[in]      quant_params   Per-channel quantization info.
76  *                                It contains the multiplier and shift values to be applied to each output channel
77  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
78  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
79  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
80  *                                spatial filter dimensions
81  * @param[in]      filter_data    Filter data pointer. Data type: int8 packed with 2x int4
82  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
83  * @param[in]      bias_data      Bias data pointer. Data type: int32
84  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
85  * @param[out]     output_data    Output data pointer. Data type: int8
86  *
87  * @return     The function returns either
88  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
89  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
90  *
91  */
92 arm_cmsis_nn_status arm_convolve_wrapper_s4(const cmsis_nn_context *ctx,
93                                             const cmsis_nn_conv_params *conv_params,
94                                             const cmsis_nn_per_channel_quant_params *quant_params,
95                                             const cmsis_nn_dims *input_dims,
96                                             const int8_t *input_data,
97                                             const cmsis_nn_dims *filter_dims,
98                                             const int8_t *filter_data,
99                                             const cmsis_nn_dims *bias_dims,
100                                             const int32_t *bias_data,
101                                             const cmsis_nn_dims *output_dims,
102                                             int8_t *output_data);
103 
104 /**
105  * @brief Get the required buffer size for arm_convolve_wrapper_s4
106  *
107  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
108  *                                Range of conv_params->input_offset  : [-127, 128]
109  *                                Range of conv_params->output_offset : [-128, 127]
110  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
111  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
112  *                                filter dimensions
113  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
114  *
115  * @return         The function returns required buffer size(bytes)
116  *
117  */
118 int32_t arm_convolve_wrapper_s4_get_buffer_size(const cmsis_nn_conv_params *conv_params,
119                                                 const cmsis_nn_dims *input_dims,
120                                                 const cmsis_nn_dims *filter_dims,
121                                                 const cmsis_nn_dims *output_dims);
122 
123 /**
124  * @brief Get the required buffer size for arm_convolve_wrapper_s4 for Arm(R) Helium Architecture case.
125  *        Refer to arm_convolve_wrapper_s4_get_buffer_size() for function argument details.
126  *
127  * @note       Intended for compilation on Host. If compiling for an Arm target, use
128  *             arm_convolve_wrapper_s4_get_buffer_size(). Currently this operator does not have an
129  *             mve implementation, so dsp will be used.
130  *
131  */
132 int32_t arm_convolve_wrapper_s4_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
133                                                     const cmsis_nn_dims *input_dims,
134                                                     const cmsis_nn_dims *filter_dims,
135                                                     const cmsis_nn_dims *output_dims);
136 
137 /**
138  * @brief Get the required buffer size for arm_convolve_wrapper_s4 for processors with DSP extension.
139  *        Refer to arm_convolve_wrapper_s4_get_buffer_size() for function argument details.
140  *
141  * @note       Intended for compilation on Host. If compiling for an Arm target, use
142  *             arm_convolve_wrapper_s4_get_buffer_size().
143  *
144  */
145 int32_t arm_convolve_wrapper_s4_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
146                                                     const cmsis_nn_dims *input_dims,
147                                                     const cmsis_nn_dims *filter_dims,
148                                                     const cmsis_nn_dims *output_dims);
149 
150 /**
151  * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
152  *        cmsis-nn  to perform the convolution.
153  *
154  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
155  *                                arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required.
156  *                                The caller is expected to clear the buffer, if applicable, for security reasons.
157  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
158  *                                Range of conv_params->input_offset  : [-127, 128]
159  *                                Range of conv_params->output_offset : [-128, 127]
160  * @param[in]      quant_params   Per-channel quantization info.
161  *                                It contains the multiplier and shift values to be applied to each output channel
162  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
163  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
164  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
165  *                                spatial filter dimensions
166  * @param[in]      filter_data    Filter data pointer. Data type: int8
167  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
168  * @param[in]      bias_data      Bias data pointer. Data type: int32
169  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
170  * @param[out]     output_data    Output data pointer. Data type: int8
171  *
172  * @return     The function returns either
173  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
174  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
175  *
176  */
177 arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
178                                             const cmsis_nn_conv_params *conv_params,
179                                             const cmsis_nn_per_channel_quant_params *quant_params,
180                                             const cmsis_nn_dims *input_dims,
181                                             const int8_t *input_data,
182                                             const cmsis_nn_dims *filter_dims,
183                                             const int8_t *filter_data,
184                                             const cmsis_nn_dims *bias_dims,
185                                             const int32_t *bias_data,
186                                             const cmsis_nn_dims *output_dims,
187                                             int8_t *output_data);
188 
189 /**
190  * @brief Get the required buffer size for arm_convolve_wrapper_s8
191  *
192  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
193  *                                Range of conv_params->input_offset  : [-127, 128]
194  *                                Range of conv_params->output_offset : [-128, 127]
195  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
196  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
197  *                                filter dimensions
198  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
199  *
200  * @return         The function returns required buffer size(bytes)
201  *
202  */
203 int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
204                                                 const cmsis_nn_dims *input_dims,
205                                                 const cmsis_nn_dims *filter_dims,
206                                                 const cmsis_nn_dims *output_dims);
207 
208 /**
209  * @brief Get the required buffer size for arm_convolve_wrapper_s8 for Arm(R) Helium Architecture case.
210  *        Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details.
211  *
212  * @note       Intended for compilation on Host. If compiling for an Arm target, use
213  *             arm_convolve_wrapper_s8_get_buffer_size().
214  *
215  */
216 int32_t arm_convolve_wrapper_s8_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
217                                                     const cmsis_nn_dims *input_dims,
218                                                     const cmsis_nn_dims *filter_dims,
219                                                     const cmsis_nn_dims *output_dims);
220 
221 /**
222  * @brief Get the required buffer size for arm_convolve_wrapper_s8 for processors with DSP extension.
223  *        Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details.
224  *
225  * @note       Intended for compilation on Host. If compiling for an Arm target, use
226  *             arm_convolve_wrapper_s8_get_buffer_size().
227  *
228  */
229 int32_t arm_convolve_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
230                                                     const cmsis_nn_dims *input_dims,
231                                                     const cmsis_nn_dims *filter_dims,
232                                                     const cmsis_nn_dims *output_dims);
233 
234 /**
235  * @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
236  *        cmsis-nn to perform the convolution.
237  *
238  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
239  *                                arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
240  *                                The caller is expected to clear the buffer, if applicable, for security reasons.
241  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
242  *                                conv_params->input_offset  : Not used
243  *                                conv_params->output_offset : Not used
244  * @param[in]      quant_params   Per-channel quantization info.
245  *                                It contains the multiplier and shift values to be applied to each output channel
246  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
247  * @param[in]      input_data     Input (activation) data pointer. Data type: int16
248  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
249  *                                spatial filter dimensions
250  * @param[in]      filter_data    Filter data pointer. Data type: int8
251  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
252  * @param[in]      bias_data      Struct with optional bias data pointer. Bias data type can be int64 or int32 depending
253  *                                flag in struct.
254  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
255  * @param[out]     output_data    Output data pointer. Data type: int16
256  *
257  * @return     The function returns either
258  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
259  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
260  *
261  */
262 arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
263                                              const cmsis_nn_conv_params *conv_params,
264                                              const cmsis_nn_per_channel_quant_params *quant_params,
265                                              const cmsis_nn_dims *input_dims,
266                                              const int16_t *input_data,
267                                              const cmsis_nn_dims *filter_dims,
268                                              const int8_t *filter_data,
269                                              const cmsis_nn_dims *bias_dims,
270                                              const cmsis_nn_bias_data *bias_data,
271                                              const cmsis_nn_dims *output_dims,
272                                              int16_t *output_data);
273 
274 /**
275  * @brief Get the required buffer size for arm_convolve_wrapper_s16.
276  *
277  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
278  *                                conv_params->input_offset  : Not used
279  *                                conv_params->output_offset : Not used
280  * @param[in]      input_dims     Input (activation) dimensions. Format: [N, H, W, C_IN]
281  * @param[in]      filter_dims    Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
282  *                                filter dimensions
283  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
284  *
285  * @return         The function returns required buffer size(bytes)
286  *
287  */
288 int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
289                                                  const cmsis_nn_dims *input_dims,
290                                                  const cmsis_nn_dims *filter_dims,
291                                                  const cmsis_nn_dims *output_dims);
292 
293 /**
294  * @brief Get the required buffer size for arm_convolve_wrapper_s16 for for processors with DSP extension.
295  *        Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details.
296  *
297  * @note       Intended for compilation on Host. If compiling for an Arm target, use
298  *             arm_convolve_wrapper_s16_get_buffer_size().
299  *
300  */
301 int32_t arm_convolve_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params,
302                                                      const cmsis_nn_dims *input_dims,
303                                                      const cmsis_nn_dims *filter_dims,
304                                                      const cmsis_nn_dims *output_dims);
305 
306 /**
307  * @brief Get the required buffer size for arm_convolve_wrapper_s16 for Arm(R) Helium Architecture case.
308  *        Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details.
309  *
310  * @note       Intended for compilation on Host. If compiling for an Arm target, use
311  *             arm_convolve_wrapper_s16_get_buffer_size().
312  *
313  */
314 int32_t arm_convolve_wrapper_s16_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params,
315                                                      const cmsis_nn_dims *input_dims,
316                                                      const cmsis_nn_dims *filter_dims,
317                                                      const cmsis_nn_dims *output_dims);
318 
319 /**
320  * @brief Basic s4 convolution function
321  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
322  *                                arm_convolve_s4_get_buffer_size will return the buffer_size if required.
323  *                                The caller is expected to clear the buffer ,if applicable, for security reasons.
324  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
325  *                                Range of conv_params->input_offset  : [-127, 128]
326  *                                Range of conv_params->output_offset : [-128, 127]
327  * @param[in]      quant_params   Per-channel quantization info.
328  *                                It contains the multiplier and shift values to be applied to each output channel
329  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
330  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
331  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
332  *                                spatial filter dimensions
333  * @param[in]      filter_data    Packed Filter data pointer. Data type: int8 packed with 2x int4
334  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
335  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
336  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
337  * @param[out]     output_data    Output data pointer. Data type: int8
338 
339  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
340  *
341  * @details
342  *    1. Supported framework: TensorFlow Lite micro
343  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
344  *
345  */
346 arm_cmsis_nn_status arm_convolve_s4(const cmsis_nn_context *ctx,
347                                     const cmsis_nn_conv_params *conv_params,
348                                     const cmsis_nn_per_channel_quant_params *quant_params,
349                                     const cmsis_nn_dims *input_dims,
350                                     const int8_t *input_data,
351                                     const cmsis_nn_dims *filter_dims,
352                                     const int8_t *filter_data,
353                                     const cmsis_nn_dims *bias_dims,
354                                     const int32_t *bias_data,
355                                     const cmsis_nn_dims *output_dims,
356                                     int8_t *output_data);
357 /**
358  * @brief Basic s8 convolution function
359  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
360  *                                arm_convolve_s8_get_buffer_size will return the buffer_size if required.
361  *                                The caller is expected to clear the buffer, if applicable, for security reasons.
362  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
363  *                                Range of conv_params->input_offset  : [-127, 128]
364  *                                Range of conv_params->output_offset : [-128, 127]
365  * @param[in]      quant_params   Per-channel quantization info.
366  *                                It contains the multiplier and shift values to be applied to each output channel
367  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
368  * @param[in]      input_data     Input (activation) data pointer. Data type: int8
369  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, CK] where HK, WK and CK are the
370  *                                spatial filter dimensions. CK != C_IN is used for grouped convolution, in which
371  *                                case the required conditions are C_IN = N * CK and C_OUT = N * M for N groups of
372  *                                size M.
373  * @param[in]      filter_data    Filter data pointer. Data type: int8
374  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
375  * @param[in]      bias_data      Optional bias data pointer. Data type: int32
376  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
377  * @param[out]     output_data    Output data pointer. Data type: int8
378  *
379  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
380  *                                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
381  *                                  <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code>
382  *
383  * @details
384  *    1. Supported framework: TensorFlow Lite micro
385  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
386  *
387  */
388 arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
389                                     const cmsis_nn_conv_params *conv_params,
390                                     const cmsis_nn_per_channel_quant_params *quant_params,
391                                     const cmsis_nn_dims *input_dims,
392                                     const int8_t *input_data,
393                                     const cmsis_nn_dims *filter_dims,
394                                     const int8_t *filter_data,
395                                     const cmsis_nn_dims *bias_dims,
396                                     const int32_t *bias_data,
397                                     const cmsis_nn_dims *output_dims,
398                                     int8_t *output_data);
399 
400 /**
401  * @brief Get the required buffer size for s4 convolution function
402  *
403  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
404  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
405  * are the spatial filter dimensions
406  * @return          The function returns required buffer size(bytes)
407  *
408  */
409 int32_t arm_convolve_s4_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
410 
411 /**
412  * @brief Get the required buffer size for s8 convolution function
413  *
414  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
415  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
416  * are the spatial filter dimensions
417  * @return          The function returns required buffer size(bytes)
418  *
419  */
420 int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
421 
422 /**
423  * @brief Basic s8 transpose convolution function
424  * @param[in, out] ctx                   Function context that contains the additional buffer if required by the
425  *                                       function.
426  *                                       arm_transpose_conv_s8_get_buffer_size will return the buffer_size if required.
427  *                                       The caller is expected to clear the buffer, if applicable, for security
428  reasons.
429  * @param[in, out] output_ctx            Temporary scratch buffer.
430  *                                       The size required size is: output width * output height * output channel * 4
431  *                                       The caller is expected to clear the buffer, if applicable, for security
432  *                                        reasons.
433  * @param[in]      transpose_conv_params Convolution parameters (e.g. strides, dilations, pads,...).
434  *                                       Range of transpose_conv_params->input_offset  : [-127, 128]
435  *                                       Range of transpose_conv_params->output_offset : [-128, 127]
436  * @param[in]      quant_params          Per-channel quantization info.
437  *                                       It contains the multiplier and shift values to be applied to each out channel.
438  * @param[in]      input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
439  * @param[in]      input_data            Input (activation) data pointer. Data type: int8
440  * @param[in]      filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
441  *                                       spatial filter dimensions
442  * @param[in]      filter_data           Filter data pointer. Data type: int8
443  * @param[in]      bias_dims             Bias tensor dimensions. Format: [C_OUT]
444  * @param[in]      bias_data             Optional bias data pointer. Data type: int32
445  * @param[in]      output_dims           Output tensor dimensions. Format: [N, H, W, C_OUT]
446  * @param[out]     output_data           Output data pointer. Data type: int8
447 
448  * @return     The function returns either
449  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
450  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
451  *
452  * @details
453  *    1. Supported framework: TensorFlow Lite micro
454  *    2. Additional memory is required for optimization. Refer to arguments 'ctx' and 'output_ctx' for details.
455  *
456  */
457 arm_cmsis_nn_status arm_transpose_conv_s8(const cmsis_nn_context *ctx,
458                                           const cmsis_nn_context *output_ctx,
459                                           const cmsis_nn_transpose_conv_params *transpose_conv_params,
460                                           const cmsis_nn_per_channel_quant_params *quant_params,
461                                           const cmsis_nn_dims *input_dims,
462                                           const int8_t *input_data,
463                                           const cmsis_nn_dims *filter_dims,
464                                           const int8_t *filter_data,
465                                           const cmsis_nn_dims *bias_dims,
466                                           const int32_t *bias_data,
467                                           const cmsis_nn_dims *output_dims,
468                                           int8_t *output_data);
469 
470 /**
471  * @brief Get the required buffer size for s8 transpose conv function
472  *
473  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
474  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
475  *                                        are the spatial filter dimensions
476  * @param[in]       out_dims              Output tensor dimensions. Format: [N, H, W, C_OUT]
477  * @return          The function returns required buffer size(bytes)
478  *
479  */
480 int32_t arm_transpose_conv_s8_get_buffer_size(const cmsis_nn_dims *input_dims,
481                                               const cmsis_nn_dims *filter_dims,
482                                               const cmsis_nn_dims *out_dims);
483 
484 /**
485  * @brief Get size of additional buffer required by arm_transpose_conv_s8() for processors with DSP extension.
486  *        Refer to arm_transpose_conv_s8_get_buffer_size() for function argument details.
487  *
488  * @note       Intended for compilation on Host. If compiling for an Arm target, use
489  *             arm_transpose_conv_s8_get_buffer_size().
490  *
491  */
492 int32_t arm_transpose_conv_s8_get_buffer_size_dsp(const cmsis_nn_dims *input_dims,
493                                                   const cmsis_nn_dims *filter_dims,
494                                                   const cmsis_nn_dims *out_dims);
495 
496 /**
497  * @brief Get size of additional buffer required by arm_transpose_conv_s8() for Arm(R) Helium Architecture case.
498  *        Refer to arm_transpose_conv_s8_get_buffer_size() for function argument details.
499  *
500  * @note       Intended for compilation on Host. If compiling for an Arm target, use
501  *             arm_transpose_conv_s8_get_buffer_size().
502  *
503  */
504 int32_t arm_transpose_conv_s8_get_buffer_size_mve(const cmsis_nn_dims *input_dims,
505                                                   const cmsis_nn_dims *filter_dims,
506                                                   const cmsis_nn_dims *out_dims);
507 
508 /**
509  * @brief Basic s16 convolution function
510  * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
511  *                                arm_convolve_s16_get_buffer_size will return the buffer_size if required.
512  *                                The caller is expected to clear the buffer, if applicable, for security reasons.
513  * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
514  *                                conv_params->input_offset  : Not used
515  *                                conv_params->output_offset : Not used
516  * @param[in]      quant_params   Per-channel quantization info.
517  *                                It contains the multiplier and shift values to be applied to each output channel
518  * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
519  * @param[in]      input_data     Input (activation) data pointer. Data type: int16
520  * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
521  *                                spatial filter dimensions
522  * @param[in]      filter_data    Filter data pointer. Data type: int8
523  * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
524  * @param[in]      bias_data      Struct with optional bias data pointer. Bias data type can be int64 or int32 depending
525  *                                flag in struct.
526  * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
527  * @param[out]     output_data    Output data pointer. Data type: int16
528  *
529  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
530  *                                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
531  *                                  <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code>
532  *
533  * @details
534  *    1. Supported framework: TensorFlow Lite micro
535  *    2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
536  *
537  */
538 arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
539                                      const cmsis_nn_conv_params *conv_params,
540                                      const cmsis_nn_per_channel_quant_params *quant_params,
541                                      const cmsis_nn_dims *input_dims,
542                                      const int16_t *input_data,
543                                      const cmsis_nn_dims *filter_dims,
544                                      const int8_t *filter_data,
545                                      const cmsis_nn_dims *bias_dims,
546                                      const cmsis_nn_bias_data *bias_data,
547                                      const cmsis_nn_dims *output_dims,
548                                      int16_t *output_data);
549 
550 /**
551  * @brief Get the required buffer size for s16 convolution function
552  *
553  * @param[in]       input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
554  * @param[in]       filter_dims   Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
555  *                                are the spatial filter dimensions
556  * @return          The function returns required buffer size(bytes)
557  *
558  */
559 int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
560 
561 /**
562  * @brief Fast s4 version for 1x1 convolution (non-square shape)
563  *
564  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
565  *                               arm_convolve_1x1_s4_fast_get_buffer_size will return the buffer_size if required.
566  *                               The caller is expected to clear the buffer ,if applicable, for security reasons.
567  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
568  *                               Range of conv_params->input_offset  : [-127, 128]
569  *                               Range of conv_params->output_offset : [-128, 127]
570  * @param[in]      quant_params  Per-channel quantization info.
571  *                               It contains the multiplier and shift values to be applied to each output channel
572  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
573  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
574  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
575  * @param[in]      filter_data   Filter data pointer. Data type: int8 packed with 2x int4
576  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
577  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
578  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
579  * @param[out]     output_data   Output data pointer. Data type: int8
580  *
581  * @return     The function returns either
582  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
583  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
584  *
585  * @details
586  *   - Supported framework : TensorFlow Lite Micro
587  *   - The following constrains on the arguments apply
588  *      -# conv_params->padding.w = conv_params->padding.h = 0
589  *      -# conv_params->stride.w = conv_params->stride.h = 1
590  *
591  */
592 arm_cmsis_nn_status arm_convolve_1x1_s4_fast(const cmsis_nn_context *ctx,
593                                              const cmsis_nn_conv_params *conv_params,
594                                              const cmsis_nn_per_channel_quant_params *quant_params,
595                                              const cmsis_nn_dims *input_dims,
596                                              const int8_t *input_data,
597                                              const cmsis_nn_dims *filter_dims,
598                                              const int8_t *filter_data,
599                                              const cmsis_nn_dims *bias_dims,
600                                              const int32_t *bias_data,
601                                              const cmsis_nn_dims *output_dims,
602                                              int8_t *output_data);
603 
604 /**
605  * @brief s4 version for 1x1 convolution with support for non-unity stride values
606  *
607  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
608  *                               None is required by this function.
609  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
610  *                               Range of conv_params->input_offset  : [-127, 128]
611  *                               Range of conv_params->output_offset : [-128, 127]
612  * @param[in]      quant_params  Per-channel quantization info.
613  *                               It contains the multiplier and shift values to be applied to each output channel
614  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
615  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
616  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
617  * @param[in]      filter_data   Filter data pointer. Data type: int8 packed with 2x int4
618  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
619  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
620  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
621  * @param[out]     output_data   Output data pointer. Data type: int8
622  *
623  * @return     The function returns either
624  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
625  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
626  * @details
627  *   - Supported framework : TensorFlow Lite Micro
628  *   - The following constrains on the arguments apply
629  *      -# conv_params->padding.w = conv_params->padding.h = 0
630  *
631  */
632 arm_cmsis_nn_status arm_convolve_1x1_s4(const cmsis_nn_context *ctx,
633                                         const cmsis_nn_conv_params *conv_params,
634                                         const cmsis_nn_per_channel_quant_params *quant_params,
635                                         const cmsis_nn_dims *input_dims,
636                                         const int8_t *input_data,
637                                         const cmsis_nn_dims *filter_dims,
638                                         const int8_t *filter_data,
639                                         const cmsis_nn_dims *bias_dims,
640                                         const int32_t *bias_data,
641                                         const cmsis_nn_dims *output_dims,
642                                         int8_t *output_data);
643 
644 /**
645  * @brief Fast s8 version for 1x1 convolution (non-square shape)
646  *
647  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
648  *                               arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required.
649  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
650  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
651  *                               Range of conv_params->input_offset  : [-127, 128]
652  *                               Range of conv_params->output_offset : [-128, 127]
653  * @param[in]      quant_params  Per-channel quantization info.
654  *                               It contains the multiplier and shift values to be applied to each output channel
655  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
656  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
657  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
658  * @param[in]      filter_data   Filter data pointer. Data type: int8
659  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
660  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
661  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
662  * @param[out]     output_data   Output data pointer. Data type: int8
663  *
664  * @return     The function returns either
665  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
666  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
667  *
668  * @details
669  *   - Supported framework : TensorFlow Lite Micro
670  *   - The following constrains on the arguments apply
671  *      -# conv_params->padding.w = conv_params->padding.h = 0
672  *      -# conv_params->stride.w = conv_params->stride.h = 1
673  *
674  */
675 arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
676                                              const cmsis_nn_conv_params *conv_params,
677                                              const cmsis_nn_per_channel_quant_params *quant_params,
678                                              const cmsis_nn_dims *input_dims,
679                                              const int8_t *input_data,
680                                              const cmsis_nn_dims *filter_dims,
681                                              const int8_t *filter_data,
682                                              const cmsis_nn_dims *bias_dims,
683                                              const int32_t *bias_data,
684                                              const cmsis_nn_dims *output_dims,
685                                              int8_t *output_data);
686 
687 /**
688  * @brief Get the required buffer size for arm_convolve_1x1_s4_fast
689  *
690  * @param[in]       input_dims            Input (activation) dimensions
691  * @return          The function returns the required buffer size in bytes
692  *
693  */
694 int32_t arm_convolve_1x1_s4_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
695 
696 /**
697  * @brief Get the required buffer size for arm_convolve_1x1_s8_fast
698  *
699  * @param[in]       input_dims            Input (activation) dimensions
700  * @return          The function returns the required buffer size in bytes
701  *
702  */
703 int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
704 
705 /**
706  * @brief s8 version for 1x1 convolution with support for non-unity stride values
707  *
708  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
709  *                               None is required by this function.
710  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
711  *                               Range of conv_params->input_offset  : [-127, 128]
712  *                               Range of conv_params->output_offset : [-128, 127]
713  * @param[in]      quant_params  Per-channel quantization info.
714  *                               It contains the multiplier and shift values to be applied to each output channel
715  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
716  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
717  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
718  * @param[in]      filter_data   Filter data pointer. Data type: int8
719  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
720  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
721  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
722  * @param[out]     output_data   Output data pointer. Data type: int8
723  *
724  * @return     The function returns either
725  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
726  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
727  * @details
728  *   - Supported framework : TensorFlow Lite Micro
729  *   - The following constrains on the arguments apply
730  *      -# conv_params->padding.w = conv_params->padding.h = 0
731  *
732  */
733 arm_cmsis_nn_status arm_convolve_1x1_s8(const cmsis_nn_context *ctx,
734                                         const cmsis_nn_conv_params *conv_params,
735                                         const cmsis_nn_per_channel_quant_params *quant_params,
736                                         const cmsis_nn_dims *input_dims,
737                                         const int8_t *input_data,
738                                         const cmsis_nn_dims *filter_dims,
739                                         const int8_t *filter_data,
740                                         const cmsis_nn_dims *bias_dims,
741                                         const int32_t *bias_data,
742                                         const cmsis_nn_dims *output_dims,
743                                         int8_t *output_data);
744 
745 /**
746  * @brief 1xn convolution
747  *
748  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
749  *                               arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required
750  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
751  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
752  *                               Range of conv_params->input_offset  : [-127, 128]
753  *                               Range of conv_params->output_offset : [-128, 127]
754  * @param[in]      quant_params  Per-channel quantization info.
755  *                               It contains the multiplier and shift values to be applied to each output channel
756  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
757  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
758  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
759  *                               spatial filter dimension
760  * @param[in]      filter_data   Filter data pointer. Data type: int8
761  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
762  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
763  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
764  * @param[out]     output_data   Output data pointer. Data type: int8
765  *
766  * @return     The function returns either
767  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
768  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
769  *
770  * @details
771  *   - Supported framework : TensorFlow Lite Micro
772  *   - The following constrains on the arguments apply
773  *      -# input_dims->n equals 1
774  *      -# ouput_dims->w is a multiple of 4
775  *      -# Explicit constraints(since it is for 1xN convolution)
776  *      -## input_dims->h equals 1
777  *      -## output_dims->h equals 1
778  *      -## filter_dims->h equals 1
779  *@todo  Remove constraint on output_dims->w to make the function generic.
780  *
781  */
782 arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
783                                           const cmsis_nn_conv_params *conv_params,
784                                           const cmsis_nn_per_channel_quant_params *quant_params,
785                                           const cmsis_nn_dims *input_dims,
786                                           const int8_t *input_data,
787                                           const cmsis_nn_dims *filter_dims,
788                                           const int8_t *filter_data,
789                                           const cmsis_nn_dims *bias_dims,
790                                           const int32_t *bias_data,
791                                           const cmsis_nn_dims *output_dims,
792                                           int8_t *output_data);
793 
794 /**
795  * @brief 1xn convolution for s4 weights
796  *
797  * @param[in, out] ctx           Function context that contains the additional buffer if required by the function.
798  *                               arm_convolve_1_x_n_s4_get_buffer_size will return the buffer_size if required
799  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
800  * @param[in]      conv_params   Convolution parameters (e.g. strides, dilations, pads,...).
801  *                               Range of conv_params->input_offset  : [-127, 128]
802  *                               Range of conv_params->output_offset : [-128, 127]
803  * @param[in]      quant_params  Per-channel quantization info.
804  *                               It contains the multiplier and shift values to be applied to each output channel
805  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
806  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
807  * @param[in]      filter_dims   Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
808  *                               spatial filter dimension
809  * @param[in]      filter_data   Filter data pointer. Data type: int8 as packed int4
810  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
811  * @param[in]      bias_data     Optional bias data pointer. Data type: int32
812  * @param[in]      output_dims   Output tensor dimensions. Format: [N, H, W, C_OUT]
813  * @param[out]     output_data   Output data pointer. Data type: int8
814  *
815  * @return     The function returns either
816  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
817  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
818  *
819  * @details
820  *   - Supported framework : TensorFlow Lite Micro
821  *   - The following constrains on the arguments apply
822  *      -# stride.w * input_dims->c is a multiple of 4
823  *      -# Explicit constraints(since it is for 1xN convolution)
824  *      -## input_dims->h equals 1
825  *      -## output_dims->h equals 1
826  *      -## filter_dims->h equals 1
827  *@todo  Remove constraint on output_dims->w to make the function generic.
828  *
829  */
830 arm_cmsis_nn_status arm_convolve_1_x_n_s4(const cmsis_nn_context *ctx,
831                                           const cmsis_nn_conv_params *conv_params,
832                                           const cmsis_nn_per_channel_quant_params *quant_params,
833                                           const cmsis_nn_dims *input_dims,
834                                           const int8_t *input_data,
835                                           const cmsis_nn_dims *filter_dims,
836                                           const int8_t *filter_data,
837                                           const cmsis_nn_dims *bias_dims,
838                                           const int32_t *bias_data,
839                                           const cmsis_nn_dims *output_dims,
840                                           int8_t *output_data);
841 
842 /**
843  * @brief Get the required additional buffer size for 1xn convolution
844  *
845  * @param[in]       conv_params           Convolution parameters (e.g. strides, dilations, pads,...).
846  *                                        Range of conv_params->input_offset  : [-127, 128]
847  *                                        Range of conv_params->output_offset : [-128, 127]
848  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
849  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
850  *                                        horizontal spatial filter dimension
851  * @param[in]       output_dims           Output tensor dimensions. Format: [N, H, W, C_OUT]
852  *
853  * @return          The function returns required buffer size(bytes)
854  *
855  */
856 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
857                                               const cmsis_nn_dims *input_dims,
858                                               const cmsis_nn_dims *filter_dims,
859                                               const cmsis_nn_dims *output_dims);
860 
861 /**
862  * @brief Get the required additional buffer size for 1xn convolution
863  *
864  * @param[in]       conv_params           Convolution parameters (e.g. strides, dilations, pads,...).
865  *                                        Range of conv_params->input_offset  : [-127, 128]
866  *                                        Range of conv_params->output_offset : [-128, 127]
867  * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
868  * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
869  *                                        horizontal spatial filter dimension
870  * @param[in]       output_dims           Output tensor dimensions. Format: [N, H, W, C_OUT]
871  *
872  * @return          The function returns required buffer size(bytes)
873  *
874  */
875 int32_t arm_convolve_1_x_n_s4_get_buffer_size(const cmsis_nn_conv_params *conv_params,
876                                               const cmsis_nn_dims *input_dims,
877                                               const cmsis_nn_dims *filter_dims,
878                                               const cmsis_nn_dims *output_dims);
879 
880 /**
881  * @brief Wrapper function to pick the right optimized s8 depthwise convolution function
882  *
883  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
884  *                                 definition file to see if an additional buffer is required.
885  *                                 Optional function {API}_get_buffer_size() provides the buffer
886  *                                 size if required.
887  *                                 The caller is expected to clear the buffer, if applicable, for security reasons.
888  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
889  *                                 dw_conv_params->dilation is not used.
890  *                                 Range of dw_conv_params->input_offset : [-127, 128]
891  *                                 Range of dw_conv_params->output_offset : [-128, 127]
892  * @param[in]      quant_params    Per-channel quantization info.
893  *                                 It contains the multiplier and shift values to be applied to each
894  *                                 output channel
895  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
896  *                                 Batch argument N is not used and assumed to be 1.
897  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
898  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
899  * @param[in]      filter_data     Filter data pointer. Data type: int8
900  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
901  * @param[in]      bias_data       Bias data pointer. Data type: int32
902  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
903  * @param[in, out] output_data     Output data pointer. Data type: int8
904  * @return     The function returns
905  *                <code>ARM_CMSIS_NN_SUCCESS</code>   -  Successful completion.
906  *
907  * @details
908  *    - Supported framework: TensorFlow Lite
909  *    - Picks one of the the following functions
910  *        -# arm_depthwise_conv_s8()
911  *        -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
912  *        -# arm_depthwise_conv_s8_opt()
913  *    - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
914  * boundary.
915  */
916 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
917                                                   const cmsis_nn_dw_conv_params *dw_conv_params,
918                                                   const cmsis_nn_per_channel_quant_params *quant_params,
919                                                   const cmsis_nn_dims *input_dims,
920                                                   const int8_t *input_data,
921                                                   const cmsis_nn_dims *filter_dims,
922                                                   const int8_t *filter_data,
923                                                   const cmsis_nn_dims *bias_dims,
924                                                   const int32_t *bias_data,
925                                                   const cmsis_nn_dims *output_dims,
926                                                   int8_t *output_data);
927 
928 /**
929  * @brief Wrapper function to pick the right optimized s4 depthwise convolution function
930  *
931  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
932  *                                 definition file to see if an additional buffer is required.
933  *                                 Optional function {API}_get_buffer_size() provides the buffer
934  *                                 size if required.
935  *                                 The caller is expected to clear the buffer ,if applicable, for security reasons.
936  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
937  *                                 dw_conv_params->dilation is not used.
938  *                                 Range of dw_conv_params->input_offset : [-127, 128]
939  *                                 Range of dw_conv_params->output_offset : [-128, 127]
940  * @param[in]      quant_params    Per-channel quantization info.
941  *                                 It contains the multiplier and shift values to be applied to each
942  *                                 output channel
943  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
944  *                                 Batch argument N is not used and assumed to be 1.
945  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
946  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
947  * @param[in]      filter_data     Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
948  *                                 weights [0x1, 0x2, 0x3, 0x4]  packed as [0x21, 0x43].
949  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
950  * @param[in]      bias_data       Bias data pointer. Data type: int32
951  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
952  * @param[in, out] output_data     Output data pointer. Data type: int8
953  * @return     The function returns
954  *                <code>ARM_CMSIS_NN_SUCCESS</code>   -  Successful completion.
955  *
956  * @details
957  *    - Supported framework: TensorFlow Lite
958  */
959 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s4(const cmsis_nn_context *ctx,
960                                                   const cmsis_nn_dw_conv_params *dw_conv_params,
961                                                   const cmsis_nn_per_channel_quant_params *quant_params,
962                                                   const cmsis_nn_dims *input_dims,
963                                                   const int8_t *input_data,
964                                                   const cmsis_nn_dims *filter_dims,
965                                                   const int8_t *filter_data,
966                                                   const cmsis_nn_dims *bias_dims,
967                                                   const int32_t *bias_data,
968                                                   const cmsis_nn_dims *output_dims,
969                                                   int8_t *output_data);
970 
971 /**
972  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
973  *
974  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
975  *                                 Range of dw_conv_params->input_offset : [-127, 128]
976  *                                 Range of dw_conv_params->input_offset : [-128, 127]
977  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
978  *                                 Batch argument N is not used and assumed to be 1.
979  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
980  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
981  * @return                         Size of additional memory required for optimizations in bytes.
982  *
983  */
984 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
985                                                       const cmsis_nn_dims *input_dims,
986                                                       const cmsis_nn_dims *filter_dims,
987                                                       const cmsis_nn_dims *output_dims);
988 
989 /**
990  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for processors with DSP extension.
991  *        Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details.
992  *
993  * @note       Intended for compilation on Host. If compiling for an Arm target, use
994  *             arm_depthwise_conv_wrapper_s8_get_buffer_size().
995  *
996  */
997 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
998                                                           const cmsis_nn_dims *input_dims,
999                                                           const cmsis_nn_dims *filter_dims,
1000                                                           const cmsis_nn_dims *output_dims);
1001 
1002 /**
1003  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for Arm(R) Helium Architecture case.
1004  *        Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details.
1005  *
1006  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1007  *             arm_depthwise_conv_wrapper_s8_get_buffer_size().
1008  *
1009  */
1010 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
1011                                                           const cmsis_nn_dims *input_dims,
1012                                                           const cmsis_nn_dims *filter_dims,
1013                                                           const cmsis_nn_dims *output_dims);
1014 
1015 /**
1016  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4()
1017  *
1018  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1019  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1020  *                                 Range of dw_conv_params->input_offset : [-128, 127]
1021  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1022  *                                 Batch argument N is not used and assumed to be 1.
1023  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1024  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1025  * @return                         Size of additional memory required for optimizations in bytes.
1026  *
1027  */
1028 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
1029                                                       const cmsis_nn_dims *input_dims,
1030                                                       const cmsis_nn_dims *filter_dims,
1031                                                       const cmsis_nn_dims *output_dims);
1032 
1033 /**
1034  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() for processors with DSP extension.
1035  *        Refer to arm_depthwise_conv_wrapper_s4_get_buffer_size() for function argument details.
1036  *
1037  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1038  *             arm_depthwise_conv_wrapper_s4_get_buffer_size().
1039  *
1040  */
1041 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
1042                                                           const cmsis_nn_dims *input_dims,
1043                                                           const cmsis_nn_dims *filter_dims,
1044                                                           const cmsis_nn_dims *output_dims);
1045 
1046 /**
1047  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() for Arm(R) Helium Architecture case.
1048  *        Refer to arm_depthwise_conv_wrapper_s4_get_buffer_size() for function argument details.
1049  *
1050  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1051  *             arm_depthwise_conv_wrapper_s4_get_buffer_size().
1052  *
1053  */
1054 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
1055                                                           const cmsis_nn_dims *input_dims,
1056                                                           const cmsis_nn_dims *filter_dims,
1057                                                           const cmsis_nn_dims *output_dims);
1058 
1059 /**
1060  * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
1061  *
1062  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1063  *                                 definition file to see if an additional buffer is required.
1064  *                                 Optional function {API}_get_buffer_size() provides the buffer
1065  *                                 size if an additional buffer is required exists if additional memory is.
1066  *                                 The caller is expected to clear the buffer, if applicable, for security reasons.
1067  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1068  *                                 dw_conv_params->dilation is not used.
1069  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1070  *                                 Range of dw_conv_params->input_offset : [-128, 127]
1071  * @param[in]      quant_params    Per-channel quantization info.
1072  *                                 It contains the multiplier and shift values to be applied to each
1073  *                                 output channel
1074  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1075  *                                 Batch argument N is not used.
1076  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
1077  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1078  * @param[in]      filter_data     Filter data pointer. Data type: int8
1079  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1080  * @param[in]      bias_data       Bias data pointer. Data type: int32
1081  * @param[in]      output_dims     Output tensor dimensions. Format: [N, H, W, C_OUT]
1082  * @param[in, out] output_data     Output data pointer. Data type: int8
1083  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1084  *
1085  * @details
1086  *    - Supported framework: TensorFlow Lite
1087  */
1088 arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
1089                                           const cmsis_nn_dw_conv_params *dw_conv_params,
1090                                           const cmsis_nn_per_channel_quant_params *quant_params,
1091                                           const cmsis_nn_dims *input_dims,
1092                                           const int8_t *input_data,
1093                                           const cmsis_nn_dims *filter_dims,
1094                                           const int8_t *filter_data,
1095                                           const cmsis_nn_dims *bias_dims,
1096                                           const int32_t *bias_data,
1097                                           const cmsis_nn_dims *output_dims,
1098                                           int8_t *output_data);
1099 
1100 /**
1101  * @brief Basic s4 depthwise convolution function that doesn't have any constraints on the input dimensions.
1102  *
1103  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1104  *                                 definition file to see if an additional buffer is required.
1105  *                                 Optional function {API}_get_buffer_size() provides the buffer
1106  *                                 size if an additional buffer is required exists if additional memory is.
1107  *                                 The caller is expected to clear the buffer ,if applicable, for security reasons.
1108  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1109  *                                 dw_conv_params->dilation is not used.
1110  *                                 Range of dw_conv_params->input_offset : [-127, 128]
1111  *                                 Range of dw_conv_params->input_offset : [-128, 127]
1112  * @param[in]      quant_params    Per-channel quantization info.
1113  *                                 It contains the multiplier and shift values to be applied to each
1114  *                                 output channel
1115  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1116  *                                 Batch argument N is not used.
1117  * @param[in]      input           Input (activation) data pointer. Data type: int8
1118  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1119  * @param[in]      kernel          Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
1120  *                                 weights [0x1, 0x2, 0x3, 0x4]  packed as [0x21, 0x43].
1121  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1122  * @param[in]      bias            Bias data pointer. Data type: int32
1123  * @param[in]      output_dims     Output tensor dimensions. Format: [N, H, W, C_OUT]
1124  * @param[in, out] output          Output data pointer. Data type: int8
1125  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1126  *
1127  * @details
1128  *    - Supported framework: TensorFlow Lite
1129  */
1130 arm_cmsis_nn_status arm_depthwise_conv_s4(const cmsis_nn_context *ctx,
1131                                           const cmsis_nn_dw_conv_params *dw_conv_params,
1132                                           const cmsis_nn_per_channel_quant_params *quant_params,
1133                                           const cmsis_nn_dims *input_dims,
1134                                           const int8_t *input,
1135                                           const cmsis_nn_dims *filter_dims,
1136                                           const int8_t *kernel,
1137                                           const cmsis_nn_dims *bias_dims,
1138                                           const int32_t *bias,
1139                                           const cmsis_nn_dims *output_dims,
1140                                           int8_t *output);
1141 
1142 /**
1143  * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
1144  *
1145  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1146  *                                 definition file to see if an additional buffer is required.
1147  *                                 Optional function {API}_get_buffer_size() provides the buffer
1148  *                                 size if an additional buffer is required.
1149  *                                 exists if additional memory is.
1150  *                                 The caller is expected to clear the buffer, if applicable, for security reasons.
1151  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1152  *                                 conv_params->input_offset  : Not used
1153  *                                 conv_params->output_offset : Not used
1154  * @param[in]      quant_params    Per-channel quantization info.
1155  *                                 It contains the multiplier and shift values to be applied to each
1156  *                                 output channel
1157  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1158  *                                 Batch argument N is not used.
1159  * @param[in]      input_data      Input (activation) data pointer. Data type: int8
1160  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1161  * @param[in]      filter_data     Filter data pointer. Data type: int8
1162  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1163  * @param[in]      bias_data       Bias data pointer. Data type: int64
1164  * @param[in]      output_dims     Output tensor dimensions. Format: [N, H, W, C_OUT]
1165  * @param[in, out] output_data     Output data pointer. Data type: int16
1166  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1167  *
1168  * @details
1169  *    - Supported framework: TensorFlow Lite
1170  */
1171 arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
1172                                            const cmsis_nn_dw_conv_params *dw_conv_params,
1173                                            const cmsis_nn_per_channel_quant_params *quant_params,
1174                                            const cmsis_nn_dims *input_dims,
1175                                            const int16_t *input_data,
1176                                            const cmsis_nn_dims *filter_dims,
1177                                            const int8_t *filter_data,
1178                                            const cmsis_nn_dims *bias_dims,
1179                                            const int64_t *bias_data,
1180                                            const cmsis_nn_dims *output_dims,
1181                                            int16_t *output_data);
1182 
1183 /**
1184  * @brief Wrapper function to pick the right optimized s16 depthwise convolution function
1185  *
1186  * @param[in, out] ctx             Function context (e.g. temporary buffer). Check the function
1187  *                                 definition file to see if an additional buffer is required.
1188  *                                 Optional function {API}_get_buffer_size() provides the buffer
1189  *                                 size if required.
1190  *                                 The caller is expected to clear the buffer, if applicable, for security reasons.
1191  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1192  *                                 dw_conv_params->dilation is not used.
1193  *                                 Range of dw_conv_params->input_offset : Not used
1194  *                                 Range of dw_conv_params->output_offset : Not used
1195  * @param[in]      quant_params    Per-channel quantization info.
1196  *                                 It contains the multiplier and shift values to be applied to each
1197  *                                 output channel
1198  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1199  *                                 Batch argument N is not used and assumed to be 1.
1200  * @param[in]      input_data      Input (activation) data pointer. Data type: int16
1201  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1202  * @param[in]      filter_data     Filter data pointer. Data type: int8
1203  * @param[in]      bias_dims       Bias tensor dimensions. Format: [C_OUT]
1204  * @param[in]      bias_data       Bias data pointer. Data type: int64
1205  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1206  * @param[in, out] output_data     Output data pointer. Data type: int16
1207  * @return     The function returns
1208  *                <code>ARM_CMSIS_NN_SUCCESS</code>   -  Successful completion.
1209  *
1210  * @details
1211  *    - Supported framework: TensorFlow Lite
1212  *    - Picks one of the the following functions
1213  *        -# arm_depthwise_conv_s16()
1214  *        -# arm_depthwise_conv_fast_s16()  - Cortex-M CPUs with DSP extension only
1215  */
1216 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx,
1217                                                    const cmsis_nn_dw_conv_params *dw_conv_params,
1218                                                    const cmsis_nn_per_channel_quant_params *quant_params,
1219                                                    const cmsis_nn_dims *input_dims,
1220                                                    const int16_t *input_data,
1221                                                    const cmsis_nn_dims *filter_dims,
1222                                                    const int8_t *filter_data,
1223                                                    const cmsis_nn_dims *bias_dims,
1224                                                    const int64_t *bias_data,
1225                                                    const cmsis_nn_dims *output_dims,
1226                                                    int16_t *output_data);
1227 
1228 /**
1229  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16()
1230  *
1231  * @param[in]      dw_conv_params  Depthwise convolution parameters (e.g. strides, dilations, pads,...)
1232  *                                 Range of dw_conv_params->input_offset : Not used
1233  *                                 Range of dw_conv_params->input_offset : Not used
1234  * @param[in]      input_dims      Input (activation) tensor dimensions. Format: [H, W, C_IN]
1235  *                                 Batch argument N is not used and assumed to be 1.
1236  * @param[in]      filter_dims     Filter tensor dimensions. Format: [1, H, W, C_OUT]
1237  * @param[in]      output_dims     Output tensor dimensions. Format: [1, H, W, C_OUT]
1238  * @return                         Size of additional memory required for optimizations in bytes.
1239  *
1240  */
1241 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
1242                                                        const cmsis_nn_dims *input_dims,
1243                                                        const cmsis_nn_dims *filter_dims,
1244                                                        const cmsis_nn_dims *output_dims);
1245 
1246 /**
1247  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for processors with DSP extension.
1248  *        Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details.
1249  *
1250  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1251  *             arm_depthwise_conv_wrapper_s16_get_buffer_size().
1252  *
1253  */
1254 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
1255                                                            const cmsis_nn_dims *input_dims,
1256                                                            const cmsis_nn_dims *filter_dims,
1257                                                            const cmsis_nn_dims *output_dims);
1258 
1259 /**
1260  * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for Arm(R) Helium Architecture
1261  * case. Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details.
1262  *
1263  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1264  *             arm_depthwise_conv_wrapper_s16_get_buffer_size().
1265  *
1266  */
1267 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
1268                                                            const cmsis_nn_dims *input_dims,
1269                                                            const cmsis_nn_dims *filter_dims,
1270                                                            const cmsis_nn_dims *output_dims);
1271 
1272 /**
1273  * @brief Optimized s16 depthwise convolution function with constraint that in_channel equals out_channel.
1274  *        Refer arm_depthwise_conv_s16() for function argument details.
1275  *
1276  * @return     The function returns one of the following
1277  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - ctx-buff == NULL and
1278  *                                                      arm_depthwise_conv_fast_s16_get_buffer_size() > 0 or
1279  *                                                      input channel != output channel or
1280  *                                                      ch_mult != 1
1281  *
1282  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1283  *
1284  * @details
1285  *    - Supported framework: TensorFlow Lite
1286  *    - The following constrains on the arguments apply
1287  *        -# Number of input channel equals number of output channels or ch_mult equals 1
1288  *    - Reccomended when number of channels is 4 or greater.
1289  *
1290  */
1291 arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
1292                                                 const cmsis_nn_dw_conv_params *dw_conv_params,
1293                                                 const cmsis_nn_per_channel_quant_params *quant_params,
1294                                                 const cmsis_nn_dims *input_dims,
1295                                                 const int16_t *input_data,
1296                                                 const cmsis_nn_dims *filter_dims,
1297                                                 const int8_t *filter_data,
1298                                                 const cmsis_nn_dims *bias_dims,
1299                                                 const int64_t *bias_data,
1300                                                 const cmsis_nn_dims *output_dims,
1301                                                 int16_t *output_data);
1302 
1303 /**
1304  * @brief Get the required buffer size for optimized s16 depthwise convolution
1305  * function with constraint that in_channel equals out_channel.
1306  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1307  *                               Batch argument N is not used.
1308  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
1309  * @return          The function returns required buffer size in bytes
1310  *
1311  */
1312 int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
1313 
1314 /**
1315  * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
1316  *        the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
1317  *        argument details.
1318  *
1319  * @return     The function returns one of the following
1320  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - Unsupported dimension of tensors
1321  *                                                    - Unsupported pad size along the x axis
1322  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1323  *
1324  * @details
1325  *   - Supported framework : TensorFlow Lite Micro
1326  *   - The following constrains on the arguments apply
1327  *      -# Number of input channel equals number of output channels
1328  *      -# Filter height and width equals 3
1329  *      -# Padding along x is either 0 or 1.
1330  *
1331  */
1332 arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
1333                                               const cmsis_nn_dw_conv_params *dw_conv_params,
1334                                               const cmsis_nn_per_channel_quant_params *quant_params,
1335                                               const cmsis_nn_dims *input_dims,
1336                                               const int8_t *input_data,
1337                                               const cmsis_nn_dims *filter_dims,
1338                                               const int8_t *filter_data,
1339                                               const cmsis_nn_dims *bias_dims,
1340                                               const int32_t *bias_data,
1341                                               const cmsis_nn_dims *output_dims,
1342                                               int8_t *output_data);
1343 
1344 /**
1345  * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
1346  *        Refer arm_depthwise_conv_s8() for function argument details.
1347  *
1348  * @return     The function returns one of the following
1349  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - input channel != output channel or
1350  *                                                      ch_mult != 1
1351  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1352  *
1353  * @note       If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
1354  *             for the following if MVE optimizations(Arm Helium Technology) are used.
1355  *               - Output shift
1356  *               - Output multiplier
1357  *               - Output bias
1358  *               - kernel
1359  * @details
1360  *    - Supported framework: TensorFlow Lite
1361  *    - The following constrains on the arguments apply
1362  *        -# Number of input channel equals number of output channels or ch_mult equals 1
1363  *    - Reccomended when number of channels is 4 or greater.
1364  *
1365  */
1366 arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
1367                                               const cmsis_nn_dw_conv_params *dw_conv_params,
1368                                               const cmsis_nn_per_channel_quant_params *quant_params,
1369                                               const cmsis_nn_dims *input_dims,
1370                                               const int8_t *input_data,
1371                                               const cmsis_nn_dims *filter_dims,
1372                                               const int8_t *filter_data,
1373                                               const cmsis_nn_dims *bias_dims,
1374                                               const int32_t *bias_data,
1375                                               const cmsis_nn_dims *output_dims,
1376                                               int8_t *output_data);
1377 
1378 /**
1379  * @brief Optimized s4 depthwise convolution function with constraint that in_channel equals out_channel.
1380  *        Refer arm_depthwise_conv_s4() for function argument details.
1381  *
1382  * @return     The function returns one of the following
1383  *                <code>ARM_CMSIS_NN_ARG_ERROR</code> - input channel != output channel or
1384  *                                                      ch_mult != 1
1385  *                <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1386  *
1387  * @note       If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
1388  *             for the following if MVE optimizations(Arm Helium Technology) are used.
1389  *               - Output shift
1390  *               - Output multiplier
1391  *               - Output bias
1392  *               - kernel
1393  * @details
1394  *    - Supported framework: TensorFlow Lite
1395  *    - The following constrains on the arguments apply
1396  *        -# Number of input channel equals number of output channels or ch_mult equals 1
1397  *    - Reccomended when number of channels is 4 or greater.
1398  *
1399  */
1400 arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
1401                                               const cmsis_nn_dw_conv_params *dw_conv_params,
1402                                               const cmsis_nn_per_channel_quant_params *quant_params,
1403                                               const cmsis_nn_dims *input_dims,
1404                                               const int8_t *input_data,
1405                                               const cmsis_nn_dims *filter_dims,
1406                                               const int8_t *filter_data,
1407                                               const cmsis_nn_dims *bias_dims,
1408                                               const int32_t *bias_data,
1409                                               const cmsis_nn_dims *output_dims,
1410                                               int8_t *output_data);
1411 
1412 /**
1413  * @brief Get the required buffer size for optimized s8 depthwise convolution
1414  * function with constraint that in_channel equals out_channel.
1415  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1416  *                               Batch argument N is not used.
1417  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
1418  * @return          The function returns required buffer size in bytes
1419  *
1420  */
1421 int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
1422 
1423 /**
1424  * @brief Get the required buffer size for optimized s4 depthwise convolution
1425  * function with constraint that in_channel equals out_channel.
1426  * @param[in]       input_dims   Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
1427  *                               Batch argument N is not used.
1428  * @param[in]       filter_dims  Filter tensor dimensions. Format: [1, H, W, C_OUT]
1429  * @return          The function returns required buffer size in bytes
1430  *
1431  */
1432 int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
1433 
1434 /**
1435  * @defgroup FC Fully-connected Layer Functions
1436  *
1437  * Collection of fully-connected and matrix multiplication functions.
1438  *
1439  * Fully-connected layer is basically a matrix-vector multiplication
1440  * with bias. The matrix is the weights and the input/output vectors
1441  * are the activation values. Supported {weight, activation} precisions
1442  * include {8-bit, 8-bit} and {8-bit, 16-bit}
1443  *
1444  *
1445  */
1446 
1447 /**
1448  * @brief Basic s4 Fully Connected function.
1449  *
1450  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1451  *                               definition file to see if an additional buffer is required.
1452  *                               Optional function {API}_get_buffer_size() provides the buffer
1453  *                               size if an additional buffer is required.
1454  *                               The caller is expected to clear the buffer ,if applicable, for security reasons.
1455  * @param[in]      fc_params     Fully Connected layer parameters.
1456  *                               Range of fc_params->input_offset  : [-127, 128]
1457  *                               fc_params->filter_offset : 0
1458  *                               Range of fc_params->output_offset : [-128, 127]
1459  * @param[in]      quant_params  Per-tensor quantization info.
1460  *                               It contains the multiplier and shift values to be applied to the output tensor.
1461  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1462  *                               Input dimension is taken as Nx(H * W * C_IN)
1463  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
1464  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1465  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1466  *                               C : output depth and equals C_OUT in output_dims
1467  *                               H & W : Not used
1468  * @param[in]      filter_data   Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
1469  *                               weights [0x1, 0x2, 0x3, 0x4]  packed as [0x21, 0x43].
1470  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1471  *                               N, H, W : Not used
1472  * @param[in]      bias_data     Bias data pointer. Data type: int32
1473  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1474  *                               N : Batches
1475  *                               C_OUT : Output depth
1476  *                               H & W : Not used.
1477  * @param[in, out] output_data    Output data pointer. Data type: int8
1478  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1479  *
1480  * @details
1481  *    - Supported framework: TensorFlow Lite
1482  */
1483 arm_cmsis_nn_status arm_fully_connected_s4(const cmsis_nn_context *ctx,
1484                                            const cmsis_nn_fc_params *fc_params,
1485                                            const cmsis_nn_per_tensor_quant_params *quant_params,
1486                                            const cmsis_nn_dims *input_dims,
1487                                            const int8_t *input_data,
1488                                            const cmsis_nn_dims *filter_dims,
1489                                            const int8_t *filter_data,
1490                                            const cmsis_nn_dims *bias_dims,
1491                                            const int32_t *bias_data,
1492                                            const cmsis_nn_dims *output_dims,
1493                                            int8_t *output_data);
1494 
1495 /**
1496  * @brief Basic s8 Fully Connected function.
1497  *
1498  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1499  *                               definition file to see if an additional buffer is required.
1500  *                               Optional function {API}_get_buffer_size() provides the buffer
1501  *                               size if an additional buffer is required.
1502  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
1503  * @param[in]      fc_params     Fully Connected layer parameters.
1504  *                               Range of fc_params->input_offset  : [-127, 128]
1505  *                               fc_params->filter_offset : 0
1506  *                               Range of fc_params->output_offset : [-128, 127]
1507  * @param[in]      quant_params  Per-tensor quantization info.
1508  *                               It contains the multiplier and shift values to be applied to the output tensor.
1509  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1510  *                               Input dimension is taken as Nx(H * W * C_IN)
1511  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
1512  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1513  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1514  *                               C : output depth and equals C_OUT in output_dims
1515  *                               H & W : Not used
1516  * @param[in]      filter_data   Filter data pointer. Data type: int8
1517  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1518  *                               N, H, W : Not used
1519  * @param[in]      bias_data     Bias data pointer. Data type: int32
1520  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1521  *                               N : Batches
1522  *                               C_OUT : Output depth
1523  *                               H & W : Not used.
1524  * @param[in, out] output_data    Output data pointer. Data type: int8
1525  *
1526  * @return     The function returns either
1527  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
1528  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
1529  *
1530  * @details
1531  *    - Supported framework: TensorFlow Lite
1532  */
1533 arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
1534                                            const cmsis_nn_fc_params *fc_params,
1535                                            const cmsis_nn_per_tensor_quant_params *quant_params,
1536                                            const cmsis_nn_dims *input_dims,
1537                                            const int8_t *input_data,
1538                                            const cmsis_nn_dims *filter_dims,
1539                                            const int8_t *filter_data,
1540                                            const cmsis_nn_dims *bias_dims,
1541                                            const int32_t *bias_data,
1542                                            const cmsis_nn_dims *output_dims,
1543                                            int8_t *output_data);
1544 
1545 /**
1546  * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s32 bias_data.
1547  * @param[in, out]      vector_sum_buf              Buffer for vector sums
1548  * @param[in]           vector_cols                 Number of vector columns
1549  * @param[in]           vector_rows                 Number of vector rows
1550  * @param[in]           vector_data                 Vector of weigths data
1551  * @param[in]           lhs_offset                  Constant multiplied with each sum
1552  * @param[in]           bias_data                   Vector of bias data, added to each sum.
1553  * @return              The function returns
1554  *                         <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1555  */
1556 arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
1557                                       const int32_t vector_cols,
1558                                       const int32_t vector_rows,
1559                                       const int8_t *vector_data,
1560                                       const int32_t lhs_offset,
1561                                       const int32_t *bias_data);
1562 
1563 /**
1564  * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s64 bias_data.
1565  * @param[in, out]      vector_sum_buf              Buffer for vector sums
1566  * @param[in]           vector_cols                 Number of vector columns
1567  * @param[in]           vector_rows                 Number of vector rows
1568  * @param[in]           vector_data                 Vector of weigths data
1569  * @param[in]           lhs_offset                  Constant multiplied with each sum
1570  * @param[in]           bias_data                   Vector of bias data, added to each sum.
1571  * @return              The function returns
1572  *                         <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1573  */
1574 arm_cmsis_nn_status arm_vector_sum_s8_s64(int64_t *vector_sum_buf,
1575                                           const int32_t vector_cols,
1576                                           const int32_t vector_rows,
1577                                           const int8_t *vector_data,
1578                                           const int32_t lhs_offset,
1579                                           const int64_t *bias_data);
1580 
1581 /**
1582  * @brief Get size of additional buffer required by arm_fully_connected_s8().
1583  *        See also arm_vector_sum_s8, which is required if buffer size is > 0.
1584  * @param[in]      filter_dims             dimension of filter
1585  * @return         The function returns    required buffer size in bytes
1586  *
1587  */
1588 int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
1589 
1590 /**
1591  * @brief Get size of additional buffer required by arm_fully_connected_s8() for processors with DSP extension.
1592  *        Refer to arm_fully_connected_s8_get_buffer_size() for function argument details.
1593  *
1594  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1595  *             arm_fully_connected_s8_get_buffer_size().
1596  *
1597  */
1598 int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
1599 
1600 /**
1601  * @brief Get size of additional buffer required by arm_fully_connected_s8() for Arm(R) Helium Architecture case.
1602  *        Refer to arm_fully_connected_s8_get_buffer_size() for function argument details.
1603  *
1604  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1605  *             arm_fully_connected_s8_get_buffer_size().
1606  *
1607  */
1608 int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
1609 
1610 /**
1611  * @brief Basic s16 Fully Connected function.
1612  *
1613  * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
1614  *                               definition file to see if an additional buffer is required.
1615  *                               Optional function {API}_get_buffer_size() provides the buffer
1616  *                               size if an additional buffer is required.
1617  *                               The caller is expected to clear the buffer, if applicable, for security reasons.
1618  * @param[in]      fc_params     Fully Connected layer parameters.
1619  *                               fc_params->input_offset  : 0
1620  *                               fc_params->filter_offset : 0
1621  *                               fc_params->output_offset : 0
1622  * @param[in]      quant_params  Per-tensor quantization info.
1623  *                               It contains the multiplier and shift values to be applied to the output tensor.
1624  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
1625  *                               Input dimension is taken as Nx(H * W * C_IN)
1626  * @param[in]      input_data    Input (activation) data pointer. Data type: int16
1627  * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
1628  *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
1629  *                               C : output depth and equals C_OUT in output_dims
1630  *                               H & W : Not used
1631  * @param[in]      filter_data   Filter data pointer. Data type: int8
1632  * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
1633  *                               N, H, W : Not used
1634  * @param[in]      bias_data     Bias data pointer. Data type: int64
1635  * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
1636  *                               N : Batches
1637  *                               C_OUT : Output depth
1638  *                               H & W : Not used.
1639  * @param[in, out] output_data    Output data pointer. Data type: int16
1640  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1641  *
1642  * @details
1643  *    - Supported framework: TensorFlow Lite
1644  */
1645 arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
1646                                             const cmsis_nn_fc_params *fc_params,
1647                                             const cmsis_nn_per_tensor_quant_params *quant_params,
1648                                             const cmsis_nn_dims *input_dims,
1649                                             const int16_t *input_data,
1650                                             const cmsis_nn_dims *filter_dims,
1651                                             const int8_t *filter_data,
1652                                             const cmsis_nn_dims *bias_dims,
1653                                             const int64_t *bias_data,
1654                                             const cmsis_nn_dims *output_dims,
1655                                             int16_t *output_data);
1656 
1657 /**
1658  * @brief Get size of additional buffer required by arm_fully_connected_s16().
1659  * @param[in]      filter_dims             dimension of filter
1660  * @return         The function returns    required buffer size in bytes
1661  *
1662  */
1663 int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims);
1664 
1665 /**
1666  * @brief Get size of additional buffer required by arm_fully_connected_s16() for processors with DSP extension.
1667  *        Refer to arm_fully_connected_s16_get_buffer_size() for function argument details.
1668  *
1669  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1670  *             arm_fully_connected_s16_get_buffer_size().
1671  *
1672  */
1673 int32_t arm_fully_connected_s16_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
1674 
1675 /**
1676  * @brief Get size of additional buffer required by arm_fully_connected_s16() for Arm(R) Helium Architecture case.
1677  *        Refer to arm_fully_connected_s16_get_buffer_size() for function argument details.
1678  *
1679  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1680  *             arm_fully_connected_s16_get_buffer_size().
1681  *
1682  */
1683 int32_t arm_fully_connected_s16_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
1684 
1685 /**
1686  * @defgroup groupElementwise Elementwise Functions
1687  *
1688  * Elementwise add and multiplication functions.
1689  *
1690  */
1691 
1692 /**
1693  * @brief s8 elementwise add of two vectors
1694  * @param[in]       input_1_vect        pointer to input vector 1
1695  * @param[in]       input_2_vect        pointer to input vector 2
1696  * @param[in]       input_1_offset      offset for input 1. Range: -127 to 128
1697  * @param[in]       input_1_mult        multiplier for input 1
1698  * @param[in]       input_1_shift       shift for input 1
1699  * @param[in]       input_2_offset      offset for input 2. Range: -127 to 128
1700  * @param[in]       input_2_mult        multiplier for input 2
1701  * @param[in]       input_2_shift       shift for input 2
1702  * @param[in]       left_shift          input left shift
1703  * @param[in,out]   output              pointer to output vector
1704  * @param[in]       out_offset          output offset.  Range: -128 to 127
1705  * @param[in]       out_mult            output multiplier
1706  * @param[in]       out_shift           output shift
1707  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -128
1708  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 127
1709  * @param[in]       block_size          number of samples
1710  * @return          The function returns    ARM_CMSIS_NN_SUCCESS
1711  */
1712 arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
1713                                            const int8_t *input_2_vect,
1714                                            const int32_t input_1_offset,
1715                                            const int32_t input_1_mult,
1716                                            const int32_t input_1_shift,
1717                                            const int32_t input_2_offset,
1718                                            const int32_t input_2_mult,
1719                                            const int32_t input_2_shift,
1720                                            const int32_t left_shift,
1721                                            int8_t *output,
1722                                            const int32_t out_offset,
1723                                            const int32_t out_mult,
1724                                            const int32_t out_shift,
1725                                            const int32_t out_activation_min,
1726                                            const int32_t out_activation_max,
1727                                            const int32_t block_size);
1728 
1729 /**
1730  * @brief s16 elementwise add of two vectors
1731  * @param[in]       input_1_vect        pointer to input vector 1
1732  * @param[in]       input_2_vect        pointer to input vector 2
1733  * @param[in]       input_1_offset      offset for input 1. Not used.
1734  * @param[in]       input_1_mult        multiplier for input 1
1735  * @param[in]       input_1_shift       shift for input 1
1736  * @param[in]       input_2_offset      offset for input 2. Not used.
1737  * @param[in]       input_2_mult        multiplier for input 2
1738  * @param[in]       input_2_shift       shift for input 2
1739  * @param[in]       left_shift          input left shift
1740  * @param[in,out]   output              pointer to output vector
1741  * @param[in]       out_offset          output offset. Not used.
1742  * @param[in]       out_mult            output multiplier
1743  * @param[in]       out_shift           output shift
1744  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -32768
1745  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 32767
1746  * @param[in]       block_size          number of samples
1747  * @return          The function returns  ARM_CMSIS_NN_SUCCESS
1748  */
1749 arm_cmsis_nn_status arm_elementwise_add_s16(const int16_t *input_1_vect,
1750                                             const int16_t *input_2_vect,
1751                                             const int32_t input_1_offset,
1752                                             const int32_t input_1_mult,
1753                                             const int32_t input_1_shift,
1754                                             const int32_t input_2_offset,
1755                                             const int32_t input_2_mult,
1756                                             const int32_t input_2_shift,
1757                                             const int32_t left_shift,
1758                                             int16_t *output,
1759                                             const int32_t out_offset,
1760                                             const int32_t out_mult,
1761                                             const int32_t out_shift,
1762                                             const int32_t out_activation_min,
1763                                             const int32_t out_activation_max,
1764                                             const int32_t block_size);
1765 
1766 /**
1767  * @brief s8 elementwise multiplication
1768  * @param[in]       input_1_vect        pointer to input vector 1
1769  * @param[in]       input_2_vect        pointer to input vector 2
1770  * @param[in]       input_1_offset      offset for input 1. Range: -127 to 128
1771  * @param[in]       input_2_offset      offset for input 2. Range: -127 to 128
1772  * @param[in,out]   output              pointer to output vector
1773  * @param[in]       out_offset          output offset. Range: -128 to 127
1774  * @param[in]       out_mult            output multiplier
1775  * @param[in]       out_shift           output shift
1776  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -128
1777  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 127
1778  * @param[in]       block_size          number of samples
1779  * @return          The function returns ARM_CMSIS_NN_SUCCESS
1780  *
1781  * @details   Supported framework: TensorFlow Lite micro
1782  */
1783 arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
1784                                            const int8_t *input_2_vect,
1785                                            const int32_t input_1_offset,
1786                                            const int32_t input_2_offset,
1787                                            int8_t *output,
1788                                            const int32_t out_offset,
1789                                            const int32_t out_mult,
1790                                            const int32_t out_shift,
1791                                            const int32_t out_activation_min,
1792                                            const int32_t out_activation_max,
1793                                            const int32_t block_size);
1794 
1795 /**
1796  * @brief s16 elementwise multiplication
1797  * @param[in]       input_1_vect        pointer to input vector 1
1798  * @param[in]       input_2_vect        pointer to input vector 2
1799  * @param[in]       input_1_offset      offset for input 1. Not used.
1800  * @param[in]       input_2_offset      offset for input 2. Not used.
1801  * @param[in,out]   output              pointer to output vector
1802  * @param[in]       out_offset          output offset. Not used.
1803  * @param[in]       out_mult            output multiplier
1804  * @param[in]       out_shift           output shift
1805  * @param[in]       out_activation_min  minimum value to clamp output to. Min: -32768
1806  * @param[in]       out_activation_max  maximum value to clamp output to. Max: 32767
1807  * @param[in]       block_size          number of samples
1808  * @return          The function returns ARM_CMSIS_NN_SUCCESS
1809  *
1810  * @details   Supported framework: TensorFlow Lite micro
1811  */
1812 arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
1813                                             const int16_t *input_2_vect,
1814                                             const int32_t input_1_offset,
1815                                             const int32_t input_2_offset,
1816                                             int16_t *output,
1817                                             const int32_t out_offset,
1818                                             const int32_t out_mult,
1819                                             const int32_t out_shift,
1820                                             const int32_t out_activation_min,
1821                                             const int32_t out_activation_max,
1822                                             const int32_t block_size);
1823 
1824 /**
1825  * @defgroup Acti Activation Functions
1826  *
1827  * Perform activation layers, including ReLU (Rectified Linear Unit),
1828  * sigmoid and tanh
1829  *
1830  */
1831 
1832 /**
1833  * @brief Q7 RELU function
1834  * @param[in,out]   data        pointer to input
1835  * @param[in]       size        number of elements
1836  */
1837 void arm_relu_q7(int8_t *data, uint16_t size);
1838 
1839 /**
1840  * @brief s8 ReLU6 function
1841  * @param[in,out]   data        pointer to input
1842  * @param[in]       size        number of elements
1843  */
1844 void arm_relu6_s8(int8_t *data, uint16_t size);
1845 
1846 /**
1847  * @brief Q15 RELU function
1848  * @param[in,out]   data        pointer to input
1849  * @param[in]       size        number of elements
1850  */
1851 void arm_relu_q15(int16_t *data, uint16_t size);
1852 
1853 /**
1854  * @brief s16 neural network activation function using direct table look-up
1855  * @param[in]       input       pointer to input data
1856  * @param[out]      output      pointer to output
1857  * @param[in]       size        number of elements
1858  * @param[in]       left_shift  bit-width of the integer part, assumed to be smaller than 3.
1859  * @param[in]       type        type of activation functions
1860  * @return                      The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
1861 
1862  *
1863  * @details Supported framework: TensorFlow Lite for Microcontrollers.
1864  * This activation function must be bit precise congruent with the corresponding TFLM tanh and sigmoid activation
1865  * functions
1866  */
1867 arm_cmsis_nn_status arm_nn_activation_s16(const int16_t *input,
1868                                           int16_t *output,
1869                                           const int32_t size,
1870                                           const int32_t left_shift,
1871                                           const arm_nn_activation_type type);
1872 
1873 /**
1874  * @defgroup Pooling Pooling Functions
1875  *
1876  * Perform max and average pooling operations
1877  *
1878  */
1879 
1880 /**
1881  * @brief s8 average pooling function.
1882  *
1883  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
1884  *                              definition file to see if an additional buffer is required.
1885  *                              Optional function {API}_get_buffer_size() provides the buffer
1886  *                              size if an additional buffer is required.
1887  *                              The caller is expected to clear the buffer, if applicable, for security reasons.
1888  * @param[in]      pool_params  Pooling parameters
1889  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
1890  * @param[in]      input_data   Input (activation) data pointer. Data type: int8
1891  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
1892  *                              Argument N and C are not used.
1893  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
1894  *                              Argument N is not used.
1895  *                              C_OUT equals C_IN.
1896  * @param[in, out] output_data Output data pointer. Data type: int8
1897  *
1898  * @return     The function returns either
1899  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
1900  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
1901  *
1902  * @details
1903  *    - Supported Framework: TensorFlow Lite
1904  *
1905  */
1906 arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx,
1907                                    const cmsis_nn_pool_params *pool_params,
1908                                    const cmsis_nn_dims *input_dims,
1909                                    const int8_t *input_data,
1910                                    const cmsis_nn_dims *filter_dims,
1911                                    const cmsis_nn_dims *output_dims,
1912                                    int8_t *output_data);
1913 
1914 /**
1915  * @brief Get the required buffer size for S8 average pooling function
1916  * @param[in]       dim_dst_width         output tensor dimension
1917  * @param[in]       ch_src                number of input tensor channels
1918  * @return          The function returns required buffer size in bytes
1919  *
1920  */
1921 int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
1922 
1923 /**
1924  * @brief Get the required buffer size for S8 average pooling function for processors with DSP extension.
1925  *        Refer to arm_avgpool_s8_get_buffer_size() for function argument details.
1926  *
1927  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1928  *             arm_avgpool_s8_get_buffer_size().
1929  *
1930  */
1931 int32_t arm_avgpool_s8_get_buffer_size_dsp(const int dim_dst_width, const int ch_src);
1932 
1933 /**
1934  * @brief Get the required buffer size for S8 average pooling function for Arm(R) Helium Architecture case.
1935  *        Refer to arm_avgpool_s8_get_buffer_size() for function argument details.
1936  *
1937  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1938  *             arm_avgpool_s8_get_buffer_size().
1939  *
1940  */
1941 int32_t arm_avgpool_s8_get_buffer_size_mve(const int dim_dst_width, const int ch_src);
1942 
1943 /**
1944  * @brief s16 average pooling function.
1945  *
1946  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
1947  *                              definition file to see if an additional buffer is required.
1948  *                              Optional function {API}_get_buffer_size() provides the buffer
1949  *                              size if an additional buffer is required.
1950  *                              The caller is expected to clear the buffer, if applicable, for security reasons.
1951  * @param[in]      pool_params  Pooling parameters
1952  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
1953  * @param[in]      input_data   Input (activation) data pointer. Data type: int16
1954  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
1955  *                              Argument N and C are not used.
1956  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
1957  *                              Argument N is not used.
1958  *                              C_OUT equals C_IN.
1959  * @param[in, out] output_data  Output data pointer. Data type: int16
1960  *
1961  * @return                        The function returns
1962  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
1963  *                                    <code>ARM_CMSIS_NN_ARG_ERROR</code> - In case of invalid arguments
1964  *
1965  * @details
1966  *    - Supported Framework: TensorFlow Lite
1967  *
1968  */
1969 arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx,
1970                                     const cmsis_nn_pool_params *pool_params,
1971                                     const cmsis_nn_dims *input_dims,
1972                                     const int16_t *input_data,
1973                                     const cmsis_nn_dims *filter_dims,
1974                                     const cmsis_nn_dims *output_dims,
1975                                     int16_t *output_data);
1976 
1977 /**
1978  * @brief Get the required buffer size for S16 average pooling function
1979  * @param[in]       dim_dst_width         output tensor dimension
1980  * @param[in]       ch_src                number of input tensor channels
1981  * @return          The function returns required buffer size in bytes
1982  *
1983  */
1984 int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src);
1985 
1986 /**
1987  * @brief Get the required buffer size for S16 average pooling function for processors with DSP extension.
1988  *        Refer to arm_avgpool_s16_get_buffer_size() for function argument details.
1989  *
1990  * @note       Intended for compilation on Host. If compiling for an Arm target, use
1991  *             arm_avgpool_s16_get_buffer_size().
1992  *
1993  */
1994 int32_t arm_avgpool_s16_get_buffer_size_dsp(const int dim_dst_width, const int ch_src);
1995 
1996 /**
1997  * @brief Get the required buffer size for S16 average pooling function for Arm(R) Helium Architecture case.
1998  *        Refer to arm_avgpool_s16_get_buffer_size() for function argument details.
1999  *
2000  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2001  *             arm_avgpool_s16_get_buffer_size().
2002  *
2003  */
2004 int32_t arm_avgpool_s16_get_buffer_size_mve(const int dim_dst_width, const int ch_src);
2005 
2006 /**
2007  * @brief s8 max pooling function.
2008  *
2009  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
2010  *                              definition file to see if an additional buffer is required.
2011  *                              Optional function {API}_get_buffer_size() provides the buffer
2012  *                              size if an additional buffer is required.
2013  *                              The caller is expected to clear the buffer, if applicable, for security reasons.
2014  * @param[in]      pool_params  Pooling parameters
2015  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
2016  * @param[in]      input_data   Input (activation) data pointer. The input tensor must not
2017  *                              overlap with the output tensor. Data type: int8
2018  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
2019  *                              Argument N and C are not used.
2020  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
2021  *                              Argument N is not used.
2022  *                              C_OUT equals C_IN.
2023  * @param[in, out] output_data    Output data pointer. Data type: int8
2024  *
2025  * @return     The function returns either
2026  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
2027  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
2028  *
2029  * @details
2030  *    - Supported Framework: TensorFlow Lite
2031  *
2032  */
2033 arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx,
2034                                     const cmsis_nn_pool_params *pool_params,
2035                                     const cmsis_nn_dims *input_dims,
2036                                     const int8_t *input_data,
2037                                     const cmsis_nn_dims *filter_dims,
2038                                     const cmsis_nn_dims *output_dims,
2039                                     int8_t *output_data);
2040 
2041 /**
2042  * @brief s16 max pooling function.
2043  *
2044  * @param[in, out] ctx          Function context (e.g. temporary buffer). Check the function
2045  *                              definition file to see if an additional buffer is required.
2046  *                              Optional function {API}_get_buffer_size() provides the buffer
2047  *                              size if an additional buffer is required.
2048  *                              The caller is expected to clear the buffer, if applicable, for security reasons.
2049  * @param[in]      pool_params  Pooling parameters
2050  * @param[in]      input_dims   Input (activation) tensor dimensions. Format: [H, W, C_IN]
2051  * @param[in]      src          Input (activation) data pointer. The input tensor must not
2052  *                              overlap with the output tensor. Data type: int16
2053  * @param[in]      filter_dims  Filter tensor dimensions. Format: [H, W]
2054  *                              Argument N and C are not used.
2055  * @param[in]      output_dims  Output tensor dimensions. Format: [H, W, C_OUT]
2056  *                              Argument N is not used.
2057  *                              C_OUT equals C_IN.
2058  * @param[in, out] dst          Output data pointer. Data type: int16
2059  *
2060  * @return     The function returns either
2061  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
2062  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
2063  *
2064  * @details
2065  *    - Supported Framework: TensorFlow Lite
2066  *
2067  */
2068 arm_cmsis_nn_status arm_max_pool_s16(const cmsis_nn_context *ctx,
2069                                      const cmsis_nn_pool_params *pool_params,
2070                                      const cmsis_nn_dims *input_dims,
2071                                      const int16_t *src,
2072                                      const cmsis_nn_dims *filter_dims,
2073                                      const cmsis_nn_dims *output_dims,
2074                                      int16_t *dst);
2075 
2076 /**
2077  * @defgroup Softmax Softmax Functions
2078  *
2079  *
2080  */
2081 
2082 /**
2083  * @brief S8 softmax function
2084  * @param[in]  input     Pointer to the input tensor
2085  * @param[in]  num_rows  Number of rows in the input tensor
2086  * @param[in]  row_size  Number of elements in each input row
2087  * @param[in]  mult      Input quantization multiplier
2088  * @param[in]  shift     Input quantization shift within the range [0, 31]
2089  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
2090  *                       the quantized exponential operation can be performed
2091  * @param[out] output    Pointer to the output tensor
2092  *
2093  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2094  *
2095  */
2096 void arm_softmax_s8(const int8_t *input,
2097                     const int32_t num_rows,
2098                     const int32_t row_size,
2099                     const int32_t mult,
2100                     const int32_t shift,
2101                     const int32_t diff_min,
2102                     int8_t *output);
2103 
2104 /**
2105  * @brief S8 to s16 softmax function
2106  * @param[in]  input     Pointer to the input tensor
2107  * @param[in]  num_rows  Number of rows in the input tensor
2108  * @param[in]  row_size  Number of elements in each input row
2109  * @param[in]  mult      Input quantization multiplier
2110  * @param[in]  shift     Input quantization shift within the range [0, 31]
2111  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
2112  *                       the quantized exponential operation can be performed
2113  * @param[out] output    Pointer to the output tensor
2114  *
2115  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2116  *
2117  */
2118 void arm_softmax_s8_s16(const int8_t *input,
2119                         const int32_t num_rows,
2120                         const int32_t row_size,
2121                         const int32_t mult,
2122                         const int32_t shift,
2123                         const int32_t diff_min,
2124                         int16_t *output);
2125 
2126 /**
2127  * @brief S16 softmax function
2128  * @param[in]  input           Pointer to the input tensor
2129  * @param[in]  num_rows        Number of rows in the input tensor
2130  * @param[in]  row_size        Number of elements in each input row
2131  * @param[in]  mult            Input quantization multiplier
2132  * @param[in]  shift           Input quantization shift within the range [0, 31]
2133  * @param[in]  softmax_params  Softmax s16 layer parameters with two pointers to LUTs speficied below.
2134  *                             For indexing the high 9 bits are used and 7 remaining for interpolation.
2135  *                             That means 512 entries for the 9-bit indexing and 1 extra for interpolation, i.e. 513
2136  *                             values for each LUT.
2137  *                             - Lookup table for exp(x), where x uniform distributed between [-10.0 , 0.0]
2138  *                             - Lookup table for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
2139  * @param[out] output          Pointer to the output tensor
2140  * @return                        The function returns
2141  *                                    <code>ARM_CMSIS_NN_ARG_ERROR</code> Argument error check failed
2142  *                                    <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
2143  *
2144  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2145  *
2146  */
2147 arm_cmsis_nn_status arm_softmax_s16(const int16_t *input,
2148                                     const int32_t num_rows,
2149                                     const int32_t row_size,
2150                                     const int32_t mult,
2151                                     const int32_t shift,
2152                                     const cmsis_nn_softmax_lut_s16 *softmax_params,
2153                                     int16_t *output);
2154 
2155 /**
2156  * @brief U8 softmax function
2157  * @param[in]  input     Pointer to the input tensor
2158  * @param[in]  num_rows  Number of rows in the input tensor
2159  * @param[in]  row_size  Number of elements in each input row
2160  * @param[in]  mult      Input quantization multiplier
2161  * @param[in]  shift     Input quantization shift within the range [0, 31]
2162  * @param[in]  diff_min  Minimum difference with max in row. Used to check if
2163  *                       the quantized exponential operation can be performed
2164  * @param[out] output    Pointer to the output tensor
2165  *
2166  * @note Supported framework: TensorFlow Lite micro (bit-accurate)
2167  *
2168  */
2169 
2170 void arm_softmax_u8(const uint8_t *input,
2171                     const int32_t num_rows,
2172                     const int32_t row_size,
2173                     const int32_t mult,
2174                     const int32_t shift,
2175                     const int32_t diff_min,
2176                     uint8_t *output);
2177 
2178 /**
2179  * @defgroup Reshape Reshape Functions
2180  *
2181  */
2182 
2183 /**
2184  * @brief Reshape a s8 vector into another with different shape
2185  * @param[in]  input      points to the s8 input vector
2186  * @param[out] output     points to the s8 output vector
2187  * @param[in]  total_size total size of the input and output vectors in bytes
2188  *
2189  * @note The output is expected to be in a memory area that does not overlap with the input's
2190  *
2191  */
2192 void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size);
2193 
2194 /**
2195  * @defgroup Concatenation Concatenation Functions
2196  *
2197  */
2198 
2199 /**
2200  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis
2201  *        This function should be called for each input tensor to concatenate. The argument offset_x
2202  *        will be used to store the input tensor in the correct position in the output tensor
2203  *
2204  *        i.e.    offset_x = 0
2205  *                for(i = 0 i < num_input_tensors; ++i)
2206  *                {
2207  *                    arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x)
2208  *                    offset_x += input_x[i]
2209  *                }
2210  *
2211  *        This function assumes that the output tensor has:
2212  *        -# The same height of the input tensor
2213  *        -# The same number of channels of the input tensor
2214  *        -# The same batch size of the input tensor
2215  *
2216  *        Unless specified otherwise, arguments are mandatory.
2217  *
2218  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2219  *      does not involve any arithmetic operation
2220  *
2221  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with the output tensor.
2222  * @param[in]  input_x  Width of input tensor
2223  * @param[in]  input_y  Height of input tensor
2224  * @param[in]  input_z  Channels in input tensor
2225  * @param[in]  input_w  Batch size in input tensor
2226  * @param[out] output   Pointer to output tensor. Expected to be at least
2227  *                          (input_x * input_y * input_z * input_w) + offset_x
2228  *                      bytes.
2229  * @param[in]  output_x Width of output tensor
2230  * @param[in]  offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor
2231  *                      It is user responsibility to provide the correct value
2232  *
2233  * <b> Input constraints</b>
2234  * offset_x is less than output_x
2235  *
2236  */
2237 void arm_concatenation_s8_x(const int8_t *input,
2238                             const uint16_t input_x,
2239                             const uint16_t input_y,
2240                             const uint16_t input_z,
2241                             const uint16_t input_w,
2242                             int8_t *output,
2243                             const uint16_t output_x,
2244                             const uint32_t offset_x);
2245 
2246 /**
2247  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis
2248  *        This function should be called for each input tensor to concatenate. The argument offset_y
2249  *        will be used to store the input tensor in the correct position in the output tensor
2250  *
2251  *        i.e.    offset_y = 0
2252  *                for(i = 0 i < num_input_tensors; ++i)
2253  *                {
2254  *                    arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y)
2255  *                    offset_y += input_y[i]
2256  *                }
2257  *
2258  *        This function assumes that the output tensor has:
2259  *        -# The same width of the input tensor
2260  *        -# The same number of channels of the input tensor
2261  *        -# The same batch size of the input tensor
2262  *
2263  *        Unless specified otherwise, arguments are mandatory.
2264  *
2265  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2266  *       does not involve any arithmetic operation
2267  *
2268  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with the output tensor.
2269  * @param[in]  input_x  Width of input tensor
2270  * @param[in]  input_y  Height of input tensor
2271  * @param[in]  input_z  Channels in input tensor
2272  * @param[in]  input_w  Batch size in input tensor
2273  * @param[out] output   Pointer to output tensor. Expected to be at least
2274  *                          (input_z * input_w * input_x * input_y) + offset_y
2275  *                      bytes.
2276  * @param[in]  output_y Height of output tensor
2277  * @param[in]  offset_y The offset on the Y axis to start concatenating the input tensor
2278  *                      It is user responsibility to provide the correct value
2279  *
2280  * <b> Input constraints</b>
2281  * offset_y is less than output_y
2282  *
2283  */
2284 void arm_concatenation_s8_y(const int8_t *input,
2285                             const uint16_t input_x,
2286                             const uint16_t input_y,
2287                             const uint16_t input_z,
2288                             const uint16_t input_w,
2289                             int8_t *output,
2290                             const uint16_t output_y,
2291                             const uint32_t offset_y);
2292 
2293 /**
2294  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis
2295  *        This function should be called for each input tensor to concatenate. The argument offset_z
2296  *        will be used to store the input tensor in the correct position in the output tensor
2297  *
2298  *        i.e.    offset_z = 0
2299  *                for(i = 0 i < num_input_tensors; ++i)
2300  *                {
2301  *                    arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z)
2302  *                    offset_z += input_z[i]
2303  *                }
2304  *
2305  *        This function assumes that the output tensor has:
2306  *        -# The same width of the input tensor
2307  *        -# The same height of the input tensor
2308  *        -# The same batch size of the input tensor
2309  *
2310  *        Unless specified otherwise, arguments are mandatory.
2311  *
2312  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2313  *       does not involve any arithmetic operation
2314  *
2315  * @param[in]  input    Pointer to input tensor. Input tensor must not overlap with output tensor.
2316  * @param[in]  input_x  Width of input tensor
2317  * @param[in]  input_y  Height of input tensor
2318  * @param[in]  input_z  Channels in input tensor
2319  * @param[in]  input_w  Batch size in input tensor
2320  * @param[out] output   Pointer to output tensor. Expected to be at least
2321  *                          (input_x * input_y * input_z * input_w) + offset_z
2322  *                      bytes.
2323  * @param[in]  output_z Channels in output tensor
2324  * @param[in]  offset_z The offset on the Z axis to start concatenating the input tensor
2325  *                      It is user responsibility to provide the correct value
2326  *
2327  * <b> Input constraints</b>
2328  * offset_z is less than output_z
2329  *
2330  */
2331 void arm_concatenation_s8_z(const int8_t *input,
2332                             const uint16_t input_x,
2333                             const uint16_t input_y,
2334                             const uint16_t input_z,
2335                             const uint16_t input_w,
2336                             int8_t *output,
2337                             const uint16_t output_z,
2338                             const uint32_t offset_z);
2339 
2340 /**
2341  * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size)
2342  *        This function should be called for each input tensor to concatenate. The argument offset_w
2343  *        will be used to store the input tensor in the correct position in the output tensor
2344  *
2345  *        i.e.    offset_w = 0
2346  *                for(i = 0 i < num_input_tensors; ++i)
2347  *                {
2348  *                    arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w)
2349  *                    offset_w += input_w[i]
2350  *                }
2351  *
2352  *        This function assumes that the output tensor has:
2353  *        -# The same width of the input tensor
2354  *        -# The same height of the input tensor
2355  *        -# The same number o channels of the input tensor
2356  *
2357  *        Unless specified otherwise, arguments are mandatory.
2358  *
2359  * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
2360  *       does not involve any arithmetic operation
2361  *
2362  * @param[in]  input    Pointer to input tensor
2363  * @param[in]  input_x  Width of input tensor
2364  * @param[in]  input_y  Height of input tensor
2365  * @param[in]  input_z  Channels in input tensor
2366  * @param[in]  input_w  Batch size in input tensor
2367  * @param[out] output   Pointer to output tensor. Expected to be at least
2368  *                          input_x * input_y * input_z * input_w
2369  *                      bytes.
2370  * @param[in]  offset_w The offset on the W axis to start concatenating the input tensor
2371  *                      It is user responsibility to provide the correct value
2372  *
2373  */
2374 void arm_concatenation_s8_w(const int8_t *input,
2375                             const uint16_t input_x,
2376                             const uint16_t input_y,
2377                             const uint16_t input_z,
2378                             const uint16_t input_w,
2379                             int8_t *output,
2380                             const uint32_t offset_w);
2381 /**
2382  * @defgroup SVDF SVDF Functions
2383  *
2384  */
2385 
2386 /**
2387  * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
2388  *
2389  * @param[in, out] ctx                Function context (e.g. temporary buffer). Check the function
2390  *                                    definition file to see if an additional buffer is required.
2391  *                                    Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
2392  *                                    size if an additional buffer is required.
2393  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2394  * @param[in]   input_ctx             Temporary scratch buffer
2395  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2396  * @param[in]   output_ctx            Temporary output scratch buffer
2397  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2398  * @param[in]   svdf_params           SVDF Parameters
2399  *                                    Range of svdf_params->input_offset  : [-128, 127]
2400  *                                    Range of svdf_params->output_offset  : [-128, 127]
2401  * @param[in]   input_quant_params    Input quantization parameters
2402  * @param[in]   output_quant_params   Output quantization parameters
2403  * @param[in]   input_dims            Input tensor dimensions
2404  * @param[in]   input_data            Pointer to input tensor
2405  * @param[in]   state_dims            State tensor dimensions
2406  * @param[in]   state_data            Pointer to state tensor
2407  * @param[in]   weights_feature_dims  Weights (feature) tensor dimensions
2408  * @param[in]   weights_feature_data  Pointer to the weights (feature) tensor
2409  * @param[in]   weights_time_dims     Weights (time) tensor dimensions
2410  * @param[in]   weights_time_data     Pointer to the weights (time) tensor
2411  * @param[in]   bias_dims             Bias tensor dimensions
2412  * @param[in]   bias_data             Pointer to bias tensor
2413  * @param[in]   output_dims           Output tensor dimensions
2414  * @param[out]  output_data           Pointer to the output tensor
2415  *
2416  * @return     The function returns either
2417  *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
2418  *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
2419  *
2420  * @details
2421  *    1. Supported framework: TensorFlow Lite micro
2422  */
2423 arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx,
2424                                 const cmsis_nn_context *input_ctx,
2425                                 const cmsis_nn_context *output_ctx,
2426                                 const cmsis_nn_svdf_params *svdf_params,
2427                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
2428                                 const cmsis_nn_per_tensor_quant_params *output_quant_params,
2429                                 const cmsis_nn_dims *input_dims,
2430                                 const int8_t *input_data,
2431                                 const cmsis_nn_dims *state_dims,
2432                                 int8_t *state_data,
2433                                 const cmsis_nn_dims *weights_feature_dims,
2434                                 const int8_t *weights_feature_data,
2435                                 const cmsis_nn_dims *weights_time_dims,
2436                                 const int8_t *weights_time_data,
2437                                 const cmsis_nn_dims *bias_dims,
2438                                 const int32_t *bias_data,
2439                                 const cmsis_nn_dims *output_dims,
2440                                 int8_t *output_data);
2441 
2442 /**
2443  * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
2444  *
2445  * @param[in]   input_ctx             Temporary scratch buffer
2446  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2447  * @param[in]   output_ctx            Temporary output scratch buffer
2448  *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
2449  * @param[in]   svdf_params           SVDF Parameters
2450  *                                    Range of svdf_params->input_offset  : [-128, 127]
2451  *                                    Range of svdf_params->output_offset  : [-128, 127]
2452  * @param[in]   input_quant_params    Input quantization parameters
2453  * @param[in]   output_quant_params   Output quantization parameters
2454  * @param[in]   input_dims            Input tensor dimensions
2455  * @param[in]   input_data            Pointer to input tensor
2456  * @param[in]   state_dims            State tensor dimensions
2457  * @param[in]   state_data            Pointer to state tensor
2458  * @param[in]   weights_feature_dims  Weights (feature) tensor dimensions
2459  * @param[in]   weights_feature_data  Pointer to the weights (feature) tensor
2460  * @param[in]   weights_time_dims     Weights (time) tensor dimensions
2461  * @param[in]   weights_time_data     Pointer to the weights (time) tensor
2462  * @param[in]   bias_dims             Bias tensor dimensions
2463  * @param[in]   bias_data             Pointer to bias tensor
2464  * @param[in]   output_dims           Output tensor dimensions
2465  * @param[out]  output_data           Pointer to the output tensor
2466  *
2467  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2468  *
2469  * @details
2470  *    1. Supported framework: TensorFlow Lite micro
2471  */
2472 arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
2473                                           const cmsis_nn_context *output_ctx,
2474                                           const cmsis_nn_svdf_params *svdf_params,
2475                                           const cmsis_nn_per_tensor_quant_params *input_quant_params,
2476                                           const cmsis_nn_per_tensor_quant_params *output_quant_params,
2477                                           const cmsis_nn_dims *input_dims,
2478                                           const int8_t *input_data,
2479                                           const cmsis_nn_dims *state_dims,
2480                                           int16_t *state_data,
2481                                           const cmsis_nn_dims *weights_feature_dims,
2482                                           const int8_t *weights_feature_data,
2483                                           const cmsis_nn_dims *weights_time_dims,
2484                                           const int16_t *weights_time_data,
2485                                           const cmsis_nn_dims *bias_dims,
2486                                           const int32_t *bias_data,
2487                                           const cmsis_nn_dims *output_dims,
2488                                           int8_t *output_data);
2489 
2490 /**
2491  * @brief Get size of additional buffer required by arm_svdf_s8().
2492  * @param[in]      filter_dims             dimension of filter
2493  * @return         The function returns    required buffer size in bytes
2494  *
2495  */
2496 int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
2497 
2498 /**
2499  * @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension.
2500  *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
2501  *
2502  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2503  *             arm_svdf_s8_get_buffer_size().
2504  *
2505  */
2506 int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
2507 
2508 /**
2509  * @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case.
2510  *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
2511  *
2512  * @note       Intended for compilation on Host. If compiling for an Arm target, use
2513  *             arm_svdf_s8_get_buffer_size().
2514  *
2515  */
2516 int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
2517 
2518 /**
2519  * @defgroup LSTM LSTM Layer Functions
2520  *
2521  */
2522 
2523 /**
2524  * @brief LSTM unidirectional function with 8 bit input and output and 16 bit gate output, 32 bit bias.
2525  *
2526  * @param[in]   input                      Pointer to input data
2527  * @param[out]  output                     Pointer to output data
2528  * @param[in]   params                     Struct containing all information about the lstm operator, see arm_nn_types.
2529  * @param[in]   buffers                    Struct containing pointers to all temporary scratch buffers needed for the
2530  * lstm operator, see arm_nn_types.
2531  *
2532  *
2533  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2534  *
2535  * @details
2536  *    1. Supported framework: TensorFlow Lite Micro
2537  *
2538  */
2539 arm_cmsis_nn_status arm_lstm_unidirectional_s8(const int8_t *input,
2540                                                int8_t *output,
2541                                                const cmsis_nn_lstm_params *params,
2542                                                cmsis_nn_lstm_context *buffers);
2543 
2544 /**
2545  * @brief LSTM unidirectional function with 16 bit input and output and 16 bit gate output, 64 bit bias.
2546  *
2547  * @param[in]   input                      Pointer to input data
2548  * @param[out]  output                     Pointer to output data
2549  * @param[in]   params                     Struct containing all information about the lstm operator, see arm_nn_types.
2550  * @param[in]   buffers                    Struct containing pointers to all temporary scratch buffers needed for the
2551  * lstm operator, see arm_nn_types.
2552  *
2553  *
2554  * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
2555  *
2556  * @details
2557  *    1. Supported framework: TensorFlow Lite Micro
2558  *
2559  */
2560 arm_cmsis_nn_status arm_lstm_unidirectional_s16(const int16_t *input,
2561                                                 int16_t *output,
2562                                                 const cmsis_nn_lstm_params *params,
2563                                                 cmsis_nn_lstm_context *buffers);
2564 
2565 #ifdef __cplusplus
2566 }
2567 #endif
2568 
2569 #endif /* ARM_NNFUNCTIONS_H */
2570