1 /* 2 * SPDX-FileCopyrightText: Copyright 2010-2024 Arm Limited and/or its affiliates <open-source-office@arm.com> 3 * 4 * SPDX-License-Identifier: Apache-2.0 5 * 6 * Licensed under the Apache License, Version 2.0 (the License); you may 7 * not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 /* ---------------------------------------------------------------------- 20 * Project: CMSIS NN Library 21 * Title: arm_nnfunctions.h 22 * Description: Public header file for CMSIS NN Library 23 * 24 * $Date: 04 November 2024 25 * $Revision: V.18.0.0 26 * 27 * Target : Arm(R) M-Profile Architecture 28 * -------------------------------------------------------------------- */ 29 30 /** 31 * @defgroup Public Public 32 * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support 33 * TensorFlow Lite framework. 34 */ 35 36 #ifndef ARM_NNFUNCTIONS_H 37 #define ARM_NNFUNCTIONS_H 38 39 #include "arm_nn_math_types.h" 40 #include "arm_nn_types.h" 41 42 #define USE_INTRINSIC 43 44 #ifdef __cplusplus 45 extern "C" { 46 #endif 47 48 /** 49 * @defgroup NNConv Convolution Functions 50 * 51 * Collection of convolution, depthwise convolution functions and their variants. 52 * 53 * The convolution is implemented in 2 steps: im2col and General Matrix Multiplication(GEMM) 54 * 55 * im2col is a process of converting each patch of image data into 56 * a column. After im2col, the convolution is computed as matrix-matrix 57 * multiplication. 58 * 59 * To reduce the memory footprint, the im2col is performed partially. 60 * Each iteration, only a few column (i.e., patches) are generated followed 61 * by GEMM. 62 * 63 */ 64 65 /** 66 * @brief s4 convolution layer wrapper function with the main purpose to call the optimal kernel available in 67 * cmsis-nn to perform the convolution. 68 * 69 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 70 * arm_convolve_wrapper_s4_get_buffer_size will return the buffer_size if required. 71 * The caller is expected to clear the buffer ,if applicable, for security reasons. 72 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 73 * Range of conv_params->input_offset : [-127, 128] 74 * Range of conv_params->output_offset : [-128, 127] 75 * @param[in] quant_params Per-channel quantization info. 76 * It contains the multiplier and shift values to be applied to each output channel 77 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 78 * @param[in] input_data Input (activation) data pointer. Data type: int8 79 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 80 * spatial filter dimensions 81 * @param[in] filter_data Filter data pointer. Data type: int8 packed with 2x int4 82 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 83 * @param[in] bias_data Bias data pointer. Data type: int32 84 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 85 * @param[out] output_data Output data pointer. Data type: int8 86 * 87 * @return The function returns either 88 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 89 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 90 * 91 */ 92 arm_cmsis_nn_status arm_convolve_wrapper_s4(const cmsis_nn_context *ctx, 93 const cmsis_nn_conv_params *conv_params, 94 const cmsis_nn_per_channel_quant_params *quant_params, 95 const cmsis_nn_dims *input_dims, 96 const int8_t *input_data, 97 const cmsis_nn_dims *filter_dims, 98 const int8_t *filter_data, 99 const cmsis_nn_dims *bias_dims, 100 const int32_t *bias_data, 101 const cmsis_nn_dims *output_dims, 102 int8_t *output_data); 103 104 /** 105 * @brief Get the required buffer size for arm_convolve_wrapper_s4 106 * 107 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 108 * Range of conv_params->input_offset : [-127, 128] 109 * Range of conv_params->output_offset : [-128, 127] 110 * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN] 111 * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial 112 * filter dimensions 113 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 114 * 115 * @return The function returns required buffer size(bytes) 116 * 117 */ 118 int32_t arm_convolve_wrapper_s4_get_buffer_size(const cmsis_nn_conv_params *conv_params, 119 const cmsis_nn_dims *input_dims, 120 const cmsis_nn_dims *filter_dims, 121 const cmsis_nn_dims *output_dims); 122 123 /** 124 * @brief Get the required buffer size for arm_convolve_wrapper_s4 for Arm(R) Helium Architecture case. 125 * Refer to arm_convolve_wrapper_s4_get_buffer_size() for function argument details. 126 * 127 * @note Intended for compilation on Host. If compiling for an Arm target, use 128 * arm_convolve_wrapper_s4_get_buffer_size(). Currently this operator does not have an 129 * mve implementation, so dsp will be used. 130 * 131 */ 132 int32_t arm_convolve_wrapper_s4_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params, 133 const cmsis_nn_dims *input_dims, 134 const cmsis_nn_dims *filter_dims, 135 const cmsis_nn_dims *output_dims); 136 137 /** 138 * @brief Get the required buffer size for arm_convolve_wrapper_s4 for processors with DSP extension. 139 * Refer to arm_convolve_wrapper_s4_get_buffer_size() for function argument details. 140 * 141 * @note Intended for compilation on Host. If compiling for an Arm target, use 142 * arm_convolve_wrapper_s4_get_buffer_size(). 143 * 144 */ 145 int32_t arm_convolve_wrapper_s4_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params, 146 const cmsis_nn_dims *input_dims, 147 const cmsis_nn_dims *filter_dims, 148 const cmsis_nn_dims *output_dims); 149 150 /** 151 * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in 152 * cmsis-nn to perform the convolution. 153 * 154 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 155 * arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required. 156 * The caller is expected to clear the buffer, if applicable, for security reasons. 157 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 158 * Range of conv_params->input_offset : [-127, 128] 159 * Range of conv_params->output_offset : [-128, 127] 160 * @param[in] quant_params Per-channel quantization info. 161 * It contains the multiplier and shift values to be applied to each output channel 162 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 163 * @param[in] input_data Input (activation) data pointer. Data type: int8 164 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 165 * spatial filter dimensions 166 * @param[in] filter_data Filter data pointer. Data type: int8 167 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 168 * @param[in] bias_data Bias data pointer. Data type: int32 169 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 170 * @param[out] output_data Output data pointer. Data type: int8 171 * 172 * @return The function returns either 173 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 174 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 175 * 176 */ 177 arm_cmsis_nn_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, 178 const cmsis_nn_conv_params *conv_params, 179 const cmsis_nn_per_channel_quant_params *quant_params, 180 const cmsis_nn_dims *input_dims, 181 const int8_t *input_data, 182 const cmsis_nn_dims *filter_dims, 183 const int8_t *filter_data, 184 const cmsis_nn_dims *bias_dims, 185 const int32_t *bias_data, 186 const cmsis_nn_dims *output_dims, 187 int8_t *output_data); 188 189 /** 190 * @brief Get the required buffer size for arm_convolve_wrapper_s8 191 * 192 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 193 * Range of conv_params->input_offset : [-127, 128] 194 * Range of conv_params->output_offset : [-128, 127] 195 * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN] 196 * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial 197 * filter dimensions 198 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 199 * 200 * @return The function returns required buffer size(bytes) 201 * 202 */ 203 int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, 204 const cmsis_nn_dims *input_dims, 205 const cmsis_nn_dims *filter_dims, 206 const cmsis_nn_dims *output_dims); 207 208 /** 209 * @brief Get the required buffer size for arm_convolve_wrapper_s8 for Arm(R) Helium Architecture case. 210 * Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details. 211 * 212 * @note Intended for compilation on Host. If compiling for an Arm target, use 213 * arm_convolve_wrapper_s8_get_buffer_size(). 214 * 215 */ 216 int32_t arm_convolve_wrapper_s8_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params, 217 const cmsis_nn_dims *input_dims, 218 const cmsis_nn_dims *filter_dims, 219 const cmsis_nn_dims *output_dims); 220 221 /** 222 * @brief Get the required buffer size for arm_convolve_wrapper_s8 for processors with DSP extension. 223 * Refer to arm_convolve_wrapper_s8_get_buffer_size() for function argument details. 224 * 225 * @note Intended for compilation on Host. If compiling for an Arm target, use 226 * arm_convolve_wrapper_s8_get_buffer_size(). 227 * 228 */ 229 int32_t arm_convolve_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params, 230 const cmsis_nn_dims *input_dims, 231 const cmsis_nn_dims *filter_dims, 232 const cmsis_nn_dims *output_dims); 233 234 /** 235 * @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in 236 * cmsis-nn to perform the convolution. 237 * 238 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 239 * arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required 240 * The caller is expected to clear the buffer, if applicable, for security reasons. 241 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 242 * conv_params->input_offset : Not used 243 * conv_params->output_offset : Not used 244 * @param[in] quant_params Per-channel quantization info. 245 * It contains the multiplier and shift values to be applied to each output channel 246 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 247 * @param[in] input_data Input (activation) data pointer. Data type: int16 248 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 249 * spatial filter dimensions 250 * @param[in] filter_data Filter data pointer. Data type: int8 251 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 252 * @param[in] bias_data Struct with optional bias data pointer. Bias data type can be int64 or int32 depending 253 * flag in struct. 254 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 255 * @param[out] output_data Output data pointer. Data type: int16 256 * 257 * @return The function returns either 258 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 259 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 260 * 261 */ 262 arm_cmsis_nn_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx, 263 const cmsis_nn_conv_params *conv_params, 264 const cmsis_nn_per_channel_quant_params *quant_params, 265 const cmsis_nn_dims *input_dims, 266 const int16_t *input_data, 267 const cmsis_nn_dims *filter_dims, 268 const int8_t *filter_data, 269 const cmsis_nn_dims *bias_dims, 270 const cmsis_nn_bias_data *bias_data, 271 const cmsis_nn_dims *output_dims, 272 int16_t *output_data); 273 274 /** 275 * @brief Get the required buffer size for arm_convolve_wrapper_s16. 276 * 277 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 278 * conv_params->input_offset : Not used 279 * conv_params->output_offset : Not used 280 * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN] 281 * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial 282 * filter dimensions 283 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 284 * 285 * @return The function returns required buffer size(bytes) 286 * 287 */ 288 int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params, 289 const cmsis_nn_dims *input_dims, 290 const cmsis_nn_dims *filter_dims, 291 const cmsis_nn_dims *output_dims); 292 293 /** 294 * @brief Get the required buffer size for arm_convolve_wrapper_s16 for for processors with DSP extension. 295 * Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details. 296 * 297 * @note Intended for compilation on Host. If compiling for an Arm target, use 298 * arm_convolve_wrapper_s16_get_buffer_size(). 299 * 300 */ 301 int32_t arm_convolve_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_conv_params *conv_params, 302 const cmsis_nn_dims *input_dims, 303 const cmsis_nn_dims *filter_dims, 304 const cmsis_nn_dims *output_dims); 305 306 /** 307 * @brief Get the required buffer size for arm_convolve_wrapper_s16 for Arm(R) Helium Architecture case. 308 * Refer to arm_convolve_wrapper_s16_get_buffer_size() for function argument details. 309 * 310 * @note Intended for compilation on Host. If compiling for an Arm target, use 311 * arm_convolve_wrapper_s16_get_buffer_size(). 312 * 313 */ 314 int32_t arm_convolve_wrapper_s16_get_buffer_size_mve(const cmsis_nn_conv_params *conv_params, 315 const cmsis_nn_dims *input_dims, 316 const cmsis_nn_dims *filter_dims, 317 const cmsis_nn_dims *output_dims); 318 319 /** 320 * @brief Basic s4 convolution function 321 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 322 * arm_convolve_s4_get_buffer_size will return the buffer_size if required. 323 * The caller is expected to clear the buffer ,if applicable, for security reasons. 324 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 325 * Range of conv_params->input_offset : [-127, 128] 326 * Range of conv_params->output_offset : [-128, 127] 327 * @param[in] quant_params Per-channel quantization info. 328 * It contains the multiplier and shift values to be applied to each output channel 329 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 330 * @param[in] input_data Input (activation) data pointer. Data type: int8 331 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 332 * spatial filter dimensions 333 * @param[in] filter_data Packed Filter data pointer. Data type: int8 packed with 2x int4 334 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 335 * @param[in] bias_data Optional bias data pointer. Data type: int32 336 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 337 * @param[out] output_data Output data pointer. Data type: int8 338 339 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 340 * 341 * @details 342 * 1. Supported framework: TensorFlow Lite micro 343 * 2. Additional memory is required for optimization. Refer to argument 'ctx' for details. 344 * 345 */ 346 arm_cmsis_nn_status arm_convolve_s4(const cmsis_nn_context *ctx, 347 const cmsis_nn_conv_params *conv_params, 348 const cmsis_nn_per_channel_quant_params *quant_params, 349 const cmsis_nn_dims *input_dims, 350 const int8_t *input_data, 351 const cmsis_nn_dims *filter_dims, 352 const int8_t *filter_data, 353 const cmsis_nn_dims *bias_dims, 354 const int32_t *bias_data, 355 const cmsis_nn_dims *output_dims, 356 int8_t *output_data); 357 358 /** 359 * @brief Basic s4 convolution function with a requirement of even number of kernels. 360 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 361 * arm_convolve_s4_get_buffer_size will return the buffer_size if required. 362 * The caller is expected to clear the buffer ,if applicable, for security reasons. 363 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 364 * Range of conv_params->input_offset : [-127, 128] 365 * Range of conv_params->output_offset : [-128, 127] 366 * @param[in] quant_params Per-channel quantization info. 367 * It contains the multiplier and shift values to be applied to each output channel 368 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 369 * @param[in] input_data Input (activation) data pointer. Data type: int8 370 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 371 * spatial filter dimensions. Note the product must be even. 372 * @param[in] filter_data Packed Filter data pointer. Data type: int8 packed with 2x int4 373 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 374 * @param[in] bias_data Optional bias data pointer. Data type: int32 375 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 376 * @param[out] output_data Output data pointer. Data type: int8 377 * 378 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or 379 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or 380 * <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code> if not for MVE 381 * 382 * @details 383 * 1. Supported framework: TensorFlow Lite micro 384 * 2. Additional memory is required for optimization. Refer to argument 'ctx' for details. 385 * 386 */ 387 arm_cmsis_nn_status arm_convolve_even_s4(const cmsis_nn_context *ctx, 388 const cmsis_nn_conv_params *conv_params, 389 const cmsis_nn_per_channel_quant_params *quant_params, 390 const cmsis_nn_dims *input_dims, 391 const int8_t *input_data, 392 const cmsis_nn_dims *filter_dims, 393 const int8_t *filter_data, 394 const cmsis_nn_dims *bias_dims, 395 const int32_t *bias_data, 396 const cmsis_nn_dims *output_dims, 397 int8_t *output_data); 398 399 /** 400 * @brief Basic s8 convolution function 401 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 402 * arm_convolve_s8_get_buffer_size will return the buffer_size if required. 403 * The caller is expected to clear the buffer, if applicable, for security reasons. 404 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 405 * Range of conv_params->input_offset : [-127, 128] 406 * Range of conv_params->output_offset : [-128, 127] 407 * @param[in] quant_params Per-channel quantization info. 408 * It contains the multiplier and shift values to be applied to each output channel 409 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 410 * @param[in] input_data Input (activation) data pointer. Data type: int8 411 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, CK] where HK, WK and CK are the 412 * spatial filter dimensions. CK != C_IN is used for grouped convolution, in which 413 * case the required conditions are C_IN = N * CK and C_OUT = N * M for N groups of 414 * size M. 415 * @param[in] filter_data Filter data pointer. Data type: int8 416 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 417 * @param[in] bias_data Optional bias data pointer. Data type: int32 418 * @param[in] upscale_dims Inserts zeroes to upscale the input in h/w dimensions if set to 2. This is used for 419 * tranposed convolution. 420 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 421 * @param[out] output_data Output data pointer. Data type: int8 422 * 423 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or 424 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or 425 * <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code> 426 * 427 * @details 428 * 1. Supported framework: TensorFlow Lite micro 429 * 2. Additional memory is required for optimization. Refer to argument 'ctx' for details. 430 * 431 */ 432 arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, 433 const cmsis_nn_conv_params *conv_params, 434 const cmsis_nn_per_channel_quant_params *quant_params, 435 const cmsis_nn_dims *input_dims, 436 const int8_t *input_data, 437 const cmsis_nn_dims *filter_dims, 438 const int8_t *filter_data, 439 const cmsis_nn_dims *bias_dims, 440 const int32_t *bias_data, 441 const cmsis_nn_dims *upscale_dims, 442 const cmsis_nn_dims *output_dims, 443 int8_t *output_data); 444 445 /** 446 * @brief Get the required buffer size for s4 convolution function 447 * 448 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 449 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK 450 * are the spatial filter dimensions 451 * @return The function returns required buffer size(bytes) 452 * 453 */ 454 int32_t arm_convolve_s4_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 455 456 /** 457 * @brief Get the required buffer size for s8 convolution function 458 * 459 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 460 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK 461 * are the spatial filter dimensions 462 * @return The function returns required buffer size(bytes) 463 * 464 */ 465 int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 466 467 /** 468 * @brief Wrapper to select optimal transposed convolution algorithm depending on parameters. 469 * @param[in, out] ctx Function context that contains the additional buffer if required by the 470 * function. 471 * arm_transpose_conv_s8_get_buffer_size will return the buffer_size if required. 472 * The caller is expected to clear the buffer, if applicable, for security 473 reasons. 474 * @param[in, out] output_ctx Temporary scratch buffer. 475 * The size required size is: output width * output height * output channel * 4 476 * The caller is expected to clear the buffer, if applicable, for security 477 * reasons. 478 * @param[in] transpose_conv_params Convolution parameters (e.g. strides, dilations, pads,...). 479 * Range of transpose_conv_params->input_offset : [-127, 128] 480 * Range of transpose_conv_params->output_offset : [-128, 127] 481 * @param[in] quant_params Per-channel quantization info. 482 * It contains the multiplier and shift values to be applied to each out channel. 483 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 484 * @param[in] input_data Input (activation) data pointer. Data type: int8 485 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 486 * spatial filter dimensions 487 * @param[in] filter_data Filter data pointer. Data type: int8 488 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 489 * @param[in] bias_data Optional bias data pointer. Data type: int32 490 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 491 * @param[out] output_data Output data pointer. Data type: int8 492 493 * @return The function returns either 494 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 495 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 496 * 497 * @details 498 * 1. Supported framework: TensorFlow Lite micro 499 * 2. Additional memory is required for optimization. Refer to arguments 'ctx' and 'output_ctx' for details. 500 * 501 */ 502 arm_cmsis_nn_status arm_transpose_conv_wrapper_s8(const cmsis_nn_context *ctx, 503 const cmsis_nn_context *output_ctx, 504 const cmsis_nn_transpose_conv_params *transpose_conv_params, 505 const cmsis_nn_per_channel_quant_params *quant_params, 506 const cmsis_nn_dims *input_dims, 507 const int8_t *input_data, 508 const cmsis_nn_dims *filter_dims, 509 const int8_t *filter_data, 510 const cmsis_nn_dims *bias_dims, 511 const int32_t *bias_data, 512 const cmsis_nn_dims *output_dims, 513 int8_t *output_data); 514 515 /** 516 * @brief Basic s8 transpose convolution function 517 * @param[in, out] ctx Function context that contains the additional buffer if required by the 518 * function. 519 * arm_transpose_conv_s8_get_buffer_size will return the buffer_size if required. 520 * The caller is expected to clear the buffer, if applicable, for security 521 reasons. 522 * @param[in, out] output_ctx Temporary scratch buffer. 523 * The size required size is: output width * output height * output channel * 4 524 * The caller is expected to clear the buffer, if applicable, for security 525 * reasons. 526 * @param[in] transpose_conv_params Convolution parameters (e.g. strides, dilations, pads,...). 527 * Range of transpose_conv_params->input_offset : [-127, 128] 528 * Range of transpose_conv_params->output_offset : [-128, 127] 529 * @param[in] quant_params Per-channel quantization info. 530 * It contains the multiplier and shift values to be applied to each out channel. 531 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 532 * @param[in] input_data Input (activation) data pointer. Data type: int8 533 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 534 * spatial filter dimensions 535 * @param[in] filter_data Filter data pointer. Data type: int8 536 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 537 * @param[in] bias_data Optional bias data pointer. Data type: int32 538 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 539 * @param[out] output_data Output data pointer. Data type: int8 540 541 * @return The function returns either 542 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 543 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 544 * 545 * @details 546 * 1. Supported framework: TensorFlow Lite micro 547 * 2. Additional memory is required for optimization. Refer to arguments 'ctx' and 'output_ctx' for details. 548 * 549 */ 550 arm_cmsis_nn_status arm_transpose_conv_s8(const cmsis_nn_context *ctx, 551 const cmsis_nn_context *output_ctx, 552 const cmsis_nn_transpose_conv_params *transpose_conv_params, 553 const cmsis_nn_per_channel_quant_params *quant_params, 554 const cmsis_nn_dims *input_dims, 555 const int8_t *input_data, 556 const cmsis_nn_dims *filter_dims, 557 const int8_t *filter_data, 558 const cmsis_nn_dims *bias_dims, 559 const int32_t *bias_data, 560 const cmsis_nn_dims *output_dims, 561 int8_t *output_data); 562 563 /** 564 * @brief Get the required buffer size for ctx in s8 transpose conv function 565 * 566 * @param[in] transposed_conv_params Transposed convolution parameters 567 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 568 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK 569 * are the spatial filter dimensions 570 * @param[in] out_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 571 * @return The function returns required buffer size(bytes) 572 * 573 */ 574 int32_t arm_transpose_conv_s8_get_buffer_size(const cmsis_nn_transpose_conv_params *transposed_conv_params, 575 const cmsis_nn_dims *input_dims, 576 const cmsis_nn_dims *filter_dims, 577 const cmsis_nn_dims *out_dims); 578 579 /** 580 * @brief Get the required buffer size for output_ctx in s8 transpose conv function 581 * 582 * @param[in] transposed_conv_params Transposed convolution parameters 583 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 584 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK 585 * are the spatial filter dimensions 586 * @return The function returns required buffer size(bytes) 587 * 588 */ 589 int32_t arm_transpose_conv_s8_get_reverse_conv_buffer_size(const cmsis_nn_transpose_conv_params *transposed_conv_params, 590 const cmsis_nn_dims *input_dims, 591 const cmsis_nn_dims *filter_dims); 592 593 /** 594 * @brief Get size of additional buffer required by arm_transpose_conv_s8() for processors with DSP extension. 595 * Refer to arm_transpose_conv_s8_get_buffer_size() for function argument details. 596 * 597 * @note Intended for compilation on Host. If compiling for an Arm target, use 598 * arm_transpose_conv_s8_get_buffer_size(). 599 * 600 */ 601 int32_t arm_transpose_conv_s8_get_buffer_size_dsp(const cmsis_nn_dims *input_dims, 602 const cmsis_nn_dims *filter_dims, 603 const cmsis_nn_dims *out_dims); 604 605 /** 606 * @brief Get size of additional buffer required by arm_transpose_conv_s8() for Arm(R) Helium Architecture case. 607 * Refer to arm_transpose_conv_s8_get_buffer_size() for function argument details. 608 * 609 * @note Intended for compilation on Host. If compiling for an Arm target, use 610 * arm_transpose_conv_s8_get_buffer_size(). 611 * 612 */ 613 int32_t arm_transpose_conv_s8_get_buffer_size_mve(const cmsis_nn_dims *input_dims, 614 const cmsis_nn_dims *filter_dims, 615 const cmsis_nn_dims *out_dims); 616 617 /** 618 * @brief Basic s16 convolution function 619 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 620 * arm_convolve_s16_get_buffer_size will return the buffer_size if required. 621 * The caller is expected to clear the buffer, if applicable, for security reasons. 622 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 623 * conv_params->input_offset : Not used 624 * conv_params->output_offset : Not used 625 * @param[in] quant_params Per-channel quantization info. 626 * It contains the multiplier and shift values to be applied to each output channel 627 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 628 * @param[in] input_data Input (activation) data pointer. Data type: int16 629 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 630 * spatial filter dimensions 631 * @param[in] filter_data Filter data pointer. Data type: int8 632 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 633 * @param[in] bias_data Struct with optional bias data pointer. Bias data type can be int64 or int32 depending 634 * flag in struct. 635 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 636 * @param[out] output_data Output data pointer. Data type: int16 637 * 638 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or 639 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or 640 * <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code> 641 * 642 * @details 643 * 1. Supported framework: TensorFlow Lite micro 644 * 2. Additional memory is required for optimization. Refer to argument 'ctx' for details. 645 * 646 */ 647 arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx, 648 const cmsis_nn_conv_params *conv_params, 649 const cmsis_nn_per_channel_quant_params *quant_params, 650 const cmsis_nn_dims *input_dims, 651 const int16_t *input_data, 652 const cmsis_nn_dims *filter_dims, 653 const int8_t *filter_data, 654 const cmsis_nn_dims *bias_dims, 655 const cmsis_nn_bias_data *bias_data, 656 const cmsis_nn_dims *output_dims, 657 int16_t *output_data); 658 659 /** 660 * @brief Get the required buffer size for s16 convolution function 661 * 662 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 663 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK 664 * are the spatial filter dimensions 665 * @return The function returns required buffer size(bytes) 666 * 667 */ 668 int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 669 670 /** 671 * @brief Fast s4 version for 1x1 convolution (non-square shape) 672 * 673 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 674 * arm_convolve_1x1_s4_fast_get_buffer_size will return the buffer_size if required. 675 * The caller is expected to clear the buffer ,if applicable, for security reasons. 676 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 677 * Range of conv_params->input_offset : [-127, 128] 678 * Range of conv_params->output_offset : [-128, 127] 679 * @param[in] quant_params Per-channel quantization info. 680 * It contains the multiplier and shift values to be applied to each output channel 681 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 682 * @param[in] input_data Input (activation) data pointer. Data type: int8 683 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN] 684 * @param[in] filter_data Filter data pointer. Data type: int8 packed with 2x int4 685 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 686 * @param[in] bias_data Optional bias data pointer. Data type: int32 687 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 688 * @param[out] output_data Output data pointer. Data type: int8 689 * 690 * @return The function returns either 691 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 692 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 693 * 694 * @details 695 * - Supported framework : TensorFlow Lite Micro 696 * - The following constrains on the arguments apply 697 * -# conv_params->padding.w = conv_params->padding.h = 0 698 * -# conv_params->stride.w = conv_params->stride.h = 1 699 * 700 */ 701 arm_cmsis_nn_status arm_convolve_1x1_s4_fast(const cmsis_nn_context *ctx, 702 const cmsis_nn_conv_params *conv_params, 703 const cmsis_nn_per_channel_quant_params *quant_params, 704 const cmsis_nn_dims *input_dims, 705 const int8_t *input_data, 706 const cmsis_nn_dims *filter_dims, 707 const int8_t *filter_data, 708 const cmsis_nn_dims *bias_dims, 709 const int32_t *bias_data, 710 const cmsis_nn_dims *output_dims, 711 int8_t *output_data); 712 713 /** 714 * @brief s4 version for 1x1 convolution with support for non-unity stride values 715 * 716 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 717 * None is required by this function. 718 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 719 * Range of conv_params->input_offset : [-127, 128] 720 * Range of conv_params->output_offset : [-128, 127] 721 * @param[in] quant_params Per-channel quantization info. 722 * It contains the multiplier and shift values to be applied to each output channel 723 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 724 * @param[in] input_data Input (activation) data pointer. Data type: int8 725 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN] 726 * @param[in] filter_data Filter data pointer. Data type: int8 packed with 2x int4 727 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 728 * @param[in] bias_data Optional bias data pointer. Data type: int32 729 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 730 * @param[out] output_data Output data pointer. Data type: int8 731 * 732 * @return The function returns either 733 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 734 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 735 * @details 736 * - Supported framework : TensorFlow Lite Micro 737 * - The following constrains on the arguments apply 738 * -# conv_params->padding.w = conv_params->padding.h = 0 739 * 740 */ 741 arm_cmsis_nn_status arm_convolve_1x1_s4(const cmsis_nn_context *ctx, 742 const cmsis_nn_conv_params *conv_params, 743 const cmsis_nn_per_channel_quant_params *quant_params, 744 const cmsis_nn_dims *input_dims, 745 const int8_t *input_data, 746 const cmsis_nn_dims *filter_dims, 747 const int8_t *filter_data, 748 const cmsis_nn_dims *bias_dims, 749 const int32_t *bias_data, 750 const cmsis_nn_dims *output_dims, 751 int8_t *output_data); 752 753 /** 754 * @brief Fast s8 version for 1x1 convolution (non-square shape) 755 * 756 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 757 * arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required. 758 * The caller is expected to clear the buffer, if applicable, for security reasons. 759 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 760 * Range of conv_params->input_offset : [-127, 128] 761 * Range of conv_params->output_offset : [-128, 127] 762 * @param[in] quant_params Per-channel quantization info. 763 * It contains the multiplier and shift values to be applied to each output channel 764 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 765 * @param[in] input_data Input (activation) data pointer. Data type: int8 766 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN] 767 * @param[in] filter_data Filter data pointer. Data type: int8 768 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 769 * @param[in] bias_data Optional bias data pointer. Data type: int32 770 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 771 * @param[out] output_data Output data pointer. Data type: int8 772 * 773 * @return The function returns either 774 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 775 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 776 * 777 * @details 778 * - Supported framework : TensorFlow Lite Micro 779 * - The following constrains on the arguments apply 780 * -# conv_params->padding.w = conv_params->padding.h = 0 781 * -# conv_params->stride.w = conv_params->stride.h = 1 782 * 783 */ 784 arm_cmsis_nn_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, 785 const cmsis_nn_conv_params *conv_params, 786 const cmsis_nn_per_channel_quant_params *quant_params, 787 const cmsis_nn_dims *input_dims, 788 const int8_t *input_data, 789 const cmsis_nn_dims *filter_dims, 790 const int8_t *filter_data, 791 const cmsis_nn_dims *bias_dims, 792 const int32_t *bias_data, 793 const cmsis_nn_dims *output_dims, 794 int8_t *output_data); 795 796 /** 797 * @brief Get the required buffer size for arm_convolve_1x1_s4_fast 798 * 799 * @param[in] input_dims Input (activation) dimensions 800 * @return The function returns the required buffer size in bytes 801 * 802 */ 803 int32_t arm_convolve_1x1_s4_fast_get_buffer_size(const cmsis_nn_dims *input_dims); 804 805 /** 806 * @brief Get the required buffer size for arm_convolve_1x1_s8_fast 807 * 808 * @param[in] input_dims Input (activation) dimensions 809 * @return The function returns the required buffer size in bytes 810 * 811 */ 812 int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims); 813 814 /** 815 * @brief s8 version for 1x1 convolution with support for non-unity stride values 816 * 817 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 818 * None is required by this function. 819 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 820 * Range of conv_params->input_offset : [-127, 128] 821 * Range of conv_params->output_offset : [-128, 127] 822 * @param[in] quant_params Per-channel quantization info. 823 * It contains the multiplier and shift values to be applied to each output channel 824 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 825 * @param[in] input_data Input (activation) data pointer. Data type: int8 826 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN] 827 * @param[in] filter_data Filter data pointer. Data type: int8 828 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 829 * @param[in] bias_data Optional bias data pointer. Data type: int32 830 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 831 * @param[out] output_data Output data pointer. Data type: int8 832 * 833 * @return The function returns either 834 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 835 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 836 * @details 837 * - Supported framework : TensorFlow Lite Micro 838 * - The following constrains on the arguments apply 839 * -# conv_params->padding.w = conv_params->padding.h = 0 840 * 841 */ 842 arm_cmsis_nn_status arm_convolve_1x1_s8(const cmsis_nn_context *ctx, 843 const cmsis_nn_conv_params *conv_params, 844 const cmsis_nn_per_channel_quant_params *quant_params, 845 const cmsis_nn_dims *input_dims, 846 const int8_t *input_data, 847 const cmsis_nn_dims *filter_dims, 848 const int8_t *filter_data, 849 const cmsis_nn_dims *bias_dims, 850 const int32_t *bias_data, 851 const cmsis_nn_dims *output_dims, 852 int8_t *output_data); 853 854 /** 855 * @brief 1xn convolution 856 * 857 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 858 * arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required 859 * The caller is expected to clear the buffer, if applicable, for security reasons. 860 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 861 * Range of conv_params->input_offset : [-127, 128] 862 * Range of conv_params->output_offset : [-128, 127] 863 * @param[in] quant_params Per-channel quantization info. 864 * It contains the multiplier and shift values to be applied to each output channel 865 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 866 * @param[in] input_data Input (activation) data pointer. Data type: int8 867 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal 868 * spatial filter dimension 869 * @param[in] filter_data Filter data pointer. Data type: int8 870 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 871 * @param[in] bias_data Optional bias data pointer. Data type: int32 872 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 873 * @param[out] output_data Output data pointer. Data type: int8 874 * 875 * @return The function returns either 876 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 877 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 878 * 879 * @details 880 * - Supported framework : TensorFlow Lite Micro 881 * - The following constrains on the arguments apply 882 * -# input_dims->n equals 1 883 * -# ouput_dims->w is a multiple of 4 884 * -# Explicit constraints(since it is for 1xN convolution) 885 * -## input_dims->h equals 1 886 * -## output_dims->h equals 1 887 * -## filter_dims->h equals 1 888 *@todo Remove constraint on output_dims->w to make the function generic. 889 * 890 */ 891 arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, 892 const cmsis_nn_conv_params *conv_params, 893 const cmsis_nn_per_channel_quant_params *quant_params, 894 const cmsis_nn_dims *input_dims, 895 const int8_t *input_data, 896 const cmsis_nn_dims *filter_dims, 897 const int8_t *filter_data, 898 const cmsis_nn_dims *bias_dims, 899 const int32_t *bias_data, 900 const cmsis_nn_dims *output_dims, 901 int8_t *output_data); 902 903 /** 904 * @brief 1xn convolution for s4 weights 905 * 906 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 907 * arm_convolve_1_x_n_s4_get_buffer_size will return the buffer_size if required 908 * The caller is expected to clear the buffer, if applicable, for security reasons. 909 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 910 * Range of conv_params->input_offset : [-127, 128] 911 * Range of conv_params->output_offset : [-128, 127] 912 * @param[in] quant_params Per-channel quantization info. 913 * It contains the multiplier and shift values to be applied to each output channel 914 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 915 * @param[in] input_data Input (activation) data pointer. Data type: int8 916 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal 917 * spatial filter dimension 918 * @param[in] filter_data Filter data pointer. Data type: int8 as packed int4 919 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 920 * @param[in] bias_data Optional bias data pointer. Data type: int32 921 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 922 * @param[out] output_data Output data pointer. Data type: int8 923 * 924 * @return The function returns either 925 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 926 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 927 * 928 * @details 929 * - Supported framework : TensorFlow Lite Micro 930 * - The following constrains on the arguments apply 931 * -# stride.w * input_dims->c is a multiple of 4 932 * -# Explicit constraints(since it is for 1xN convolution) 933 * -## input_dims->h equals 1 934 * -## output_dims->h equals 1 935 * -## filter_dims->h equals 1 936 *@todo Remove constraint on output_dims->w to make the function generic. 937 * 938 */ 939 arm_cmsis_nn_status arm_convolve_1_x_n_s4(const cmsis_nn_context *ctx, 940 const cmsis_nn_conv_params *conv_params, 941 const cmsis_nn_per_channel_quant_params *quant_params, 942 const cmsis_nn_dims *input_dims, 943 const int8_t *input_data, 944 const cmsis_nn_dims *filter_dims, 945 const int8_t *filter_data, 946 const cmsis_nn_dims *bias_dims, 947 const int32_t *bias_data, 948 const cmsis_nn_dims *output_dims, 949 int8_t *output_data); 950 951 /** 952 * @brief Get the required additional buffer size for 1xn convolution 953 * 954 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 955 * Range of conv_params->input_offset : [-127, 128] 956 * Range of conv_params->output_offset : [-128, 127] 957 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 958 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the 959 * horizontal spatial filter dimension 960 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 961 * 962 * @return The function returns required buffer size(bytes) 963 * 964 */ 965 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, 966 const cmsis_nn_dims *input_dims, 967 const cmsis_nn_dims *filter_dims, 968 const cmsis_nn_dims *output_dims); 969 970 /** 971 * @brief Get the required additional buffer size for 1xn convolution 972 * 973 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 974 * Range of conv_params->input_offset : [-127, 128] 975 * Range of conv_params->output_offset : [-128, 127] 976 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 977 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the 978 * horizontal spatial filter dimension 979 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 980 * 981 * @return The function returns required buffer size(bytes) 982 * 983 */ 984 int32_t arm_convolve_1_x_n_s4_get_buffer_size(const cmsis_nn_conv_params *conv_params, 985 const cmsis_nn_dims *input_dims, 986 const cmsis_nn_dims *filter_dims, 987 const cmsis_nn_dims *output_dims); 988 989 /** 990 * @brief Wrapper function to pick the right optimized s8 depthwise convolution function 991 * 992 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 993 * definition file to see if an additional buffer is required. 994 * Optional function {API}_get_buffer_size() provides the buffer 995 * size if required. 996 * The caller is expected to clear the buffer, if applicable, for security reasons. 997 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 998 * dw_conv_params->dilation is not used. 999 * Range of dw_conv_params->input_offset : [-127, 128] 1000 * Range of dw_conv_params->output_offset : [-128, 127] 1001 * @param[in] quant_params Per-channel quantization info. 1002 * It contains the multiplier and shift values to be applied to each 1003 * output channel 1004 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 1005 * Batch argument N is not used and assumed to be 1. 1006 * @param[in] input_data Input (activation) data pointer. Data type: int8 1007 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1008 * @param[in] filter_data Filter data pointer. Data type: int8 1009 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1010 * @param[in] bias_data Bias data pointer. Data type: int32 1011 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 1012 * @param[in, out] output_data Output data pointer. Data type: int8 1013 * @return The function returns 1014 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful completion. 1015 * 1016 * @details 1017 * - Supported framework: TensorFlow Lite 1018 * - Picks one of the the following functions 1019 * -# arm_depthwise_conv_s8() 1020 * -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only 1021 * -# arm_depthwise_conv_s8_opt() 1022 * - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the 1023 * boundary. 1024 */ 1025 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, 1026 const cmsis_nn_dw_conv_params *dw_conv_params, 1027 const cmsis_nn_per_channel_quant_params *quant_params, 1028 const cmsis_nn_dims *input_dims, 1029 const int8_t *input_data, 1030 const cmsis_nn_dims *filter_dims, 1031 const int8_t *filter_data, 1032 const cmsis_nn_dims *bias_dims, 1033 const int32_t *bias_data, 1034 const cmsis_nn_dims *output_dims, 1035 int8_t *output_data); 1036 1037 /** 1038 * @brief Wrapper function to pick the right optimized s4 depthwise convolution function 1039 * 1040 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1041 * definition file to see if an additional buffer is required. 1042 * Optional function {API}_get_buffer_size() provides the buffer 1043 * size if required. 1044 * The caller is expected to clear the buffer ,if applicable, for security reasons. 1045 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 1046 * dw_conv_params->dilation is not used. 1047 * Range of dw_conv_params->input_offset : [-127, 128] 1048 * Range of dw_conv_params->output_offset : [-128, 127] 1049 * @param[in] quant_params Per-channel quantization info. 1050 * It contains the multiplier and shift values to be applied to each 1051 * output channel 1052 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 1053 * Batch argument N is not used and assumed to be 1. 1054 * @param[in] input_data Input (activation) data pointer. Data type: int8 1055 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1056 * @param[in] filter_data Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential 1057 * weights [0x1, 0x2, 0x3, 0x4] packed as [0x21, 0x43]. 1058 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1059 * @param[in] bias_data Bias data pointer. Data type: int32 1060 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 1061 * @param[in, out] output_data Output data pointer. Data type: int8 1062 * @return The function returns 1063 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful completion. 1064 * 1065 * @details 1066 * - Supported framework: TensorFlow Lite 1067 */ 1068 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s4(const cmsis_nn_context *ctx, 1069 const cmsis_nn_dw_conv_params *dw_conv_params, 1070 const cmsis_nn_per_channel_quant_params *quant_params, 1071 const cmsis_nn_dims *input_dims, 1072 const int8_t *input_data, 1073 const cmsis_nn_dims *filter_dims, 1074 const int8_t *filter_data, 1075 const cmsis_nn_dims *bias_dims, 1076 const int32_t *bias_data, 1077 const cmsis_nn_dims *output_dims, 1078 int8_t *output_data); 1079 1080 /** 1081 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() 1082 * 1083 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 1084 * Range of dw_conv_params->input_offset : [-127, 128] 1085 * Range of dw_conv_params->input_offset : [-128, 127] 1086 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 1087 * Batch argument N is not used and assumed to be 1. 1088 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1089 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 1090 * @return Size of additional memory required for optimizations in bytes. 1091 * 1092 */ 1093 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, 1094 const cmsis_nn_dims *input_dims, 1095 const cmsis_nn_dims *filter_dims, 1096 const cmsis_nn_dims *output_dims); 1097 1098 /** 1099 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for processors with DSP extension. 1100 * Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details. 1101 * 1102 * @note Intended for compilation on Host. If compiling for an Arm target, use 1103 * arm_depthwise_conv_wrapper_s8_get_buffer_size(). 1104 * 1105 */ 1106 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params, 1107 const cmsis_nn_dims *input_dims, 1108 const cmsis_nn_dims *filter_dims, 1109 const cmsis_nn_dims *output_dims); 1110 1111 /** 1112 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() for Arm(R) Helium Architecture case. 1113 * Refer to arm_depthwise_conv_wrapper_s8_get_buffer_size() for function argument details. 1114 * 1115 * @note Intended for compilation on Host. If compiling for an Arm target, use 1116 * arm_depthwise_conv_wrapper_s8_get_buffer_size(). 1117 * 1118 */ 1119 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params, 1120 const cmsis_nn_dims *input_dims, 1121 const cmsis_nn_dims *filter_dims, 1122 const cmsis_nn_dims *output_dims); 1123 1124 /** 1125 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() 1126 * 1127 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 1128 * Range of dw_conv_params->input_offset : [-127, 128] 1129 * Range of dw_conv_params->input_offset : [-128, 127] 1130 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 1131 * Batch argument N is not used and assumed to be 1. 1132 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1133 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 1134 * @return Size of additional memory required for optimizations in bytes. 1135 * 1136 */ 1137 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, 1138 const cmsis_nn_dims *input_dims, 1139 const cmsis_nn_dims *filter_dims, 1140 const cmsis_nn_dims *output_dims); 1141 1142 /** 1143 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() for processors with DSP extension. 1144 * Refer to arm_depthwise_conv_wrapper_s4_get_buffer_size() for function argument details. 1145 * 1146 * @note Intended for compilation on Host. If compiling for an Arm target, use 1147 * arm_depthwise_conv_wrapper_s4_get_buffer_size(). 1148 * 1149 */ 1150 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params, 1151 const cmsis_nn_dims *input_dims, 1152 const cmsis_nn_dims *filter_dims, 1153 const cmsis_nn_dims *output_dims); 1154 1155 /** 1156 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() for Arm(R) Helium Architecture case. 1157 * Refer to arm_depthwise_conv_wrapper_s4_get_buffer_size() for function argument details. 1158 * 1159 * @note Intended for compilation on Host. If compiling for an Arm target, use 1160 * arm_depthwise_conv_wrapper_s4_get_buffer_size(). 1161 * 1162 */ 1163 int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params, 1164 const cmsis_nn_dims *input_dims, 1165 const cmsis_nn_dims *filter_dims, 1166 const cmsis_nn_dims *output_dims); 1167 1168 /** 1169 * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions. 1170 * 1171 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1172 * definition file to see if an additional buffer is required. 1173 * Optional function {API}_get_buffer_size() provides the buffer 1174 * size if an additional buffer is required exists if additional memory is. 1175 * The caller is expected to clear the buffer, if applicable, for security reasons. 1176 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 1177 * dw_conv_params->dilation is not used. 1178 * Range of dw_conv_params->input_offset : [-127, 128] 1179 * Range of dw_conv_params->input_offset : [-128, 127] 1180 * @param[in] quant_params Per-channel quantization info. 1181 * It contains the multiplier and shift values to be applied to each 1182 * output channel 1183 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1184 * Batch argument N is not used. 1185 * @param[in] input_data Input (activation) data pointer. Data type: int8 1186 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1187 * @param[in] filter_data Filter data pointer. Data type: int8 1188 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1189 * @param[in] bias_data Bias data pointer. Data type: int32 1190 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 1191 * @param[in, out] output_data Output data pointer. Data type: int8 1192 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 1193 * 1194 * @details 1195 * - Supported framework: TensorFlow Lite 1196 */ 1197 arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, 1198 const cmsis_nn_dw_conv_params *dw_conv_params, 1199 const cmsis_nn_per_channel_quant_params *quant_params, 1200 const cmsis_nn_dims *input_dims, 1201 const int8_t *input_data, 1202 const cmsis_nn_dims *filter_dims, 1203 const int8_t *filter_data, 1204 const cmsis_nn_dims *bias_dims, 1205 const int32_t *bias_data, 1206 const cmsis_nn_dims *output_dims, 1207 int8_t *output_data); 1208 1209 /** 1210 * @brief Basic s4 depthwise convolution function that doesn't have any constraints on the input dimensions. 1211 * 1212 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1213 * definition file to see if an additional buffer is required. 1214 * Optional function {API}_get_buffer_size() provides the buffer 1215 * size if an additional buffer is required exists if additional memory is. 1216 * The caller is expected to clear the buffer ,if applicable, for security reasons. 1217 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 1218 * dw_conv_params->dilation is not used. 1219 * Range of dw_conv_params->input_offset : [-127, 128] 1220 * Range of dw_conv_params->input_offset : [-128, 127] 1221 * @param[in] quant_params Per-channel quantization info. 1222 * It contains the multiplier and shift values to be applied to each 1223 * output channel 1224 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1225 * Batch argument N is not used. 1226 * @param[in] input Input (activation) data pointer. Data type: int8 1227 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1228 * @param[in] kernel Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential 1229 * weights [0x1, 0x2, 0x3, 0x4] packed as [0x21, 0x43]. 1230 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1231 * @param[in] bias Bias data pointer. Data type: int32 1232 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 1233 * @param[in, out] output Output data pointer. Data type: int8 1234 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 1235 * 1236 * @details 1237 * - Supported framework: TensorFlow Lite 1238 */ 1239 arm_cmsis_nn_status arm_depthwise_conv_s4(const cmsis_nn_context *ctx, 1240 const cmsis_nn_dw_conv_params *dw_conv_params, 1241 const cmsis_nn_per_channel_quant_params *quant_params, 1242 const cmsis_nn_dims *input_dims, 1243 const int8_t *input, 1244 const cmsis_nn_dims *filter_dims, 1245 const int8_t *kernel, 1246 const cmsis_nn_dims *bias_dims, 1247 const int32_t *bias, 1248 const cmsis_nn_dims *output_dims, 1249 int8_t *output); 1250 1251 /** 1252 * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions. 1253 * 1254 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1255 * definition file to see if an additional buffer is required. 1256 * Optional function {API}_get_buffer_size() provides the buffer 1257 * size if an additional buffer is required. 1258 * exists if additional memory is. 1259 * The caller is expected to clear the buffer, if applicable, for security reasons. 1260 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 1261 * conv_params->input_offset : Not used 1262 * conv_params->output_offset : Not used 1263 * @param[in] quant_params Per-channel quantization info. 1264 * It contains the multiplier and shift values to be applied to each 1265 * output channel 1266 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1267 * Batch argument N is not used. 1268 * @param[in] input_data Input (activation) data pointer. Data type: int8 1269 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1270 * @param[in] filter_data Filter data pointer. Data type: int8 1271 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1272 * @param[in] bias_data Bias data pointer. Data type: int64 1273 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 1274 * @param[in, out] output_data Output data pointer. Data type: int16 1275 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 1276 * 1277 * @details 1278 * - Supported framework: TensorFlow Lite 1279 */ 1280 arm_cmsis_nn_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx, 1281 const cmsis_nn_dw_conv_params *dw_conv_params, 1282 const cmsis_nn_per_channel_quant_params *quant_params, 1283 const cmsis_nn_dims *input_dims, 1284 const int16_t *input_data, 1285 const cmsis_nn_dims *filter_dims, 1286 const int8_t *filter_data, 1287 const cmsis_nn_dims *bias_dims, 1288 const int64_t *bias_data, 1289 const cmsis_nn_dims *output_dims, 1290 int16_t *output_data); 1291 1292 /** 1293 * @brief Wrapper function to pick the right optimized s16 depthwise convolution function 1294 * 1295 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1296 * definition file to see if an additional buffer is required. 1297 * Optional function {API}_get_buffer_size() provides the buffer 1298 * size if required. 1299 * The caller is expected to clear the buffer, if applicable, for security reasons. 1300 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 1301 * dw_conv_params->dilation is not used. 1302 * Range of dw_conv_params->input_offset : Not used 1303 * Range of dw_conv_params->output_offset : Not used 1304 * @param[in] quant_params Per-channel quantization info. 1305 * It contains the multiplier and shift values to be applied to each 1306 * output channel 1307 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 1308 * Batch argument N is not used and assumed to be 1. 1309 * @param[in] input_data Input (activation) data pointer. Data type: int16 1310 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1311 * @param[in] filter_data Filter data pointer. Data type: int8 1312 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1313 * @param[in] bias_data Bias data pointer. Data type: int64 1314 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 1315 * @param[in, out] output_data Output data pointer. Data type: int16 1316 * @return The function returns 1317 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful completion. 1318 * 1319 * @details 1320 * - Supported framework: TensorFlow Lite 1321 * - Picks one of the the following functions 1322 * -# arm_depthwise_conv_s16() 1323 * -# arm_depthwise_conv_fast_s16() - Cortex-M CPUs with DSP extension only 1324 */ 1325 arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx, 1326 const cmsis_nn_dw_conv_params *dw_conv_params, 1327 const cmsis_nn_per_channel_quant_params *quant_params, 1328 const cmsis_nn_dims *input_dims, 1329 const int16_t *input_data, 1330 const cmsis_nn_dims *filter_dims, 1331 const int8_t *filter_data, 1332 const cmsis_nn_dims *bias_dims, 1333 const int64_t *bias_data, 1334 const cmsis_nn_dims *output_dims, 1335 int16_t *output_data); 1336 1337 /** 1338 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() 1339 * 1340 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 1341 * Range of dw_conv_params->input_offset : Not used 1342 * Range of dw_conv_params->input_offset : Not used 1343 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 1344 * Batch argument N is not used and assumed to be 1. 1345 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1346 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 1347 * @return Size of additional memory required for optimizations in bytes. 1348 * 1349 */ 1350 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, 1351 const cmsis_nn_dims *input_dims, 1352 const cmsis_nn_dims *filter_dims, 1353 const cmsis_nn_dims *output_dims); 1354 1355 /** 1356 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for processors with DSP extension. 1357 * Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details. 1358 * 1359 * @note Intended for compilation on Host. If compiling for an Arm target, use 1360 * arm_depthwise_conv_wrapper_s16_get_buffer_size(). 1361 * 1362 */ 1363 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params, 1364 const cmsis_nn_dims *input_dims, 1365 const cmsis_nn_dims *filter_dims, 1366 const cmsis_nn_dims *output_dims); 1367 1368 /** 1369 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s16() for Arm(R) Helium Architecture 1370 * case. Refer to arm_depthwise_conv_wrapper_s16_get_buffer_size() for function argument details. 1371 * 1372 * @note Intended for compilation on Host. If compiling for an Arm target, use 1373 * arm_depthwise_conv_wrapper_s16_get_buffer_size(). 1374 * 1375 */ 1376 int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params, 1377 const cmsis_nn_dims *input_dims, 1378 const cmsis_nn_dims *filter_dims, 1379 const cmsis_nn_dims *output_dims); 1380 1381 /** 1382 * @brief Optimized s16 depthwise convolution function with constraint that in_channel equals out_channel. 1383 * Refer arm_depthwise_conv_s16() for function argument details. 1384 * 1385 * @return The function returns one of the following 1386 * <code>ARM_CMSIS_NN_ARG_ERROR</code> - ctx-buff == NULL and 1387 * arm_depthwise_conv_fast_s16_get_buffer_size() > 0 or 1388 * input channel != output channel or 1389 * ch_mult != 1 1390 * 1391 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation 1392 * 1393 * @details 1394 * - Supported framework: TensorFlow Lite 1395 * - The following constrains on the arguments apply 1396 * -# Number of input channel equals number of output channels or ch_mult equals 1 1397 * - Reccomended when number of channels is 4 or greater. 1398 * 1399 */ 1400 arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx, 1401 const cmsis_nn_dw_conv_params *dw_conv_params, 1402 const cmsis_nn_per_channel_quant_params *quant_params, 1403 const cmsis_nn_dims *input_dims, 1404 const int16_t *input_data, 1405 const cmsis_nn_dims *filter_dims, 1406 const int8_t *filter_data, 1407 const cmsis_nn_dims *bias_dims, 1408 const int64_t *bias_data, 1409 const cmsis_nn_dims *output_dims, 1410 int16_t *output_data); 1411 1412 /** 1413 * @brief Get the required buffer size for optimized s16 depthwise convolution 1414 * function with constraint that in_channel equals out_channel. 1415 * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] 1416 * Batch argument N is not used. 1417 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1418 * @return The function returns required buffer size in bytes 1419 * 1420 */ 1421 int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 1422 1423 /** 1424 * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on 1425 * the input arguments(documented below). Refer arm_depthwise_conv_s8() for function 1426 * argument details. 1427 * 1428 * @return The function returns one of the following 1429 * <code>ARM_CMSIS_NN_ARG_ERROR</code> - Unsupported dimension of tensors 1430 * - Unsupported pad size along the x axis 1431 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation 1432 * 1433 * @details 1434 * - Supported framework : TensorFlow Lite Micro 1435 * - The following constrains on the arguments apply 1436 * -# Number of input channel equals number of output channels 1437 * -# Filter height and width equals 3 1438 * -# Padding along x is either 0 or 1. 1439 * 1440 */ 1441 arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, 1442 const cmsis_nn_dw_conv_params *dw_conv_params, 1443 const cmsis_nn_per_channel_quant_params *quant_params, 1444 const cmsis_nn_dims *input_dims, 1445 const int8_t *input_data, 1446 const cmsis_nn_dims *filter_dims, 1447 const int8_t *filter_data, 1448 const cmsis_nn_dims *bias_dims, 1449 const int32_t *bias_data, 1450 const cmsis_nn_dims *output_dims, 1451 int8_t *output_data); 1452 1453 /** 1454 * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. 1455 * Refer arm_depthwise_conv_s8() for function argument details. 1456 * 1457 * @return The function returns one of the following 1458 * <code>ARM_CMSIS_NN_ARG_ERROR</code> - input channel != output channel or 1459 * ch_mult != 1 1460 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation 1461 * 1462 * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out 1463 * for the following if MVE optimizations(Arm Helium Technology) are used. 1464 * - Output shift 1465 * - Output multiplier 1466 * - Output bias 1467 * - kernel 1468 * @details 1469 * - Supported framework: TensorFlow Lite 1470 * - The following constrains on the arguments apply 1471 * -# Number of input channel equals number of output channels or ch_mult equals 1 1472 * - Reccomended when number of channels is 4 or greater. 1473 * 1474 */ 1475 arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, 1476 const cmsis_nn_dw_conv_params *dw_conv_params, 1477 const cmsis_nn_per_channel_quant_params *quant_params, 1478 const cmsis_nn_dims *input_dims, 1479 const int8_t *input_data, 1480 const cmsis_nn_dims *filter_dims, 1481 const int8_t *filter_data, 1482 const cmsis_nn_dims *bias_dims, 1483 const int32_t *bias_data, 1484 const cmsis_nn_dims *output_dims, 1485 int8_t *output_data); 1486 1487 /** 1488 * @brief Optimized s4 depthwise convolution function with constraint that in_channel equals out_channel. 1489 * Refer arm_depthwise_conv_s4() for function argument details. 1490 * 1491 * @return The function returns one of the following 1492 * <code>ARM_CMSIS_NN_ARG_ERROR</code> - input channel != output channel or 1493 * ch_mult != 1 1494 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation 1495 * 1496 * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out 1497 * for the following if MVE optimizations(Arm Helium Technology) are used. 1498 * - Output shift 1499 * - Output multiplier 1500 * - Output bias 1501 * - kernel 1502 * @details 1503 * - Supported framework: TensorFlow Lite 1504 * - The following constrains on the arguments apply 1505 * -# Number of input channel equals number of output channels or ch_mult equals 1 1506 * - Reccomended when number of channels is 4 or greater. 1507 * 1508 */ 1509 arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx, 1510 const cmsis_nn_dw_conv_params *dw_conv_params, 1511 const cmsis_nn_per_channel_quant_params *quant_params, 1512 const cmsis_nn_dims *input_dims, 1513 const int8_t *input_data, 1514 const cmsis_nn_dims *filter_dims, 1515 const int8_t *filter_data, 1516 const cmsis_nn_dims *bias_dims, 1517 const int32_t *bias_data, 1518 const cmsis_nn_dims *output_dims, 1519 int8_t *output_data); 1520 1521 /** 1522 * @brief Get the required buffer size for optimized s8 depthwise convolution 1523 * function with constraint that in_channel equals out_channel. 1524 * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] 1525 * Batch argument N is not used. 1526 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1527 * @return The function returns required buffer size in bytes 1528 * 1529 */ 1530 int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 1531 1532 /** 1533 * @brief Get the required buffer size for optimized s4 depthwise convolution 1534 * function with constraint that in_channel equals out_channel. 1535 * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] 1536 * Batch argument N is not used. 1537 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1538 * @return The function returns required buffer size in bytes 1539 * 1540 */ 1541 int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 1542 1543 /** 1544 * @defgroup FC Fully-connected Layer Functions 1545 * 1546 * Collection of fully-connected and matrix multiplication functions. 1547 * 1548 * Fully-connected layer is basically a matrix-vector multiplication 1549 * with bias. The matrix is the weights and the input/output vectors 1550 * are the activation values. Supported {weight, activation} precisions 1551 * include {8-bit, 8-bit} and {8-bit, 16-bit} 1552 * 1553 * 1554 */ 1555 1556 /** 1557 * @brief Basic s4 Fully Connected function. 1558 * 1559 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1560 * definition file to see if an additional buffer is required. 1561 * Optional function {API}_get_buffer_size() provides the buffer 1562 * size if an additional buffer is required. 1563 * The caller is expected to clear the buffer ,if applicable, for security reasons. 1564 * @param[in] fc_params Fully Connected layer parameters. 1565 * Range of fc_params->input_offset : [-127, 128] 1566 * fc_params->filter_offset : 0 1567 * Range of fc_params->output_offset : [-128, 127] 1568 * @param[in] quant_params Per-tensor quantization info. 1569 * It contains the multiplier and shift value to be applied to the output tensor. 1570 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1571 * Input dimension is taken as Nx(H * W * C_IN) 1572 * @param[in] input_data Input (activation) data pointer. Data type: int8 1573 * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] 1574 * N : accumulation depth and equals (H * W * C_IN) from input_dims 1575 * C : output depth and equals C_OUT in output_dims 1576 * H & W : Not used 1577 * @param[in] filter_data Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential 1578 * weights [0x1, 0x2, 0x3, 0x4] packed as [0x21, 0x43]. 1579 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1580 * N, H, W : Not used 1581 * @param[in] bias_data Bias data pointer. Data type: int32 1582 * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] 1583 * N : Batches 1584 * C_OUT : Output depth 1585 * H & W : Not used. 1586 * @param[in, out] output_data Output data pointer. Data type: int8 1587 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 1588 * 1589 * @details 1590 * - Supported framework: TensorFlow Lite 1591 */ 1592 arm_cmsis_nn_status arm_fully_connected_s4(const cmsis_nn_context *ctx, 1593 const cmsis_nn_fc_params *fc_params, 1594 const cmsis_nn_per_tensor_quant_params *quant_params, 1595 const cmsis_nn_dims *input_dims, 1596 const int8_t *input_data, 1597 const cmsis_nn_dims *filter_dims, 1598 const int8_t *filter_data, 1599 const cmsis_nn_dims *bias_dims, 1600 const int32_t *bias_data, 1601 const cmsis_nn_dims *output_dims, 1602 int8_t *output_data); 1603 1604 /** 1605 * @brief Basic s8 Fully Connected function. 1606 * 1607 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1608 * definition file to see if an additional buffer is required. 1609 * Optional function {API}_get_buffer_size() provides the buffer 1610 * size if an additional buffer is required. 1611 * The caller is expected to clear the buffer, if applicable, for security reasons. 1612 * @param[in] fc_params Fully Connected layer parameters. 1613 * Range of fc_params->input_offset : [-127, 128] 1614 * fc_params->filter_offset : 0 1615 * Range of fc_params->output_offset : [-128, 127] 1616 * @param[in] quant_params Per-tensor quantization info. 1617 * It contains the multiplier and shift value to be applied to the output tensor. 1618 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1619 * Input dimension is taken as Nx(H * W * C_IN) 1620 * @param[in] input_data Input (activation) data pointer. Data type: int8 1621 * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] 1622 * N : accumulation depth and equals (H * W * C_IN) from input_dims 1623 * C : output depth and equals C_OUT in output_dims 1624 * H & W : Not used 1625 * @param[in] filter_data Filter data pointer. Data type: int8 1626 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1627 * N, H, W : Not used 1628 * @param[in] bias_data Bias data pointer. Data type: int32 1629 * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] 1630 * N : Batches 1631 * C_OUT : Output depth 1632 * H & W : Not used. 1633 * @param[in, out] output_data Output data pointer. Data type: int8 1634 * 1635 * @return The function returns either 1636 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 1637 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 1638 * 1639 * @details 1640 * - Supported framework: TensorFlow Lite 1641 */ 1642 arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, 1643 const cmsis_nn_fc_params *fc_params, 1644 const cmsis_nn_per_tensor_quant_params *quant_params, 1645 const cmsis_nn_dims *input_dims, 1646 const int8_t *input_data, 1647 const cmsis_nn_dims *filter_dims, 1648 const int8_t *filter_data, 1649 const cmsis_nn_dims *bias_dims, 1650 const int32_t *bias_data, 1651 const cmsis_nn_dims *output_dims, 1652 int8_t *output_data); 1653 1654 /** 1655 * @brief Basic s8 Fully Connected function using per channel quantization. 1656 * 1657 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1658 * definition file to see if an additional buffer is required. 1659 * Optional function {API}_get_buffer_size() provides the buffer 1660 * size if an additional buffer is required. 1661 * The caller is expected to clear the buffer, if applicable, for security reasons. 1662 * @param[in] fc_params Fully Connected layer parameters. 1663 * Range of fc_params->input_offset : [-127, 128] 1664 * fc_params->filter_offset : 0 1665 * Range of fc_params->output_offset : [-128, 127] 1666 * @param[in] quant_params Per-channel quantization info. 1667 * It contains the multiplier and shift values to be applied to each output channel 1668 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1669 * Input dimension is taken as Nx(H * W * C_IN) 1670 * @param[in] input_data Input (activation) data pointer. Data type: int8 1671 * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] 1672 * N : accumulation depth and equals (H * W * C_IN) from input_dims 1673 * C : output depth and equals C_OUT in output_dims 1674 * H & W : Not used 1675 * @param[in] filter_data Filter data pointer. Data type: int8 1676 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1677 * N, H, W : Not used 1678 * @param[in] bias_data Bias data pointer. Data type: int32 1679 * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] 1680 * N : Batches 1681 * C_OUT : Output depth 1682 * H & W : Not used. 1683 * @param[in, out] output_data Output data pointer. Data type: int8 1684 * 1685 * @return The function returns either 1686 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 1687 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 1688 * 1689 * @details 1690 * - Supported framework: TensorFlow Lite 1691 */ 1692 arm_cmsis_nn_status arm_fully_connected_per_channel_s8(const cmsis_nn_context *ctx, 1693 const cmsis_nn_fc_params *fc_params, 1694 const cmsis_nn_per_channel_quant_params *quant_params, 1695 const cmsis_nn_dims *input_dims, 1696 const int8_t *input_data, 1697 const cmsis_nn_dims *filter_dims, 1698 const int8_t *filter_data, 1699 const cmsis_nn_dims *bias_dims, 1700 const int32_t *bias_data, 1701 const cmsis_nn_dims *output_dims, 1702 int8_t *output_data); 1703 1704 /** 1705 * @brief s8 Fully Connected layer wrapper function 1706 * 1707 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1708 * definition file to see if an additional buffer is required. 1709 * Optional function {API}_get_buffer_size() provides the buffer 1710 * size if an additional buffer is required. 1711 * The caller is expected to clear the buffer, if applicable, for security reasons. 1712 * @param[in] fc_params Fully Connected layer parameters. 1713 * Range of fc_params->input_offset : [-127, 128] 1714 * fc_params->filter_offset : 0 1715 * Range of fc_params->output_offset : [-128, 127] 1716 * @param[in] quant_params Per-channel or per-tensor quantization info. Check struct defintion for details. 1717 * It contains the multiplier and shift value(s) to be applied to each output channel 1718 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1719 * Input dimension is taken as Nx(H * W * C_IN) 1720 * @param[in] input_data Input (activation) data pointer. Data type: int8 1721 * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] 1722 * N : accumulation depth and equals (H * W * C_IN) from input_dims 1723 * C : output depth and equals C_OUT in output_dims 1724 * H & W : Not used 1725 * @param[in] filter_data Filter data pointer. Data type: int8 1726 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1727 * N, H, W : Not used 1728 * @param[in] bias_data Bias data pointer. Data type: int32 1729 * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] 1730 * N : Batches 1731 * C_OUT : Output depth 1732 * H & W : Not used. 1733 * @param[in, out] output_data Output data pointer. Data type: int8 1734 * 1735 * @return The function returns either 1736 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 1737 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 1738 * 1739 * @details 1740 * - Supported framework: TensorFlow Lite 1741 */ 1742 arm_cmsis_nn_status arm_fully_connected_wrapper_s8(const cmsis_nn_context *ctx, 1743 const cmsis_nn_fc_params *fc_params, 1744 const cmsis_nn_quant_params *quant_params, 1745 const cmsis_nn_dims *input_dims, 1746 const int8_t *input_data, 1747 const cmsis_nn_dims *filter_dims, 1748 const int8_t *filter_data, 1749 const cmsis_nn_dims *bias_dims, 1750 const int32_t *bias_data, 1751 const cmsis_nn_dims *output_dims, 1752 int8_t *output_data); 1753 1754 /** 1755 * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s32 bias_data. 1756 * @param[in, out] vector_sum_buf Buffer for vector sums 1757 * @param[in] vector_cols Number of vector columns 1758 * @param[in] vector_rows Number of vector rows 1759 * @param[in] vector_data Vector of weigths data 1760 * @param[in] lhs_offset Constant multiplied with each sum 1761 * @param[in] rhs_offset Constant added to each vector element before sum 1762 * @param[in] bias_data Vector of bias data, added to each sum. 1763 * @return The function returns 1764 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation 1765 */ 1766 arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf, 1767 const int32_t vector_cols, 1768 const int32_t vector_rows, 1769 const int8_t *vector_data, 1770 const int32_t lhs_offset, 1771 const int32_t rhs_offset, 1772 const int32_t *bias_data); 1773 1774 /** 1775 * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s64 bias_data. 1776 * @param[in, out] vector_sum_buf Buffer for vector sums 1777 * @param[in] vector_cols Number of vector columns 1778 * @param[in] vector_rows Number of vector rows 1779 * @param[in] vector_data Vector of weigths data 1780 * @param[in] lhs_offset Constant multiplied with each sum 1781 * @param[in] bias_data Vector of bias data, added to each sum. 1782 * @return The function returns 1783 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation 1784 */ 1785 arm_cmsis_nn_status arm_vector_sum_s8_s64(int64_t *vector_sum_buf, 1786 const int32_t vector_cols, 1787 const int32_t vector_rows, 1788 const int8_t *vector_data, 1789 const int32_t lhs_offset, 1790 const int64_t *bias_data); 1791 1792 /** 1793 * @brief Get size of additional buffer required by arm_fully_connected_s8(). 1794 * See also arm_vector_sum_s8, which is required if buffer size is > 0. 1795 * @param[in] filter_dims dimension of filter 1796 * @return The function returns required buffer size in bytes 1797 * 1798 */ 1799 int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims); 1800 1801 /** 1802 * @brief Get size of additional buffer required by arm_fully_connected_s8() for processors with DSP extension. 1803 * Refer to arm_fully_connected_s8_get_buffer_size() for function argument details. 1804 * 1805 * @note Intended for compilation on Host. If compiling for an Arm target, use 1806 * arm_fully_connected_s8_get_buffer_size(). 1807 * 1808 */ 1809 int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims); 1810 1811 /** 1812 * @brief Get size of additional buffer required by arm_fully_connected_s8() for Arm(R) Helium Architecture case. 1813 * Refer to arm_fully_connected_s8_get_buffer_size() for function argument details. 1814 * 1815 * @note Intended for compilation on Host. If compiling for an Arm target, use 1816 * arm_fully_connected_s8_get_buffer_size(). 1817 * 1818 */ 1819 int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims); 1820 1821 /** 1822 * @brief Basic s16 Fully Connected function. 1823 * 1824 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1825 * definition file to see if an additional buffer is required. 1826 * Optional function {API}_get_buffer_size() provides the buffer 1827 * size if an additional buffer is required. 1828 * The caller is expected to clear the buffer, if applicable, for security reasons. 1829 * @param[in] fc_params Fully Connected layer parameters. 1830 * fc_params->input_offset : 0 1831 * fc_params->filter_offset : 0 1832 * fc_params->output_offset : 0 1833 * @param[in] quant_params Per-tensor quantization info. 1834 * It contains the multiplier and shift value to be applied to the output tensor. 1835 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1836 * Input dimension is taken as Nx(H * W * C_IN) 1837 * @param[in] input_data Input (activation) data pointer. Data type: int16 1838 * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] 1839 * N : accumulation depth and equals (H * W * C_IN) from input_dims 1840 * C : output depth and equals C_OUT in output_dims 1841 * H & W : Not used 1842 * @param[in] filter_data Filter data pointer. Data type: int8 1843 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1844 * N, H, W : Not used 1845 * @param[in] bias_data Bias data pointer. Data type: int64 1846 * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] 1847 * N : Batches 1848 * C_OUT : Output depth 1849 * H & W : Not used. 1850 * @param[in, out] output_data Output data pointer. Data type: int16 1851 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 1852 * 1853 * @details 1854 * - Supported framework: TensorFlow Lite 1855 */ 1856 arm_cmsis_nn_status arm_fully_connected_s16(const cmsis_nn_context *ctx, 1857 const cmsis_nn_fc_params *fc_params, 1858 const cmsis_nn_per_tensor_quant_params *quant_params, 1859 const cmsis_nn_dims *input_dims, 1860 const int16_t *input_data, 1861 const cmsis_nn_dims *filter_dims, 1862 const int8_t *filter_data, 1863 const cmsis_nn_dims *bias_dims, 1864 const int64_t *bias_data, 1865 const cmsis_nn_dims *output_dims, 1866 int16_t *output_data); 1867 1868 /** 1869 * @brief Get size of additional buffer required by arm_fully_connected_s16(). 1870 * @param[in] filter_dims dimension of filter 1871 * @return The function returns required buffer size in bytes 1872 * 1873 */ 1874 int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims); 1875 1876 /** 1877 * @brief Get size of additional buffer required by arm_fully_connected_s16() for processors with DSP extension. 1878 * Refer to arm_fully_connected_s16_get_buffer_size() for function argument details. 1879 * 1880 * @note Intended for compilation on Host. If compiling for an Arm target, use 1881 * arm_fully_connected_s16_get_buffer_size(). 1882 * 1883 */ 1884 int32_t arm_fully_connected_s16_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims); 1885 1886 /** 1887 * @brief Get size of additional buffer required by arm_fully_connected_s16() for Arm(R) Helium Architecture case. 1888 * Refer to arm_fully_connected_s16_get_buffer_size() for function argument details. 1889 * 1890 * @note Intended for compilation on Host. If compiling for an Arm target, use 1891 * arm_fully_connected_s16_get_buffer_size(). 1892 * 1893 */ 1894 int32_t arm_fully_connected_s16_get_buffer_size_mve(const cmsis_nn_dims *filter_dims); 1895 1896 /** 1897 * @defgroup groupElementwise Elementwise Functions 1898 * 1899 * Elementwise add and multiplication functions. 1900 * 1901 */ 1902 1903 /** 1904 * @brief s8 elementwise add of two vectors 1905 * @param[in] input_1_vect pointer to input vector 1 1906 * @param[in] input_2_vect pointer to input vector 2 1907 * @param[in] input_1_offset offset for input 1. Range: -127 to 128 1908 * @param[in] input_1_mult multiplier for input 1 1909 * @param[in] input_1_shift shift for input 1 1910 * @param[in] input_2_offset offset for input 2. Range: -127 to 128 1911 * @param[in] input_2_mult multiplier for input 2 1912 * @param[in] input_2_shift shift for input 2 1913 * @param[in] left_shift input left shift 1914 * @param[in,out] output pointer to output vector 1915 * @param[in] out_offset output offset. Range: -128 to 127 1916 * @param[in] out_mult output multiplier 1917 * @param[in] out_shift output shift 1918 * @param[in] out_activation_min minimum value to clamp output to. Min: -128 1919 * @param[in] out_activation_max maximum value to clamp output to. Max: 127 1920 * @param[in] block_size number of samples 1921 * @return The function returns ARM_CMSIS_NN_SUCCESS 1922 */ 1923 arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, 1924 const int8_t *input_2_vect, 1925 const int32_t input_1_offset, 1926 const int32_t input_1_mult, 1927 const int32_t input_1_shift, 1928 const int32_t input_2_offset, 1929 const int32_t input_2_mult, 1930 const int32_t input_2_shift, 1931 const int32_t left_shift, 1932 int8_t *output, 1933 const int32_t out_offset, 1934 const int32_t out_mult, 1935 const int32_t out_shift, 1936 const int32_t out_activation_min, 1937 const int32_t out_activation_max, 1938 const int32_t block_size); 1939 1940 /** 1941 * @brief s16 elementwise add of two vectors 1942 * @param[in] input_1_vect pointer to input vector 1 1943 * @param[in] input_2_vect pointer to input vector 2 1944 * @param[in] input_1_offset offset for input 1. Not used. 1945 * @param[in] input_1_mult multiplier for input 1 1946 * @param[in] input_1_shift shift for input 1 1947 * @param[in] input_2_offset offset for input 2. Not used. 1948 * @param[in] input_2_mult multiplier for input 2 1949 * @param[in] input_2_shift shift for input 2 1950 * @param[in] left_shift input left shift 1951 * @param[in,out] output pointer to output vector 1952 * @param[in] out_offset output offset. Not used. 1953 * @param[in] out_mult output multiplier 1954 * @param[in] out_shift output shift 1955 * @param[in] out_activation_min minimum value to clamp output to. Min: -32768 1956 * @param[in] out_activation_max maximum value to clamp output to. Max: 32767 1957 * @param[in] block_size number of samples 1958 * @return The function returns ARM_CMSIS_NN_SUCCESS 1959 */ 1960 arm_cmsis_nn_status arm_elementwise_add_s16(const int16_t *input_1_vect, 1961 const int16_t *input_2_vect, 1962 const int32_t input_1_offset, 1963 const int32_t input_1_mult, 1964 const int32_t input_1_shift, 1965 const int32_t input_2_offset, 1966 const int32_t input_2_mult, 1967 const int32_t input_2_shift, 1968 const int32_t left_shift, 1969 int16_t *output, 1970 const int32_t out_offset, 1971 const int32_t out_mult, 1972 const int32_t out_shift, 1973 const int32_t out_activation_min, 1974 const int32_t out_activation_max, 1975 const int32_t block_size); 1976 1977 /** 1978 * @brief s8 elementwise multiplication 1979 * @param[in] input_1_vect pointer to input vector 1 1980 * @param[in] input_2_vect pointer to input vector 2 1981 * @param[in] input_1_offset offset for input 1. Range: -127 to 128 1982 * @param[in] input_2_offset offset for input 2. Range: -127 to 128 1983 * @param[in,out] output pointer to output vector 1984 * @param[in] out_offset output offset. Range: -128 to 127 1985 * @param[in] out_mult output multiplier 1986 * @param[in] out_shift output shift 1987 * @param[in] out_activation_min minimum value to clamp output to. Min: -128 1988 * @param[in] out_activation_max maximum value to clamp output to. Max: 127 1989 * @param[in] block_size number of samples 1990 * @return The function returns ARM_CMSIS_NN_SUCCESS 1991 * 1992 * @details Supported framework: TensorFlow Lite micro 1993 */ 1994 arm_cmsis_nn_status arm_elementwise_mul_s8(const int8_t *input_1_vect, 1995 const int8_t *input_2_vect, 1996 const int32_t input_1_offset, 1997 const int32_t input_2_offset, 1998 int8_t *output, 1999 const int32_t out_offset, 2000 const int32_t out_mult, 2001 const int32_t out_shift, 2002 const int32_t out_activation_min, 2003 const int32_t out_activation_max, 2004 const int32_t block_size); 2005 2006 /** 2007 * @brief s16 elementwise multiplication 2008 * @param[in] input_1_vect pointer to input vector 1 2009 * @param[in] input_2_vect pointer to input vector 2 2010 * @param[in] input_1_offset offset for input 1. Not used. 2011 * @param[in] input_2_offset offset for input 2. Not used. 2012 * @param[in,out] output pointer to output vector 2013 * @param[in] out_offset output offset. Not used. 2014 * @param[in] out_mult output multiplier 2015 * @param[in] out_shift output shift 2016 * @param[in] out_activation_min minimum value to clamp output to. Min: -32768 2017 * @param[in] out_activation_max maximum value to clamp output to. Max: 32767 2018 * @param[in] block_size number of samples 2019 * @return The function returns ARM_CMSIS_NN_SUCCESS 2020 * 2021 * @details Supported framework: TensorFlow Lite micro 2022 */ 2023 arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect, 2024 const int16_t *input_2_vect, 2025 const int32_t input_1_offset, 2026 const int32_t input_2_offset, 2027 int16_t *output, 2028 const int32_t out_offset, 2029 const int32_t out_mult, 2030 const int32_t out_shift, 2031 const int32_t out_activation_min, 2032 const int32_t out_activation_max, 2033 const int32_t block_size); 2034 2035 /** 2036 * @defgroup Acti Activation Functions 2037 * 2038 * Perform activation layers, including ReLU (Rectified Linear Unit), 2039 * sigmoid and tanh 2040 * 2041 */ 2042 2043 /** 2044 * @brief Q7 RELU function 2045 * @param[in,out] data pointer to input 2046 * @param[in] size number of elements 2047 */ 2048 void arm_relu_q7(int8_t *data, uint16_t size); 2049 2050 /** 2051 * @brief s8 ReLU6 function 2052 * @param[in,out] data pointer to input 2053 * @param[in] size number of elements 2054 */ 2055 void arm_relu6_s8(int8_t *data, uint16_t size); 2056 2057 /** 2058 * @brief Q15 RELU function 2059 * @param[in,out] data pointer to input 2060 * @param[in] size number of elements 2061 */ 2062 void arm_relu_q15(int16_t *data, uint16_t size); 2063 2064 /** 2065 * @brief s16 neural network activation function using direct table look-up 2066 * @param[in] input pointer to input data 2067 * @param[out] output pointer to output 2068 * @param[in] size number of elements 2069 * @param[in] left_shift bit-width of the integer part, assumed to be smaller than 3. 2070 * @param[in] type type of activation functions 2071 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2072 2073 * 2074 * @details Supported framework: TensorFlow Lite for Microcontrollers. 2075 * This activation function must be bit precise congruent with the corresponding TFLM tanh and sigmoid activation 2076 * functions 2077 */ 2078 arm_cmsis_nn_status arm_nn_activation_s16(const int16_t *input, 2079 int16_t *output, 2080 const int32_t size, 2081 const int32_t left_shift, 2082 const arm_nn_activation_type type); 2083 2084 /** 2085 * @defgroup Pooling Pooling Functions 2086 * 2087 * Perform max and average pooling operations 2088 * 2089 */ 2090 2091 /** 2092 * @brief s8 average pooling function. 2093 * 2094 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 2095 * definition file to see if an additional buffer is required. 2096 * Optional function {API}_get_buffer_size() provides the buffer 2097 * size if an additional buffer is required. 2098 * The caller is expected to clear the buffer, if applicable, for security reasons. 2099 * @param[in] pool_params Pooling parameters 2100 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 2101 * @param[in] input_data Input (activation) data pointer. Data type: int8 2102 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] 2103 * Argument N and C are not used. 2104 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] 2105 * Argument N is not used. 2106 * C_OUT equals C_IN. 2107 * @param[in, out] output_data Output data pointer. Data type: int8 2108 * 2109 * @return The function returns either 2110 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 2111 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 2112 * 2113 * @details 2114 * - Supported Framework: TensorFlow Lite 2115 * 2116 */ 2117 arm_cmsis_nn_status arm_avgpool_s8(const cmsis_nn_context *ctx, 2118 const cmsis_nn_pool_params *pool_params, 2119 const cmsis_nn_dims *input_dims, 2120 const int8_t *input_data, 2121 const cmsis_nn_dims *filter_dims, 2122 const cmsis_nn_dims *output_dims, 2123 int8_t *output_data); 2124 2125 /** 2126 * @brief Get the required buffer size for S8 average pooling function 2127 * @param[in] dim_dst_width output tensor dimension 2128 * @param[in] ch_src number of input tensor channels 2129 * @return The function returns required buffer size in bytes 2130 * 2131 */ 2132 int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src); 2133 2134 /** 2135 * @brief Get the required buffer size for S8 average pooling function for processors with DSP extension. 2136 * Refer to arm_avgpool_s8_get_buffer_size() for function argument details. 2137 * 2138 * @note Intended for compilation on Host. If compiling for an Arm target, use 2139 * arm_avgpool_s8_get_buffer_size(). 2140 * 2141 */ 2142 int32_t arm_avgpool_s8_get_buffer_size_dsp(const int dim_dst_width, const int ch_src); 2143 2144 /** 2145 * @brief Get the required buffer size for S8 average pooling function for Arm(R) Helium Architecture case. 2146 * Refer to arm_avgpool_s8_get_buffer_size() for function argument details. 2147 * 2148 * @note Intended for compilation on Host. If compiling for an Arm target, use 2149 * arm_avgpool_s8_get_buffer_size(). 2150 * 2151 */ 2152 int32_t arm_avgpool_s8_get_buffer_size_mve(const int dim_dst_width, const int ch_src); 2153 2154 /** 2155 * @brief s16 average pooling function. 2156 * 2157 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 2158 * definition file to see if an additional buffer is required. 2159 * Optional function {API}_get_buffer_size() provides the buffer 2160 * size if an additional buffer is required. 2161 * The caller is expected to clear the buffer, if applicable, for security reasons. 2162 * @param[in] pool_params Pooling parameters 2163 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 2164 * @param[in] input_data Input (activation) data pointer. Data type: int16 2165 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] 2166 * Argument N and C are not used. 2167 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] 2168 * Argument N is not used. 2169 * C_OUT equals C_IN. 2170 * @param[in, out] output_data Output data pointer. Data type: int16 2171 * 2172 * @return The function returns 2173 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation 2174 * <code>ARM_CMSIS_NN_ARG_ERROR</code> - In case of invalid arguments 2175 * 2176 * @details 2177 * - Supported Framework: TensorFlow Lite 2178 * 2179 */ 2180 arm_cmsis_nn_status arm_avgpool_s16(const cmsis_nn_context *ctx, 2181 const cmsis_nn_pool_params *pool_params, 2182 const cmsis_nn_dims *input_dims, 2183 const int16_t *input_data, 2184 const cmsis_nn_dims *filter_dims, 2185 const cmsis_nn_dims *output_dims, 2186 int16_t *output_data); 2187 2188 /** 2189 * @brief Get the required buffer size for S16 average pooling function 2190 * @param[in] dim_dst_width output tensor dimension 2191 * @param[in] ch_src number of input tensor channels 2192 * @return The function returns required buffer size in bytes 2193 * 2194 */ 2195 int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src); 2196 2197 /** 2198 * @brief Get the required buffer size for S16 average pooling function for processors with DSP extension. 2199 * Refer to arm_avgpool_s16_get_buffer_size() for function argument details. 2200 * 2201 * @note Intended for compilation on Host. If compiling for an Arm target, use 2202 * arm_avgpool_s16_get_buffer_size(). 2203 * 2204 */ 2205 int32_t arm_avgpool_s16_get_buffer_size_dsp(const int dim_dst_width, const int ch_src); 2206 2207 /** 2208 * @brief Get the required buffer size for S16 average pooling function for Arm(R) Helium Architecture case. 2209 * Refer to arm_avgpool_s16_get_buffer_size() for function argument details. 2210 * 2211 * @note Intended for compilation on Host. If compiling for an Arm target, use 2212 * arm_avgpool_s16_get_buffer_size(). 2213 * 2214 */ 2215 int32_t arm_avgpool_s16_get_buffer_size_mve(const int dim_dst_width, const int ch_src); 2216 2217 /** 2218 * @brief s8 max pooling function. 2219 * 2220 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 2221 * definition file to see if an additional buffer is required. 2222 * Optional function {API}_get_buffer_size() provides the buffer 2223 * size if an additional buffer is required. 2224 * The caller is expected to clear the buffer, if applicable, for security reasons. 2225 * @param[in] pool_params Pooling parameters 2226 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 2227 * @param[in] input_data Input (activation) data pointer. The input tensor must not 2228 * overlap with the output tensor. Data type: int8 2229 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] 2230 * Argument N and C are not used. 2231 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] 2232 * Argument N is not used. 2233 * C_OUT equals C_IN. 2234 * @param[in, out] output_data Output data pointer. Data type: int8 2235 * 2236 * @return The function returns either 2237 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 2238 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 2239 * 2240 * @details 2241 * - Supported Framework: TensorFlow Lite 2242 * 2243 */ 2244 arm_cmsis_nn_status arm_max_pool_s8(const cmsis_nn_context *ctx, 2245 const cmsis_nn_pool_params *pool_params, 2246 const cmsis_nn_dims *input_dims, 2247 const int8_t *input_data, 2248 const cmsis_nn_dims *filter_dims, 2249 const cmsis_nn_dims *output_dims, 2250 int8_t *output_data); 2251 2252 /** 2253 * @brief s16 max pooling function. 2254 * 2255 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 2256 * definition file to see if an additional buffer is required. 2257 * Optional function {API}_get_buffer_size() provides the buffer 2258 * size if an additional buffer is required. 2259 * The caller is expected to clear the buffer, if applicable, for security reasons. 2260 * @param[in] pool_params Pooling parameters 2261 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 2262 * @param[in] src Input (activation) data pointer. The input tensor must not 2263 * overlap with the output tensor. Data type: int16 2264 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] 2265 * Argument N and C are not used. 2266 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] 2267 * Argument N is not used. 2268 * C_OUT equals C_IN. 2269 * @param[in, out] dst Output data pointer. Data type: int16 2270 * 2271 * @return The function returns either 2272 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 2273 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 2274 * 2275 * @details 2276 * - Supported Framework: TensorFlow Lite 2277 * 2278 */ 2279 arm_cmsis_nn_status arm_max_pool_s16(const cmsis_nn_context *ctx, 2280 const cmsis_nn_pool_params *pool_params, 2281 const cmsis_nn_dims *input_dims, 2282 const int16_t *src, 2283 const cmsis_nn_dims *filter_dims, 2284 const cmsis_nn_dims *output_dims, 2285 int16_t *dst); 2286 2287 /** 2288 * @defgroup Softmax Softmax Functions 2289 * 2290 * 2291 */ 2292 2293 /** 2294 * @brief S8 softmax function 2295 * @param[in] input Pointer to the input tensor 2296 * @param[in] num_rows Number of rows in the input tensor 2297 * @param[in] row_size Number of elements in each input row 2298 * @param[in] mult Input quantization multiplier 2299 * @param[in] shift Input quantization shift within the range [0, 31] 2300 * @param[in] diff_min Minimum difference with max in row. Used to check if 2301 * the quantized exponential operation can be performed 2302 * @param[out] output Pointer to the output tensor 2303 * 2304 * @note Supported framework: TensorFlow Lite micro (bit-accurate) 2305 * 2306 */ 2307 void arm_softmax_s8(const int8_t *input, 2308 const int32_t num_rows, 2309 const int32_t row_size, 2310 const int32_t mult, 2311 const int32_t shift, 2312 const int32_t diff_min, 2313 int8_t *output); 2314 2315 /** 2316 * @brief S8 to s16 softmax function 2317 * @param[in] input Pointer to the input tensor 2318 * @param[in] num_rows Number of rows in the input tensor 2319 * @param[in] row_size Number of elements in each input row 2320 * @param[in] mult Input quantization multiplier 2321 * @param[in] shift Input quantization shift within the range [0, 31] 2322 * @param[in] diff_min Minimum difference with max in row. Used to check if 2323 * the quantized exponential operation can be performed 2324 * @param[out] output Pointer to the output tensor 2325 * 2326 * @note Supported framework: TensorFlow Lite micro (bit-accurate) 2327 * 2328 */ 2329 void arm_softmax_s8_s16(const int8_t *input, 2330 const int32_t num_rows, 2331 const int32_t row_size, 2332 const int32_t mult, 2333 const int32_t shift, 2334 const int32_t diff_min, 2335 int16_t *output); 2336 2337 /** 2338 * @brief S16 softmax function 2339 * @param[in] input Pointer to the input tensor 2340 * @param[in] num_rows Number of rows in the input tensor 2341 * @param[in] row_size Number of elements in each input row 2342 * @param[in] mult Input quantization multiplier 2343 * @param[in] shift Input quantization shift within the range [0, 31] 2344 * @param[in] softmax_params Softmax s16 layer parameters with two pointers to LUTs speficied below. 2345 * For indexing the high 9 bits are used and 7 remaining for interpolation. 2346 * That means 512 entries for the 9-bit indexing and 1 extra for interpolation, i.e. 513 2347 * values for each LUT. 2348 * - Lookup table for exp(x), where x uniform distributed between [-10.0 , 0.0] 2349 * - Lookup table for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0] 2350 * @param[out] output Pointer to the output tensor 2351 * @return The function returns 2352 * <code>ARM_CMSIS_NN_ARG_ERROR</code> Argument error check failed 2353 * <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation 2354 * 2355 * @note Supported framework: TensorFlow Lite micro (bit-accurate) 2356 * 2357 */ 2358 arm_cmsis_nn_status arm_softmax_s16(const int16_t *input, 2359 const int32_t num_rows, 2360 const int32_t row_size, 2361 const int32_t mult, 2362 const int32_t shift, 2363 const cmsis_nn_softmax_lut_s16 *softmax_params, 2364 int16_t *output); 2365 2366 /** 2367 * @brief U8 softmax function 2368 * @param[in] input Pointer to the input tensor 2369 * @param[in] num_rows Number of rows in the input tensor 2370 * @param[in] row_size Number of elements in each input row 2371 * @param[in] mult Input quantization multiplier 2372 * @param[in] shift Input quantization shift within the range [0, 31] 2373 * @param[in] diff_min Minimum difference with max in row. Used to check if 2374 * the quantized exponential operation can be performed 2375 * @param[out] output Pointer to the output tensor 2376 * 2377 * @note Supported framework: TensorFlow Lite micro (bit-accurate) 2378 * 2379 */ 2380 2381 void arm_softmax_u8(const uint8_t *input, 2382 const int32_t num_rows, 2383 const int32_t row_size, 2384 const int32_t mult, 2385 const int32_t shift, 2386 const int32_t diff_min, 2387 uint8_t *output); 2388 2389 /** 2390 * @defgroup Reshape Reshape Functions 2391 * 2392 */ 2393 2394 /** 2395 * @brief Reshape a s8 vector into another with different shape 2396 * @param[in] input points to the s8 input vector 2397 * @param[out] output points to the s8 output vector 2398 * @param[in] total_size total size of the input and output vectors in bytes 2399 * 2400 * @note The output is expected to be in a memory area that does not overlap with the input's 2401 * 2402 */ 2403 void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size); 2404 2405 /** 2406 * @defgroup Transpose Transpose Functions 2407 * 2408 */ 2409 2410 /** 2411 * @brief Basic transpose function 2412 * 2413 * @param[in] input_data Input (activation) data pointer. Data type: int8 2414 * @param[out] output_data Output data pointer. Data type: int8 2415 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 2416 * @param[in] output_dims Output tensor dimensions. Format may be arbitrary relative to input format. 2417 * The output dimension will depend on the permutation dimensions. 2418 * In other words the out dimensions are the result of applying the permutation 2419 * to the input dimensions. 2420 * @param[in] transpose_params Transpose parameters. Contains permutation dimensions. 2421 * 2422 * @return The function returns either 2423 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 2424 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 2425 * 2426 */ 2427 arm_cmsis_nn_status arm_transpose_s8(const int8_t *input_data, 2428 int8_t *const output_data, 2429 const cmsis_nn_dims *const input_dims, 2430 const cmsis_nn_dims *const output_dims, 2431 const cmsis_nn_transpose_params *const transpose_params); 2432 2433 /** 2434 * @defgroup Concatenation Concatenation Functions 2435 * 2436 */ 2437 2438 /** 2439 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis 2440 * This function should be called for each input tensor to concatenate. The argument offset_x 2441 * will be used to store the input tensor in the correct position in the output tensor 2442 * 2443 * i.e. offset_x = 0 2444 * for(i = 0 i < num_input_tensors; ++i) 2445 * { 2446 * arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x) 2447 * offset_x += input_x[i] 2448 * } 2449 * 2450 * This function assumes that the output tensor has: 2451 * -# The same height of the input tensor 2452 * -# The same number of channels of the input tensor 2453 * -# The same batch size of the input tensor 2454 * 2455 * Unless specified otherwise, arguments are mandatory. 2456 * 2457 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it 2458 * does not involve any arithmetic operation 2459 * 2460 * @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor. 2461 * @param[in] input_x Width of input tensor 2462 * @param[in] input_y Height of input tensor 2463 * @param[in] input_z Channels in input tensor 2464 * @param[in] input_w Batch size in input tensor 2465 * @param[out] output Pointer to output tensor. Expected to be at least 2466 * (input_x * input_y * input_z * input_w) + offset_x 2467 * bytes. 2468 * @param[in] output_x Width of output tensor 2469 * @param[in] offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor 2470 * It is user responsibility to provide the correct value 2471 * 2472 * <b> Input constraints</b> 2473 * offset_x is less than output_x 2474 * 2475 */ 2476 void arm_concatenation_s8_x(const int8_t *input, 2477 const uint16_t input_x, 2478 const uint16_t input_y, 2479 const uint16_t input_z, 2480 const uint16_t input_w, 2481 int8_t *output, 2482 const uint16_t output_x, 2483 const uint32_t offset_x); 2484 2485 /** 2486 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis 2487 * This function should be called for each input tensor to concatenate. The argument offset_y 2488 * will be used to store the input tensor in the correct position in the output tensor 2489 * 2490 * i.e. offset_y = 0 2491 * for(i = 0 i < num_input_tensors; ++i) 2492 * { 2493 * arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y) 2494 * offset_y += input_y[i] 2495 * } 2496 * 2497 * This function assumes that the output tensor has: 2498 * -# The same width of the input tensor 2499 * -# The same number of channels of the input tensor 2500 * -# The same batch size of the input tensor 2501 * 2502 * Unless specified otherwise, arguments are mandatory. 2503 * 2504 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it 2505 * does not involve any arithmetic operation 2506 * 2507 * @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor. 2508 * @param[in] input_x Width of input tensor 2509 * @param[in] input_y Height of input tensor 2510 * @param[in] input_z Channels in input tensor 2511 * @param[in] input_w Batch size in input tensor 2512 * @param[out] output Pointer to output tensor. Expected to be at least 2513 * (input_z * input_w * input_x * input_y) + offset_y 2514 * bytes. 2515 * @param[in] output_y Height of output tensor 2516 * @param[in] offset_y The offset on the Y axis to start concatenating the input tensor 2517 * It is user responsibility to provide the correct value 2518 * 2519 * <b> Input constraints</b> 2520 * offset_y is less than output_y 2521 * 2522 */ 2523 void arm_concatenation_s8_y(const int8_t *input, 2524 const uint16_t input_x, 2525 const uint16_t input_y, 2526 const uint16_t input_z, 2527 const uint16_t input_w, 2528 int8_t *output, 2529 const uint16_t output_y, 2530 const uint32_t offset_y); 2531 2532 /** 2533 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis 2534 * This function should be called for each input tensor to concatenate. The argument offset_z 2535 * will be used to store the input tensor in the correct position in the output tensor 2536 * 2537 * i.e. offset_z = 0 2538 * for(i = 0 i < num_input_tensors; ++i) 2539 * { 2540 * arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z) 2541 * offset_z += input_z[i] 2542 * } 2543 * 2544 * This function assumes that the output tensor has: 2545 * -# The same width of the input tensor 2546 * -# The same height of the input tensor 2547 * -# The same batch size of the input tensor 2548 * 2549 * Unless specified otherwise, arguments are mandatory. 2550 * 2551 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it 2552 * does not involve any arithmetic operation 2553 * 2554 * @param[in] input Pointer to input tensor. Input tensor must not overlap with output tensor. 2555 * @param[in] input_x Width of input tensor 2556 * @param[in] input_y Height of input tensor 2557 * @param[in] input_z Channels in input tensor 2558 * @param[in] input_w Batch size in input tensor 2559 * @param[out] output Pointer to output tensor. Expected to be at least 2560 * (input_x * input_y * input_z * input_w) + offset_z 2561 * bytes. 2562 * @param[in] output_z Channels in output tensor 2563 * @param[in] offset_z The offset on the Z axis to start concatenating the input tensor 2564 * It is user responsibility to provide the correct value 2565 * 2566 * <b> Input constraints</b> 2567 * offset_z is less than output_z 2568 * 2569 */ 2570 void arm_concatenation_s8_z(const int8_t *input, 2571 const uint16_t input_x, 2572 const uint16_t input_y, 2573 const uint16_t input_z, 2574 const uint16_t input_w, 2575 int8_t *output, 2576 const uint16_t output_z, 2577 const uint32_t offset_z); 2578 2579 /** 2580 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size) 2581 * This function should be called for each input tensor to concatenate. The argument offset_w 2582 * will be used to store the input tensor in the correct position in the output tensor 2583 * 2584 * i.e. offset_w = 0 2585 * for(i = 0 i < num_input_tensors; ++i) 2586 * { 2587 * arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w) 2588 * offset_w += input_w[i] 2589 * } 2590 * 2591 * This function assumes that the output tensor has: 2592 * -# The same width of the input tensor 2593 * -# The same height of the input tensor 2594 * -# The same number o channels of the input tensor 2595 * 2596 * Unless specified otherwise, arguments are mandatory. 2597 * 2598 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it 2599 * does not involve any arithmetic operation 2600 * 2601 * @param[in] input Pointer to input tensor 2602 * @param[in] input_x Width of input tensor 2603 * @param[in] input_y Height of input tensor 2604 * @param[in] input_z Channels in input tensor 2605 * @param[in] input_w Batch size in input tensor 2606 * @param[out] output Pointer to output tensor. Expected to be at least 2607 * input_x * input_y * input_z * input_w 2608 * bytes. 2609 * @param[in] offset_w The offset on the W axis to start concatenating the input tensor 2610 * It is user responsibility to provide the correct value 2611 * 2612 */ 2613 void arm_concatenation_s8_w(const int8_t *input, 2614 const uint16_t input_x, 2615 const uint16_t input_y, 2616 const uint16_t input_z, 2617 const uint16_t input_w, 2618 int8_t *output, 2619 const uint32_t offset_w); 2620 /** 2621 * @defgroup SVDF SVDF Functions 2622 * 2623 */ 2624 2625 /** 2626 * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights 2627 * 2628 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 2629 * definition file to see if an additional buffer is required. 2630 * Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer 2631 * size if an additional buffer is required. 2632 * The caller is expected to clear the buffer, if applicable, for security reasons. 2633 * @param[in] input_ctx Temporary scratch buffer 2634 * The caller is expected to clear the buffer, if applicable, for security reasons. 2635 * @param[in] output_ctx Temporary output scratch buffer 2636 * The caller is expected to clear the buffer, if applicable, for security reasons. 2637 * @param[in] svdf_params SVDF Parameters 2638 * Range of svdf_params->input_offset : [-128, 127] 2639 * Range of svdf_params->output_offset : [-128, 127] 2640 * @param[in] input_quant_params Input quantization parameters 2641 * @param[in] output_quant_params Output quantization parameters 2642 * @param[in] input_dims Input tensor dimensions 2643 * @param[in] input_data Pointer to input tensor 2644 * @param[in] state_dims State tensor dimensions 2645 * @param[in] state_data Pointer to state tensor 2646 * @param[in] weights_feature_dims Weights (feature) tensor dimensions 2647 * @param[in] weights_feature_data Pointer to the weights (feature) tensor 2648 * @param[in] weights_time_dims Weights (time) tensor dimensions 2649 * @param[in] weights_time_data Pointer to the weights (time) tensor 2650 * @param[in] bias_dims Bias tensor dimensions 2651 * @param[in] bias_data Pointer to bias tensor 2652 * @param[in] output_dims Output tensor dimensions 2653 * @param[out] output_data Pointer to the output tensor 2654 * 2655 * @return The function returns either 2656 * <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or, 2657 * <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion. 2658 * 2659 * @details 2660 * 1. Supported framework: TensorFlow Lite micro 2661 */ 2662 arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx, 2663 const cmsis_nn_context *input_ctx, 2664 const cmsis_nn_context *output_ctx, 2665 const cmsis_nn_svdf_params *svdf_params, 2666 const cmsis_nn_per_tensor_quant_params *input_quant_params, 2667 const cmsis_nn_per_tensor_quant_params *output_quant_params, 2668 const cmsis_nn_dims *input_dims, 2669 const int8_t *input_data, 2670 const cmsis_nn_dims *state_dims, 2671 int8_t *state_data, 2672 const cmsis_nn_dims *weights_feature_dims, 2673 const int8_t *weights_feature_data, 2674 const cmsis_nn_dims *weights_time_dims, 2675 const int8_t *weights_time_data, 2676 const cmsis_nn_dims *bias_dims, 2677 const int32_t *bias_data, 2678 const cmsis_nn_dims *output_dims, 2679 int8_t *output_data); 2680 2681 /** 2682 * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights 2683 * 2684 * @param[in] input_ctx Temporary scratch buffer 2685 * The caller is expected to clear the buffer, if applicable, for security reasons. 2686 * @param[in] output_ctx Temporary output scratch buffer 2687 * The caller is expected to clear the buffer, if applicable, for security reasons. 2688 * @param[in] svdf_params SVDF Parameters 2689 * Range of svdf_params->input_offset : [-128, 127] 2690 * Range of svdf_params->output_offset : [-128, 127] 2691 * @param[in] input_quant_params Input quantization parameters 2692 * @param[in] output_quant_params Output quantization parameters 2693 * @param[in] input_dims Input tensor dimensions 2694 * @param[in] input_data Pointer to input tensor 2695 * @param[in] state_dims State tensor dimensions 2696 * @param[in] state_data Pointer to state tensor 2697 * @param[in] weights_feature_dims Weights (feature) tensor dimensions 2698 * @param[in] weights_feature_data Pointer to the weights (feature) tensor 2699 * @param[in] weights_time_dims Weights (time) tensor dimensions 2700 * @param[in] weights_time_data Pointer to the weights (time) tensor 2701 * @param[in] bias_dims Bias tensor dimensions 2702 * @param[in] bias_data Pointer to bias tensor 2703 * @param[in] output_dims Output tensor dimensions 2704 * @param[out] output_data Pointer to the output tensor 2705 * 2706 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2707 * 2708 * @details 2709 * 1. Supported framework: TensorFlow Lite micro 2710 */ 2711 arm_cmsis_nn_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, 2712 const cmsis_nn_context *output_ctx, 2713 const cmsis_nn_svdf_params *svdf_params, 2714 const cmsis_nn_per_tensor_quant_params *input_quant_params, 2715 const cmsis_nn_per_tensor_quant_params *output_quant_params, 2716 const cmsis_nn_dims *input_dims, 2717 const int8_t *input_data, 2718 const cmsis_nn_dims *state_dims, 2719 int16_t *state_data, 2720 const cmsis_nn_dims *weights_feature_dims, 2721 const int8_t *weights_feature_data, 2722 const cmsis_nn_dims *weights_time_dims, 2723 const int16_t *weights_time_data, 2724 const cmsis_nn_dims *bias_dims, 2725 const int32_t *bias_data, 2726 const cmsis_nn_dims *output_dims, 2727 int8_t *output_data); 2728 2729 /** 2730 * @brief Get size of additional buffer required by arm_svdf_s8(). 2731 * @param[in] filter_dims dimension of filter 2732 * @return The function returns required buffer size in bytes 2733 * 2734 */ 2735 int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims); 2736 2737 /** 2738 * @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension. 2739 * Refer to arm_svdf_s8_get_buffer_size() for function argument details. 2740 * 2741 * @note Intended for compilation on Host. If compiling for an Arm target, use 2742 * arm_svdf_s8_get_buffer_size(). 2743 * 2744 */ 2745 int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims); 2746 2747 /** 2748 * @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case. 2749 * Refer to arm_svdf_s8_get_buffer_size() for function argument details. 2750 * 2751 * @note Intended for compilation on Host. If compiling for an Arm target, use 2752 * arm_svdf_s8_get_buffer_size(). 2753 * 2754 */ 2755 int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims); 2756 2757 /** 2758 * @defgroup LSTM LSTM Layer Functions 2759 * 2760 */ 2761 2762 /** 2763 * @brief LSTM unidirectional function with 8 bit input and output and 16 bit gate output, 32 bit bias. 2764 * 2765 * @param[in] input Pointer to input data 2766 * @param[out] output Pointer to output data 2767 * @param[in] params Struct containing all information about the lstm operator, see arm_nn_types. 2768 * @param[in] buffers Struct containing pointers to all temporary scratch buffers needed for the 2769 * lstm operator, see arm_nn_types. 2770 * 2771 * 2772 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2773 * 2774 * @details 2775 * 1. Supported framework: TensorFlow Lite Micro 2776 * 2777 */ 2778 arm_cmsis_nn_status arm_lstm_unidirectional_s8(const int8_t *input, 2779 int8_t *output, 2780 const cmsis_nn_lstm_params *params, 2781 cmsis_nn_lstm_context *buffers); 2782 2783 /** 2784 * @brief LSTM unidirectional function with 16 bit input and output and 16 bit gate output, 64 bit bias. 2785 * 2786 * @param[in] input Pointer to input data 2787 * @param[out] output Pointer to output data 2788 * @param[in] params Struct containing all information about the lstm operator, see arm_nn_types. 2789 * @param[in] buffers Struct containing pointers to all temporary scratch buffers needed for the 2790 * lstm operator, see arm_nn_types. 2791 * 2792 * 2793 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2794 * 2795 * @details 2796 * 1. Supported framework: TensorFlow Lite Micro 2797 * 2798 */ 2799 arm_cmsis_nn_status arm_lstm_unidirectional_s16(const int16_t *input, 2800 int16_t *output, 2801 const cmsis_nn_lstm_params *params, 2802 cmsis_nn_lstm_context *buffers); 2803 2804 /** 2805 * @brief Batch matmul function with 8 bit input and output. 2806 * 2807 * @param[in] ctx Temporary scratch buffer 2808 * The caller is expected to clear the buffer, if applicable, for security reasons. 2809 * Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer 2810 * size if an additional buffer is required. 2811 * @param[in] bmm_params Batch matmul Parameters 2812 * Adjoint flags are currently unused. 2813 * @param[in] quant_params Quantization parameters 2814 * @param[in] input_lhs_dims Input lhs tensor dimensions. 2815 * This should be NHWC where lhs C = rhs C 2816 * @param[in] input_lhs Pointer to input tensor 2817 * @param[in] input_rhs_dims Input lhs tensor dimensions. 2818 * This is expected to be transposed so 2819 * should be NHWC where lhs C = rhs C 2820 * @param[in] input_rhs Pointer to transposed input tensor 2821 * @param[in] output_dims Output tensor dimensions 2822 * @param[out] output Pointer to the output tensor 2823 * 2824 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2825 * 2826 * @details 2827 * 1. Supported framework: TensorFlow Lite Micro 2828 * 2. Performs row * row matrix multiplication with the RHS transposed. 2829 * 2830 */ 2831 arm_cmsis_nn_status arm_batch_matmul_s8(const cmsis_nn_context *ctx, 2832 const cmsis_nn_bmm_params *bmm_params, 2833 const cmsis_nn_per_tensor_quant_params *quant_params, 2834 const cmsis_nn_dims *input_lhs_dims, 2835 const int8_t *input_lhs, 2836 const cmsis_nn_dims *input_rhs_dims, 2837 const int8_t *input_rhs, 2838 const cmsis_nn_dims *output_dims, 2839 int8_t *output); 2840 2841 /** 2842 * @brief Batch matmul function with 16 bit input and output. 2843 * 2844 * @param[in] ctx Temporary scratch buffer 2845 * The caller is expected to clear the buffer, if applicable, for security reasons. 2846 * Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer 2847 * size if an additional buffer is required. 2848 * @param[in] bmm_params Batch matmul Parameters 2849 * Adjoint flags are currently unused. 2850 * @param[in] quant_params Quantization parameters 2851 * @param[in] input_lhs_dims Input lhs tensor dimensions. 2852 * This should be NHWC where LHS.C = RHS.C 2853 * @param[in] input_lhs Pointer to input tensor 2854 * @param[in] input_rhs_dims Input lhs tensor dimensions. 2855 * This is expected to be transposed so 2856 * should be NHWC where LHS.C = RHS.C 2857 * @param[in] input_rhs Pointer to transposed input tensor 2858 * @param[in] output_dims Output tensor dimensions 2859 * @param[out] output Pointer to the output tensor 2860 * 2861 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2862 * 2863 * @details 2864 * 1. Supported framework: TensorFlow Lite Micro 2865 * 2. Performs row * row matrix multiplication with the RHS transposed. 2866 * 2867 */ 2868 arm_cmsis_nn_status arm_batch_matmul_s16(const cmsis_nn_context *ctx, 2869 const cmsis_nn_bmm_params *bmm_params, 2870 const cmsis_nn_per_tensor_quant_params *quant_params, 2871 const cmsis_nn_dims *input_lhs_dims, 2872 const int16_t *input_lhs, 2873 const cmsis_nn_dims *input_rhs_dims, 2874 const int16_t *input_rhs, 2875 const cmsis_nn_dims *output_dims, 2876 int16_t *output); 2877 2878 /** 2879 * @defgroup Pad Pad Layer Functions: 2880 * 2881 */ 2882 2883 /** 2884 * @brief Expands the size of the input by adding constant values before and after the data, in all dimensions. 2885 * 2886 * @param[in] input Pointer to input data 2887 * @param[out] output Pointer to output data 2888 * @param[in] pad_value Value to pad with 2889 * @param[in] input_size Input tensor dimensions 2890 * @param[in] pre_pad Padding to apply before data in each dimension 2891 * @param[in] post_pad Padding to apply after data in each dimension 2892 * 2893 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2894 * 2895 */ 2896 arm_cmsis_nn_status arm_pad_s8(const int8_t *input, 2897 int8_t *output, 2898 const int8_t pad_value, 2899 const cmsis_nn_dims *input_size, 2900 const cmsis_nn_dims *pre_pad, 2901 const cmsis_nn_dims *post_pad); 2902 2903 /** 2904 * @brief Elementwise binary minimum with 8bit data. 2905 * 2906 * @param[in] ctx Temporary scratch buffer 2907 * The caller is expected to clear the buffer, if applicable, for security reasons. 2908 * @param[in] input_1_data Pointer to input1 tensor 2909 * @param[in] input_1_dims Input1 tensor dimensions 2910 * @param[in] input_2_data Pointer to input2 tensor 2911 * @param[in] input_2_dims Input2 tensor dimensions 2912 * @param[out] output_data Pointer to the output tensor 2913 * @param[in] output_dims Output tensor dimensions 2914 * 2915 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2916 * 2917 * @details 2918 * 1. Supported framework: TensorFlow Lite Micro 2919 * 2920 */ 2921 arm_cmsis_nn_status arm_minimum_s8(const cmsis_nn_context *ctx, 2922 const int8_t *input_1_data, 2923 const cmsis_nn_dims *input_1_dims, 2924 const int8_t *input_2_data, 2925 const cmsis_nn_dims *input_2_dims, 2926 int8_t *output_data, 2927 const cmsis_nn_dims *output_dims); 2928 2929 /** 2930 * @brief Elementwise binary maximum with 8bit data. 2931 * 2932 * @param[in] ctx Temporary scratch buffer 2933 * The caller is expected to clear the buffer, if applicable, for security reasons. 2934 * @param[in] input_1_data Pointer to input1 tensor 2935 * @param[in] input_1_dims Input1 tensor dimensions 2936 * @param[in] input_2_data Pointer to input2 tensor 2937 * @param[in] input_2_dims Input2 tensor dimensions 2938 * @param[out] output_data Pointer to the output tensor 2939 * @param[in] output_dims Output tensor dimensions 2940 * 2941 * @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> 2942 * 2943 * @details 2944 * 1. Supported framework: TensorFlow Lite Micro 2945 * 2946 */ 2947 arm_cmsis_nn_status arm_maximum_s8(const cmsis_nn_context *ctx, 2948 const int8_t *input_1_data, 2949 const cmsis_nn_dims *input_1_dims, 2950 const int8_t *input_2_data, 2951 const cmsis_nn_dims *input_2_dims, 2952 int8_t *output_data, 2953 const cmsis_nn_dims *output_dims); 2954 2955 #ifdef __cplusplus 2956 } 2957 #endif 2958 2959 #endif /* ARM_NNFUNCTIONS_H */ 2960