1 /* 2 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. 3 * 4 * SPDX-License-Identifier: Apache-2.0 5 * 6 * Licensed under the Apache License, Version 2.0 (the License); you may 7 * not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 /* ---------------------------------------------------------------------- 20 * Project: CMSIS NN Library 21 * Title: arm_nnfunctions.h 22 * Description: Public header file for CMSIS NN Library 23 * 24 * $Date: 19 March 2021 25 * $Revision: V.7.0.0 26 * 27 * Target Processor: Cortex-M CPUs 28 * -------------------------------------------------------------------- */ 29 30 /** 31 \mainpage CMSIS NN Software Library 32 * 33 * Introduction 34 * ------------ 35 * 36 * This user manual describes the CMSIS NN software library, 37 * a collection of efficient neural network kernels developed to maximize the 38 * performance and minimize the memory footprint of neural networks on Cortex-M processor cores. 39 * 40 * The library is divided into a number of functions each covering a specific category: 41 * - Convolution Functions 42 * - Activation Functions 43 * - Fully-connected Layer Functions 44 * - SVDF Layer Functions 45 * - Pooling Functions 46 * - Softmax Functions 47 * - Basic math Functions 48 * 49 * The library has separate functions for operating on different weight and activation data 50 * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the 51 * kernels are included in the function description. The implementation details are also 52 * described in this paper [1]. 53 * 54 * Function Classification 55 * -------- 56 * The functions can be classified into two segments 57 * - Legacy functions supporting ARM's internal symmetric quantization(8 bits). 58 * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits). 59 * 60 * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there. 61 * The article in [2] describes in detail how to run a network using the legacy functions. 62 * 63 * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL 64 * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run 65 * a TensorFlow Lite model using optimized CMSIS-NN kernels. 66 * 67 * Block Diagram 68 * -------- 69 * \image html CMSIS-NN-OVERVIEW.PNG 70 * 71 * Examples 72 * -------- 73 * 74 * The library ships with a number of examples which demonstrate how to use the library functions. 75 * 76 * Pre-processor Macros 77 * ------------ 78 * 79 * Each library project have different pre-processor macros. 80 * 81 * - ARM_MATH_DSP: 82 * 83 * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension). 84 * 85 * - ARM_MATH_MVEI: 86 * 87 * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension. 88 89 * - ARM_MATH_AUTOVECTORIZE 90 * Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline 91 * assembly. It does not affect functions that use C or intrinsics. 92 * - ARM_MATH_BIG_ENDIAN: 93 * 94 * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy 95 * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for 96 * little endian targets. 97 * 98 * - ARM_NN_TRUNCATE: 99 * 100 * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation. 101 * 102 * 103 * Copyright Notice 104 * ------------ 105 * 106 * Copyright (C) 2010-2019 Arm Limited. All rights reserved. 107 * 108 * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601 109 * 110 * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN 111 * 112 https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page 113 * [3] https://www.tensorflow.org/lite/microcontrollers/library 114 * 115 * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis 116 */ 117 118 /** 119 * @defgroup groupNN Neural Network Functions 120 * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support 121 * TensorFlow Lite framework. 122 */ 123 124 #ifndef _ARM_NNFUNCTIONS_H 125 #define _ARM_NNFUNCTIONS_H 126 127 #include "arm_math_types.h" 128 #include "arm_nn_types.h" 129 130 #define USE_INTRINSIC 131 132 //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */ 133 134 #ifdef __cplusplus 135 extern "C" { 136 #endif 137 138 /** 139 * @brief Struct for specifying activation function types 140 * 141 */ 142 typedef enum 143 { 144 ARM_SIGMOID = 0, 145 /**< Sigmoid activation function */ 146 ARM_TANH = 1, 147 /**< Tanh activation function */ 148 } arm_nn_activation_type; 149 150 /** 151 * @defgroup NNConv Convolution Functions 152 * 153 * Collection of convolution, depthwise convolution functions and their variants. 154 * 155 * The convolution is implemented in 2 steps: im2col and GEMM 156 * 157 * im2col is a process of converting each patch of image data into 158 * a column. After im2col, the convolution is computed as matrix-matrix 159 * multiplication. 160 * 161 * To reduce the memory footprint, the im2col is performed partially. 162 * Each iteration, only a few column (i.e., patches) are generated and 163 * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions. 164 * 165 */ 166 167 /** 168 * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in 169 cmsis-nn 170 * to perform the convolution. 171 * 172 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 173 arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required 174 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 175 * Range of conv_params->input_offset : [-127, 128] 176 * Range of conv_params->output_offset : [-128, 127] 177 * @param[in] quant_params Per-channel quantization info. 178 * It contains the multiplier and shift values to be applied to each output channel 179 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 180 * @param[in] input_data Input (activation) data pointer. Data type: int8 181 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 182 * spatial filter dimensions 183 * @param[in] filter_data Filter data pointer. Data type: int8 184 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 185 * @param[in] bias_data Bias data pointer. Data type: int32 186 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 187 * @param[out] output_data Output data pointer. Data type: int8 188 * 189 * @return The function returns either 190 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, 191 * <code>ARM_MATH_SUCCESS</code> on successful completion. 192 * 193 */ 194 arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, 195 const cmsis_nn_conv_params *conv_params, 196 const cmsis_nn_per_channel_quant_params *quant_params, 197 const cmsis_nn_dims *input_dims, 198 const q7_t *input_data, 199 const cmsis_nn_dims *filter_dims, 200 const q7_t *filter_data, 201 const cmsis_nn_dims *bias_dims, 202 const int32_t *bias_data, 203 const cmsis_nn_dims *output_dims, 204 q7_t *output_data); 205 206 /** 207 * @brief Get the required buffer size for arm_convolve_wrapper_s8 208 * 209 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 210 * Range of conv_params->input_offset : [-127, 128] 211 * Range of conv_params->output_offset : [-128, 127] 212 * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN] 213 * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial 214 * filter dimensions 215 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 216 * 217 * @return The function returns required buffer size(bytes) 218 * 219 */ 220 int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, 221 const cmsis_nn_dims *input_dims, 222 const cmsis_nn_dims *filter_dims, 223 const cmsis_nn_dims *output_dims); 224 225 /** 226 * @brief Basic s8 convolution function 227 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 228 arm_convolve_s8_get_buffer_size will return the buffer_size if required 229 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 230 * Range of conv_params->input_offset : [-127, 128] 231 * Range of conv_params->output_offset : [-128, 127] 232 * @param[in] quant_params Per-channel quantization info. 233 * It contains the multiplier and shift values to be applied to each output channel 234 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 235 * @param[in] input_data Input (activation) data pointer. Data type: int8 236 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the 237 * spatial filter dimensions 238 * @param[in] filter_data Filter data pointer. Data type: int8 239 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 240 * @param[in] bias_data Optional bias data pointer. Data type: int32 241 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 242 * @param[out] output_data Output data pointer. Data type: int8 243 244 * @return The function returns <code>ARM_MATH_SUCCESS</code> 245 * 246 * @details 247 * 1. Supported framework: TensorFlow Lite micro 248 * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. 249 * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details. 250 * 251 */ 252 arm_status arm_convolve_s8(const cmsis_nn_context *ctx, 253 const cmsis_nn_conv_params *conv_params, 254 const cmsis_nn_per_channel_quant_params *quant_params, 255 const cmsis_nn_dims *input_dims, 256 const q7_t *input_data, 257 const cmsis_nn_dims *filter_dims, 258 const q7_t *filter_data, 259 const cmsis_nn_dims *bias_dims, 260 const int32_t *bias_data, 261 const cmsis_nn_dims *output_dims, 262 q7_t *output_data); 263 264 /** 265 * @brief Get the required buffer size for s8 convolution function 266 * 267 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 268 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK 269 * are the spatial filter dimensions 270 * @return The function returns required buffer size(bytes) 271 * 272 */ 273 int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 274 275 /** 276 * @brief Basic Q7 convolution function 277 * @param[in] Im_in pointer to input tensor 278 * @param[in] dim_im_in input tensor dimension 279 * @param[in] ch_im_in number of input tensor channels 280 * @param[in] wt pointer to kernel weights 281 * @param[in] ch_im_out number of filters, i.e., output tensor channels 282 * @param[in] dim_kernel filter kernel size 283 * @param[in] padding padding sizes 284 * @param[in] stride convolution stride 285 * @param[in] bias pointer to bias 286 * @param[in] bias_shift amount of left-shift for bias 287 * @param[in] out_shift amount of right-shift for output 288 * @param[in,out] Im_out pointer to output tensor 289 * @param[in] dim_im_out output tensor dimension 290 * @param[in,out] bufferA pointer to buffer space for input 291 * @param[in,out] bufferB pointer to buffer space for output 292 * @return The function returns <code>ARM_MATH_SUCCESS</code> 293 * 294 */ 295 arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in, 296 const uint16_t dim_im_in, 297 const uint16_t ch_im_in, 298 const q7_t *wt, 299 const uint16_t ch_im_out, 300 const uint16_t dim_kernel, 301 const uint16_t padding, 302 const uint16_t stride, 303 const q7_t *bias, 304 const uint16_t bias_shift, 305 const uint16_t out_shift, 306 q7_t *Im_out, 307 const uint16_t dim_im_out, 308 q15_t *bufferA, 309 q7_t *bufferB); 310 311 /** 312 * @brief Basic Q7 convolution function (non-square shape) 313 * @param[in] Im_in pointer to input tensor 314 * @param[in] dim_im_in_x input tensor dimension x 315 * @param[in] dim_im_in_y input tensor dimension y 316 * @param[in] ch_im_in number of input tensor channels 317 * @param[in] wt pointer to kernel weights 318 * @param[in] ch_im_out number of filters, i.e., output tensor channels 319 * @param[in] dim_kernel_x filter kernel size x 320 * @param[in] dim_kernel_y filter kernel size y 321 * @param[in] padding_x padding size x 322 * @param[in] padding_y padding size y 323 * @param[in] stride_x convolution stride x 324 * @param[in] stride_y convolution stride y 325 * @param[in] bias pointer to bias 326 * @param[in] bias_shift amount of left-shift for bias 327 * @param[in] out_shift amount of right-shift for output 328 * @param[in,out] Im_out pointer to output tensor 329 * @param[in] dim_im_out_x output tensor dimension x 330 * @param[in] dim_im_out_y output tensor dimension y 331 * @param[in,out] bufferA pointer to buffer space for input 332 * @param[in,out] bufferB pointer to buffer space for output 333 * @return The function returns <code>ARM_MATH_SUCCESS</code> 334 */ 335 arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in, 336 const uint16_t dim_im_in_x, 337 const uint16_t dim_im_in_y, 338 const uint16_t ch_im_in, 339 const q7_t *wt, 340 const uint16_t ch_im_out, 341 const uint16_t dim_kernel_x, 342 const uint16_t dim_kernel_y, 343 const uint16_t padding_x, 344 const uint16_t padding_y, 345 const uint16_t stride_x, 346 const uint16_t stride_y, 347 const q7_t *bias, 348 const uint16_t bias_shift, 349 const uint16_t out_shift, 350 q7_t *Im_out, 351 const uint16_t dim_im_out_x, 352 const uint16_t dim_im_out_y, 353 q15_t *bufferA, 354 q7_t *bufferB); 355 356 /** 357 * @brief Basic Q15 convolution function 358 * @param[in] Im_in pointer to input tensor 359 * @param[in] dim_im_in input tensor dimension 360 * @param[in] ch_im_in number of input tensor channels 361 * @param[in] wt pointer to kernel weights 362 * @param[in] ch_im_out number of filters, i.e., output tensor channels 363 * @param[in] dim_kernel filter kernel size 364 * @param[in] padding padding sizes 365 * @param[in] stride convolution stride 366 * @param[in] bias pointer to bias 367 * @param[in] bias_shift amount of left-shift for bias 368 * @param[in] out_shift amount of right-shift for output 369 * @param[in,out] Im_out pointer to output tensor 370 * @param[in] dim_im_out output tensor dimension 371 * @param[in,out] bufferA pointer to buffer space for input 372 * @param[in,out] bufferB pointer to buffer space for output 373 * @return The function returns <code>ARM_MATH_SUCCESS</code> 374 * 375 */ 376 arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in, 377 const uint16_t dim_im_in, 378 const uint16_t ch_im_in, 379 const q15_t *wt, 380 const uint16_t ch_im_out, 381 const uint16_t dim_kernel, 382 const uint16_t padding, 383 const uint16_t stride, 384 const q15_t *bias, 385 const uint16_t bias_shift, 386 const uint16_t out_shift, 387 q15_t *Im_out, 388 const uint16_t dim_im_out, 389 q15_t *bufferA, 390 q7_t *bufferB); 391 392 /** 393 * @brief Fast Q7 convolution function 394 * @param[in] Im_in pointer to input tensor 395 * @param[in] dim_im_in input tensor dimension 396 * @param[in] ch_im_in number of input tensor channels 397 * @param[in] wt pointer to kernel weights 398 * @param[in] ch_im_out number of filters, i.e., output tensor channels 399 * @param[in] dim_kernel filter kernel size 400 * @param[in] padding padding sizes 401 * @param[in] stride convolution stride 402 * @param[in] bias pointer to bias 403 * @param[in] bias_shift amount of left-shift for bias 404 * @param[in] out_shift amount of right-shift for output 405 * @param[in,out] Im_out pointer to output tensor 406 * @param[in] dim_im_out output tensor dimension 407 * @param[in,out] bufferA pointer to buffer space for input 408 * @param[in,out] bufferB pointer to buffer space for output 409 * @return The function returns either 410 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 411 * 412 * This function is the version with full list of optimization tricks, but with 413 * some contraints: 414 * ch_im_in is multiple of 4 415 * ch_im_out is multiple of 2 416 */ 417 arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in, 418 const uint16_t dim_im_in, 419 const uint16_t ch_im_in, 420 const q7_t *wt, 421 const uint16_t ch_im_out, 422 const uint16_t dim_kernel, 423 const uint16_t padding, 424 const uint16_t stride, 425 const q7_t *bias, 426 const uint16_t bias_shift, 427 const uint16_t out_shift, 428 q7_t *Im_out, 429 const uint16_t dim_im_out, 430 q15_t *bufferA, 431 q7_t *bufferB); 432 433 /** 434 * @brief Fast Q7 convolution function (non-sqaure shape) 435 * @param[in] Im_in pointer to input tensor 436 * @param[in] dim_im_in_x input tensor dimension x 437 * @param[in] dim_im_in_y input tensor dimension y 438 * @param[in] ch_im_in number of input tensor channels 439 * @param[in] wt pointer to kernel weights 440 * @param[in] ch_im_out number of filters, i.e., output tensor channels 441 * @param[in] dim_kernel_x filter kernel size x 442 * @param[in] dim_kernel_y filter kernel size y 443 * @param[in] padding_x padding size x 444 * @param[in] padding_y padding size y 445 * @param[in] stride_x convolution stride x 446 * @param[in] stride_y convolution stride y 447 * @param[in] bias pointer to bias 448 * @param[in] bias_shift amount of left-shift for bias 449 * @param[in] out_shift amount of right-shift for output 450 * @param[in,out] Im_out pointer to output tensor 451 * @param[in] dim_im_out_x output tensor dimension x 452 * @param[in] dim_im_out_y output tensor dimension y 453 * @param[in,out] bufferA pointer to buffer space for input 454 * @param[in,out] bufferB pointer to buffer space for output 455 * @return The function returns either 456 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 457 * 458 * This function is the version with full list of optimization tricks, but with 459 * some contraints: 460 * ch_im_in is multiple of 4 461 * ch_im_out is multiple of 2 462 */ 463 464 arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in, 465 const uint16_t dim_im_in_x, 466 const uint16_t dim_im_in_y, 467 const uint16_t ch_im_in, 468 const q7_t *wt, 469 const uint16_t ch_im_out, 470 const uint16_t dim_kernel_x, 471 const uint16_t dim_kernel_y, 472 const uint16_t padding_x, 473 const uint16_t padding_y, 474 const uint16_t stride_x, 475 const uint16_t stride_y, 476 const q7_t *bias, 477 const uint16_t bias_shift, 478 const uint16_t out_shift, 479 q7_t *Im_out, 480 const uint16_t dim_im_out_x, 481 const uint16_t dim_im_out_y, 482 q15_t *bufferA, 483 q7_t *bufferB); 484 485 /** 486 * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) 487 * @param[in] Im_in pointer to input tensor 488 * @param[in] dim_im_in_x input tensor dimension x 489 * @param[in] dim_im_in_y input tensor dimension y 490 * @param[in] ch_im_in number of input tensor channels 491 * @param[in] wt pointer to kernel weights 492 * @param[in] ch_im_out number of filters, i.e., output tensor channels 493 * @param[in] dim_kernel_x filter kernel size x 494 * @param[in] dim_kernel_y filter kernel size y 495 * @param[in] padding_x padding size x 496 * @param[in] padding_y padding size y 497 * @param[in] stride_x convolution stride x 498 * @param[in] stride_y convolution stride y 499 * @param[in] bias pointer to bias 500 * @param[in] bias_shift amount of left-shift for bias 501 * @param[in] out_shift amount of right-shift for output 502 * @param[in,out] Im_out pointer to output tensor 503 * @param[in] dim_im_out_x output tensor dimension x 504 * @param[in] dim_im_out_y output tensor dimension y 505 * @param[in,out] bufferA pointer to buffer space for input 506 * @param[in,out] bufferB pointer to buffer space for output 507 * @return The function returns either 508 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, 509 * <code>ARM_MATH_SUCCESS</code> on successful completion. 510 * 511 * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 512 * and dim_kernel_y=1). It can be used for 513 * second half of MobileNets after depthwise separable convolution. 514 * 515 * This function is the version with full list of optimization tricks, but with 516 * some contraints: 517 * ch_im_in is multiple of 4 518 * ch_im_out is multiple of 2 519 */ 520 arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in, 521 const uint16_t dim_im_in_x, 522 const uint16_t dim_im_in_y, 523 const uint16_t ch_im_in, 524 const q7_t *wt, 525 const uint16_t ch_im_out, 526 const uint16_t dim_kernel_x, 527 const uint16_t dim_kernel_y, 528 const uint16_t padding_x, 529 const uint16_t padding_y, 530 const uint16_t stride_x, 531 const uint16_t stride_y, 532 const q7_t *bias, 533 const uint16_t bias_shift, 534 const uint16_t out_shift, 535 q7_t *Im_out, 536 const uint16_t dim_im_out_x, 537 const uint16_t dim_im_out_y, 538 q15_t *bufferA, 539 q7_t *bufferB); 540 541 /** 542 * @brief Fast s8 version for 1x1 convolution (non-square shape) 543 * 544 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 545 arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required 546 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 547 * Range of conv_params->input_offset : [-127, 128] 548 * Range of conv_params->output_offset : [-128, 127] 549 * @param[in] quant_params Per-channel quantization info. 550 * It contains the multiplier and shift values to be applied to each output channel 551 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 552 * @param[in] input_data Input (activation) data pointer. Data type: int8 553 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN] 554 * @param[in] filter_data Filter data pointer. Data type: int8 555 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 556 * @param[in] bias_data Optional bias data pointer. Data type: int32 557 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 558 * @param[out] output_data Output data pointer. Data type: int8 559 * 560 * @return The function returns either 561 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, 562 * <code>ARM_MATH_SUCCESS</code> on successful completion. 563 * 564 * @details 565 * - Supported framework : TensorFlow Lite Micro 566 * - The following constrains on the arguments apply 567 * -# input_dims->c is a multiple of 4 568 * -# conv_params->padding.w = conv_params->padding.h = 0 569 * -# conv_params->stride.w = conv_params->stride.h = 1 570 * 571 */ 572 arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, 573 const cmsis_nn_conv_params *conv_params, 574 const cmsis_nn_per_channel_quant_params *quant_params, 575 const cmsis_nn_dims *input_dims, 576 const q7_t *input_data, 577 const cmsis_nn_dims *filter_dims, 578 const q7_t *filter_data, 579 const cmsis_nn_dims *bias_dims, 580 const int32_t *bias_data, 581 const cmsis_nn_dims *output_dims, 582 q7_t *output_data); 583 584 /** 585 * @brief Get the required buffer size for arm_convolve_1x1_s8_fast 586 * 587 * @param[in] input_dims Input (activation) dimensions 588 * @return The function returns the required buffer size in bytes 589 * 590 */ 591 int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims); 592 593 /** 594 * @brief 1xn convolution 595 * 596 * @param[in, out] ctx Function context that contains the additional buffer if required by the function. 597 arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required 598 * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). 599 * Range of conv_params->input_offset : [-127, 128] 600 * Range of conv_params->output_offset : [-128, 127] 601 * @param[in] quant_params Per-channel quantization info. 602 * It contains the multiplier and shift values to be applied to each output channel 603 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 604 * @param[in] input_data Input (activation) data pointer. Data type: int8 605 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal 606 * spatial filter dimension 607 * @param[in] filter_data Filter data pointer. Data type: int8 608 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 609 * @param[in] bias_data Optional bias data pointer. Data type: int32 610 * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] 611 * @param[out] output_data Output data pointer. Data type: int8 612 * 613 * @return The function returns either 614 * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, 615 * <code>ARM_MATH_SUCCESS</code> on successful completion. 616 * 617 * @details 618 * - Supported framework : TensorFlow Lite Micro 619 * - The following constrains on the arguments apply 620 * -# input_dims->n equals 1 621 * -# ouput_dims->w is a multiple of 4 622 * -# Explicit constraints(since it is for 1xN convolution) 623 * -## input_dims->h equals 1 624 * -## output_dims->h equals 1 625 * -## filter_dims->h equals 1 626 *@todo Remove constraint on output_dims->w to make the function generic. 627 * 628 */ 629 arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, 630 const cmsis_nn_conv_params *conv_params, 631 const cmsis_nn_per_channel_quant_params *quant_params, 632 const cmsis_nn_dims *input_dims, 633 const q7_t *input_data, 634 const cmsis_nn_dims *filter_dims, 635 const q7_t *filter_data, 636 const cmsis_nn_dims *bias_dims, 637 const int32_t *bias_data, 638 const cmsis_nn_dims *output_dims, 639 q7_t *output_data); 640 641 /** 642 * @brief Get the required additional buffer size for 1xn convolution 643 * 644 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 645 * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the 646 * horizontal spatial filter dimension 647 * @return The function returns required buffer size(bytes) 648 * 649 */ 650 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 651 652 /** 653 * @brief Q7 version of convolution for RGB image 654 * @param[in] Im_in pointer to input tensor 655 * @param[in] dim_im_in input tensor dimension 656 * @param[in] ch_im_in number of input tensor channels 657 * @param[in] wt pointer to kernel weights 658 * @param[in] ch_im_out number of filters, i.e., output tensor channels 659 * @param[in] dim_kernel filter kernel size 660 * @param[in] padding padding sizes 661 * @param[in] stride convolution stride 662 * @param[in] bias pointer to bias 663 * @param[in] bias_shift amount of left-shift for bias 664 * @param[in] out_shift amount of right-shift for output 665 * @param[in,out] Im_out pointer to output tensor 666 * @param[in] dim_im_out output tensor dimension 667 * @param[in,out] bufferA pointer to buffer space for input 668 * @param[in,out] bufferB pointer to buffer space for output 669 * @return The function returns either 670 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 671 * 672 * This kernel is written exclusively for convolution with ch_im_in 673 * equals 3. This applies on the first layer of CNNs which has input 674 * image with RGB format. 675 */ 676 677 arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in, 678 const uint16_t dim_im_in, 679 const uint16_t ch_im_in, 680 const q7_t *wt, 681 const uint16_t ch_im_out, 682 const uint16_t dim_kernel, 683 const uint16_t padding, 684 const uint16_t stride, 685 const q7_t *bias, 686 const uint16_t bias_shift, 687 const uint16_t out_shift, 688 q7_t *Im_out, 689 const uint16_t dim_im_out, 690 q15_t *bufferA, 691 q7_t *bufferB); 692 693 /** 694 * @brief Fast Q15 convolution function 695 * @param[in] Im_in pointer to input tensor 696 * @param[in] dim_im_in input tensor dimension 697 * @param[in] ch_im_in number of input tensor channels 698 * @param[in] wt pointer to kernel weights 699 * @param[in] ch_im_out number of filters, i.e., output tensor channels 700 * @param[in] dim_kernel filter kernel size 701 * @param[in] padding padding sizes 702 * @param[in] stride convolution stride 703 * @param[in] bias pointer to bias 704 * @param[in] bias_shift amount of left-shift for bias 705 * @param[in] out_shift amount of right-shift for output 706 * @param[in,out] Im_out pointer to output tensor 707 * @param[in] dim_im_out output tensor dimension 708 * @param[in,out] bufferA pointer to buffer space for input 709 * @param[in,out] bufferB pointer to buffer space for output 710 * @return The function returns either 711 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 712 * 713 * This function is the version with full list of optimization tricks, but with 714 * some contraints: 715 * ch_im_in is multiple of 2 716 * ch_im_out is multiple of 2 717 */ 718 719 arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in, 720 const uint16_t dim_im_in, 721 const uint16_t ch_im_in, 722 const q15_t *wt, 723 const uint16_t ch_im_out, 724 const uint16_t dim_kernel, 725 const uint16_t padding, 726 const uint16_t stride, 727 const q15_t *bias, 728 const uint16_t bias_shift, 729 const uint16_t out_shift, 730 q15_t *Im_out, 731 const uint16_t dim_im_out, 732 q15_t *bufferA, 733 q7_t *bufferB); 734 735 /** 736 * @brief Fast Q15 convolution function (non-sqaure shape) 737 * @param[in] Im_in pointer to input tensor 738 * @param[in] dim_im_in_x input tensor dimension x 739 * @param[in] dim_im_in_y input tensor dimension y 740 * @param[in] ch_im_in number of input tensor channels 741 * @param[in] wt pointer to kernel weights 742 * @param[in] ch_im_out number of filters, i.e., output tensor channels 743 * @param[in] dim_kernel_x filter kernel size x 744 * @param[in] dim_kernel_y filter kernel size y 745 * @param[in] padding_x padding size x 746 * @param[in] padding_y padding size y 747 * @param[in] stride_x convolution stride x 748 * @param[in] stride_y convolution stride y 749 * @param[in] bias pointer to bias 750 * @param[in] bias_shift amount of left-shift for bias 751 * @param[in] out_shift amount of right-shift for output 752 * @param[in,out] Im_out pointer to output tensor 753 * @param[in] dim_im_out_x output tensor dimension x 754 * @param[in] dim_im_out_y output tensor dimension y 755 * @param[in,out] bufferA pointer to buffer space for input 756 * @param[in,out] bufferB pointer to buffer space for output 757 * @return The function returns either 758 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 759 * 760 * @details 761 * 762 * <b>Buffer size:</b> 763 * 764 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel 765 * 766 * bufferB size: 0 767 * 768 * <b>Input dimension constraints:</b> 769 * 770 * ch_im_in is multiple of 2 771 * 772 * ch_im_out is multipe of 2 773 * 774 */ 775 776 arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in, 777 const uint16_t dim_im_in_x, 778 const uint16_t dim_im_in_y, 779 const uint16_t ch_im_in, 780 const q15_t *wt, 781 const uint16_t ch_im_out, 782 const uint16_t dim_kernel_x, 783 const uint16_t dim_kernel_y, 784 const uint16_t padding_x, 785 const uint16_t padding_y, 786 const uint16_t stride_x, 787 const uint16_t stride_y, 788 const q15_t *bias, 789 const uint16_t bias_shift, 790 const uint16_t out_shift, 791 q15_t *Im_out, 792 const uint16_t dim_im_out_x, 793 const uint16_t dim_im_out_y, 794 q15_t *bufferA, 795 q7_t *bufferB); 796 797 /** 798 * @brief Q7 depthwise separable convolution function 799 * @param[in] Im_in pointer to input tensor 800 * @param[in] dim_im_in input tensor dimension 801 * @param[in] ch_im_in number of input tensor channels 802 * @param[in] wt pointer to kernel weights 803 * @param[in] ch_im_out number of filters, i.e., output tensor channels 804 * @param[in] dim_kernel filter kernel size 805 * @param[in] padding padding sizes 806 * @param[in] stride convolution stride 807 * @param[in] bias pointer to bias 808 * @param[in] bias_shift amount of left-shift for bias 809 * @param[in] out_shift amount of right-shift for output 810 * @param[in,out] Im_out pointer to output tensor 811 * @param[in] dim_im_out output tensor dimension 812 * @param[in,out] bufferA pointer to buffer space for input 813 * @param[in,out] bufferB pointer to buffer space for output 814 * @return The function returns either 815 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 816 * 817 * This function is the version with full list of optimization tricks, but with 818 * some contraints: 819 * ch_im_in is multiple of 2 820 * ch_im_out is multiple of 2 821 */ 822 823 arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, 824 const uint16_t dim_im_in, 825 const uint16_t ch_im_in, 826 const q7_t *wt, 827 const uint16_t ch_im_out, 828 const uint16_t dim_kernel, 829 const uint16_t padding, 830 const uint16_t stride, 831 const q7_t *bias, 832 const uint16_t bias_shift, 833 const uint16_t out_shift, 834 q7_t *Im_out, 835 const uint16_t dim_im_out, 836 q15_t *bufferA, 837 q7_t *bufferB); 838 839 /** 840 * @brief Q7 depthwise separable convolution function (non-square shape) 841 * @param[in] Im_in pointer to input tensor 842 * @param[in] dim_im_in_x input tensor dimension x 843 * @param[in] dim_im_in_y input tensor dimension y 844 * @param[in] ch_im_in number of input tensor channels 845 * @param[in] wt pointer to kernel weights 846 * @param[in] ch_im_out number of filters, i.e., output tensor channels 847 * @param[in] dim_kernel_x filter kernel size x 848 * @param[in] dim_kernel_y filter kernel size y 849 * @param[in] padding_x padding sizes x 850 * @param[in] padding_y padding sizes y 851 * @param[in] stride_x convolution stride x 852 * @param[in] stride_y convolution stride y 853 * @param[in] bias pointer to bias 854 * @param[in] bias_shift amount of left-shift for bias 855 * @param[in] out_shift amount of right-shift for output 856 * @param[in,out] Im_out pointer to output tensor 857 * @param[in] dim_im_out_x output tensor dimension x 858 * @param[in] dim_im_out_y output tensor dimension y 859 * @param[in,out] bufferA pointer to buffer space for input 860 * @param[in,out] bufferB pointer to buffer space for output 861 * @return The function returns either 862 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. 863 * 864 * This function is the version with full list of optimization tricks, but with 865 * some contraints: 866 * ch_im_in is multiple of 2 867 * ch_im_out is multiple of 2 868 */ 869 arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in, 870 const uint16_t dim_im_in_x, 871 const uint16_t dim_im_in_y, 872 const uint16_t ch_im_in, 873 const q7_t *wt, 874 const uint16_t ch_im_out, 875 const uint16_t dim_kernel_x, 876 const uint16_t dim_kernel_y, 877 const uint16_t padding_x, 878 const uint16_t padding_y, 879 const uint16_t stride_x, 880 const uint16_t stride_y, 881 const q7_t *bias, 882 const uint16_t bias_shift, 883 const uint16_t out_shift, 884 q7_t *Im_out, 885 const uint16_t dim_im_out_x, 886 const uint16_t dim_im_out_y, 887 q15_t *bufferA, 888 q7_t *bufferB); 889 890 /** 891 * @brief Wrapper function to pick the right optimized s8 depthwise convolution function 892 * 893 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 894 * definition file to see if an additional buffer is required. 895 * Optional function {API}_get_buffer_size() provides the buffer 896 * size if required. 897 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 898 * dw_conv_params->dilation is not used. 899 * Range of dw_conv_params->input_offset : [-127, 128] 900 * Range of dw_conv_params->output_offset : [-128, 127] 901 * @param[in] quant_params Per-channel quantization info. 902 * It contains the multiplier and shift values to be applied to each 903 * output channel 904 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 905 * Batch argument N is not used and assumed to be 1. 906 * @param[in] input_data Input (activation) data pointer. Data type: int8 907 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 908 * @param[in] filter_data Filter data pointer. Data type: int8 909 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 910 * @param[in] bias_data Bias data pointer. Data type: int32 911 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 912 * @param[in, out] output_data Output data pointer. Data type: int8 913 * @return The function returns 914 * <code>ARM_MATH_SUCCESS</code> - Successful completion. 915 * 916 * @details 917 * - Supported framework: TensorFlow Lite 918 * - Picks one of the the following functions 919 * -# arm_depthwise_conv_s8() 920 * -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only 921 * -# arm_depthwise_conv_s8_opt() 922 * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. 923 * - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the 924 * boundary. 925 */ 926 arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, 927 const cmsis_nn_dw_conv_params *dw_conv_params, 928 const cmsis_nn_per_channel_quant_params *quant_params, 929 const cmsis_nn_dims *input_dims, 930 const q7_t *input_data, 931 const cmsis_nn_dims *filter_dims, 932 const q7_t *filter_data, 933 const cmsis_nn_dims *bias_dims, 934 const int32_t *bias_data, 935 const cmsis_nn_dims *output_dims, 936 q7_t *output_data); 937 938 /** 939 * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() 940 * 941 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 942 * dw_conv_params->dilation is not used. 943 * Range of dw_conv_params->input_offset : [-127, 128] 944 * Range of dw_conv_params->input_offset : [-128, 127] 945 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 946 * Batch argument N is not used and assumed to be 1. 947 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 948 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 949 * @return Size of additional memory required for optimizations in bytes. 950 * 951 */ 952 int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, 953 const cmsis_nn_dims *input_dims, 954 const cmsis_nn_dims *filter_dims, 955 const cmsis_nn_dims *output_dims); 956 957 /** 958 * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions. 959 * 960 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 961 * definition file to see if an additional buffer is required. 962 * Optional function {API}_get_buffer_size() provides the buffer 963 * size if an additional buffer is required. 964 * exists if additional memory is. 965 * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) 966 * dw_conv_params->dilation is not used. 967 * Range of dw_conv_params->input_offset : [-127, 128] 968 * Range of dw_conv_params->input_offset : [-128, 127] 969 * @param[in] quant_params Per-channel quantization info. 970 * It contains the multiplier and shift values to be applied to each 971 * output channel 972 * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] 973 * Batch argument N is not used. 974 * @param[in] input_data Input (activation) data pointer. Data type: int8 975 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 976 * @param[in] filter_data Filter data pointer. Data type: int8 977 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 978 * @param[in] bias_data Bias data pointer. Data type: int32 979 * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] 980 * @param[in, out] output_data Output data pointer. Data type: int8 981 * @return The function returns <code>ARM_MATH_SUCCESS</code> 982 * 983 * @details 984 * - Supported framework: TensorFlow Lite 985 * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. 986 */ 987 arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, 988 const cmsis_nn_dw_conv_params *dw_conv_params, 989 const cmsis_nn_per_channel_quant_params *quant_params, 990 const cmsis_nn_dims *input_dims, 991 const q7_t *input_data, 992 const cmsis_nn_dims *filter_dims, 993 const q7_t *filter_data, 994 const cmsis_nn_dims *bias_dims, 995 const int32_t *bias_data, 996 const cmsis_nn_dims *output_dims, 997 q7_t *output_data); 998 999 /** 1000 * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on 1001 * the input arguments(documented below). Refer arm_depthwise_conv_s8() for function 1002 * argument details. 1003 * 1004 * @return The function returns one of the following 1005 * <code>ARM_MATH_SIZE_MISMATCH</code> - Unsupported dimension of tensors 1006 * <code>ARM_MATH_ARGUMENT_ERROR</code> - Unsupported pad size along the x axis 1007 * <code>ARM_MATH_SUCCESS</code> - Successful operation 1008 * 1009 * @details 1010 * - Supported framework : TensorFlow Lite Micro 1011 * - The following constrains on the arguments apply 1012 * -# Number of input channel equals number of output channels 1013 * -# Filter height and width equals 3 1014 * -# Padding along x is either 0 or 1. 1015 * 1016 */ 1017 arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, 1018 const cmsis_nn_dw_conv_params *dw_conv_params, 1019 const cmsis_nn_per_channel_quant_params *quant_params, 1020 const cmsis_nn_dims *input_dims, 1021 const q7_t *input_data, 1022 const cmsis_nn_dims *filter_dims, 1023 const q7_t *filter_data, 1024 const cmsis_nn_dims *bias_dims, 1025 const int32_t *bias_data, 1026 const cmsis_nn_dims *output_dims, 1027 q7_t *output_data); 1028 1029 /** 1030 * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. 1031 * Refer arm_depthwise_conv_s8() for function argument details. 1032 * 1033 * @return The function returns one of the following 1034 * <code>ARM_MATH_SIZE_MISMATCH</code> - input channel != output channel or 1035 * ch_mult != 1 1036 * <code>ARM_MATH_SUCCESS</code> - Successful operation 1037 * 1038 * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out 1039 * for the following if MVE optimizations(Arm Helium Technology) are used. 1040 * - Output shift 1041 * - Output multiplier 1042 * - Output bias 1043 * - kernel 1044 * @details 1045 * - Supported framework: TensorFlow Lite 1046 * - The following constrains on the arguments apply 1047 * -# Number of input channel equals number of output channels or ch_mult equals 1 1048 * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. 1049 * - Reccomended when number of channels is 4 or greater. 1050 * 1051 */ 1052 arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, 1053 const cmsis_nn_dw_conv_params *dw_conv_params, 1054 const cmsis_nn_per_channel_quant_params *quant_params, 1055 const cmsis_nn_dims *input_dims, 1056 const q7_t *input_data, 1057 const cmsis_nn_dims *filter_dims, 1058 const q7_t *filter_data, 1059 const cmsis_nn_dims *bias_dims, 1060 const int32_t *bias_data, 1061 const cmsis_nn_dims *output_dims, 1062 q7_t *output_data); 1063 1064 /** 1065 * @brief Get the required buffer size for optimized s8 depthwise convolution 1066 * function with constraint that in_channel equals out_channel. 1067 * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] 1068 * Batch argument N is not used. 1069 * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] 1070 * @return The function returns required buffer size in bytes 1071 * 1072 */ 1073 int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); 1074 1075 /** 1076 * @defgroup FC Fully-connected Layer Functions 1077 * 1078 * Collection of fully-connected and matrix multiplication functions. 1079 * 1080 * Fully-connected layer is basically a matrix-vector multiplication 1081 * with bias. The matrix is the weights and the input/output vectors 1082 * are the activation values. Supported {weight, activation} precisions 1083 * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}. 1084 * 1085 * Here we have two types of kernel functions. The basic function 1086 * implements the function using regular GEMV approach. The opt functions 1087 * operates with weights in interleaved formats. 1088 * 1089 */ 1090 1091 /** 1092 *@brief Q7 basic fully-connected layer function 1093 *@param[in] pV pointer to input vector 1094 *@param[in] pM pointer to matrix weights 1095 *@param[in] dim_vec length of the vector 1096 *@param[in] num_of_rows number of rows in weight matrix 1097 *@param[in] bias_shift amount of left-shift for bias 1098 *@param[in] out_shift amount of right-shift for output 1099 *@param[in] bias pointer to bias 1100 *@param[in,out] pOut pointer to output vector 1101 *@param[in,out] vec_buffer pointer to buffer space for input 1102 *@return The function returns <code>ARM_MATH_SUCCESS</code> 1103 * 1104 */ 1105 1106 arm_status arm_fully_connected_q7(const q7_t *pV, 1107 const q7_t *pM, 1108 const uint16_t dim_vec, 1109 const uint16_t num_of_rows, 1110 const uint16_t bias_shift, 1111 const uint16_t out_shift, 1112 const q7_t *bias, 1113 q7_t *pOut, 1114 q15_t *vec_buffer); 1115 1116 /** 1117 * @brief Basic s8 Fully Connected function. 1118 * 1119 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1120 * definition file to see if an additional buffer is required. 1121 * Optional function {API}_get_buffer_size() provides the buffer 1122 * size if an additional buffer is required. 1123 * @param[in] fc_params Fully Connected layer parameters (e.g. strides, dilations, pads,...) 1124 * Range of fc_params->input_offset : [-127, 128] 1125 * fc_params->filter_offset : 0 1126 * Range of fc_params->output_offset : [-128, 127] 1127 * @param[in] quant_params Per-tensor quantization info. 1128 * It contains the multiplier and shift values to be applied to the output tensor. 1129 * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] 1130 * Input dimension is taken as Nx(H * W * C_IN) 1131 * @param[in] input_data Input (activation) data pointer. Data type: int8 1132 * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] 1133 * N : accumulation depth and equals (H * W * C_IN) from input_dims 1134 * C : output depth and equals C_OUT in output_dims 1135 * H & W : Not used 1136 * @param[in] filter_data Filter data pointer. Data type: int8 1137 * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] 1138 * N, H, W : Not used 1139 * @param[in] bias_data Bias data pointer. Data type: int32 1140 * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] 1141 * N : Batches 1142 * C_OUT : Output depth 1143 * H & W : Not used. 1144 * @param[in, out] output_data Output data pointer. Data type: int8 1145 * @return The function returns <code>ARM_MATH_SUCCESS</code> 1146 * 1147 * @details 1148 * - Supported framework: TensorFlow Lite 1149 * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. 1150 */ 1151 arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx, 1152 const cmsis_nn_fc_params *fc_params, 1153 const cmsis_nn_per_tensor_quant_params *quant_params, 1154 const cmsis_nn_dims *input_dims, 1155 const q7_t *input_data, 1156 const cmsis_nn_dims *filter_dims, 1157 const q7_t *filter_data, 1158 const cmsis_nn_dims *bias_dims, 1159 const int32_t *bias_data, 1160 const cmsis_nn_dims *output_dims, 1161 q7_t *output_data); 1162 1163 /** 1164 * @brief Get the required buffer size for S8 basic fully-connected and 1165 * matrix multiplication layer function for TF Lite 1166 * @param[in] filter_dims dimension of filter 1167 * @return The function returns required buffer size in bytes 1168 * 1169 */ 1170 int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims); 1171 1172 /** 1173 * @brief Q7 opt fully-connected layer function 1174 * @param[in] pV pointer to input vector 1175 * @param[in] pM pointer to matrix weights 1176 * @param[in] dim_vec length of the vector 1177 * @param[in] num_of_rows number of rows in weight matrix 1178 * @param[in] bias_shift amount of left-shift for bias 1179 * @param[in] out_shift amount of right-shift for output 1180 * @param[in] bias pointer to bias 1181 * @param[in,out] pOut pointer to output vector 1182 * @param[in,out] vec_buffer pointer to buffer space for input 1183 * @return The function returns <code>ARM_MATH_SUCCESS</code> 1184 * 1185 */ 1186 1187 arm_status arm_fully_connected_q7_opt(const q7_t *pV, 1188 const q7_t *pM, 1189 const uint16_t dim_vec, 1190 const uint16_t num_of_rows, 1191 const uint16_t bias_shift, 1192 const uint16_t out_shift, 1193 const q7_t *bias, 1194 q7_t *pOut, 1195 q15_t *vec_buffer); 1196 1197 /** 1198 * @brief Q15 basic fully-connected layer function 1199 * @param[in] pV pointer to input vector 1200 * @param[in] pM pointer to matrix weights 1201 * @param[in] dim_vec length of the vector 1202 * @param[in] num_of_rows number of rows in weight matrix 1203 * @param[in] bias_shift amount of left-shift for bias 1204 * @param[in] out_shift amount of right-shift for output 1205 * @param[in] bias pointer to bias 1206 * @param[in,out] pOut pointer to output vector 1207 * @param[in,out] vec_buffer pointer to buffer space for input 1208 * @return The function returns <code>ARM_MATH_SUCCESS</code> 1209 * 1210 */ 1211 1212 arm_status arm_fully_connected_q15(const q15_t *pV, 1213 const q15_t *pM, 1214 const uint16_t dim_vec, 1215 const uint16_t num_of_rows, 1216 const uint16_t bias_shift, 1217 const uint16_t out_shift, 1218 const q15_t *bias, 1219 q15_t *pOut, 1220 q15_t *vec_buffer); 1221 1222 /** 1223 * @brief Q15 opt fully-connected layer function 1224 * @param[in] pV pointer to input vector 1225 * @param[in] pM pointer to matrix weights 1226 * @param[in] dim_vec length of the vector 1227 * @param[in] num_of_rows number of rows in weight matrix 1228 * @param[in] bias_shift amount of left-shift for bias 1229 * @param[in] out_shift amount of right-shift for output 1230 * @param[in] bias pointer to bias 1231 * @param[in,out] pOut pointer to output vector 1232 * @param[in,out] vec_buffer pointer to buffer space for input 1233 * @return The function returns <code>ARM_MATH_SUCCESS</code> 1234 * 1235 */ 1236 1237 arm_status arm_fully_connected_q15_opt(const q15_t *pV, 1238 const q15_t *pM, 1239 const uint16_t dim_vec, 1240 const uint16_t num_of_rows, 1241 const uint16_t bias_shift, 1242 const uint16_t out_shift, 1243 const q15_t *bias, 1244 q15_t *pOut, 1245 q15_t *vec_buffer); 1246 1247 /** 1248 * @brief Mixed Q15-Q7 fully-connected layer function 1249 * @param[in] pV pointer to input vector 1250 * @param[in] pM pointer to matrix weights 1251 * @param[in] dim_vec length of the vector 1252 * @param[in] num_of_rows number of rows in weight matrix 1253 * @param[in] bias_shift amount of left-shift for bias 1254 * @param[in] out_shift amount of right-shift for output 1255 * @param[in] bias pointer to bias 1256 * @param[in,out] pOut pointer to output vector 1257 * @param[in,out] vec_buffer pointer to buffer space for input 1258 * @return The function returns <code>ARM_MATH_SUCCESS</code> 1259 * 1260 */ 1261 1262 arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV, 1263 const q7_t *pM, 1264 const uint16_t dim_vec, 1265 const uint16_t num_of_rows, 1266 const uint16_t bias_shift, 1267 const uint16_t out_shift, 1268 const q7_t *bias, 1269 q15_t *pOut, 1270 q15_t *vec_buffer); 1271 1272 /** 1273 * @brief Mixed Q15-Q7 opt fully-connected layer function 1274 * @param[in] pV pointer to input vector 1275 * @param[in] pM pointer to matrix weights 1276 * @param[in] dim_vec length of the vector 1277 * @param[in] num_of_rows number of rows in weight matrix 1278 * @param[in] bias_shift amount of left-shift for bias 1279 * @param[in] out_shift amount of right-shift for output 1280 * @param[in] bias pointer to bias 1281 * @param[in,out] pOut pointer to output vector 1282 * @param[in,out] vec_buffer pointer to buffer space for input 1283 * @return The function returns <code>ARM_MATH_SUCCESS</code> 1284 * 1285 */ 1286 1287 arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV, 1288 const q7_t *pM, 1289 const uint16_t dim_vec, 1290 const uint16_t num_of_rows, 1291 const uint16_t bias_shift, 1292 const uint16_t out_shift, 1293 const q7_t *bias, 1294 q15_t *pOut, 1295 q15_t *vec_buffer); 1296 1297 /** 1298 * @brief Matrix-Multiplication Kernels for Convolution 1299 * 1300 * These functions are used within convolution layer functions for 1301 * matrix multiplication. 1302 * 1303 * The implementation is similar to CMSIS-DSP arm_mat_mult functions 1304 * with one Q7 and one Q15 operands. The Q15 operand is the im2col 1305 * output which is always with 2 columns. 1306 * 1307 */ 1308 1309 /** 1310 * @brief Matrix-multiplication function for convolution 1311 * @param[in] pA pointer to operand A 1312 * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors 1313 * @param[in] ch_im_out numRow of A 1314 * @param[in] numCol_A numCol of A 1315 * @param[in] bias_shift amount of left-shift for bias 1316 * @param[in] out_shift amount of right-shift for output 1317 * @param[in] bias the bias 1318 * @param[in,out] pOut pointer to output 1319 * @return The function returns the incremented output pointer 1320 */ 1321 1322 q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA, 1323 const q15_t *pInBuffer, 1324 const uint16_t ch_im_out, 1325 const uint16_t numCol_A, 1326 const uint16_t bias_shift, 1327 const uint16_t out_shift, 1328 const q7_t *bias, 1329 q7_t *pOut); 1330 /** 1331 * @brief Matrix-multiplication function for convolution with per-channel requantization. 1332 * @param[in] input_a pointer to operand A 1333 * @param[in] input_b pointer to operand B, always consists of 2 vectors. 1334 * @param[in] output_ch number of rows of A 1335 * @param[in] out_shift pointer to per output channel requantization shift parameter. 1336 * @param[in] out_mult pointer to per output channel requantization multiplier parameter. 1337 * @param[in] out_offset output tensor offset. 1338 * @param[in] activation_min minimum value to clamp the output to. Range : int8 1339 * @param[in] activation_max maximum value to clamp the output to. Range : int8 1340 * @param[in] num_col_a number of columns of A 1341 * @param[in] output_bias per output channel bias. Range : int32 1342 * @param[in,out] out_0 pointer to output 1343 * @return The function returns one of the two 1344 * 1. The incremented output pointer for a successful operation or 1345 * 2. NULL if implementation is not available. 1346 * 1347 * @details This function does the matrix multiplication of weight matrix for all output channels 1348 * with 2 columns from im2col and produces two elements/output_channel. The outputs are 1349 * clamped in the range provided by activation min and max. 1350 * Supported framework: TensorFlow Lite micro. 1351 */ 1352 q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, 1353 const q15_t *input_b, 1354 const uint16_t output_ch, 1355 const int32_t *out_shift, 1356 const int32_t *out_mult, 1357 const int32_t out_offset, 1358 const int16_t activation_min, 1359 const int16_t activation_max, 1360 const uint16_t num_col_a, 1361 const int32_t *const output_bias, 1362 q7_t *out_0); 1363 1364 /** 1365 * @brief Matrix-multiplication of re-ordered input B with A. 1366 * 1367 * @details For arguments, refer arm_nn_mat_mult_kernel_s8_s16. The re-ordering is a consequence 1368 * of sign extension done by the SXTB16 command on input_b. The outputs are clamped in the range 1369 * provided by activation min and max. 1370 * * @details 1371 * - Supported framework : TensorFlow Lite Micro 1372 * - The following constrains on the arguments apply 1373 * -# num_col_a is a multiple of 4 1374 * -# output_ch is a multiple of 2 1375 * 1376 */ 1377 q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a, 1378 const q15_t *input_b, 1379 const uint16_t output_ch, 1380 const int32_t *out_shift, 1381 const int32_t *out_mult, 1382 const int32_t out_offset, 1383 const int16_t activation_min, 1384 const int16_t activation_max, 1385 const uint16_t num_col_a, 1386 const int32_t *const output_bias, 1387 q7_t *out_0); 1388 1389 /** 1390 *@brief Matrix-multiplication function for convolution with reordered columns 1391 *@param[in] pA pointer to operand A 1392 *@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors 1393 *@param[in] ch_im_out numRow of A 1394 *@param[in] numCol_A numCol of A 1395 *@param[in] bias_shift amount of left-shift for bias 1396 *@param[in] out_shift amount of right-shift for output 1397 *@param[in] bias the bias 1398 *@param[in,out] pOut pointer to output 1399 *@return The function returns the incremented output pointer 1400 * 1401 *@details This function assumes that data in pInBuffer are reordered 1402 */ 1403 q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, 1404 const q15_t *pInBuffer, 1405 const uint16_t ch_im_out, 1406 const uint16_t numCol_A, 1407 const uint16_t bias_shift, 1408 const uint16_t out_shift, 1409 const q7_t *bias, 1410 q7_t *pOut); 1411 1412 #ifdef __cplusplus 1413 } 1414 #endif 1415 1416 /* 1417 * Other functions 1418 * These layers are typically not timing critical 1419 * Basic implementation is supported here 1420 */ 1421 1422 #ifdef __cplusplus 1423 extern "C" { 1424 #endif 1425 1426 /** 1427 * @defgroup BasicMath Basic math functions 1428 * 1429 * Element wise add and multiplication functions. 1430 * 1431 */ 1432 1433 /** 1434 * @brief s8 element wise add of two vectors 1435 * @param[in] input_1_vect pointer to input vector 1 1436 * @param[in] input_2_vect pointer to input vector 2 1437 * @param[in] input_1_offset offset for input 1. Range: Range: -127 to 128 1438 * @param[in] input_1_mult multiplier for input 1 1439 * @param[in] input_1_shift shift for input 1 1440 * @param[in] input_2_offset offset for input 2. Range: Range: -127 to 128 1441 * @param[in] input_2_mult multiplier for input 2 1442 * @param[in] input_2_shift shift for input 2 1443 * @param[in] left_shift input left shift 1444 * @param[in,out] output pointer to output vector 1445 * @param[in] out_offset output offset 1446 * @param[in] out_mult output multiplier 1447 * @param[in] out_shift output shift 1448 * @param[in] out_activation_min minimum value to clamp output to 1449 * @param[in] out_activation_max maximum value to clamp output to 1450 * @param[in] block_size number of samples 1451 * @return The function returns ARM_MATH_SUCCESS 1452 */ 1453 arm_status arm_elementwise_add_s8(const int8_t *input_1_vect, 1454 const int8_t *input_2_vect, 1455 const int32_t input_1_offset, 1456 const int32_t input_1_mult, 1457 const int32_t input_1_shift, 1458 const int32_t input_2_offset, 1459 const int32_t input_2_mult, 1460 const int32_t input_2_shift, 1461 const int32_t left_shift, 1462 int8_t *output, 1463 const int32_t out_offset, 1464 const int32_t out_mult, 1465 const int32_t out_shift, 1466 const int32_t out_activation_min, 1467 const int32_t out_activation_max, 1468 const uint32_t block_size); 1469 1470 /** 1471 * @brief s8 element wise multiplication 1472 * @param[in] input_1_vect pointer to input vector 1 1473 * @param[in] input_2_vect pointer to input vector 2 1474 * @param[in] input_1_offset offset for input 1. Range: Range: -127 to 128 1475 * @param[in] input_2_offset offset for input 2. Range: Range: -127 to 128 1476 * @param[in,out] output pointer to output vector 1477 * @param[in] out_offset output offset 1478 * @param[in] out_mult output multiplier 1479 * @param[in] out_shift output shift 1480 * @param[in] out_activation_min minimum value to clamp output to 1481 * @param[in] out_activation_max maximum value to clamp output to 1482 * @param[in] block_size number of samples 1483 * @return The function returns ARM_MATH_SUCCESS 1484 * 1485 * @details Supported framework: TensorFlow Lite micro 1486 */ 1487 arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect, 1488 const int8_t *input_2_vect, 1489 const int32_t input_1_offset, 1490 const int32_t input_2_offset, 1491 int8_t *output, 1492 const int32_t out_offset, 1493 const int32_t out_mult, 1494 const int32_t out_shift, 1495 const int32_t out_activation_min, 1496 const int32_t out_activation_max, 1497 const uint32_t block_size); 1498 /** 1499 * @defgroup Acti Activation Functions 1500 * 1501 * Perform activation layers, including ReLU (Rectified Linear Unit), 1502 * sigmoid and tanh 1503 * 1504 */ 1505 1506 /** 1507 * @brief Q7 RELU function 1508 * @param[in,out] data pointer to input 1509 * @param[in] size number of elements 1510 * @return none. 1511 */ 1512 1513 void arm_relu_q7(q7_t *data, uint16_t size); 1514 1515 /** 1516 * @brief s8 ReLU6 function 1517 * @param[in,out] data pointer to input 1518 * @param[in] size number of elements 1519 */ 1520 1521 void arm_relu6_s8(q7_t *data, uint16_t size); 1522 1523 /** 1524 * @brief Q15 RELU function 1525 * @param[in,out] data pointer to input 1526 * @param[in] size number of elements 1527 * @return none. 1528 */ 1529 1530 void arm_relu_q15(q15_t *data, uint16_t size); 1531 1532 /** 1533 * @brief Q7 neural network activation function using direct table look-up 1534 * @param[in,out] data pointer to input 1535 * @param[in] size number of elements 1536 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 1537 * @param[in] type type of activation functions 1538 * @return none. 1539 */ 1540 1541 void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type); 1542 1543 /** 1544 * @brief Q15 neural network activation function using direct table look-up 1545 * @param[in,out] data pointer to input 1546 * @param[in] size number of elements 1547 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 1548 * @param[in] type type of activation functions 1549 * @return none. 1550 * 1551 * @details 1552 * 1553 * This is the direct table look-up approach. 1554 * 1555 * Assume here the integer part of the fixed-point is <= 3. 1556 * More than 3 just not making much sense, makes no difference with 1557 * saturation followed by any of these activation functions. 1558 */ 1559 1560 void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type); 1561 1562 /** 1563 * @defgroup Pooling Pooling Functions 1564 * 1565 * Perform pooling functions, including max pooling and average pooling 1566 * 1567 */ 1568 1569 /** 1570 * @brief Q7 max pooling function 1571 * @param[in] Im_in pointer to input tensor 1572 * @param[in] dim_im_in input tensor dimension 1573 * @param[in] ch_im_in number of input tensor channels 1574 * @param[in] dim_kernel filter kernel size 1575 * @param[in] padding padding sizes 1576 * @param[in] stride convolution stride 1577 * @param[in] dim_im_out output tensor dimension 1578 * @param[in,out] bufferA pointer to buffer space for input 1579 * @param[in,out] Im_out pointer to output tensor 1580 * @return none. 1581 * 1582 */ 1583 1584 void arm_maxpool_q7_HWC(q7_t *Im_in, 1585 const uint16_t dim_im_in, 1586 const uint16_t ch_im_in, 1587 const uint16_t dim_kernel, 1588 const uint16_t padding, 1589 const uint16_t stride, 1590 const uint16_t dim_im_out, 1591 q7_t *bufferA, 1592 q7_t *Im_out); 1593 1594 /** 1595 * @brief Q7 average pooling function 1596 * @param[in] Im_in pointer to input tensor 1597 * @param[in] dim_im_in input tensor dimension 1598 * @param[in] ch_im_in number of input tensor channels 1599 * @param[in] dim_kernel filter kernel size 1600 * @param[in] padding padding sizes 1601 * @param[in] stride convolution stride 1602 * @param[in] dim_im_out output tensor dimension 1603 * @param[in,out] bufferA pointer to buffer space for input 1604 * @param[in,out] Im_out pointer to output tensor 1605 * @return none. 1606 * 1607 */ 1608 1609 void arm_avepool_q7_HWC(q7_t *Im_in, 1610 const uint16_t dim_im_in, 1611 const uint16_t ch_im_in, 1612 const uint16_t dim_kernel, 1613 const uint16_t padding, 1614 const uint16_t stride, 1615 const uint16_t dim_im_out, 1616 q7_t *bufferA, 1617 q7_t *Im_out); 1618 1619 /** 1620 * @brief s8 average pooling function. 1621 * 1622 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1623 * definition file to see if an additional buffer is required. 1624 * Optional function {API}_get_buffer_size() provides the buffer 1625 * size if an additional buffer is required. 1626 * @param[in] pool_params Pooling parameters 1627 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 1628 * Argument 'N' is not used. 1629 * @param[in] input_data Input (activation) data pointer. Data type: int8 1630 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] 1631 * Argument N and C are not used. 1632 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] 1633 * Argument N is not used. 1634 * C_OUT equals C_IN. 1635 * @param[in, out] output_data Output data pointer. Data type: int8 1636 * @return The function returns 1637 * <code>ARM_MATH_SUCCESS</code> - Successful operation 1638 * 1639 * @details 1640 * - Supported Framework: TensorFlow Lite 1641 * 1642 */ 1643 arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, 1644 const cmsis_nn_pool_params *pool_params, 1645 const cmsis_nn_dims *input_dims, 1646 const q7_t *input_data, 1647 const cmsis_nn_dims *filter_dims, 1648 const cmsis_nn_dims *output_dims, 1649 q7_t *output_data); 1650 1651 /** 1652 * @brief Get the required buffer size for S8 average pooling function 1653 * @param[in] dim_dst_width output tensor dimension 1654 * @param[in] ch_src number of input tensor channels 1655 * @return The function returns required buffer size in bytes 1656 * 1657 */ 1658 int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src); 1659 1660 /** 1661 * @brief s8 max pooling function. 1662 * 1663 * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function 1664 * definition file to see if an additional buffer is required. 1665 * Optional function {API}_get_buffer_size() provides the buffer 1666 * size if an additional buffer is required. 1667 * @param[in] pool_params Pooling parameters 1668 * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] 1669 * Argument 'N' is not used. 1670 * @param[in] input_data Input (activation) data pointer. Data type: int8 1671 * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] 1672 * Argument N and C are not used. 1673 * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] 1674 * Argument N is not used. 1675 * C_OUT equals C_IN. 1676 * @param[in, out] output_data Output data pointer. Data type: int8 1677 * @return The function returns 1678 * <code>ARM_MATH_SUCCESS</code> - Successful operation 1679 * 1680 * @details 1681 * - Supported Framework: TensorFlow Lite 1682 * 1683 */ 1684 arm_status arm_max_pool_s8(const cmsis_nn_context *ctx, 1685 const cmsis_nn_pool_params *pool_params, 1686 const cmsis_nn_dims *input_dims, 1687 const q7_t *input_data, 1688 const cmsis_nn_dims *filter_dims, 1689 const cmsis_nn_dims *output_dims, 1690 q7_t *output_data); 1691 /** 1692 * @defgroup Softmax Softmax Functions 1693 * 1694 * EXP(2) based softmax functions. 1695 * 1696 */ 1697 1698 /** 1699 * @brief Q7 softmax function 1700 * @param[in] vec_in pointer to input vector 1701 * @param[in] dim_vec input vector dimension 1702 * @param[out] p_out pointer to output vector 1703 * 1704 * @note This function is an optimized version which is not bit-accurate with 1705 * TensorFlow Lite's kernel 1706 * 1707 */ 1708 1709 void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); 1710 1711 /** 1712 * @brief Q7 softmax function with batch parameter 1713 * @param[in] vec_in pointer to input vector 1714 * @param[in] nb_batches number of batches 1715 * @param[in] dim_vec input vector dimension 1716 * @param[out] p_out pointer to output vector 1717 * @return none. 1718 * 1719 * @note This function is an optimized version which is not bit-accurate with 1720 * TensorFlow Lite's kernel 1721 * 1722 */ 1723 1724 void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out); 1725 /** 1726 * @brief Q15 softmax function 1727 * @param[in] vec_in pointer to input vector 1728 * @param[in] dim_vec input vector dimension 1729 * @param[out] p_out pointer to output vector 1730 * @return none. 1731 * 1732 * @note This function is an optimized version which is not bit-accurate with 1733 * TensorFlow Lite's kernel 1734 * 1735 */ 1736 1737 void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out); 1738 1739 /** 1740 * @brief S8 softmax function 1741 * @param[in] input Pointer to the input tensor 1742 * @param[in] num_rows Number of rows in the input tensor 1743 * @param[in] row_size Number of elements in each input row 1744 * @param[in] mult Input quantization multiplier 1745 * @param[in] shift Input quantization shift within the range [0, 31] 1746 * @param[in] diff_min Minimum difference with max in row. Used to check if 1747 * the quantized exponential operation can be performed 1748 * @param[out] output Pointer to the output tensor 1749 * 1750 * @note Supported framework: TensorFlow Lite micro (bit-accurate) 1751 * 1752 */ 1753 1754 void arm_softmax_s8(const int8_t *input, 1755 const int32_t num_rows, 1756 const int32_t row_size, 1757 const int32_t mult, 1758 const int32_t shift, 1759 const int32_t diff_min, 1760 int8_t *output); 1761 1762 /** 1763 * @brief U8 softmax function 1764 * @param[in] input Pointer to the input tensor 1765 * @param[in] num_rows Number of rows in the input tensor 1766 * @param[in] row_size Number of elements in each input row 1767 * @param[in] mult Input quantization multiplier 1768 * @param[in] shift Input quantization shift within the range [0, 31] 1769 * @param[in] diff_min Minimum difference with max in row. Used to check if 1770 * the quantized exponential operation can be performed 1771 * @param[out] output Pointer to the output tensor 1772 * 1773 * @note Supported framework: TensorFlow Lite micro (bit-accurate) 1774 * 1775 */ 1776 1777 void arm_softmax_u8(const uint8_t *input, 1778 const int32_t num_rows, 1779 const int32_t row_size, 1780 const int32_t mult, 1781 const int32_t shift, 1782 const int32_t diff_min, 1783 uint8_t *output); 1784 1785 /** 1786 * @brief uint8 depthwise convolution function with asymmetric quantization 1787 * Unless specified otherwise, arguments are mandatory. 1788 * 1789 * @param[in] input Pointer to input tensor 1790 * @param[in] input_x Width of input tensor 1791 * @param[in] input_y Height of input tensor 1792 * @param[in] input_ch Channels in input tensor 1793 * @param[in] kernel Pointer to kernel weights 1794 * @param[in] kernel_x Width of kernel 1795 * @param[in] kernel_y Height of kernel 1796 * @param[in] ch_mult Number of channel multiplier 1797 * @param[in] pad_x Padding sizes x 1798 * @param[in] pad_y Padding sizes y 1799 * @param[in] stride_x stride along the width 1800 * @param[in] stride_y stride along the height 1801 * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. 1802 * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. 1803 * @param[in] bias Pointer to optional bias values. If no bias is 1804 * availble, NULL is expected 1805 * @param[in] input_offset Input tensor zero offset 1806 * @param[in] filter_offset Kernel tensor zero offset 1807 * @param[in] output_offset Output tensor zero offset 1808 * @param[in,out] output Pointer to output tensor 1809 * @param[in] output_x Width of output tensor 1810 * @param[in] output_y Height of output tensor 1811 * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} 1812 * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} 1813 * @param[in] out_shift Amount of right-shift for output 1814 * @param[in] out_mult Output multiplier for requantization 1815 * @return The function returns the following 1816 * <code>ARM_MATH_SUCCESS</code> - Successful operation 1817 * 1818 */ 1819 arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input, 1820 const uint16_t input_x, 1821 const uint16_t input_y, 1822 const uint16_t input_ch, 1823 const uint8_t *kernel, 1824 const uint16_t kernel_x, 1825 const uint16_t kernel_y, 1826 const int16_t ch_mult, 1827 const int16_t pad_x, 1828 const int16_t pad_y, 1829 const int16_t stride_x, 1830 const int16_t stride_y, 1831 const int16_t dilation_x, 1832 const int16_t dilation_y, 1833 const int32_t *bias, 1834 const int32_t input_offset, 1835 const int32_t filter_offset, 1836 const int32_t output_offset, 1837 uint8_t *output, 1838 const uint16_t output_x, 1839 const uint16_t output_y, 1840 const int32_t output_activation_min, 1841 const int32_t output_activation_max, 1842 const int32_t out_shift, 1843 const int32_t out_mult); 1844 1845 /** 1846 * @defgroup Reshape Reshape Functions 1847 * 1848 */ 1849 1850 /** 1851 * @brief Reshape a s8 vector into another with different shape 1852 * @param[in] input points to the s8 input vector 1853 * @param[out] output points to the s8 output vector 1854 * @param[in] total_size total size of the input and output vectors in bytes 1855 * 1856 * @note The output is expected to be in a memory area that does not overlap with the input's 1857 * 1858 */ 1859 void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size); 1860 1861 /** 1862 * @defgroup Concatenation Concatenation Functions 1863 * 1864 */ 1865 1866 /** 1867 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis 1868 * This function should be called for each input tensor to concatenate. The argument offset_x 1869 * will be used to store the input tensor in the correct position in the output tensor 1870 * 1871 * i.e. offset_x = 0 1872 * for(i = 0 i < num_input_tensors; ++i) 1873 * { 1874 * arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x) 1875 * offset_x += input_x[i] 1876 * } 1877 * 1878 * This function assumes that the output tensor has: 1879 * -# The same height of the input tensor 1880 * -# The same number of channels of the input tensor 1881 * -# The same batch size of the input tensor 1882 * 1883 * Unless specified otherwise, arguments are mandatory. 1884 * 1885 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it 1886 * does not involve any arithmetic operation 1887 * 1888 * @param[in] input Pointer to input tensor 1889 * @param[in] input_x Width of input tensor 1890 * @param[in] input_y Height of input tensor 1891 * @param[in] input_z Channels in input tensor 1892 * @param[in] input_w Batch size in input tensor 1893 * @param[out] output Pointer to output tensor 1894 * @param[in] output_x Width of output tensor 1895 * @param[in] offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor 1896 * It is user responsibility to provide the correct value 1897 * 1898 * <b> Input constraints</b> 1899 * offset_x is less than output_x 1900 * 1901 */ 1902 void arm_concatenation_s8_x(const int8_t *input, 1903 const uint16_t input_x, 1904 const uint16_t input_y, 1905 const uint16_t input_z, 1906 const uint16_t input_w, 1907 int8_t *output, 1908 const uint16_t output_x, 1909 const uint32_t offset_x); 1910 1911 /** 1912 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis 1913 * This function should be called for each input tensor to concatenate. The argument offset_y 1914 * will be used to store the input tensor in the correct position in the output tensor 1915 * 1916 * i.e. offset_y = 0 1917 * for(i = 0 i < num_input_tensors; ++i) 1918 * { 1919 * arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y) 1920 * offset_y += input_y[i] 1921 * } 1922 * 1923 * This function assumes that the output tensor has: 1924 * -# The same width of the input tensor 1925 * -# The same number of channels of the input tensor 1926 * -# The same batch size of the input tensor 1927 * 1928 * Unless specified otherwise, arguments are mandatory. 1929 * 1930 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it 1931 * does not involve any arithmetic operation 1932 * 1933 * @param[in] input Pointer to input tensor 1934 * @param[in] input_x Width of input tensor 1935 * @param[in] input_y Height of input tensor 1936 * @param[in] input_z Channels in input tensor 1937 * @param[in] input_w Batch size in input tensor 1938 * @param[out] output Pointer to output tensor 1939 * @param[in] output_y Height of output tensor 1940 * @param[in] offset_y The offset on the Y axis to start concatenating the input tensor 1941 * It is user responsibility to provide the correct value 1942 * 1943 * <b> Input constraints</b> 1944 * offset_y is less than output_y 1945 * 1946 */ 1947 void arm_concatenation_s8_y(const int8_t *input, 1948 const uint16_t input_x, 1949 const uint16_t input_y, 1950 const uint16_t input_z, 1951 const uint16_t input_w, 1952 int8_t *output, 1953 const uint16_t output_y, 1954 const uint32_t offset_y); 1955 1956 /** 1957 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis 1958 * This function should be called for each input tensor to concatenate. The argument offset_z 1959 * will be used to store the input tensor in the correct position in the output tensor 1960 * 1961 * i.e. offset_z = 0 1962 * for(i = 0 i < num_input_tensors; ++i) 1963 * { 1964 * arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z) 1965 * offset_z += input_z[i] 1966 * } 1967 * 1968 * This function assumes that the output tensor has: 1969 * -# The same width of the input tensor 1970 * -# The same height of the input tensor 1971 * -# The same batch size of the input tensor 1972 * 1973 * Unless specified otherwise, arguments are mandatory. 1974 * 1975 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it 1976 * does not involve any arithmetic operation 1977 * 1978 * @param[in] input Pointer to input tensor 1979 * @param[in] input_x Width of input tensor 1980 * @param[in] input_y Height of input tensor 1981 * @param[in] input_z Channels in input tensor 1982 * @param[in] input_w Batch size in input tensor 1983 * @param[out] output Pointer to output tensor 1984 * @param[in] output_z Channels in output tensor 1985 * @param[in] offset_z The offset on the Z axis to start concatenating the input tensor 1986 * It is user responsibility to provide the correct value 1987 * 1988 * <b> Input constraints</b> 1989 * offset_z is less than output_z 1990 * 1991 */ 1992 void arm_concatenation_s8_z(const int8_t *input, 1993 const uint16_t input_x, 1994 const uint16_t input_y, 1995 const uint16_t input_z, 1996 const uint16_t input_w, 1997 int8_t *output, 1998 const uint16_t output_z, 1999 const uint32_t offset_z); 2000 2001 /** 2002 * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size) 2003 * This function should be called for each input tensor to concatenate. The argument offset_w 2004 * will be used to store the input tensor in the correct position in the output tensor 2005 * 2006 * i.e. offset_w = 0 2007 * for(i = 0 i < num_input_tensors; ++i) 2008 * { 2009 * arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w) 2010 * offset_w += input_w[i] 2011 * } 2012 * 2013 * This function assumes that the output tensor has: 2014 * -# The same width of the input tensor 2015 * -# The same height of the input tensor 2016 * -# The same number o channels of the input tensor 2017 * 2018 * Unless specified otherwise, arguments are mandatory. 2019 * 2020 * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it 2021 * does not involve any arithmetic operation 2022 * 2023 * @param[in] input Pointer to input tensor 2024 * @param[in] input_x Width of input tensor 2025 * @param[in] input_y Height of input tensor 2026 * @param[in] input_z Channels in input tensor 2027 * @param[in] input_w Batch size in input tensor 2028 * @param[out] output Pointer to output tensor 2029 * @param[in] offset_w The offset on the W axis to start concatenating the input tensor 2030 * It is user responsibility to provide the correct value 2031 * 2032 */ 2033 void arm_concatenation_s8_w(const int8_t *input, 2034 const uint16_t input_x, 2035 const uint16_t input_y, 2036 const uint16_t input_z, 2037 const uint16_t input_w, 2038 int8_t *output, 2039 const uint32_t offset_w); 2040 /** 2041 * @defgroup SVDF SVDF Layer Functions 2042 * 2043 */ 2044 2045 /** 2046 * @brief s8 SVDF function 2047 * 2048 * @param[in] input_ctx Temporary scratch buffer 2049 * @param[in] output_ctx Temporary output scratch buffer 2050 * @param[in] svdf_params SVDF Parameters 2051 * Range of svdf_params->input_offset : [-128, 127] 2052 * Range of svdf_params->output_offset : [-128, 127] 2053 * @param[in] input_quant_params Input quantization parameters 2054 * @param[in] output_quant_params Output quantization parameters 2055 * @param[in] input_dims Input tensor dimensions 2056 * @param[in] input_data Pointer to input tensor 2057 * @param[in] state_dims State tensor dimensions 2058 * @param[in] state_data Pointer to state tensor 2059 * @param[in] weights_feature_dims Weights (feature) tensor dimensions 2060 * @param[in] weights_feature_data Pointer to the weights (feature) tensor 2061 * @param[in] weights_time_dims Weights (time) tensor dimensions 2062 * @param[in] weights_time_data Pointer to the weights (time) tensor 2063 * @param[in] bias_dims Bias tensor dimensions 2064 * @param[in] bias_data Pointer to bias tensor 2065 * @param[in] output_dims Output tensor dimensions 2066 * @param[out] output_data Pointer to the output tensor 2067 * 2068 * @return The function returns <code>ARM_MATH_SUCCESS</code> 2069 * 2070 * @details 2071 * 1. Supported framework: TensorFlow Lite micro 2072 * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. 2073 * 2074 */ 2075 arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx, 2076 const cmsis_nn_context *output_ctx, 2077 const cmsis_nn_svdf_params *svdf_params, 2078 const cmsis_nn_per_tensor_quant_params *input_quant_params, 2079 const cmsis_nn_per_tensor_quant_params *output_quant_params, 2080 const cmsis_nn_dims *input_dims, 2081 const q7_t *input_data, 2082 const cmsis_nn_dims *state_dims, 2083 q15_t *state_data, 2084 const cmsis_nn_dims *weights_feature_dims, 2085 const q7_t *weights_feature_data, 2086 const cmsis_nn_dims *weights_time_dims, 2087 const q15_t *weights_time_data, 2088 const cmsis_nn_dims *bias_dims, 2089 const q31_t *bias_data, 2090 const cmsis_nn_dims *output_dims, 2091 q7_t *output_data); 2092 2093 #ifdef __cplusplus 2094 } 2095 #endif 2096 2097 #endif 2098