/* * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the License); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an AS IS BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* ---------------------------------------------------------------------- * Project: CMSIS NN Library * Title: arm_svdf_s8.c * Description: S8 basic SVDF layer function * * $Date: 15. April 2021 * $Revision: V.1.5.0 * * Target Processor: Cortex-M processors * * -------------------------------------------------------------------- */ #include "arm_nnfunctions.h" #include "arm_nnsupportfunctions.h" /** * @ingroup groupNN */ /** * @addtogroup SVDF * @{ */ /* * S8 SVDF layer function for TensorFlow Lite * * Refer to header file for details. * */ arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx, const cmsis_nn_context *output_ctx, const cmsis_nn_svdf_params *svdf_params, const cmsis_nn_per_tensor_quant_params *input_quant_params, const cmsis_nn_per_tensor_quant_params *output_quant_params, const cmsis_nn_dims *input_dims, const q7_t *input_data, const cmsis_nn_dims *state_dims, q15_t *state_data, const cmsis_nn_dims *weights_feature_dims, const q7_t *weights_feature_data, const cmsis_nn_dims *weights_time_dims, const q15_t *weights_time_data, const cmsis_nn_dims *bias_dims, const q31_t *bias_data, const cmsis_nn_dims *output_dims, q7_t *output_data) { (void)bias_dims; (void)state_dims; (void)output_dims; const q31_t multiplier_in = input_quant_params->multiplier; const q31_t shift_in = input_quant_params->shift; const q31_t multiplier_out = output_quant_params->multiplier; const q31_t shift_2 = output_quant_params->shift; const int32_t zp_in = svdf_params->input_offset; const int32_t zp_out = svdf_params->output_offset; const int32_t in_activation_min = svdf_params->input_activation.min; const int32_t in_activation_max = svdf_params->input_activation.max; const int32_t out_activation_min = svdf_params->output_activation.min; const int32_t out_activation_max = svdf_params->output_activation.max; const int16_t rank = svdf_params->rank; const int32_t input_batches = input_dims->n; const int32_t input_height = input_dims->h; const int32_t feature_batches = weights_feature_dims->n; const int32_t time_batches = weights_time_dims->h; const int32_t unit_count = feature_batches / rank; q31_t *buffer_a = (q31_t *)input_ctx->buf; q31_t *buffer_b = (q31_t *)output_ctx->buf; memmove((q15_t *)state_data, (q15_t *)state_data + 1, (size_t)(input_batches * feature_batches * time_batches * (int32_t)sizeof(int16_t))); for (int i_batch = 0; i_batch < input_batches; i_batch++) { q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); const q7_t *weight = weights_feature_data; const q7_t *input = input_data + i_batch * input_height; arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input, weight, res_ptr, -zp_in, 0, time_batches, multiplier_in, shift_in, input_height, feature_batches, in_activation_min, in_activation_max); if (res != ARM_MATH_SUCCESS) { return res; } } { q31_t *ptr_a = buffer_a; const q15_t *v2 = state_data; for (int i_batch = 0; i_batch < input_batches; i_batch++) { const q15_t *v1 = weights_time_data; for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++) { *ptr_a = 0; int32_t sum = 0; #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) int j = 0; int32_t block_count = time_batches >> 1; for (int i = 0; i < block_count; i++) { j += 2; q31_t r1 = arm_nn_read_q15x2_ia(&v1); q31_t r2 = arm_nn_read_q15x2_ia(&v2); sum = __SMLAD(r1, r2, sum); } // Process the remaining data for (; j < time_batches; j++) { sum += *v1 * *v2; v1++; v2++; } #else for (int j = 0; j < time_batches; j++) { sum += *v1 * *v2; v1++; v2++; } #endif *ptr_a = sum; ptr_a++; } } } if (bias_data) { if (unit_count == feature_batches) { for (int i = 0; i < input_batches; i++) { q31_t *output_temp = buffer_b + i * feature_batches; const q31_t *ptr_a = buffer_a + i * feature_batches; const int32_t *bi = bias_data; for (int j = 0; j < feature_batches; j++) { output_temp[j] = ptr_a[j] + bi[j]; } } } else { for (int i_batch = 0; i_batch < input_batches; i_batch++) { q31_t *output_data_temp = buffer_b + i_batch * unit_count; q31_t *ptr_a = buffer_a + i_batch * feature_batches; for (int i = 0; i < unit_count; i++) { int32_t sum = bias_data[i]; for (int j = 0; j < rank; j++) { sum += *ptr_a; ptr_a++; } output_data_temp[i] = sum; } } } } else { for (int i_batch = 0; i_batch < input_batches; i_batch++) { q31_t *output_data_temp = buffer_b + i_batch * unit_count; q31_t *ptr_a = buffer_a + i_batch * feature_batches; for (int i = 0; i < unit_count; i++) { int32_t sum = 0; for (int j = 0; j < rank; j++) { sum += *ptr_a; ptr_a++; } output_data_temp[i] = sum; } } } #if defined(ARM_MATH_MVEI) int32_t num_elements = input_batches * unit_count; const int32_t loop_count = (num_elements + 3) / 4; for (int i_op = 0; i_op < loop_count; i_op++) { mve_pred16_t p = vctp32q((uint32_t)num_elements); int32x4_t op = vldrwq_z_s32(buffer_b, p); op = arm_requantize_mve(op, multiplier_out, shift_2); op = vaddq_n_s32(op, zp_out); const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min); const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max); op = vmaxq_s32(op, min_vec); op = vminq_s32(op, max_vec); vstrbq_p_s32(output_data, op, p); output_data += 4; buffer_b += 4; num_elements -= 4; } #else for (int i = 0; i < input_batches * unit_count; i++) { output_data[i] = (q7_t)CLAMP( arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min); } #endif return (ARM_MATH_SUCCESS); } /** * @} end of SVDF group */