1 /*
2  * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_elementwise_mul_s16
22  * Description:  Element wise multiplication
23  *
24  * $Date:        20 January 2023
25  * $Revision:    V.2.4.0
26  *
27  * Target :  Arm(R) M-Profile Architecture
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_nnfunctions.h"
32 #include "arm_nnsupportfunctions.h"
33 
34 /**
35  *  @ingroup Public
36  */
37 
38 /**
39  * @addtogroup groupElementwise
40  * @{
41  */
42 
43 /**
44  * @brief s16 element wise multiplication of two vectors
45  *
46  * @note   Refer header file for details.
47  *
48  */
arm_elementwise_mul_s16(const int16_t * input_1_vect,const int16_t * input_2_vect,const int32_t input_1_offset,const int32_t input_2_offset,int16_t * output,const int32_t out_offset,const int32_t out_mult,const int32_t out_shift,const int32_t out_activation_min,const int32_t out_activation_max,const int32_t block_size)49 arm_cmsis_nn_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
50                                             const int16_t *input_2_vect,
51                                             const int32_t input_1_offset,
52                                             const int32_t input_2_offset,
53                                             int16_t *output,
54                                             const int32_t out_offset,
55                                             const int32_t out_mult,
56                                             const int32_t out_shift,
57                                             const int32_t out_activation_min,
58                                             const int32_t out_activation_max,
59                                             const int32_t block_size)
60 {
61     (void)input_1_offset;
62     (void)input_2_offset;
63     (void)out_offset;
64     int32_t loop_count;
65 
66 #if defined(ARM_MATH_MVEI)
67 
68     loop_count = block_size;
69 
70     while (loop_count > 0)
71     {
72         mve_pred16_t pred = vctp32q(loop_count);
73 
74         int32x4_t input_1 = vldrhq_z_s32(input_1_vect, pred);
75         int32x4_t input_2 = vldrhq_z_s32(input_2_vect, pred);
76 
77         int32x4_t res_0 = vmulq_s32(input_1, input_2);
78 
79         res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
80 
81         res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min));
82         res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max));
83 
84         vstrhq_p_s32(output, res_0, pred);
85         input_1_vect += 4;
86         input_2_vect += 4;
87 
88         output += 4;
89         loop_count -= 4;
90     }
91 
92 #else
93     int32_t input_1;
94     int32_t input_2;
95     int32_t mul_res;
96     int32_t two_halfword_1, two_halfword_2;
97     int16_t mul_1, mul_2;
98     loop_count = block_size / 2;
99 
100     while (loop_count > 0)
101     {
102         two_halfword_1 = arm_nn_read_q15x2_ia(&input_1_vect);
103         two_halfword_2 = arm_nn_read_q15x2_ia(&input_2_vect);
104 
105     #if defined(ARM_MATH_DSP)
106         mul_res = SMULBB(two_halfword_1, two_halfword_2);
107     #else
108         input_1 = (int16_t)(two_halfword_1 & 0xFFFF);
109         input_2 = (int16_t)(two_halfword_2 & 0xFFFF);
110         mul_res = input_1 * input_2;
111     #endif
112         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
113         mul_res = MAX(mul_res, out_activation_min);
114         mul_res = MIN(mul_res, out_activation_max);
115         mul_1 = (int16_t)mul_res;
116 
117     #if defined(ARM_MATH_DSP)
118         mul_res = SMULTT(two_halfword_1, two_halfword_2);
119     #else
120         input_1 = (int16_t)(two_halfword_1 >> 16);
121         input_2 = (int16_t)(two_halfword_2 >> 16);
122         mul_res = input_1 * input_2;
123     #endif
124         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
125         mul_res = MAX(mul_res, out_activation_min);
126         mul_res = MIN(mul_res, out_activation_max);
127         mul_2 = (int16_t)mul_res;
128 
129         arm_nn_write_q15x2_ia(&output, PACK_Q15x2_32x1(mul_1, mul_2));
130 
131         loop_count--;
132     }
133     loop_count = block_size & 0x1;
134 
135     while (loop_count > 0)
136     {
137         /* C = A * B */
138 
139         input_1 = *input_1_vect++;
140         input_2 = *input_2_vect++;
141 
142         mul_res = input_1 * input_2;
143         mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
144 
145         mul_res = MAX(mul_res, out_activation_min);
146         mul_res = MIN(mul_res, out_activation_max);
147 
148         *output++ = (int16_t)mul_res;
149 
150         /* Decrement loop counter */
151         loop_count--;
152     }
153 #endif // #if defined(ARM_MATH_MVEI)
154     return ARM_CMSIS_NN_SUCCESS;
155 }
156 
157 /**
158  * @} end of Doxygen group
159  */
160