1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_dot_prod_f32.c
4  * Description:  Floating-point dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/basic_math_functions.h"
30 
31 /**
32   @ingroup groupMath
33  */
34 
35 /**
36   @defgroup BasicDotProd Vector Dot Product
37 
38   Computes the dot product of two vectors.
39   The vectors are multiplied element-by-element and then summed.
40 
41   <pre>
42       sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
43   </pre>
44 
45   There are separate functions for floating-point, Q7, Q15, and Q31 data types.
46  */
47 
48 /**
49   @addtogroup BasicDotProd
50   @{
51  */
52 
53 /**
54   @brief         Dot product of floating-point vectors.
55   @param[in]     pSrcA      points to the first input vector.
56   @param[in]     pSrcB      points to the second input vector.
57   @param[in]     blockSize  number of samples in each vector.
58   @param[out]    result     output result returned here.
59   @return        none
60  */
61 
62 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
63 
64 #include "arm_helium_utils.h"
65 
66 
arm_dot_prod_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * result)67 void arm_dot_prod_f32(
68     const float32_t * pSrcA,
69     const float32_t * pSrcB,
70     uint32_t    blockSize,
71     float32_t * result)
72 {
73     f32x4_t vecA, vecB;
74     f32x4_t vecSum;
75     uint32_t blkCnt;
76     float32_t sum = 0.0f;
77     vecSum = vdupq_n_f32(0.0f);
78 
79     /* Compute 4 outputs at a time */
80     blkCnt = blockSize >> 2U;
81     while (blkCnt > 0U)
82     {
83         /*
84          * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
85          * Calculate dot product and then store the result in a temporary buffer.
86          * and advance vector source and destination pointers
87          */
88         vecA = vld1q(pSrcA);
89         pSrcA += 4;
90 
91         vecB = vld1q(pSrcB);
92         pSrcB += 4;
93 
94         vecSum = vfmaq(vecSum, vecA, vecB);
95         /*
96          * Decrement the blockSize loop counter
97          */
98         blkCnt --;
99     }
100 
101 
102     blkCnt = blockSize & 3;
103     if (blkCnt > 0U)
104     {
105         /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
106 
107         mve_pred16_t p0 = vctp32q(blkCnt);
108         vecA = vld1q(pSrcA);
109         vecB = vld1q(pSrcB);
110         vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
111     }
112 
113     sum = vecAddAcrossF32Mve(vecSum);
114 
115     /* Store result in destination buffer */
116     *result = sum;
117 
118 }
119 
120 #else
121 
arm_dot_prod_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * result)122 void arm_dot_prod_f32(
123   const float32_t * pSrcA,
124   const float32_t * pSrcB,
125         uint32_t blockSize,
126         float32_t * result)
127 {
128         uint32_t blkCnt;                               /* Loop counter */
129         float32_t sum = 0.0f;                          /* Temporary return variable */
130 
131 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
132     f32x4_t vec1;
133     f32x4_t vec2;
134     f32x4_t accum = vdupq_n_f32(0);
135     f32x2_t tmp = vdup_n_f32(0);
136 
137     /* Compute 4 outputs at a time */
138     blkCnt = blockSize >> 2U;
139 
140     vec1 = vld1q_f32(pSrcA);
141     vec2 = vld1q_f32(pSrcB);
142 
143     while (blkCnt > 0U)
144     {
145         /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
146         /* Calculate dot product and then store the result in a temporary buffer. */
147 
148 	      accum = vmlaq_f32(accum, vec1, vec2);
149 
150         /* Increment pointers */
151         pSrcA += 4;
152         pSrcB += 4;
153 
154         vec1 = vld1q_f32(pSrcA);
155         vec2 = vld1q_f32(pSrcB);
156 
157         /* Decrement the loop counter */
158         blkCnt--;
159     }
160 
161 #if __aarch64__
162     sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
163 #else
164     tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
165     sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
166 
167 #endif
168 
169     /* Tail */
170     blkCnt = blockSize & 0x3;
171 
172 #else
173 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
174 
175   /* Loop unrolling: Compute 4 outputs at a time */
176   blkCnt = blockSize >> 2U;
177 
178   /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
179    ** a second loop below computes the remaining 1 to 3 samples. */
180   while (blkCnt > 0U)
181   {
182     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
183 
184     /* Calculate dot product and store result in a temporary buffer. */
185     sum += (*pSrcA++) * (*pSrcB++);
186 
187     sum += (*pSrcA++) * (*pSrcB++);
188 
189     sum += (*pSrcA++) * (*pSrcB++);
190 
191     sum += (*pSrcA++) * (*pSrcB++);
192 
193     /* Decrement loop counter */
194     blkCnt--;
195   }
196 
197   /* Loop unrolling: Compute remaining outputs */
198   blkCnt = blockSize % 0x4U;
199 
200 #else
201 
202   /* Initialize blkCnt with number of samples */
203   blkCnt = blockSize;
204 
205 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
206 #endif /* #if defined(ARM_MATH_NEON) */
207 
208   while (blkCnt > 0U)
209   {
210     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
211 
212     /* Calculate dot product and store result in a temporary buffer. */
213     sum += (*pSrcA++) * (*pSrcB++);
214 
215     /* Decrement loop counter */
216     blkCnt--;
217   }
218 
219   /* Store result in destination buffer */
220   *result = sum;
221 }
222 
223 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
224 /**
225   @} end of BasicDotProd group
226  */
227