1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_dot_prod_f16.c
4  * Description:  Floating-point dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/basic_math_functions_f16.h"
30 
31 /**
32   @ingroup groupMath
33  */
34 
35 
36 /**
37   @addtogroup BasicDotProd
38   @{
39  */
40 
41 /**
42   @brief         Dot product of floating-point vectors.
43   @param[in]     pSrcA      points to the first input vector.
44   @param[in]     pSrcB      points to the second input vector.
45   @param[in]     blockSize  number of samples in each vector.
46   @param[out]    result     output result returned here.
47  */
48 
49 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
50 
51 #include "arm_helium_utils.h"
52 
53 
arm_dot_prod_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize,float16_t * result)54 void arm_dot_prod_f16(
55     const float16_t * pSrcA,
56     const float16_t * pSrcB,
57     uint32_t    blockSize,
58     float16_t * result)
59 {
60     f16x8_t vecA, vecB;
61     f16x8_t vecSum;
62     uint32_t blkCnt;
63     float16_t sum = 0.0f;
64     vecSum = vdupq_n_f16(0.0f);
65 
66     /* Compute 4 outputs at a time */
67     blkCnt = blockSize >> 3U;
68     while (blkCnt > 0U)
69     {
70         /*
71          * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
72          * Calculate dot product and then store the result in a temporary buffer.
73          * and advance vector source and destination pointers
74          */
75         vecA = vld1q(pSrcA);
76         pSrcA += 8;
77 
78         vecB = vld1q(pSrcB);
79         pSrcB += 8;
80 
81         vecSum = vfmaq(vecSum, vecA, vecB);
82         /*
83          * Decrement the blockSize loop counter
84          */
85         blkCnt --;
86     }
87 
88 
89     blkCnt = blockSize & 7;
90     if (blkCnt > 0U)
91     {
92         /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
93 
94         mve_pred16_t p0 = vctp16q(blkCnt);
95         vecA = vld1q(pSrcA);
96         vecB = vld1q(pSrcB);
97         vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
98     }
99 
100     sum = vecAddAcrossF16Mve(vecSum);
101 
102     /* Store result in destination buffer */
103     *result = sum;
104 
105 }
106 
107 #else
108 #if defined(ARM_FLOAT16_SUPPORTED)
arm_dot_prod_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize,float16_t * result)109 void arm_dot_prod_f16(
110   const float16_t * pSrcA,
111   const float16_t * pSrcB,
112         uint32_t blockSize,
113         float16_t * result)
114 {
115         uint32_t blkCnt;                               /* Loop counter */
116         _Float16 sum = 0.0f;                          /* Temporary return variable */
117 
118 
119 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
120 
121   /* Loop unrolling: Compute 4 outputs at a time */
122   blkCnt = blockSize >> 2U;
123 
124   /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
125    ** a second loop below computes the remaining 1 to 3 samples. */
126   while (blkCnt > 0U)
127   {
128     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
129 
130     /* Calculate dot product and store result in a temporary buffer. */
131     sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
132 
133     sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
134 
135     sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
136 
137     sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
138 
139     /* Decrement loop counter */
140     blkCnt--;
141   }
142 
143   /* Loop unrolling: Compute remaining outputs */
144   blkCnt = blockSize % 0x4U;
145 
146 #else
147 
148   /* Initialize blkCnt with number of samples */
149   blkCnt = blockSize;
150 
151 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
152 
153   while (blkCnt > 0U)
154   {
155     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
156 
157     /* Calculate dot product and store result in a temporary buffer. */
158     sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
159 
160     /* Decrement loop counter */
161     blkCnt--;
162   }
163 
164   /* Store result in destination buffer */
165   *result = sum;
166 }
167 #endif
168 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
169 /**
170   @} end of BasicDotProd group
171  */
172