1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_dot_prod_q31.c
4  * Description:  Q31 dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/basic_math_functions.h"
30 
31 /**
32   @ingroup groupMath
33  */
34 
35 /**
36   @addtogroup BasicDotProd
37   @{
38  */
39 
40 /**
41   @brief         Dot product of Q31 vectors.
42   @param[in]     pSrcA      points to the first input vector.
43   @param[in]     pSrcB      points to the second input vector.
44   @param[in]     blockSize  number of samples in each vector.
45   @param[out]    result     output result returned here.
46 
47   @par           Scaling and Overflow Behavior
48                    The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
49                    are truncated to 2.48 format by discarding the lower 14 bits.
50                    The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
51                    There are 15 guard bits in the accumulator and there is no risk of overflow as long as
52                    the length of the vectors is less than 2^16 elements.
53                    The return result is in 16.48 format.
54  */
55 
56 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
57 
58 #include "arm_helium_utils.h"
59 
arm_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t blockSize,q63_t * result)60 ARM_DSP_ATTRIBUTE void arm_dot_prod_q31(
61     const q31_t * pSrcA,
62     const q31_t * pSrcB,
63     uint32_t blockSize,
64     q63_t * result)
65 {
66     uint32_t  blkCnt;           /* loop counters */
67     q31x4_t vecA;
68     q31x4_t vecB;
69     q63_t     sum = 0LL;
70 
71     /* Compute 4 outputs at a time */
72     blkCnt = blockSize >> 2;
73     while (blkCnt > 0U)
74     {
75         /*
76          * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
77          * Calculate dot product and then store the result in a temporary buffer.
78          */
79         vecA = vld1q(pSrcA);
80         vecB = vld1q(pSrcB);
81         sum = vrmlaldavhaq(sum, vecA, vecB);
82         /*
83          * Decrement the blockSize loop counter
84          */
85         blkCnt--;
86         /*
87          * advance vector source and destination pointers
88          */
89         pSrcA += 4;
90         pSrcB += 4;
91     }
92     /*
93      * tail
94      */
95     blkCnt = blockSize & 3;
96     if (blkCnt > 0U)
97     {
98         mve_pred16_t p0 = vctp32q(blkCnt);
99         vecA = vld1q(pSrcA);
100         vecB = vld1q(pSrcB);
101         sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
102     }
103 
104     /*
105      * vrmlaldavhaq provides extra intermediate accumulator headroom.
106      * limiting the need of intermediate scaling
107      * Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
108      * 16.48 output conversion is performed outside the loop by scaling accu. by 6
109      */
110     *result = asrl(sum, (14 - 8));
111 }
112 
113 #else
arm_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t blockSize,q63_t * result)114 ARM_DSP_ATTRIBUTE void arm_dot_prod_q31(
115   const q31_t * pSrcA,
116   const q31_t * pSrcB,
117         uint32_t blockSize,
118         q63_t * result)
119 {
120         uint32_t blkCnt;                               /* Loop counter */
121         q63_t sum = 0;                                 /* Temporary return variable */
122 
123 #if defined (ARM_MATH_LOOPUNROLL)
124 
125   /* Loop unrolling: Compute 4 outputs at a time */
126   blkCnt = blockSize >> 2U;
127 
128   while (blkCnt > 0U)
129   {
130     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
131 
132     /* Calculate dot product and store result in a temporary buffer. */
133     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
134 
135     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
136 
137     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
138 
139     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
140 
141     /* Decrement loop counter */
142     blkCnt--;
143   }
144 
145   /* Loop unrolling: Compute remaining outputs */
146   blkCnt = blockSize % 0x4U;
147 
148 #else
149 
150   /* Initialize blkCnt with number of samples */
151   blkCnt = blockSize;
152 
153 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
154 
155   while (blkCnt > 0U)
156   {
157     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
158 
159     /* Calculate dot product and store result in a temporary buffer. */
160     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
161 
162     /* Decrement loop counter */
163     blkCnt--;
164   }
165 
166   /* Store result in destination buffer in 16.48 format */
167   *result = sum;
168 }
169 #endif /* defined(ARM_MATH_MVEI) */
170 
171 /**
172   @} end of BasicDotProd group
173  */
174