1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_dot_prod_q31.c
4 * Description: Q31 dot product
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/basic_math_functions.h"
30
31 /**
32 @ingroup groupMath
33 */
34
35 /**
36 @addtogroup BasicDotProd
37 @{
38 */
39
40 /**
41 @brief Dot product of Q31 vectors.
42 @param[in] pSrcA points to the first input vector.
43 @param[in] pSrcB points to the second input vector.
44 @param[in] blockSize number of samples in each vector.
45 @param[out] result output result returned here.
46
47 @par Scaling and Overflow Behavior
48 The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
49 are truncated to 2.48 format by discarding the lower 14 bits.
50 The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
51 There are 15 guard bits in the accumulator and there is no risk of overflow as long as
52 the length of the vectors is less than 2^16 elements.
53 The return result is in 16.48 format.
54 */
55
56 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
57
58 #include "arm_helium_utils.h"
59
arm_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t blockSize,q63_t * result)60 ARM_DSP_ATTRIBUTE void arm_dot_prod_q31(
61 const q31_t * pSrcA,
62 const q31_t * pSrcB,
63 uint32_t blockSize,
64 q63_t * result)
65 {
66 uint32_t blkCnt; /* loop counters */
67 q31x4_t vecA;
68 q31x4_t vecB;
69 q63_t sum = 0LL;
70
71 /* Compute 4 outputs at a time */
72 blkCnt = blockSize >> 2;
73 while (blkCnt > 0U)
74 {
75 /*
76 * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
77 * Calculate dot product and then store the result in a temporary buffer.
78 */
79 vecA = vld1q(pSrcA);
80 vecB = vld1q(pSrcB);
81 sum = vrmlaldavhaq(sum, vecA, vecB);
82 /*
83 * Decrement the blockSize loop counter
84 */
85 blkCnt--;
86 /*
87 * advance vector source and destination pointers
88 */
89 pSrcA += 4;
90 pSrcB += 4;
91 }
92 /*
93 * tail
94 */
95 blkCnt = blockSize & 3;
96 if (blkCnt > 0U)
97 {
98 mve_pred16_t p0 = vctp32q(blkCnt);
99 vecA = vld1q(pSrcA);
100 vecB = vld1q(pSrcB);
101 sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
102 }
103
104 /*
105 * vrmlaldavhaq provides extra intermediate accumulator headroom.
106 * limiting the need of intermediate scaling
107 * Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
108 * 16.48 output conversion is performed outside the loop by scaling accu. by 6
109 */
110 *result = asrl(sum, (14 - 8));
111 }
112
113 #else
arm_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t blockSize,q63_t * result)114 ARM_DSP_ATTRIBUTE void arm_dot_prod_q31(
115 const q31_t * pSrcA,
116 const q31_t * pSrcB,
117 uint32_t blockSize,
118 q63_t * result)
119 {
120 uint32_t blkCnt; /* Loop counter */
121 q63_t sum = 0; /* Temporary return variable */
122
123 #if defined (ARM_MATH_LOOPUNROLL)
124
125 /* Loop unrolling: Compute 4 outputs at a time */
126 blkCnt = blockSize >> 2U;
127
128 while (blkCnt > 0U)
129 {
130 /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
131
132 /* Calculate dot product and store result in a temporary buffer. */
133 sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
134
135 sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
136
137 sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
138
139 sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
140
141 /* Decrement loop counter */
142 blkCnt--;
143 }
144
145 /* Loop unrolling: Compute remaining outputs */
146 blkCnt = blockSize % 0x4U;
147
148 #else
149
150 /* Initialize blkCnt with number of samples */
151 blkCnt = blockSize;
152
153 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
154
155 while (blkCnt > 0U)
156 {
157 /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
158
159 /* Calculate dot product and store result in a temporary buffer. */
160 sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
161
162 /* Decrement loop counter */
163 blkCnt--;
164 }
165
166 /* Store result in destination buffer in 16.48 format */
167 *result = sum;
168 }
169 #endif /* defined(ARM_MATH_MVEI) */
170
171 /**
172 @} end of BasicDotProd group
173 */
174