1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_accumulate_f32.c
4  * Description:  Sum value of a floating-point vector
5  *
6  * $Date:        14 July 2022
7  * $Revision:    V1.0.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions.h"
30 
31 /**
32  @ingroup groupStats
33  */
34 
35 
36 /**
37  @addtogroup Accumulation
38  @{
39  */
40 
41 /**
42  @brief         Accumulation value of a floating-point vector.
43  @param[in]     pSrc       points to the input vector.
44  @param[in]     blockSize  number of samples in input vector.
45  @param[out]    pResult    sum of values in input vector.
46  */
47 
48 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
49 
50 #include "arm_helium_utils.h"
51 
arm_accumulate_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)52 ARM_DSP_ATTRIBUTE void arm_accumulate_f32(
53                         const float32_t * pSrc,
54                         uint32_t blockSize,
55                         float32_t * pResult)
56 {
57     f32x4_t vecA;
58     f32x4_t vecSum;
59     uint32_t blkCnt;
60     float32_t sum = 0.0f;
61     vecSum = vdupq_n_f32(0.0f);
62 
63     /* Compute 4 outputs at a time */
64     blkCnt = blockSize >> 2U;
65     while (blkCnt > 0U)
66     {
67         /*
68          * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
69          * Calculate dot product and then store the result in a temporary buffer.
70          * and advance vector source and destination pointers
71          */
72         vecA = vld1q_f32(pSrc);
73         pSrc += 4;
74 
75         vecSum = vaddq_f32(vecSum, vecA);
76         /*
77          * Decrement the blockSize loop counter
78          */
79         blkCnt --;
80     }
81 
82 
83     blkCnt = blockSize & 3;
84     if (blkCnt > 0U)
85     {
86         /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
87 
88         mve_pred16_t p0 = vctp32q(blkCnt);
89         vecA = vld1q(pSrc);
90         vecSum = vaddq_m(vecSum,vecSum, vecA, p0);
91     }
92 
93     sum = vecAddAcrossF32Mve(vecSum);
94 
95     /* Store result in destination buffer */
96     *pResult = sum;
97 }
98 
99 #else
100 
101 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_accumulate_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)102 ARM_DSP_ATTRIBUTE void arm_accumulate_f32(
103                         const float32_t * pSrc,
104                         uint32_t blockSize,
105                         float32_t * pResult)
106 {
107   float32_t sum = 0.0f;                          /* Temporary result storage */
108   float32x4_t sumV = vdupq_n_f32(0.0f);                          /* Temporary result storage */
109   float32x2_t sumV2;
110 
111   uint32_t blkCnt;                               /* Loop counter */
112 
113   float32x4_t inV;
114 
115   blkCnt = blockSize >> 2U;
116 
117   /* Compute 4 outputs at a time.
118    ** a second loop below computes the remaining 1 to 3 samples. */
119   while (blkCnt > 0U)
120   {
121     /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
122     inV = vld1q_f32(pSrc);
123     sumV = vaddq_f32(sumV, inV);
124 
125     pSrc += 4;
126     /* Decrement the loop counter */
127     blkCnt--;
128   }
129 
130   sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
131   sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
132 
133   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
134    ** No loop unrolling is used. */
135   blkCnt = blockSize & 3;
136 
137   while (blkCnt > 0U)
138   {
139     /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
140     sum += *pSrc++;
141 
142     /* Decrement the loop counter */
143     blkCnt--;
144   }
145 
146   /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1])  */
147   /* Store the result to the destination */
148   *pResult = sum;
149 }
150 
151 #else
arm_accumulate_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)152 ARM_DSP_ATTRIBUTE void arm_accumulate_f32(
153                         const float32_t * pSrc,
154                         uint32_t blockSize,
155                         float32_t * pResult)
156 {
157   uint32_t blkCnt;                               /* Loop counter */
158   float32_t sum = 0.0f;                          /* Temporary result storage */
159 
160 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
161 
162   /* Loop unrolling: Compute 4 outputs at a time */
163   blkCnt = blockSize >> 2U;
164 
165   while (blkCnt > 0U)
166   {
167     /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
168     sum += *pSrc++;
169 
170     sum += *pSrc++;
171 
172     sum += *pSrc++;
173 
174     sum += *pSrc++;
175 
176     /* Decrement the loop counter */
177     blkCnt--;
178   }
179 
180   /* Loop unrolling: Compute remaining outputs */
181   blkCnt = blockSize % 0x4U;
182 
183 #else
184 
185   /* Initialize blkCnt with number of samples */
186   blkCnt = blockSize;
187 
188 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
189 
190   while (blkCnt > 0U)
191   {
192     /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
193     sum += *pSrc++;
194 
195     /* Decrement loop counter */
196     blkCnt--;
197   }
198 
199   /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1])  */
200   /* Store result to destination */
201   *pResult = sum ;
202 }
203 #endif /* #if defined(ARM_MATH_NEON) */
204 
205 #endif /* #if defined(ARM_MATH_MVEF) */
206 /**
207  @} end of Accumulation group
208  */
209