1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_power_f32.c
4 * Description: Sum of the squares of the elements of a floating-point vector
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions.h"
30
31 /**
32 @ingroup groupStats
33 */
34
35 /**
36 @defgroup power Power
37
38 Calculates the sum of the squares of the elements in the input vector.
39 The underlying algorithm is used:
40
41 <pre>
42 Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
43 </pre>
44
45 There are separate functions for floating point, Q31, Q15, and Q7 data types.
46
47 Since the result is not divided by the length, those functions are in fact computing
48 something which is more an energy than a power.
49
50 */
51
52 /**
53 @addtogroup power
54 @{
55 */
56
57 /**
58 @brief Sum of the squares of the elements of a floating-point vector.
59 @param[in] pSrc points to the input vector
60 @param[in] blockSize number of samples in input vector
61 @param[out] pResult sum of the squares value returned here
62 */
63 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
64
65 #include "arm_helium_utils.h"
66
arm_power_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)67 ARM_DSP_ATTRIBUTE void arm_power_f32(
68 const float32_t * pSrc,
69 uint32_t blockSize,
70 float32_t * pResult)
71 {
72 uint32_t blkCnt; /* loop counters */
73 f32x4_t vecSrc;
74 f32x4_t sumVec = vdupq_n_f32(0.0f);
75 float32_t sum = 0.0f;
76 float32_t in;
77
78 /* Compute 4 outputs at a time */
79 blkCnt = blockSize >> 2U;
80 while (blkCnt > 0U)
81 {
82 vecSrc = vldrwq_f32(pSrc);
83 /*
84 * sum lanes
85 */
86 sumVec = vfmaq(sumVec, vecSrc, vecSrc);
87
88 blkCnt --;
89 pSrc += 4;
90 }
91 sum = vecAddAcrossF32Mve(sumVec);
92
93 /*
94 * tail
95 */
96 blkCnt = blockSize & 0x3;
97 while (blkCnt > 0U)
98 {
99 /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
100
101 /* Compute Power and store result in a temporary variable, sum. */
102 in = *pSrc++;
103 sum += in * in;
104
105 /* Decrement loop counter */
106 blkCnt--;
107 }
108
109 *pResult = sum;
110 }
111 #else
112 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_power_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)113 ARM_DSP_ATTRIBUTE void arm_power_f32(
114 const float32_t * pSrc,
115 uint32_t blockSize,
116 float32_t * pResult)
117 {
118 float32_t sum = 0.0f; /* accumulator */
119 float32_t in; /* Temporary variable to store input value */
120 uint32_t blkCnt; /* loop counter */
121
122 float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
123 float32x2_t sumV2;
124 float32x4_t inV;
125
126 blkCnt = blockSize >> 2U;
127
128 /* Compute 4 outputs at a time.
129 ** a second loop below computes the remaining 1 to 3 samples. */
130 while (blkCnt > 0U)
131 {
132 /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
133 /* Compute Power and then store the result in a temporary variable, sum. */
134 inV = vld1q_f32(pSrc);
135 sumV = vmlaq_f32(sumV, inV, inV);
136 pSrc += 4;
137
138 /* Decrement the loop counter */
139 blkCnt--;
140 }
141 sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
142 sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
143
144 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
145 ** No loop unrolling is used. */
146 blkCnt = blockSize % 0x4U;
147
148 while (blkCnt > 0U)
149 {
150 /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
151 /* compute power and then store the result in a temporary variable, sum. */
152 in = *pSrc++;
153 sum += in * in;
154
155 /* Decrement the loop counter */
156 blkCnt--;
157 }
158
159 /* Store the result to the destination */
160 *pResult = sum;
161 }
162 #else
arm_power_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)163 ARM_DSP_ATTRIBUTE void arm_power_f32(
164 const float32_t * pSrc,
165 uint32_t blockSize,
166 float32_t * pResult)
167 {
168 uint32_t blkCnt; /* Loop counter */
169 float32_t sum = 0.0f; /* Temporary result storage */
170 float32_t in; /* Temporary variable to store input value */
171
172 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
173
174 /* Loop unrolling: Compute 4 outputs at a time */
175 blkCnt = blockSize >> 2U;
176
177 while (blkCnt > 0U)
178 {
179 /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
180
181 /* Compute Power and store result in a temporary variable, sum. */
182 in = *pSrc++;
183 sum += in * in;
184
185 in = *pSrc++;
186 sum += in * in;
187
188 in = *pSrc++;
189 sum += in * in;
190
191 in = *pSrc++;
192 sum += in * in;
193
194 /* Decrement loop counter */
195 blkCnt--;
196 }
197
198 /* Loop unrolling: Compute remaining outputs */
199 blkCnt = blockSize % 0x4U;
200
201 #else
202
203 /* Initialize blkCnt with number of samples */
204 blkCnt = blockSize;
205
206 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
207
208 while (blkCnt > 0U)
209 {
210 /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
211
212 /* Compute Power and store result in a temporary variable, sum. */
213 in = *pSrc++;
214 sum += in * in;
215
216 /* Decrement loop counter */
217 blkCnt--;
218 }
219
220 /* Store result to destination */
221 *pResult = sum;
222 }
223 #endif /* #if defined(ARM_MATH_NEON) */
224 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
225
226 /**
227 @} end of power group
228 */
229