1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_power_f32.c
4  * Description:  Sum of the squares of the elements of a floating-point vector
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions.h"
30 
31 /**
32   @ingroup groupStats
33  */
34 
35 /**
36   @defgroup power Power
37 
38   Calculates the sum of the squares of the elements in the input vector.
39   The underlying algorithm is used:
40 
41   <pre>
42       Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
43   </pre>
44 
45   There are separate functions for floating point, Q31, Q15, and Q7 data types.
46 
47   Since the result is not divided by the length, those functions are in fact computing
48   something which is more an energy than a power.
49 
50  */
51 
52 /**
53   @addtogroup power
54   @{
55  */
56 
57 /**
58   @brief         Sum of the squares of the elements of a floating-point vector.
59   @param[in]     pSrc       points to the input vector
60   @param[in]     blockSize  number of samples in input vector
61   @param[out]    pResult    sum of the squares value returned here
62  */
63 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
64 
65 #include "arm_helium_utils.h"
66 
arm_power_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)67 ARM_DSP_ATTRIBUTE void arm_power_f32(
68   const float32_t * pSrc,
69   uint32_t blockSize,
70   float32_t * pResult)
71 {
72     uint32_t        blkCnt;     /* loop counters */
73     f32x4_t         vecSrc;
74     f32x4_t         sumVec = vdupq_n_f32(0.0f);
75     float32_t       sum = 0.0f;
76     float32_t in;
77 
78     /* Compute 4 outputs at a time */
79     blkCnt = blockSize >> 2U;
80     while (blkCnt > 0U)
81     {
82         vecSrc = vldrwq_f32(pSrc);
83         /*
84          * sum lanes
85          */
86         sumVec = vfmaq(sumVec, vecSrc, vecSrc);
87 
88         blkCnt --;
89         pSrc += 4;
90     }
91     sum = vecAddAcrossF32Mve(sumVec);
92 
93     /*
94      * tail
95      */
96     blkCnt = blockSize & 0x3;
97     while (blkCnt > 0U)
98     {
99       /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
100 
101       /* Compute Power and store result in a temporary variable, sum. */
102       in = *pSrc++;
103       sum += in * in;
104 
105       /* Decrement loop counter */
106       blkCnt--;
107     }
108 
109     *pResult = sum;
110 }
111 #else
112 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_power_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)113 ARM_DSP_ATTRIBUTE void arm_power_f32(
114   const float32_t * pSrc,
115   uint32_t blockSize,
116   float32_t * pResult)
117 {
118   float32_t sum = 0.0f;                          /* accumulator */
119   float32_t in;                                  /* Temporary variable to store input value */
120   uint32_t blkCnt;                               /* loop counter */
121 
122   float32x4_t sumV = vdupq_n_f32(0.0f);                          /* Temporary result storage */
123   float32x2_t sumV2;
124   float32x4_t inV;
125 
126   blkCnt = blockSize >> 2U;
127 
128   /* Compute 4 outputs at a time.
129    ** a second loop below computes the remaining 1 to 3 samples. */
130   while (blkCnt > 0U)
131   {
132     /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
133     /* Compute Power and then store the result in a temporary variable, sum. */
134     inV = vld1q_f32(pSrc);
135     sumV = vmlaq_f32(sumV, inV, inV);
136     pSrc += 4;
137 
138     /* Decrement the loop counter */
139     blkCnt--;
140   }
141   sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
142   sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
143 
144   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
145    ** No loop unrolling is used. */
146   blkCnt = blockSize % 0x4U;
147 
148   while (blkCnt > 0U)
149   {
150     /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
151     /* compute power and then store the result in a temporary variable, sum. */
152     in = *pSrc++;
153     sum += in * in;
154 
155     /* Decrement the loop counter */
156     blkCnt--;
157   }
158 
159   /* Store the result to the destination */
160   *pResult = sum;
161 }
162 #else
arm_power_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)163 ARM_DSP_ATTRIBUTE void arm_power_f32(
164   const float32_t * pSrc,
165         uint32_t blockSize,
166         float32_t * pResult)
167 {
168         uint32_t blkCnt;                               /* Loop counter */
169         float32_t sum = 0.0f;                          /* Temporary result storage */
170         float32_t in;                                  /* Temporary variable to store input value */
171 
172 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
173 
174   /* Loop unrolling: Compute 4 outputs at a time */
175   blkCnt = blockSize >> 2U;
176 
177   while (blkCnt > 0U)
178   {
179     /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
180 
181     /* Compute Power and store result in a temporary variable, sum. */
182     in = *pSrc++;
183     sum += in * in;
184 
185     in = *pSrc++;
186     sum += in * in;
187 
188     in = *pSrc++;
189     sum += in * in;
190 
191     in = *pSrc++;
192     sum += in * in;
193 
194     /* Decrement loop counter */
195     blkCnt--;
196   }
197 
198   /* Loop unrolling: Compute remaining outputs */
199   blkCnt = blockSize % 0x4U;
200 
201 #else
202 
203   /* Initialize blkCnt with number of samples */
204   blkCnt = blockSize;
205 
206 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
207 
208   while (blkCnt > 0U)
209   {
210     /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
211 
212     /* Compute Power and store result in a temporary variable, sum. */
213     in = *pSrc++;
214     sum += in * in;
215 
216     /* Decrement loop counter */
217     blkCnt--;
218   }
219 
220   /* Store result to destination */
221   *pResult = sum;
222 }
223 #endif /* #if defined(ARM_MATH_NEON) */
224 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
225 
226 /**
227   @} end of power group
228  */
229