1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_var_f32.c
4  * Description:  Variance of the elements of a floating-point vector
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions.h"
30 
31 /**
32   @ingroup groupStats
33  */
34 
35 /**
36   @defgroup variance  Variance
37 
38   Calculates the variance of the elements in the input vector.
39   The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
40 
41   <pre>
42       Result = sum(element - meanOfElements)^2) / numElement - 1
43 
44       meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize
45   </pre>
46 
47   There are separate functions for floating point, Q31, and Q15 data types.
48  */
49 
50 /**
51   @addtogroup variance
52   @{
53  */
54 
55 /**
56   @brief         Variance of the elements of a floating-point vector.
57   @param[in]     pSrc       points to the input vector
58   @param[in]     blockSize  number of samples in input vector
59   @param[out]    pResult    variance value returned here
60  */
61 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
62 
63 #include "arm_helium_utils.h"
64 
arm_var_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)65 ARM_DSP_ATTRIBUTE void arm_var_f32(
66            const float32_t * pSrc,
67                  uint32_t blockSize,
68                  float32_t * pResult)
69 {
70     uint32_t         blkCnt;     /* loop counters */
71     f32x4_t         vecSrc;
72     f32x4_t         sumVec = vdupq_n_f32(0.0f);
73     float32_t       fMean;
74     float32_t sum = 0.0f;                          /* accumulator */
75     float32_t in;                                  /* Temporary variable to store input value */
76 
77     if (blockSize <= 1U) {
78         *pResult = 0;
79         return;
80     }
81 
82     arm_mean_f32(pSrc, blockSize, &fMean);
83 
84     /* Compute 4 outputs at a time */
85     blkCnt = blockSize >> 2U;
86     while (blkCnt > 0U)
87     {
88 
89         vecSrc = vldrwq_f32(pSrc);
90         /*
91          * sum lanes
92          */
93         vecSrc = vsubq(vecSrc, fMean);
94         sumVec = vfmaq(sumVec, vecSrc, vecSrc);
95 
96         blkCnt --;
97         pSrc += 4;
98     }
99 
100     sum = vecAddAcrossF32Mve(sumVec);
101 
102     /*
103      * tail
104      */
105     blkCnt = blockSize & 0x3;
106     while (blkCnt > 0U)
107     {
108        in = *pSrc++ - fMean;
109        sum += in * in;
110 
111        /* Decrement loop counter */
112        blkCnt--;
113     }
114 
115     /* Variance */
116     *pResult = sum / (float32_t) (blockSize - 1);
117 }
118 #else
119 #if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_var_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)120 ARM_DSP_ATTRIBUTE void arm_var_f32(
121            const float32_t * pSrc,
122                  uint32_t blockSize,
123                  float32_t * pResult)
124 {
125   float32_t mean;
126 
127   float32_t sum = 0.0f;                          /* accumulator */
128   float32_t in;                                  /* Temporary variable to store input value */
129   uint32_t blkCnt;                               /* loop counter */
130 
131   float32x4_t sumV = vdupq_n_f32(0.0f);                          /* Temporary result storage */
132   float32x2_t sumV2;
133   float32x4_t inV;
134   float32x4_t avg;
135 
136   arm_mean_f32(pSrc,blockSize,&mean);
137   avg = vdupq_n_f32(mean);
138 
139   blkCnt = blockSize >> 2U;
140 
141   /* Compute 4 outputs at a time.
142    ** a second loop below computes the remaining 1 to 3 samples. */
143   while (blkCnt > 0U)
144   {
145     /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
146     /* Compute Power and then store the result in a temporary variable, sum. */
147     inV = vld1q_f32(pSrc);
148     inV = vsubq_f32(inV, avg);
149     sumV = vmlaq_f32(sumV, inV, inV);
150     pSrc += 4;
151 
152     /* Decrement the loop counter */
153     blkCnt--;
154   }
155 
156   sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
157   sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
158 
159   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
160    ** No loop unrolling is used. */
161   blkCnt = blockSize % 0x4U;
162 
163   while (blkCnt > 0U)
164   {
165     /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
166     /* compute power and then store the result in a temporary variable, sum. */
167     in = *pSrc++;
168     in = in - mean;
169     sum += in * in;
170 
171     /* Decrement the loop counter */
172     blkCnt--;
173   }
174 
175   /* Variance */
176   *pResult = sum / (float32_t)(blockSize - 1.0f);
177 
178 }
179 
180 #else
arm_var_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)181 ARM_DSP_ATTRIBUTE void arm_var_f32(
182   const float32_t * pSrc,
183         uint32_t blockSize,
184         float32_t * pResult)
185 {
186         uint32_t blkCnt;                               /* Loop counter */
187         float32_t sum = 0.0f;                          /* Temporary result storage */
188         float32_t fSum = 0.0f;
189         float32_t fMean, fValue;
190   const float32_t * pInput = pSrc;
191 
192   if (blockSize <= 1U)
193   {
194     *pResult = 0;
195     return;
196   }
197 
198 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
199 
200   /* Loop unrolling: Compute 4 outputs at a time */
201   blkCnt = blockSize >> 2U;
202 
203   while (blkCnt > 0U)
204   {
205     /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
206 
207     sum += *pInput++;
208     sum += *pInput++;
209     sum += *pInput++;
210     sum += *pInput++;
211 
212 
213     /* Decrement loop counter */
214     blkCnt--;
215   }
216 
217   /* Loop unrolling: Compute remaining outputs */
218   blkCnt = blockSize % 0x4U;
219 
220 #else
221 
222   /* Initialize blkCnt with number of samples */
223   blkCnt = blockSize;
224 
225 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
226 
227   while (blkCnt > 0U)
228   {
229     /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
230 
231     sum += *pInput++;
232 
233     /* Decrement loop counter */
234     blkCnt--;
235   }
236 
237   /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
238   fMean = sum / (float32_t) blockSize;
239 
240   pInput = pSrc;
241 
242 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
243 
244   /* Loop unrolling: Compute 4 outputs at a time */
245   blkCnt = blockSize >> 2U;
246 
247   while (blkCnt > 0U)
248   {
249     fValue = *pInput++ - fMean;
250     fSum += fValue * fValue;
251 
252     fValue = *pInput++ - fMean;
253     fSum += fValue * fValue;
254 
255     fValue = *pInput++ - fMean;
256     fSum += fValue * fValue;
257 
258     fValue = *pInput++ - fMean;
259     fSum += fValue * fValue;
260 
261     /* Decrement loop counter */
262     blkCnt--;
263   }
264 
265   /* Loop unrolling: Compute remaining outputs */
266   blkCnt = blockSize % 0x4U;
267 
268 #else
269 
270   /* Initialize blkCnt with number of samples */
271   blkCnt = blockSize;
272 
273 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
274 
275   while (blkCnt > 0U)
276   {
277     fValue = *pInput++ - fMean;
278     fSum += fValue * fValue;
279 
280     /* Decrement loop counter */
281     blkCnt--;
282   }
283 
284   /* Variance */
285   *pResult = fSum / (float32_t)(blockSize - 1.0f);
286 }
287 #endif /* #if defined(ARM_MATH_NEON) */
288 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
289 
290 /**
291   @} end of variance group
292  */
293