1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_var_f16.c
4  * Description:  Variance of the elements of a floating-point vector
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions_f16.h"
30 
31 
32 #if defined(ARM_FLOAT16_SUPPORTED)
33 
34 
35 /**
36   @ingroup groupStats
37  */
38 
39 
40 /**
41   @addtogroup variance
42   @{
43  */
44 
45 /**
46   @brief         Variance of the elements of a floating-point vector.
47   @param[in]     pSrc       points to the input vector
48   @param[in]     blockSize  number of samples in input vector
49   @param[out]    pResult    variance value returned here
50  */
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
53 #include "arm_helium_utils.h"
54 
55 
arm_var_f16(const float16_t * pSrc,uint32_t blockSize,float16_t * pResult)56 void arm_var_f16(
57            const float16_t * pSrc,
58                  uint32_t blockSize,
59                  float16_t * pResult)
60 {
61     int32_t         blkCnt;     /* loop counters */
62     f16x8_t         vecSrc;
63     f16x8_t         sumVec = vdupq_n_f16(0.0f16);
64     float16_t       fMean;
65 
66     if (blockSize <= 1U) {
67         *pResult = 0;
68         return;
69     }
70 
71 
72     arm_mean_f16(pSrc, blockSize, &fMean);
73 
74     blkCnt = blockSize;
75     do {
76         mve_pred16_t    p = vctp16q(blkCnt);
77 
78         vecSrc = vldrhq_z_f16((float16_t const *) pSrc, p);
79         /*
80          * sum lanes
81          */
82         vecSrc = vsubq_m(vuninitializedq_f16(), vecSrc, fMean, p);
83         sumVec = vfmaq_m(sumVec, vecSrc, vecSrc, p);
84 
85         blkCnt -= 8;
86         pSrc += 8;
87     }
88     while (blkCnt > 0);
89 
90     /* Variance */
91     *pResult = (_Float16)vecAddAcrossF16Mve(sumVec) / (_Float16) (blockSize - 1.0f16);
92 }
93 #else
94 
arm_var_f16(const float16_t * pSrc,uint32_t blockSize,float16_t * pResult)95 void arm_var_f16(
96   const float16_t * pSrc,
97         uint32_t blockSize,
98         float16_t * pResult)
99 {
100         uint32_t blkCnt;                               /* Loop counter */
101         _Float16 sum = 0.0f;                          /* Temporary result storage */
102         _Float16 fSum = 0.0f;
103         _Float16 fMean, fValue;
104   const float16_t * pInput = pSrc;
105 
106   if (blockSize <= 1U)
107   {
108     *pResult = 0;
109     return;
110   }
111 
112 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
113 
114   /* Loop unrolling: Compute 4 outputs at a time */
115   blkCnt = blockSize >> 2U;
116 
117   while (blkCnt > 0U)
118   {
119     /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
120 
121     sum += (_Float16)*pInput++;
122     sum += (_Float16)*pInput++;
123     sum += (_Float16)*pInput++;
124     sum += (_Float16)*pInput++;
125 
126 
127     /* Decrement loop counter */
128     blkCnt--;
129   }
130 
131   /* Loop unrolling: Compute remaining outputs */
132   blkCnt = blockSize % 0x4U;
133 
134 #else
135 
136   /* Initialize blkCnt with number of samples */
137   blkCnt = blockSize;
138 
139 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
140 
141   while (blkCnt > 0U)
142   {
143     /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
144 
145     sum += (_Float16)*pInput++;
146 
147     /* Decrement loop counter */
148     blkCnt--;
149   }
150 
151   /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
152   fMean = (_Float16)sum / (_Float16) blockSize;
153 
154   pInput = pSrc;
155 
156 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
157 
158   /* Loop unrolling: Compute 4 outputs at a time */
159   blkCnt = blockSize >> 2U;
160 
161   while (blkCnt > 0U)
162   {
163     fValue = (_Float16)*pInput++ - (_Float16)fMean;
164     fSum += (_Float16)fValue * (_Float16)fValue;
165 
166     fValue = (_Float16)*pInput++ - (_Float16)fMean;
167     fSum += (_Float16)fValue * (_Float16)fValue;
168 
169     fValue = (_Float16)*pInput++ - (_Float16)fMean;
170     fSum += (_Float16)fValue * (_Float16)fValue;
171 
172     fValue = (_Float16)*pInput++ - (_Float16)fMean;
173     fSum += (_Float16)fValue * (_Float16)fValue;
174 
175     /* Decrement loop counter */
176     blkCnt--;
177   }
178 
179   /* Loop unrolling: Compute remaining outputs */
180   blkCnt = blockSize % 0x4U;
181 
182 #else
183 
184   /* Initialize blkCnt with number of samples */
185   blkCnt = blockSize;
186 
187 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
188 
189   while (blkCnt > 0U)
190   {
191     fValue = (_Float16)*pInput++ - (_Float16)fMean;
192     fSum += (_Float16)fValue * (_Float16)fValue;
193 
194     /* Decrement loop counter */
195     blkCnt--;
196   }
197 
198   /* Variance */
199   *pResult = (_Float16)fSum / ((_Float16)blockSize - 1.0f16);
200 }
201 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
202 
203 /**
204   @} end of variance group
205  */
206 
207 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
208 
209