1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mean_f32.c
4 * Description: Mean value of a floating-point vector
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions.h"
30
31 /**
32 @ingroup groupStats
33 */
34
35
36 /**
37 @addtogroup mean
38 @{
39 */
40
41 /**
42 @brief Mean value of a floating-point vector.
43 @param[in] pSrc points to the input vector.
44 @param[in] blockSize number of samples in input vector.
45 @param[out] pResult mean value returned here.
46 */
47 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
48
49 #include "arm_helium_utils.h"
50
arm_mean_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)51 ARM_DSP_ATTRIBUTE void arm_mean_f32(
52 const float32_t * pSrc,
53 uint32_t blockSize,
54 float32_t * pResult)
55 {
56 uint32_t blkCnt; /* loop counters */
57 f32x4_t vecSrc;
58 f32x4_t sumVec = vdupq_n_f32(0.0f);
59 float32_t sum = 0.0f;
60
61 /* Compute 4 outputs at a time */
62 blkCnt = blockSize >> 2U;
63 while (blkCnt > 0U)
64 {
65 vecSrc = vldrwq_f32(pSrc);
66 sumVec = vaddq_f32(sumVec, vecSrc);
67
68 blkCnt --;
69 pSrc += 4;
70 }
71
72 sum = vecAddAcrossF32Mve(sumVec);
73
74 /* Tail */
75 blkCnt = blockSize & 0x3;
76
77 while (blkCnt > 0U)
78 {
79 /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
80 sum += *pSrc++;
81
82 /* Decrement loop counter */
83 blkCnt--;
84 }
85
86 *pResult = sum / (float32_t) blockSize;
87 }
88
89
90 #else
91 #if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mean_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)92 ARM_DSP_ATTRIBUTE void arm_mean_f32(
93 const float32_t * pSrc,
94 uint32_t blockSize,
95 float32_t * pResult)
96 {
97 float32_t sum = 0.0f; /* Temporary result storage */
98 float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
99 float32x2_t sumV2;
100
101 uint32_t blkCnt; /* Loop counter */
102
103 float32x4_t inV;
104
105 blkCnt = blockSize >> 2U;
106
107 /* Compute 4 outputs at a time.
108 ** a second loop below computes the remaining 1 to 3 samples. */
109 while (blkCnt > 0U)
110 {
111 /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
112 inV = vld1q_f32(pSrc);
113 sumV = vaddq_f32(sumV, inV);
114
115 pSrc += 4;
116 /* Decrement the loop counter */
117 blkCnt--;
118 }
119
120 sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
121 sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
122
123 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
124 ** No loop unrolling is used. */
125 blkCnt = blockSize & 3;
126
127 while (blkCnt > 0U)
128 {
129 /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
130 sum += *pSrc++;
131
132 /* Decrement the loop counter */
133 blkCnt--;
134 }
135
136 /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
137 /* Store the result to the destination */
138 *pResult = sum / (float32_t) blockSize;
139 }
140 #else
arm_mean_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult)141 ARM_DSP_ATTRIBUTE void arm_mean_f32(
142 const float32_t * pSrc,
143 uint32_t blockSize,
144 float32_t * pResult)
145 {
146 uint32_t blkCnt; /* Loop counter */
147 float32_t sum = 0.0f; /* Temporary result storage */
148
149 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
150
151 /* Loop unrolling: Compute 4 outputs at a time */
152 blkCnt = blockSize >> 2U;
153
154 while (blkCnt > 0U)
155 {
156 /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
157 sum += *pSrc++;
158
159 sum += *pSrc++;
160
161 sum += *pSrc++;
162
163 sum += *pSrc++;
164
165 /* Decrement the loop counter */
166 blkCnt--;
167 }
168
169 /* Loop unrolling: Compute remaining outputs */
170 blkCnt = blockSize % 0x4U;
171
172 #else
173
174 /* Initialize blkCnt with number of samples */
175 blkCnt = blockSize;
176
177 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
178
179 while (blkCnt > 0U)
180 {
181 /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
182 sum += *pSrc++;
183
184 /* Decrement loop counter */
185 blkCnt--;
186 }
187
188 /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
189 /* Store result to destination */
190 *pResult = (sum / blockSize);
191 }
192 #endif /* #if defined(ARM_MATH_NEON) */
193 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
194
195 /**
196 @} end of mean group
197 */
198