1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mse_f32.c
4 * Description: Floating point mean square error
5 *
6 * $Date: 05 April 2022
7 * $Revision: V1.10.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions.h"
30
31 /**
32 @ingroup groupStats
33 */
34
35 /**
36 @addtogroup MSE
37 @{
38 */
39
40 /**
41 @brief Mean square error between two floating point vectors.
42 @param[in] pSrcA points to the first input vector
43 @param[in] pSrcB points to the second input vector
44 @param[in] blockSize number of samples in input vector
45 @param[out] pResult mean square error
46 */
47
48 #if !defined(ARM_MATH_AUTOVECTORIZE)
49
50 #if defined(ARM_MATH_MVEF)
51 #include "arm_helium_utils.h"
52
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)53 ARM_DSP_ATTRIBUTE void arm_mse_f32(
54 const float32_t * pSrcA,
55 const float32_t * pSrcB,
56 uint32_t blockSize,
57 float32_t * pResult)
58
59 {
60 float32x4_t vecA, vecB;
61 float32x4_t vecSum;
62 uint32_t blkCnt;
63 float32_t sum = 0.0f;
64 vecSum = vdupq_n_f32(0.0f);
65
66 /* Compute 4 outputs at a time */
67 blkCnt = (blockSize) >> 2;
68 while (blkCnt > 0U)
69 {
70 vecA = vld1q(pSrcA);
71 pSrcA += 4;
72
73 vecB = vld1q(pSrcB);
74 pSrcB += 4;
75
76 vecA = vsubq(vecA, vecB);
77
78 vecSum = vfmaq(vecSum, vecA, vecA);
79 /*
80 * Decrement the blockSize loop counter
81 */
82 blkCnt --;
83 }
84
85
86 blkCnt = (blockSize) & 3;
87 if (blkCnt > 0U)
88 {
89 mve_pred16_t p0 = vctp32q(blkCnt);
90 vecA = vld1q(pSrcA);
91 vecB = vld1q(pSrcB);
92
93 vecA = vsubq(vecA, vecB);
94 vecSum = vfmaq_m(vecSum, vecA, vecA, p0);
95 }
96
97 sum = vecAddAcrossF32Mve(vecSum);
98
99 /* Store result in destination buffer */
100 *pResult = sum / blockSize;
101
102 }
103
104 #endif
105
106 #if defined(ARM_MATH_NEON)
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)107 ARM_DSP_ATTRIBUTE void arm_mse_f32(
108 const float32_t * pSrcA,
109 const float32_t * pSrcB,
110 uint32_t blockSize,
111 float32_t * pResult)
112
113 {
114 float32x4_t vecA, vecB;
115 float32x4_t vecSum;
116 uint32_t blkCnt;
117 float32_t inA, inB;
118 float32_t sum = 0.0f;
119 vecSum = vdupq_n_f32(0.0f);
120 #if !defined(__aarch64__)
121 f32x2_t tmp = vdup_n_f32(0.0f);
122 #endif
123
124 /* Compute 4 outputs at a time */
125 blkCnt = (blockSize) >> 2;
126 while (blkCnt > 0U)
127 {
128 vecA = vld1q_f32(pSrcA);
129 pSrcA += 4;
130
131 vecB = vld1q_f32(pSrcB);
132 pSrcB += 4;
133
134 vecA = vsubq_f32(vecA, vecB);
135
136 vecSum = vfmaq_f32(vecSum, vecA, vecA);
137 /*
138 * Decrement the blockSize loop counter
139 */
140 blkCnt --;
141 }
142
143 #if defined(__aarch64__)
144 sum = vpadds_f32(vpadd_f32(vget_low_f32(vecSum), vget_high_f32(vecSum)));
145 #else
146 tmp = vpadd_f32(vget_low_f32(vecSum), vget_high_f32(vecSum));
147 sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
148
149 #endif
150
151 blkCnt = (blockSize) & 3;
152 while (blkCnt > 0U)
153 {
154 /* Calculate dot product and store result in a temporary buffer. */
155 inA = *pSrcA++;
156 inB = *pSrcB++;
157 inA = inA - inB;
158 sum += inA * inA;
159
160 /* Decrement loop counter */
161 blkCnt--;
162 }
163
164 /* Store result in destination buffer */
165 *pResult = sum / blockSize;
166
167 }
168 #endif
169
170 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
171
172
173
174 #if (!defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)) || defined(ARM_MATH_AUTOVECTORIZE)
175
176
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)177 ARM_DSP_ATTRIBUTE void arm_mse_f32(
178 const float32_t * pSrcA,
179 const float32_t * pSrcB,
180 uint32_t blockSize,
181 float32_t * pResult)
182
183 {
184 uint32_t blkCnt; /* Loop counter */
185 float32_t inA, inB;
186 float32_t sum = 0.0f; /* Temporary return variable */
187 #if defined (ARM_MATH_LOOPUNROLL)
188 /* Loop unrolling: Compute 4 outputs at a time */
189 blkCnt = (blockSize) >> 2;
190
191 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
192 ** a second loop below computes the remaining 1 to 3 samples. */
193 while (blkCnt > 0U)
194 {
195
196 inA = *pSrcA++;
197 inB = *pSrcB++;
198 inA = inA - inB;
199 sum += inA * inA;
200
201 inA = *pSrcA++;
202 inB = *pSrcB++;
203 inA = inA - inB;
204 sum += inA * inA;
205
206 inA = *pSrcA++;
207 inB = *pSrcB++;
208 inA = inA - inB;
209 sum += inA * inA;
210
211 inA = *pSrcA++;
212 inB = *pSrcB++;
213 inA = inA - inB;
214 sum += inA * inA;
215
216 /* Decrement loop counter */
217 blkCnt--;
218 }
219
220
221 /* Loop unrolling: Compute remaining outputs */
222 blkCnt = (blockSize) & 3;
223 #else
224 /* Initialize blkCnt with number of samples */
225 blkCnt = blockSize;
226 #endif
227 while (blkCnt > 0U)
228 {
229 inA = *pSrcA++;
230 inB = *pSrcB++;
231 inA = inA - inB;
232 sum += inA * inA;
233
234 /* Decrement loop counter */
235 blkCnt--;
236 }
237
238 /* Store result in destination buffer */
239 *pResult = sum / blockSize;
240 }
241
242 #endif /* end of test for vector instruction availability */
243
244 /**
245 @} end of MSE group
246 */
247