1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mse_f32.c
4  * Description:  Floating point mean square error
5  *
6  * $Date:        05 April 2022
7  * $Revision:    V1.10.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions.h"
30 
31 /**
32   @ingroup groupStats
33  */
34 
35 /**
36   @addtogroup MSE
37   @{
38  */
39 
40 /**
41   @brief         Mean square error between two floating point vectors.
42   @param[in]     pSrcA       points to the first input vector
43   @param[in]     pSrcB       points to the second input vector
44   @param[in]     blockSize   number of samples in input vector
45   @param[out]    pResult      mean square error
46  */
47 
48 #if !defined(ARM_MATH_AUTOVECTORIZE)
49 
50 #if defined(ARM_MATH_MVEF)
51 #include "arm_helium_utils.h"
52 
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)53 ARM_DSP_ATTRIBUTE void arm_mse_f32(
54     const float32_t * pSrcA,
55     const float32_t * pSrcB,
56     uint32_t    blockSize,
57     float32_t * pResult)
58 
59 {
60     float32x4_t vecA, vecB;
61     float32x4_t vecSum;
62     uint32_t blkCnt;
63     float32_t sum = 0.0f;
64     vecSum = vdupq_n_f32(0.0f);
65 
66     /* Compute 4 outputs at a time */
67     blkCnt = (blockSize) >> 2;
68     while (blkCnt > 0U)
69     {
70         vecA = vld1q(pSrcA);
71         pSrcA += 4;
72 
73         vecB = vld1q(pSrcB);
74         pSrcB += 4;
75 
76         vecA = vsubq(vecA, vecB);
77 
78         vecSum = vfmaq(vecSum, vecA, vecA);
79         /*
80          * Decrement the blockSize loop counter
81          */
82         blkCnt --;
83     }
84 
85 
86     blkCnt = (blockSize) & 3;
87     if (blkCnt > 0U)
88     {
89         mve_pred16_t p0 = vctp32q(blkCnt);
90         vecA = vld1q(pSrcA);
91         vecB = vld1q(pSrcB);
92 
93         vecA = vsubq(vecA, vecB);
94         vecSum = vfmaq_m(vecSum, vecA, vecA, p0);
95     }
96 
97     sum = vecAddAcrossF32Mve(vecSum);
98 
99     /* Store result in destination buffer */
100     *pResult = sum / blockSize;
101 
102 }
103 
104 #endif
105 
106 #if defined(ARM_MATH_NEON)
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)107 ARM_DSP_ATTRIBUTE void arm_mse_f32(
108     const float32_t * pSrcA,
109     const float32_t * pSrcB,
110     uint32_t    blockSize,
111     float32_t * pResult)
112 
113 {
114     float32x4_t vecA, vecB;
115     float32x4_t vecSum;
116     uint32_t blkCnt;
117     float32_t inA, inB;
118     float32_t sum = 0.0f;
119     vecSum = vdupq_n_f32(0.0f);
120 #if !defined(__aarch64__)
121     f32x2_t tmp = vdup_n_f32(0.0f);
122 #endif
123 
124     /* Compute 4 outputs at a time */
125     blkCnt = (blockSize) >> 2;
126     while (blkCnt > 0U)
127     {
128         vecA = vld1q_f32(pSrcA);
129         pSrcA += 4;
130 
131         vecB = vld1q_f32(pSrcB);
132         pSrcB += 4;
133 
134         vecA = vsubq_f32(vecA, vecB);
135 
136         vecSum = vfmaq_f32(vecSum, vecA, vecA);
137         /*
138          * Decrement the blockSize loop counter
139          */
140         blkCnt --;
141     }
142 
143 #if defined(__aarch64__)
144     sum = vpadds_f32(vpadd_f32(vget_low_f32(vecSum), vget_high_f32(vecSum)));
145 #else
146     tmp = vpadd_f32(vget_low_f32(vecSum), vget_high_f32(vecSum));
147     sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
148 
149 #endif
150 
151     blkCnt = (blockSize) & 3;
152     while (blkCnt > 0U)
153     {
154         /* Calculate dot product and store result in a temporary buffer. */
155         inA = *pSrcA++;
156         inB = *pSrcB++;
157         inA = inA - inB;
158         sum += inA * inA;
159 
160         /* Decrement loop counter */
161         blkCnt--;
162     }
163 
164     /* Store result in destination buffer */
165     *pResult = sum / blockSize;
166 
167 }
168 #endif
169 
170 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
171 
172 
173 
174 #if (!defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)) || defined(ARM_MATH_AUTOVECTORIZE)
175 
176 
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)177 ARM_DSP_ATTRIBUTE void arm_mse_f32(
178     const float32_t * pSrcA,
179     const float32_t * pSrcB,
180     uint32_t    blockSize,
181     float32_t * pResult)
182 
183 {
184   uint32_t blkCnt;                               /* Loop counter */
185   float32_t inA, inB;
186   float32_t sum = 0.0f;                          /* Temporary return variable */
187 #if defined (ARM_MATH_LOOPUNROLL)
188   /* Loop unrolling: Compute 4 outputs at a time */
189   blkCnt = (blockSize) >> 2;
190 
191   /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
192    ** a second loop below computes the remaining 1 to 3 samples. */
193   while (blkCnt > 0U)
194   {
195 
196     inA = *pSrcA++;
197     inB = *pSrcB++;
198     inA = inA - inB;
199     sum += inA * inA;
200 
201     inA = *pSrcA++;
202     inB = *pSrcB++;
203     inA = inA - inB;
204     sum += inA * inA;
205 
206     inA = *pSrcA++;
207     inB = *pSrcB++;
208     inA = inA - inB;
209     sum += inA * inA;
210 
211     inA = *pSrcA++;
212     inB = *pSrcB++;
213     inA = inA - inB;
214     sum += inA * inA;
215 
216     /* Decrement loop counter */
217     blkCnt--;
218   }
219 
220 
221   /* Loop unrolling: Compute remaining outputs */
222   blkCnt = (blockSize) & 3;
223 #else
224   /* Initialize blkCnt with number of samples */
225   blkCnt = blockSize;
226 #endif
227   while (blkCnt > 0U)
228   {
229     inA = *pSrcA++;
230     inB = *pSrcB++;
231     inA = inA - inB;
232     sum += inA * inA;
233 
234     /* Decrement loop counter */
235     blkCnt--;
236   }
237 
238   /* Store result in destination buffer */
239   *pResult = sum / blockSize;
240 }
241 
242 #endif /* end of test for vector instruction availability */
243 
244 /**
245   @} end of MSE group
246  */
247