1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mse_f16.c
4 * Description: Half floating point mean square error
5 *
6 * $Date: 05 April 2022
7 * $Revision: V1.10.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions_f16.h"
30
31 /**
32 @ingroup groupStats
33 */
34
35 /**
36 @addtogroup MSE
37 @{
38 */
39
40 /**
41 @brief Mean square error between two half floating point vectors.
42 @param[in] pSrcA points to the first input vector
43 @param[in] pSrcB points to the second input vector
44 @param[in] blockSize number of samples in input vector
45 @param[out] result mean square error
46 */
47
48 #if !defined(ARM_MATH_AUTOVECTORIZE)
49
50 #if defined(ARM_MATH_MVE_FLOAT16)
51 #include "arm_helium_utils.h"
52
arm_mse_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize,float16_t * result)53 ARM_DSP_ATTRIBUTE void arm_mse_f16(
54 const float16_t * pSrcA,
55 const float16_t * pSrcB,
56 uint32_t blockSize,
57 float16_t * result)
58
59 {
60 float16x8_t vecA, vecB;
61 float16x8_t vecSum;
62 uint32_t blkCnt;
63 _Float16 sum = 0.0f16;
64 vecSum = vdupq_n_f16(0.0f16);
65
66 blkCnt = (blockSize) >> 3;
67 while (blkCnt > 0U)
68 {
69 vecA = vld1q(pSrcA);
70 pSrcA += 8;
71
72 vecB = vld1q(pSrcB);
73 pSrcB += 8;
74
75 vecA = vsubq(vecA, vecB);
76
77 vecSum = vfmaq(vecSum, vecA, vecA);
78 /*
79 * Decrement the blockSize loop counter
80 */
81 blkCnt --;
82 }
83
84
85 blkCnt = (blockSize) & 7;
86 if (blkCnt > 0U)
87 {
88 mve_pred16_t p0 = vctp16q(blkCnt);
89 vecA = vld1q(pSrcA);
90 vecB = vld1q(pSrcB);
91
92 vecA = vsubq(vecA, vecB);
93 vecSum = vfmaq_m(vecSum, vecA, vecA, p0);
94 }
95
96 sum = vecAddAcrossF16Mve(vecSum);
97
98 /* Store result in destination buffer */
99 *result = (_Float16)sum / (_Float16)blockSize;
100
101 }
102
103 #endif
104
105
106 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
107
108
109 #if defined(ARM_FLOAT16_SUPPORTED)
110
111 #if (!defined(ARM_MATH_MVE_FLOAT16)) || defined(ARM_MATH_AUTOVECTORIZE)
112
113
114
arm_mse_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize,float16_t * result)115 ARM_DSP_ATTRIBUTE void arm_mse_f16(
116 const float16_t * pSrcA,
117 const float16_t * pSrcB,
118 uint32_t blockSize,
119 float16_t * result)
120
121 {
122 uint32_t blkCnt; /* Loop counter */
123 _Float16 inA, inB;
124 _Float16 sum = 0.0f16; /* Temporary return variable */
125 #if defined (ARM_MATH_LOOPUNROLL)
126 blkCnt = (blockSize) >> 3;
127
128
129 while (blkCnt > 0U)
130 {
131 inA = *pSrcA++;
132 inB = *pSrcB++;
133 inA = (_Float16)inA - (_Float16)inB;
134 sum += (_Float16)inA * (_Float16)inA;
135
136 inA = *pSrcA++;
137 inB = *pSrcB++;
138 inA = (_Float16)inA - (_Float16)inB;
139 sum += (_Float16)inA * (_Float16)inA;
140
141 inA = *pSrcA++;
142 inB = *pSrcB++;
143 inA = (_Float16)inA - (_Float16)inB;
144 sum += (_Float16)inA * (_Float16)inA;
145
146 inA = *pSrcA++;
147 inB = *pSrcB++;
148 inA = (_Float16)inA - (_Float16)inB;
149 sum += (_Float16)inA * (_Float16)inA;
150
151 inA = *pSrcA++;
152 inB = *pSrcB++;
153 inA = (_Float16)inA - (_Float16)inB;
154 sum += (_Float16)inA * (_Float16)inA;
155
156 inA = *pSrcA++;
157 inB = *pSrcB++;
158 inA = (_Float16)inA - (_Float16)inB;
159 sum += (_Float16)inA * (_Float16)inA;
160
161 inA = *pSrcA++;
162 inB = *pSrcB++;
163 inA = (_Float16)inA - (_Float16)inB;
164 sum += (_Float16)inA * (_Float16)inA;
165
166 inA = *pSrcA++;
167 inB = *pSrcB++;
168 inA = (_Float16)inA - (_Float16)inB;
169 sum += (_Float16)inA * (_Float16)inA;
170
171 /* Decrement loop counter */
172 blkCnt--;
173 }
174
175
176 /* Loop unrolling: Compute remaining outputs */
177 blkCnt = (blockSize) & 7;
178 #else
179 /* Initialize blkCnt with number of samples */
180 blkCnt = blockSize;
181 #endif
182 while (blkCnt > 0U)
183 {
184 inA = *pSrcA++;
185 inB = *pSrcB++;
186 inA = (_Float16)inA - (_Float16)inB;
187 sum += (_Float16)inA * (_Float16)inA;
188
189 /* Decrement loop counter */
190 blkCnt--;
191 }
192
193 /* Store result in destination buffer */
194 *result = (_Float16)sum / (_Float16)blockSize;
195 }
196
197 #endif /* end of test for vector instruction availability */
198
199 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
200 /**
201 @} end of MSE group
202 */
203