1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mse_f16.c
4  * Description:  Half floating point mean square error
5  *
6  * $Date:        05 April 2022
7  * $Revision:    V1.10.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions_f16.h"
30 
31 /**
32   @ingroup groupStats
33  */
34 
35 /**
36   @addtogroup MSE
37   @{
38  */
39 
40 /**
41   @brief         Mean square error between two half floating point vectors.
42   @param[in]     pSrcA       points to the first input vector
43   @param[in]     pSrcB       points to the second input vector
44   @param[in]     blockSize   number of samples in input vector
45   @param[out]    result      mean square error
46  */
47 
48 #if !defined(ARM_MATH_AUTOVECTORIZE)
49 
50 #if defined(ARM_MATH_MVE_FLOAT16)
51 #include "arm_helium_utils.h"
52 
arm_mse_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize,float16_t * result)53 ARM_DSP_ATTRIBUTE void arm_mse_f16(
54     const float16_t * pSrcA,
55     const float16_t * pSrcB,
56     uint32_t    blockSize,
57     float16_t * result)
58 
59 {
60     float16x8_t vecA, vecB;
61     float16x8_t vecSum;
62     uint32_t blkCnt;
63     _Float16 sum = 0.0f16;
64     vecSum = vdupq_n_f16(0.0f16);
65 
66     blkCnt = (blockSize) >> 3;
67     while (blkCnt > 0U)
68     {
69         vecA = vld1q(pSrcA);
70         pSrcA += 8;
71 
72         vecB = vld1q(pSrcB);
73         pSrcB += 8;
74 
75         vecA = vsubq(vecA, vecB);
76 
77         vecSum = vfmaq(vecSum, vecA, vecA);
78         /*
79          * Decrement the blockSize loop counter
80          */
81         blkCnt --;
82     }
83 
84 
85     blkCnt = (blockSize) & 7;
86     if (blkCnt > 0U)
87     {
88         mve_pred16_t p0 = vctp16q(blkCnt);
89         vecA = vld1q(pSrcA);
90         vecB = vld1q(pSrcB);
91 
92         vecA = vsubq(vecA, vecB);
93         vecSum = vfmaq_m(vecSum, vecA, vecA, p0);
94     }
95 
96     sum = vecAddAcrossF16Mve(vecSum);
97 
98     /* Store result in destination buffer */
99     *result = (_Float16)sum / (_Float16)blockSize;
100 
101 }
102 
103 #endif
104 
105 
106 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
107 
108 
109 #if defined(ARM_FLOAT16_SUPPORTED)
110 
111 #if (!defined(ARM_MATH_MVE_FLOAT16)) || defined(ARM_MATH_AUTOVECTORIZE)
112 
113 
114 
arm_mse_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize,float16_t * result)115 ARM_DSP_ATTRIBUTE void arm_mse_f16(
116     const float16_t * pSrcA,
117     const float16_t * pSrcB,
118     uint32_t    blockSize,
119     float16_t * result)
120 
121 {
122   uint32_t blkCnt;                               /* Loop counter */
123   _Float16 inA, inB;
124   _Float16 sum = 0.0f16;                          /* Temporary return variable */
125 #if defined (ARM_MATH_LOOPUNROLL)
126   blkCnt = (blockSize) >> 3;
127 
128 
129   while (blkCnt > 0U)
130   {
131     inA = *pSrcA++;
132     inB = *pSrcB++;
133     inA = (_Float16)inA - (_Float16)inB;
134     sum += (_Float16)inA * (_Float16)inA;
135 
136     inA = *pSrcA++;
137     inB = *pSrcB++;
138     inA = (_Float16)inA - (_Float16)inB;
139     sum += (_Float16)inA * (_Float16)inA;
140 
141     inA = *pSrcA++;
142     inB = *pSrcB++;
143     inA = (_Float16)inA - (_Float16)inB;
144     sum += (_Float16)inA * (_Float16)inA;
145 
146     inA = *pSrcA++;
147     inB = *pSrcB++;
148     inA = (_Float16)inA - (_Float16)inB;
149     sum += (_Float16)inA * (_Float16)inA;
150 
151     inA = *pSrcA++;
152     inB = *pSrcB++;
153     inA = (_Float16)inA - (_Float16)inB;
154     sum += (_Float16)inA * (_Float16)inA;
155 
156     inA = *pSrcA++;
157     inB = *pSrcB++;
158     inA = (_Float16)inA - (_Float16)inB;
159     sum += (_Float16)inA * (_Float16)inA;
160 
161     inA = *pSrcA++;
162     inB = *pSrcB++;
163     inA = (_Float16)inA - (_Float16)inB;
164     sum += (_Float16)inA * (_Float16)inA;
165 
166     inA = *pSrcA++;
167     inB = *pSrcB++;
168     inA = (_Float16)inA - (_Float16)inB;
169     sum += (_Float16)inA * (_Float16)inA;
170 
171     /* Decrement loop counter */
172     blkCnt--;
173   }
174 
175 
176   /* Loop unrolling: Compute remaining outputs */
177   blkCnt = (blockSize) & 7;
178 #else
179   /* Initialize blkCnt with number of samples */
180   blkCnt = blockSize;
181 #endif
182   while (blkCnt > 0U)
183   {
184     inA = *pSrcA++;
185     inB = *pSrcB++;
186     inA = (_Float16)inA - (_Float16)inB;
187     sum += (_Float16)inA * (_Float16)inA;
188 
189     /* Decrement loop counter */
190     blkCnt--;
191   }
192 
193   /* Store result in destination buffer */
194   *result = (_Float16)sum / (_Float16)blockSize;
195 }
196 
197 #endif /* end of test for vector instruction availability */
198 
199 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
200 /**
201   @} end of MSE group
202  */
203