1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mse_q31.c
4 * Description: Mean square error between two Q31 vectors
5 *
6 * $Date: 04 April 2022
7 * $Revision: V1.10.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions.h"
30
31 /**
32 @ingroup groupStats
33 */
34
35
36 /**
37 @addtogroup MSE
38 @{
39 */
40
41 /**
42 @brief Mean square error between two Q31 vectors.
43 @param[in] pSrcA points to the first input vector
44 @param[in] pSrcB points to the second input vector
45 @param[in] blockSize number of samples in input vector
46 @param[out] pResult mean square error
47 */
48 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mse_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t blockSize,q31_t * pResult)49 void arm_mse_q31(
50 const q31_t * pSrcA,
51 const q31_t * pSrcB,
52 uint32_t blockSize,
53 q31_t * pResult)
54 {
55 uint32_t blkCnt; /* loop counters */
56 q31x4_t vecSrcA,vecSrcB;
57 q63_t sum = 0LL;
58
59 /* Compute 4 outputs at a time */
60 blkCnt = blockSize >> 2U;
61 while (blkCnt > 0U)
62 {
63 vecSrcA = vld1q(pSrcA);
64 vecSrcB = vld1q(pSrcB);
65
66 vecSrcA = vshrq(vecSrcA,1);
67 vecSrcB = vshrq(vecSrcB,1);
68
69
70 vecSrcA = vqsubq(vecSrcA,vecSrcB);
71 /*
72 * sum lanes
73 */
74 sum = vrmlaldavhaq(sum, vecSrcA, vecSrcA);
75
76 blkCnt--;
77 pSrcA += 4;
78 pSrcB += 4;
79 }
80
81 /*
82 * tail
83 */
84 blkCnt = blockSize & 3;
85 if (blkCnt > 0U)
86 {
87 mve_pred16_t p0 = vctp32q(blkCnt);
88 vecSrcA = vld1q(pSrcA);
89 vecSrcB = vld1q(pSrcB);
90
91 vecSrcA = vshrq(vecSrcA,1);
92 vecSrcB = vshrq(vecSrcB,1);
93
94 vecSrcA = vqsubq(vecSrcA,vecSrcB);
95
96 sum = vrmlaldavhaq_p(sum, vecSrcA, vecSrcA, p0);
97 }
98
99
100 *pResult = (q31_t) ((sum / blockSize)>>21);
101
102 }
103 #else
arm_mse_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t blockSize,q31_t * pResult)104 void arm_mse_q31(
105 const q31_t * pSrcA,
106 const q31_t * pSrcB,
107 uint32_t blockSize,
108 q31_t * pResult)
109 {
110 uint32_t blkCnt; /* Loop counter */
111 q63_t sum = 0; /* Temporary result storage */
112
113 q31_t inA32,inB32; /* Temporary variable to store packed input value */
114
115 #if defined (ARM_MATH_LOOPUNROLL)
116
117 /* Loop unrolling: Compute 4 outputs at a time */
118 blkCnt = blockSize >> 2U;
119
120 while (blkCnt > 0U)
121 {
122 inA32 = *pSrcA++ >> 1;
123 inB32 = *pSrcB++ >> 1;
124 inA32 = __QSUB(inA32, inB32);
125 sum += ((q63_t) inA32 * inA32) >> 14U;
126
127 inA32 = *pSrcA++ >> 1;
128 inB32 = *pSrcB++ >> 1;
129 inA32 = __QSUB(inA32, inB32);
130 sum += ((q63_t) inA32 * inA32) >> 14U;
131
132 inA32 = *pSrcA++ >> 1;
133 inB32 = *pSrcB++ >> 1;
134 inA32 = __QSUB(inA32, inB32);
135 sum += ((q63_t) inA32 * inA32) >> 14U;
136
137 inA32 = *pSrcA++ >> 1;
138 inB32 = *pSrcB++ >> 1;
139 inA32 = __QSUB(inA32, inB32);
140 sum += ((q63_t) inA32 * inA32) >> 14U;
141
142
143 /* Decrement loop counter */
144 blkCnt--;
145 }
146
147 /* Loop unrolling: Compute remaining outputs */
148 blkCnt = blockSize % 0x4U;
149
150 #else
151
152 /* Initialize blkCnt with number of samples */
153 blkCnt = blockSize;
154
155 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
156
157 while (blkCnt > 0U)
158 {
159 inA32 = *pSrcA++ >> 1;
160 inB32 = *pSrcB++ >> 1;
161 inA32 = __QSUB(inA32, inB32);
162 sum += ((q63_t) inA32 * inA32) >> 14U;
163
164 /* Decrement loop counter */
165 blkCnt--;
166 }
167
168 /* Store result in q31 format */
169 *pResult = (q31_t) ((sum / blockSize)>>15);
170 }
171 #endif /* defined(ARM_MATH_MVEI) */
172
173 /**
174 @} end of MSE group
175 */
176