1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_add_f32.c
4 * Description: Floating-point matrix addition
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/matrix_functions.h"
30
31 /**
32 @ingroup groupMatrix
33 */
34
35 /**
36 @defgroup MatrixAdd Matrix Addition
37
38 Adds two matrices.
39 \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
40
41 The functions check to make sure that
42 <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
43 number of rows and columns.
44 */
45
46 /**
47 @addtogroup MatrixAdd
48 @{
49 */
50
51
52 /**
53 @brief Floating-point matrix addition.
54 @param[in] pSrcA points to first input matrix structure
55 @param[in] pSrcB points to second input matrix structure
56 @param[out] pDst points to output matrix structure
57 @return execution status
58 - \ref ARM_MATH_SUCCESS : Operation successful
59 - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
60 */
61
62 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)63 arm_status arm_mat_add_f32(
64 const arm_matrix_instance_f32 * pSrcA,
65 const arm_matrix_instance_f32 * pSrcB,
66 arm_matrix_instance_f32 * pDst)
67 {
68 arm_status status;
69 uint32_t numSamples; /* total number of elements in the matrix */
70 float32_t *pDataA, *pDataB, *pDataDst;
71 f32x4_t vecA, vecB, vecDst = { 0 };
72 float32_t const *pSrcAVec;
73 float32_t const *pSrcBVec;
74 uint32_t blkCnt; /* loop counters */
75
76 pDataA = pSrcA->pData;
77 pDataB = pSrcB->pData;
78 pDataDst = pDst->pData;
79 pSrcAVec = (float32_t const *) pDataA;
80 pSrcBVec = (float32_t const *) pDataB;
81
82 #ifdef ARM_MATH_MATRIX_CHECK
83 /* Check for matrix mismatch condition */
84 if ((pSrcA->numRows != pSrcB->numRows) ||
85 (pSrcA->numCols != pSrcB->numCols) ||
86 (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
87 {
88 /* Set status as ARM_MATH_SIZE_MISMATCH */
89 status = ARM_MATH_SIZE_MISMATCH;
90 }
91 else
92 #endif
93 {
94 /*
95 * Total number of samples in the input matrix
96 */
97 numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
98 blkCnt = numSamples >> 2;
99 while (blkCnt > 0U)
100 {
101 /* C(m,n) = A(m,n) + B(m,n) */
102 /* Add and then store the results in the destination buffer. */
103 vecA = vld1q(pSrcAVec);
104 pSrcAVec += 4;
105 vecB = vld1q(pSrcBVec);
106 pSrcBVec += 4;
107 vecDst = vaddq(vecA, vecB);
108 vst1q(pDataDst, vecDst);
109 pDataDst += 4;
110 /*
111 * Decrement the blockSize loop counter
112 */
113 blkCnt--;
114 }
115 /*
116 * tail
117 */
118 blkCnt = numSamples & 3;
119 if (blkCnt > 0U)
120 {
121 mve_pred16_t p0 = vctp32q(blkCnt);
122 vecA = vld1q(pSrcAVec);
123 vecB = vld1q(pSrcBVec);
124 vecDst = vaddq_m(vecDst, vecA, vecB, p0);
125 vstrwq_p(pDataDst, vecDst, p0);
126 }
127 /* set status as ARM_MATH_SUCCESS */
128 status = ARM_MATH_SUCCESS;
129 }
130 return (status);
131 }
132 #else
133 #if defined(ARM_MATH_NEON)
134 /*
135
136 Neon version is assuming the matrix is small enough.
137 So no blocking is used for taking into account cache effects.
138 For big matrix, there exist better libraries for Neon.
139
140 */
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)141 arm_status arm_mat_add_f32(
142 const arm_matrix_instance_f32 * pSrcA,
143 const arm_matrix_instance_f32 * pSrcB,
144 arm_matrix_instance_f32 * pDst)
145 {
146 float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
147 float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
148 float32_t *pOut = pDst->pData; /* output data matrix pointer */
149
150
151 uint32_t numSamples; /* total number of elements in the matrix */
152 uint32_t blkCnt; /* loop counters */
153 arm_status status; /* status of matrix addition */
154
155 #ifdef ARM_MATH_MATRIX_CHECK
156 /* Check for matrix mismatch condition */
157 if ((pSrcA->numRows != pSrcB->numRows) ||
158 (pSrcA->numCols != pSrcB->numCols) ||
159 (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
160 {
161 /* Set status as ARM_MATH_SIZE_MISMATCH */
162 status = ARM_MATH_SIZE_MISMATCH;
163 }
164 else
165 #endif
166 {
167 float32x4_t vec1;
168 float32x4_t vec2;
169 float32x4_t res;
170
171 /* Total number of samples in the input matrix */
172 numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
173
174 blkCnt = numSamples >> 2U;
175
176 /* Compute 4 outputs at a time.
177 ** a second loop below computes the remaining 1 to 3 samples. */
178 while (blkCnt > 0U)
179 {
180 /* C(m,n) = A(m,n) + B(m,n) */
181 /* Add and then store the results in the destination buffer. */
182 vec1 = vld1q_f32(pIn1);
183 vec2 = vld1q_f32(pIn2);
184 res = vaddq_f32(vec1, vec2);
185 vst1q_f32(pOut, res);
186
187 /* update pointers to process next samples */
188 pIn1 += 4U;
189 pIn2 += 4U;
190 pOut += 4U;
191 /* Decrement the loop counter */
192 blkCnt--;
193 }
194
195 /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
196 ** No loop unrolling is used. */
197 blkCnt = numSamples % 0x4U;
198
199 while (blkCnt > 0U)
200 {
201 /* C(m,n) = A(m,n) + B(m,n) */
202 /* Add and then store the results in the destination buffer. */
203 *pOut++ = (*pIn1++) + (*pIn2++);
204
205 /* Decrement the loop counter */
206 blkCnt--;
207 }
208
209 /* set status as ARM_MATH_SUCCESS */
210 status = ARM_MATH_SUCCESS;
211 }
212
213 /* Return to application */
214 return (status);
215 }
216 #else
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)217 arm_status arm_mat_add_f32(
218 const arm_matrix_instance_f32 * pSrcA,
219 const arm_matrix_instance_f32 * pSrcB,
220 arm_matrix_instance_f32 * pDst)
221 {
222 float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
223 float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
224 float32_t *pOut = pDst->pData; /* output data matrix pointer */
225
226 uint32_t numSamples; /* total number of elements in the matrix */
227 uint32_t blkCnt; /* loop counters */
228 arm_status status; /* status of matrix addition */
229
230 #ifdef ARM_MATH_MATRIX_CHECK
231
232 /* Check for matrix mismatch condition */
233 if ((pSrcA->numRows != pSrcB->numRows) ||
234 (pSrcA->numCols != pSrcB->numCols) ||
235 (pSrcA->numRows != pDst->numRows) ||
236 (pSrcA->numCols != pDst->numCols) )
237 {
238 /* Set status as ARM_MATH_SIZE_MISMATCH */
239 status = ARM_MATH_SIZE_MISMATCH;
240 }
241 else
242
243 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
244
245 {
246 /* Total number of samples in input matrix */
247 numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
248
249 #if defined (ARM_MATH_LOOPUNROLL)
250
251 /* Loop unrolling: Compute 4 outputs at a time */
252 blkCnt = numSamples >> 2U;
253
254 while (blkCnt > 0U)
255 {
256 /* C(m,n) = A(m,n) + B(m,n) */
257
258 /* Add and store result in destination buffer. */
259 *pOut++ = *pInA++ + *pInB++;
260
261 *pOut++ = *pInA++ + *pInB++;
262
263 *pOut++ = *pInA++ + *pInB++;
264
265 *pOut++ = *pInA++ + *pInB++;
266
267 /* Decrement loop counter */
268 blkCnt--;
269 }
270
271 /* Loop unrolling: Compute remaining outputs */
272 blkCnt = numSamples % 0x4U;
273
274 #else
275
276 /* Initialize blkCnt with number of samples */
277 blkCnt = numSamples;
278
279 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
280
281 while (blkCnt > 0U)
282 {
283 /* C(m,n) = A(m,n) + B(m,n) */
284
285 /* Add and store result in destination buffer. */
286 *pOut++ = *pInA++ + *pInB++;
287
288 /* Decrement loop counter */
289 blkCnt--;
290 }
291
292 /* Set status as ARM_MATH_SUCCESS */
293 status = ARM_MATH_SUCCESS;
294 }
295
296 /* Return to application */
297 return (status);
298 }
299 #endif /* #if defined(ARM_MATH_NEON) */
300 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
301
302 /**
303 @} end of MatrixAdd group
304 */
305