1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_add_f32.c
4  * Description:  Floating-point matrix addition
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions.h"
30 
31 /**
32   @ingroup groupMatrix
33  */
34 
35 /**
36   @defgroup MatrixAdd Matrix Addition
37 
38   Adds two matrices.
39   \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
40 
41   The functions check to make sure that
42   <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
43   number of rows and columns.
44  */
45 
46 /**
47   @addtogroup MatrixAdd
48   @{
49  */
50 
51 
52 /**
53   @brief         Floating-point matrix addition.
54   @param[in]     pSrcA      points to first input matrix structure
55   @param[in]     pSrcB      points to second input matrix structure
56   @param[out]    pDst       points to output matrix structure
57   @return        execution status
58                    - \ref ARM_MATH_SUCCESS       : Operation successful
59                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
60  */
61 
62 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)63 arm_status arm_mat_add_f32(
64   const arm_matrix_instance_f32 * pSrcA,
65   const arm_matrix_instance_f32 * pSrcB,
66   arm_matrix_instance_f32 * pDst)
67 {
68     arm_status status;
69     uint32_t  numSamples;       /* total number of elements in the matrix  */
70     float32_t *pDataA, *pDataB, *pDataDst;
71     f32x4_t vecA, vecB, vecDst = { 0 };
72     float32_t const *pSrcAVec;
73     float32_t const *pSrcBVec;
74     uint32_t  blkCnt;           /* loop counters */
75 
76     pDataA = pSrcA->pData;
77     pDataB = pSrcB->pData;
78     pDataDst = pDst->pData;
79     pSrcAVec = (float32_t const *) pDataA;
80     pSrcBVec = (float32_t const *) pDataB;
81 
82 #ifdef ARM_MATH_MATRIX_CHECK
83   /* Check for matrix mismatch condition */
84   if ((pSrcA->numRows != pSrcB->numRows) ||
85      (pSrcA->numCols != pSrcB->numCols) ||
86      (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
87   {
88     /* Set status as ARM_MATH_SIZE_MISMATCH */
89     status = ARM_MATH_SIZE_MISMATCH;
90   }
91   else
92 #endif
93  {
94     /*
95      * Total number of samples in the input matrix
96      */
97     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
98     blkCnt = numSamples >> 2;
99     while (blkCnt > 0U)
100     {
101         /* C(m,n) = A(m,n) + B(m,n) */
102         /* Add and then store the results in the destination buffer. */
103         vecA = vld1q(pSrcAVec);
104         pSrcAVec += 4;
105         vecB = vld1q(pSrcBVec);
106         pSrcBVec += 4;
107         vecDst = vaddq(vecA, vecB);
108         vst1q(pDataDst, vecDst);
109         pDataDst += 4;
110         /*
111          * Decrement the blockSize loop counter
112          */
113         blkCnt--;
114     }
115     /*
116      * tail
117      */
118     blkCnt = numSamples & 3;
119     if (blkCnt > 0U)
120     {
121         mve_pred16_t p0 = vctp32q(blkCnt);
122         vecA = vld1q(pSrcAVec);
123         vecB = vld1q(pSrcBVec);
124         vecDst = vaddq_m(vecDst, vecA, vecB, p0);
125         vstrwq_p(pDataDst, vecDst, p0);
126     }
127     /* set status as ARM_MATH_SUCCESS */
128     status = ARM_MATH_SUCCESS;
129   }
130   return (status);
131 }
132 #else
133 #if defined(ARM_MATH_NEON)
134 /*
135 
136 Neon version is assuming the matrix is small enough.
137 So no blocking is used for taking into account cache effects.
138 For big matrix, there exist better libraries for Neon.
139 
140 */
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)141 arm_status arm_mat_add_f32(
142   const arm_matrix_instance_f32 * pSrcA,
143   const arm_matrix_instance_f32 * pSrcB,
144   arm_matrix_instance_f32 * pDst)
145 {
146   float32_t *pIn1 = pSrcA->pData;                /* input data matrix pointer A  */
147   float32_t *pIn2 = pSrcB->pData;                /* input data matrix pointer B  */
148   float32_t *pOut = pDst->pData;                 /* output data matrix pointer   */
149 
150 
151   uint32_t numSamples;                           /* total number of elements in the matrix  */
152   uint32_t blkCnt;                               /* loop counters */
153   arm_status status;                             /* status of matrix addition */
154 
155 #ifdef ARM_MATH_MATRIX_CHECK
156   /* Check for matrix mismatch condition */
157   if ((pSrcA->numRows != pSrcB->numRows) ||
158      (pSrcA->numCols != pSrcB->numCols) ||
159      (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
160   {
161     /* Set status as ARM_MATH_SIZE_MISMATCH */
162     status = ARM_MATH_SIZE_MISMATCH;
163   }
164   else
165 #endif
166   {
167     float32x4_t vec1;
168     float32x4_t vec2;
169     float32x4_t res;
170 
171     /* Total number of samples in the input matrix */
172     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
173 
174     blkCnt = numSamples >> 2U;
175 
176     /* Compute 4 outputs at a time.
177      ** a second loop below computes the remaining 1 to 3 samples. */
178     while (blkCnt > 0U)
179     {
180       /* C(m,n) = A(m,n) + B(m,n) */
181       /* Add and then store the results in the destination buffer. */
182       vec1 = vld1q_f32(pIn1);
183       vec2 = vld1q_f32(pIn2);
184       res = vaddq_f32(vec1, vec2);
185       vst1q_f32(pOut, res);
186 
187       /* update pointers to process next samples */
188       pIn1 += 4U;
189       pIn2 += 4U;
190       pOut += 4U;
191       /* Decrement the loop counter */
192       blkCnt--;
193     }
194 
195     /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
196      ** No loop unrolling is used. */
197     blkCnt = numSamples % 0x4U;
198 
199     while (blkCnt > 0U)
200     {
201       /* C(m,n) = A(m,n) + B(m,n) */
202       /* Add and then store the results in the destination buffer. */
203       *pOut++ = (*pIn1++) + (*pIn2++);
204 
205       /* Decrement the loop counter */
206       blkCnt--;
207     }
208 
209     /* set status as ARM_MATH_SUCCESS */
210     status = ARM_MATH_SUCCESS;
211   }
212 
213   /* Return to application */
214   return (status);
215 }
216 #else
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)217 arm_status arm_mat_add_f32(
218   const arm_matrix_instance_f32 * pSrcA,
219   const arm_matrix_instance_f32 * pSrcB,
220         arm_matrix_instance_f32 * pDst)
221 {
222   float32_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
223   float32_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
224   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
225 
226   uint32_t numSamples;                           /* total number of elements in the matrix */
227   uint32_t blkCnt;                               /* loop counters */
228   arm_status status;                             /* status of matrix addition */
229 
230 #ifdef ARM_MATH_MATRIX_CHECK
231 
232   /* Check for matrix mismatch condition */
233   if ((pSrcA->numRows != pSrcB->numRows) ||
234       (pSrcA->numCols != pSrcB->numCols) ||
235       (pSrcA->numRows != pDst->numRows)  ||
236       (pSrcA->numCols != pDst->numCols)    )
237   {
238     /* Set status as ARM_MATH_SIZE_MISMATCH */
239     status = ARM_MATH_SIZE_MISMATCH;
240   }
241   else
242 
243 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
244 
245   {
246     /* Total number of samples in input matrix */
247     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
248 
249 #if defined (ARM_MATH_LOOPUNROLL)
250 
251     /* Loop unrolling: Compute 4 outputs at a time */
252     blkCnt = numSamples >> 2U;
253 
254     while (blkCnt > 0U)
255     {
256       /* C(m,n) = A(m,n) + B(m,n) */
257 
258       /* Add and store result in destination buffer. */
259       *pOut++ = *pInA++ + *pInB++;
260 
261       *pOut++ = *pInA++ + *pInB++;
262 
263       *pOut++ = *pInA++ + *pInB++;
264 
265       *pOut++ = *pInA++ + *pInB++;
266 
267       /* Decrement loop counter */
268       blkCnt--;
269     }
270 
271     /* Loop unrolling: Compute remaining outputs */
272     blkCnt = numSamples % 0x4U;
273 
274 #else
275 
276     /* Initialize blkCnt with number of samples */
277     blkCnt = numSamples;
278 
279 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
280 
281     while (blkCnt > 0U)
282     {
283       /* C(m,n) = A(m,n) + B(m,n) */
284 
285       /* Add and store result in destination buffer. */
286       *pOut++ = *pInA++ + *pInB++;
287 
288       /* Decrement loop counter */
289       blkCnt--;
290     }
291 
292     /* Set status as ARM_MATH_SUCCESS */
293     status = ARM_MATH_SUCCESS;
294   }
295 
296   /* Return to application */
297   return (status);
298 }
299 #endif /* #if defined(ARM_MATH_NEON) */
300 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
301 
302 /**
303   @} end of MatrixAdd group
304  */
305