1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_sub_f32.c
4  * Description:  Floating-point matrix subtraction
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions.h"
30 
31 /**
32   @ingroup groupMatrix
33  */
34 
35 /**
36   @defgroup MatrixSub Matrix Subtraction
37 
38   Subtract two matrices.
39   \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
40 
41   The functions check to make sure that
42   <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
43   number of rows and columns.
44  */
45 
46 /**
47   @addtogroup MatrixSub
48   @{
49  */
50 
51 /**
52   @brief         Floating-point matrix subtraction.
53   @param[in]     pSrcA      points to the first input matrix structure
54   @param[in]     pSrcB      points to the second input matrix structure
55   @param[out]    pDst       points to output matrix structure
56   @return        execution status
57                    - \ref ARM_MATH_SUCCESS       : Operation successful
58                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
59  */
60 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mat_sub_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)61 arm_status arm_mat_sub_f32(
62   const arm_matrix_instance_f32 * pSrcA,
63   const arm_matrix_instance_f32 * pSrcB,
64   arm_matrix_instance_f32 * pDst)
65 {
66     arm_status status;                             /* status of matrix subtraction */
67     uint32_t  numSamples;       /* total number of elements in the matrix  */
68     float32_t *pDataA, *pDataB, *pDataDst;
69     f32x4_t vecA, vecB, vecDst;
70     float32_t const *pSrcAVec;
71     float32_t const *pSrcBVec;
72     uint32_t  blkCnt;           /* loop counters */
73 
74     pDataA = pSrcA->pData;
75     pDataB = pSrcB->pData;
76     pDataDst = pDst->pData;
77     pSrcAVec = (float32_t const *) pDataA;
78     pSrcBVec = (float32_t const *) pDataB;
79 
80 #ifdef ARM_MATH_MATRIX_CHECK
81   /* Check for matrix mismatch condition */
82   if ((pSrcA->numRows != pSrcB->numRows) ||
83      (pSrcA->numCols != pSrcB->numCols) ||
84      (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
85   {
86     /* Set status as ARM_MATH_SIZE_MISMATCH */
87     status = ARM_MATH_SIZE_MISMATCH;
88   }
89   else
90 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
91   {
92     /*
93      * Total number of samples in the input matrix
94      */
95     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
96     blkCnt = numSamples >> 2;
97     while (blkCnt > 0U)
98     {
99         /* C(m,n) = A(m,n) + B(m,n) */
100         /* sub and then store the results in the destination buffer. */
101         vecA = vld1q(pSrcAVec);
102         pSrcAVec += 4;
103         vecB = vld1q(pSrcBVec);
104         pSrcBVec += 4;
105         vecDst = vsubq(vecA, vecB);
106         vst1q(pDataDst, vecDst);
107         pDataDst += 4;
108         /*
109          * Decrement the blockSize loop counter
110          */
111         blkCnt--;
112     }
113     /*
114      * tail
115      * (will be merged thru tail predication)
116      */
117     blkCnt = numSamples & 3;
118     if (blkCnt > 0U)
119     {
120         mve_pred16_t p0 = vctp32q(blkCnt);
121         vecA = vld1q(pSrcAVec);
122         vecB = vld1q(pSrcBVec);
123         vecDst = vsubq_m(vecDst, vecA, vecB, p0);
124         vstrwq_p(pDataDst, vecDst, p0);
125     }
126     status = ARM_MATH_SUCCESS;
127   }
128 
129   /* Return to application */
130   return (status);
131 }
132 
133 #else
134 #if defined(ARM_MATH_NEON)
arm_mat_sub_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)135 arm_status arm_mat_sub_f32(
136   const arm_matrix_instance_f32 * pSrcA,
137   const arm_matrix_instance_f32 * pSrcB,
138   arm_matrix_instance_f32 * pDst)
139 {
140   float32_t *pIn1 = pSrcA->pData;                /* input data matrix pointer A */
141   float32_t *pIn2 = pSrcB->pData;                /* input data matrix pointer B */
142   float32_t *pOut = pDst->pData;                 /* output data matrix pointer  */
143 
144 
145   uint32_t numSamples;                           /* total number of elements in the matrix  */
146   uint32_t blkCnt;                               /* loop counters */
147   arm_status status;                             /* status of matrix subtraction */
148 
149 #ifdef ARM_MATH_MATRIX_CHECK
150   /* Check for matrix mismatch condition */
151   if ((pSrcA->numRows != pSrcB->numRows) ||
152      (pSrcA->numCols != pSrcB->numCols) ||
153      (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
154   {
155     /* Set status as ARM_MATH_SIZE_MISMATCH */
156     status = ARM_MATH_SIZE_MISMATCH;
157   }
158   else
159 #endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
160   {
161     float32x4_t vec1;
162     float32x4_t vec2;
163     float32x4_t res;
164 
165     /* Total number of samples in the input matrix */
166     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
167 
168     blkCnt = numSamples >> 2U;
169 
170     /* Compute 4 outputs at a time.
171      ** a second loop below computes the remaining 1 to 3 samples. */
172     while (blkCnt > 0U)
173     {
174       /* C(m,n) = A(m,n) - B(m,n) */
175       /* Subtract and then store the results in the destination buffer. */
176       /* Read values from source A */
177       vec1 = vld1q_f32(pIn1);
178       vec2 = vld1q_f32(pIn2);
179       res = vsubq_f32(vec1, vec2);
180       vst1q_f32(pOut, res);
181 
182       /* Update pointers to process next samples */
183       pIn1 += 4U;
184       pIn2 += 4U;
185       pOut += 4U;
186 
187       /* Decrement the loop counter */
188       blkCnt--;
189     }
190 
191     /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
192      ** No loop unrolling is used. */
193     blkCnt = numSamples % 0x4U;
194 
195 
196     while (blkCnt > 0U)
197     {
198       /* C(m,n) = A(m,n) - B(m,n) */
199       /* Subtract and then store the results in the destination buffer. */
200       *pOut++ = (*pIn1++) - (*pIn2++);
201 
202       /* Decrement the loop counter */
203       blkCnt--;
204     }
205 
206     /* Set status as ARM_MATH_SUCCESS */
207     status = ARM_MATH_SUCCESS;
208   }
209 
210   /* Return to application */
211   return (status);
212 }
213 #else
arm_mat_sub_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)214 arm_status arm_mat_sub_f32(
215   const arm_matrix_instance_f32 * pSrcA,
216   const arm_matrix_instance_f32 * pSrcB,
217         arm_matrix_instance_f32 * pDst)
218 {
219   float32_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
220   float32_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
221   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
222 
223   uint32_t numSamples;                           /* total number of elements in the matrix */
224   uint32_t blkCnt;                               /* loop counters */
225   arm_status status;                             /* status of matrix subtraction */
226 
227 #ifdef ARM_MATH_MATRIX_CHECK
228 
229   /* Check for matrix mismatch condition */
230   if ((pSrcA->numRows != pSrcB->numRows) ||
231       (pSrcA->numCols != pSrcB->numCols) ||
232       (pSrcA->numRows != pDst->numRows)  ||
233       (pSrcA->numCols != pDst->numCols)    )
234   {
235     /* Set status as ARM_MATH_SIZE_MISMATCH */
236     status = ARM_MATH_SIZE_MISMATCH;
237   }
238   else
239 
240 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
241 
242   {
243     /* Total number of samples in input matrix */
244     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
245 
246 #if defined (ARM_MATH_LOOPUNROLL)
247 
248     /* Loop unrolling: Compute 4 outputs at a time */
249     blkCnt = numSamples >> 2U;
250 
251     while (blkCnt > 0U)
252     {
253       /* C(m,n) = A(m,n) - B(m,n) */
254 
255       /* Subtract and store result in destination buffer. */
256       *pOut++ = (*pInA++) - (*pInB++);
257       *pOut++ = (*pInA++) - (*pInB++);
258       *pOut++ = (*pInA++) - (*pInB++);
259       *pOut++ = (*pInA++) - (*pInB++);
260 
261       /* Decrement loop counter */
262       blkCnt--;
263     }
264 
265     /* Loop unrolling: Compute remaining outputs */
266     blkCnt = numSamples % 0x4U;
267 
268 #else
269 
270     /* Initialize blkCnt with number of samples */
271     blkCnt = numSamples;
272 
273 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
274 
275     while (blkCnt > 0U)
276     {
277       /* C(m,n) = A(m,n) - B(m,n) */
278 
279       /* Subtract and store result in destination buffer. */
280       *pOut++ = (*pInA++) - (*pInB++);
281 
282       /* Decrement loop counter */
283       blkCnt--;
284     }
285 
286     /* Set status as ARM_MATH_SUCCESS */
287     status = ARM_MATH_SUCCESS;
288   }
289 
290   /* Return to application */
291   return (status);
292 }
293 #endif /* #if defined(ARM_MATH_NEON) */
294 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
295 
296 /**
297   @} end of MatrixSub group
298  */
299