1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_add_f16.c
4  * Description:  Floating-point matrix addition
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 
34 /**
35   @ingroup groupMatrix
36  */
37 
38 
39 /**
40   @addtogroup MatrixAdd
41   @{
42  */
43 
44 
45 /**
46   @brief         Floating-point matrix addition.
47   @param[in]     pSrcA      points to first input matrix structure
48   @param[in]     pSrcB      points to second input matrix structure
49   @param[out]    pDst       points to output matrix structure
50   @return        execution status
51                    - \ref ARM_MATH_SUCCESS       : Operation successful
52                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
53  */
54 
55 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
56 
arm_mat_add_f16(const arm_matrix_instance_f16 * pSrcA,const arm_matrix_instance_f16 * pSrcB,arm_matrix_instance_f16 * pDst)57 arm_status arm_mat_add_f16(
58   const arm_matrix_instance_f16 * pSrcA,
59   const arm_matrix_instance_f16 * pSrcB,
60   arm_matrix_instance_f16 * pDst)
61 {
62     arm_status status;
63     uint32_t  numSamples;       /* total number of elements in the matrix  */
64     float16_t *pDataA, *pDataB, *pDataDst;
65     f16x8_t vecA, vecB, vecDst;
66     float16_t const *pSrcAVec;
67     float16_t const *pSrcBVec;
68     uint32_t  blkCnt;           /* loop counters */
69 
70     pDataA = pSrcA->pData;
71     pDataB = pSrcB->pData;
72     pDataDst = pDst->pData;
73     pSrcAVec = (float16_t const *) pDataA;
74     pSrcBVec = (float16_t const *) pDataB;
75 
76 #ifdef ARM_MATH_MATRIX_CHECK
77   /* Check for matrix mismatch condition */
78   if ((pSrcA->numRows != pSrcB->numRows) ||
79      (pSrcA->numCols != pSrcB->numCols) ||
80      (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
81   {
82     /* Set status as ARM_MATH_SIZE_MISMATCH */
83     status = ARM_MATH_SIZE_MISMATCH;
84   }
85   else
86 #endif
87  {
88     /*
89      * Total number of samples in the input matrix
90      */
91     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
92     blkCnt = numSamples >> 3;
93     while (blkCnt > 0U)
94     {
95         /* C(m,n) = A(m,n) + B(m,n) */
96         /* Add and then store the results in the destination buffer. */
97         vecA = vld1q(pSrcAVec);
98         pSrcAVec += 8;
99         vecB = vld1q(pSrcBVec);
100         pSrcBVec += 8;
101         vecDst = vaddq(vecA, vecB);
102         vst1q(pDataDst, vecDst);
103         pDataDst += 8;
104         /*
105          * Decrement the blockSize loop counter
106          */
107         blkCnt--;
108     }
109     /*
110      * tail
111      */
112     blkCnt = numSamples & 7;
113     if (blkCnt > 0U)
114     {
115         mve_pred16_t p0 = vctp16q(blkCnt);
116         vecA = vld1q(pSrcAVec);
117         vecB = vld1q(pSrcBVec);
118         vecDst = vaddq_m(vecDst, vecA, vecB, p0);
119         vstrhq_p(pDataDst, vecDst, p0);
120     }
121     /* set status as ARM_MATH_SUCCESS */
122     status = ARM_MATH_SUCCESS;
123   }
124   return (status);
125 }
126 #else
127 
arm_mat_add_f16(const arm_matrix_instance_f16 * pSrcA,const arm_matrix_instance_f16 * pSrcB,arm_matrix_instance_f16 * pDst)128 arm_status arm_mat_add_f16(
129   const arm_matrix_instance_f16 * pSrcA,
130   const arm_matrix_instance_f16 * pSrcB,
131         arm_matrix_instance_f16 * pDst)
132 {
133   float16_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
134   float16_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
135   float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
136 
137   uint32_t numSamples;                           /* total number of elements in the matrix */
138   uint32_t blkCnt;                               /* loop counters */
139   arm_status status;                             /* status of matrix addition */
140 
141 #ifdef ARM_MATH_MATRIX_CHECK
142 
143   /* Check for matrix mismatch condition */
144   if ((pSrcA->numRows != pSrcB->numRows) ||
145       (pSrcA->numCols != pSrcB->numCols) ||
146       (pSrcA->numRows != pDst->numRows)  ||
147       (pSrcA->numCols != pDst->numCols)    )
148   {
149     /* Set status as ARM_MATH_SIZE_MISMATCH */
150     status = ARM_MATH_SIZE_MISMATCH;
151   }
152   else
153 
154 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
155 
156   {
157     /* Total number of samples in input matrix */
158     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
159 
160 #if defined (ARM_MATH_LOOPUNROLL)
161 
162     /* Loop unrolling: Compute 4 outputs at a time */
163     blkCnt = numSamples >> 2U;
164 
165     while (blkCnt > 0U)
166     {
167       /* C(m,n) = A(m,n) + B(m,n) */
168 
169       /* Add and store result in destination buffer. */
170       *pOut++ = *pInA++ + *pInB++;
171 
172       *pOut++ = *pInA++ + *pInB++;
173 
174       *pOut++ = *pInA++ + *pInB++;
175 
176       *pOut++ = *pInA++ + *pInB++;
177 
178       /* Decrement loop counter */
179       blkCnt--;
180     }
181 
182     /* Loop unrolling: Compute remaining outputs */
183     blkCnt = numSamples % 0x4U;
184 
185 #else
186 
187     /* Initialize blkCnt with number of samples */
188     blkCnt = numSamples;
189 
190 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
191 
192     while (blkCnt > 0U)
193     {
194       /* C(m,n) = A(m,n) + B(m,n) */
195 
196       /* Add and store result in destination buffer. */
197       *pOut++ = *pInA++ + *pInB++;
198 
199       /* Decrement loop counter */
200       blkCnt--;
201     }
202 
203     /* Set status as ARM_MATH_SUCCESS */
204     status = ARM_MATH_SUCCESS;
205   }
206 
207   /* Return to application */
208   return (status);
209 }
210 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
211 
212 /**
213   @} end of MatrixAdd group
214  */
215 
216 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
217 
218