1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_linear_predict_f16.c
4  * Description:  SVM Linear Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 #include <limits.h>
34 #include <math.h>
35 
36 
37 /**
38  * @addtogroup linearsvm
39  * @{
40  */
41 
42 
43 /**
44  * @brief SVM linear prediction
45  * @param[in]    S          Pointer to an instance of the linear SVM structure.
46  * @param[in]    in         Pointer to input vector
47  * @param[out]   pResult    Decision value
48  *
49  */
50 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
51 
52 #include "arm_helium_utils.h"
53 
arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 * S,const float16_t * in,int32_t * pResult)54 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f16(
55     const arm_svm_linear_instance_f16 *S,
56     const float16_t * in,
57     int32_t * pResult)
58 {
59         /* inlined Matrix x Vector function interleaved with dot prod */
60     uint32_t        numRows = S->nbOfSupportVectors;
61     uint32_t        numCols = S->vectorDimension;
62     const float16_t *pSupport = S->supportVectors;
63     const float16_t *pSrcA = pSupport;
64     const float16_t *pInA0;
65     const float16_t *pInA1;
66     uint32_t         row;
67     uint32_t         blkCnt;     /* loop counters */
68     const float16_t *pDualCoef = S->dualCoefficients;
69     _Float16       sum = S->intercept;
70     row = numRows;
71 
72     /*
73      * compute 4 rows in parrallel
74      */
75     while (row >= 4)
76     {
77         const float16_t *pInA2, *pInA3;
78         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
79         f16x8_t         vecIn, acc0, acc1, acc2, acc3;
80         float16_t const *pSrcVecPtr = in;
81 
82         /*
83          * Initialize the pointers to 4 consecutive MatrixA rows
84          */
85         pInA0 = pSrcA;
86         pInA1 = pInA0 + numCols;
87         pInA2 = pInA1 + numCols;
88         pInA3 = pInA2 + numCols;
89         /*
90          * Initialize the vector pointer
91          */
92         pInVec = pSrcVecPtr;
93         /*
94          * reset accumulators
95          */
96         acc0 = vdupq_n_f16(0.0f);
97         acc1 = vdupq_n_f16(0.0f);
98         acc2 = vdupq_n_f16(0.0f);
99         acc3 = vdupq_n_f16(0.0f);
100 
101         pSrcA0Vec = pInA0;
102         pSrcA1Vec = pInA1;
103         pSrcA2Vec = pInA2;
104         pSrcA3Vec = pInA3;
105 
106         blkCnt = numCols >> 3;
107         while (blkCnt > 0U) {
108             f16x8_t         vecA;
109 
110             vecIn = vld1q(pInVec);
111             pInVec += 8;
112             vecA = vld1q(pSrcA0Vec);
113             pSrcA0Vec += 8;
114             acc0 = vfmaq(acc0, vecIn, vecA);
115             vecA = vld1q(pSrcA1Vec);
116             pSrcA1Vec += 8;
117             acc1 = vfmaq(acc1, vecIn, vecA);
118             vecA = vld1q(pSrcA2Vec);
119             pSrcA2Vec += 8;
120             acc2 = vfmaq(acc2, vecIn, vecA);
121             vecA = vld1q(pSrcA3Vec);
122             pSrcA3Vec += 8;
123             acc3 = vfmaq(acc3, vecIn, vecA);
124 
125             blkCnt--;
126         }
127         /*
128          * tail
129          * (will be merged thru tail predication)
130          */
131         blkCnt = numCols & 7;
132         if (blkCnt > 0U) {
133             mve_pred16_t    p0 = vctp16q(blkCnt);
134             f16x8_t         vecA;
135 
136             vecIn = vldrhq_z_f16(pInVec, p0);
137             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
138             acc0 = vfmaq(acc0, vecIn, vecA);
139             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
140             acc1 = vfmaq(acc1, vecIn, vecA);
141             vecA = vldrhq_z_f16(pSrcA2Vec, p0);
142             acc2 = vfmaq(acc2, vecIn, vecA);
143             vecA = vldrhq_z_f16(pSrcA3Vec, p0);
144             acc3 = vfmaq(acc3, vecIn, vecA);
145         }
146         /*
147          * Sum the partial parts
148          */
149         acc0 = vmulq_n_f16(acc0,*pDualCoef++);
150         acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
151         acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
152         acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
153 
154         sum += (_Float16)vecAddAcrossF16Mve(acc0);
155 
156         pSrcA += numCols * 4;
157         /*
158          * Decrement the row loop counter
159          */
160         row -= 4;
161     }
162 
163     /*
164      * compute 2 rows in parallel
165      */
166     if (row >= 2) {
167         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
168         f16x8_t         vecIn, acc0, acc1;
169         float16_t const *pSrcVecPtr = in;
170 
171         /*
172          * Initialize the pointers to 2 consecutive MatrixA rows
173          */
174         pInA0 = pSrcA;
175         pInA1 = pInA0 + numCols;
176         /*
177          * Initialize the vector pointer
178          */
179         pInVec = pSrcVecPtr;
180         /*
181          * reset accumulators
182          */
183         acc0 = vdupq_n_f16(0.0f);
184         acc1 = vdupq_n_f16(0.0f);
185         pSrcA0Vec = pInA0;
186         pSrcA1Vec = pInA1;
187 
188         blkCnt = numCols >> 3;
189         while (blkCnt > 0U) {
190             f16x8_t         vecA;
191 
192             vecIn = vld1q(pInVec);
193             pInVec += 8;
194             vecA = vld1q(pSrcA0Vec);
195             pSrcA0Vec += 8;
196             acc0 = vfmaq(acc0, vecIn, vecA);
197             vecA = vld1q(pSrcA1Vec);
198             pSrcA1Vec += 8;
199             acc1 = vfmaq(acc1, vecIn, vecA);
200 
201             blkCnt--;
202         }
203         /*
204          * tail
205          * (will be merged thru tail predication)
206          */
207         blkCnt = numCols & 7;
208         if (blkCnt > 0U) {
209             mve_pred16_t    p0 = vctp16q(blkCnt);
210             f16x8_t         vecA;
211 
212             vecIn = vldrhq_z_f16(pInVec, p0);
213             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
214             acc0 = vfmaq(acc0, vecIn, vecA);
215             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
216             acc1 = vfmaq(acc1, vecIn, vecA);
217         }
218         /*
219          * Sum the partial parts
220          */
221         acc0 = vmulq_n_f16(acc0,*pDualCoef++);
222         acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
223 
224         sum += (_Float16)vecAddAcrossF16Mve(acc0);
225 
226         pSrcA += numCols * 2;
227         row -= 2;
228     }
229 
230     if (row >= 1) {
231         f16x8_t         vecIn, acc0;
232         float16_t const *pSrcA0Vec, *pInVec;
233         float16_t const *pSrcVecPtr = in;
234         /*
235          * Initialize the pointers to last MatrixA row
236          */
237         pInA0 = pSrcA;
238         /*
239          * Initialize the vector pointer
240          */
241         pInVec = pSrcVecPtr;
242         /*
243          * reset accumulators
244          */
245         acc0 = vdupq_n_f16(0.0f);
246 
247         pSrcA0Vec = pInA0;
248 
249         blkCnt = numCols >> 3;
250         while (blkCnt > 0U) {
251             f16x8_t         vecA;
252 
253             vecIn = vld1q(pInVec);
254             pInVec += 8;
255             vecA = vld1q(pSrcA0Vec);
256             pSrcA0Vec += 8;
257             acc0 = vfmaq(acc0, vecIn, vecA);
258 
259             blkCnt--;
260         }
261         /*
262          * tail
263          * (will be merged thru tail predication)
264          */
265         blkCnt = numCols & 7;
266         if (blkCnt > 0U) {
267             mve_pred16_t    p0 = vctp16q(blkCnt);
268             f16x8_t         vecA;
269 
270             vecIn = vldrhq_z_f16(pInVec, p0);
271             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
272             acc0 = vfmaq(acc0, vecIn, vecA);
273         }
274         /*
275          * Sum the partial parts
276          */
277         sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
278 
279     }
280 
281     *pResult = S->classes[STEP(sum)];
282 }
283 
284 #else
arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 * S,const float16_t * in,int32_t * pResult)285 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f16(
286     const arm_svm_linear_instance_f16 *S,
287     const float16_t * in,
288     int32_t * pResult)
289 {
290     _Float16 sum=S->intercept;
291     _Float16 dot=0;
292     uint32_t i,j;
293     const float16_t *pSupport = S->supportVectors;
294 
295     for(i=0; i < S->nbOfSupportVectors; i++)
296     {
297         dot=0;
298         for(j=0; j < S->vectorDimension; j++)
299         {
300             dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
301         }
302         sum += (_Float16)S->dualCoefficients[i] * (_Float16)dot;
303     }
304     *pResult=S->classes[STEP(sum)];
305 }
306 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
307 
308 /**
309  * @} end of linearsvm group
310  */
311 
312 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
313 
314