1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_sigmoid_predict_f16.c
4  * Description:  SVM Sigmoid Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 #include <limits.h>
34 #include <math.h>
35 
36 /**
37  * @addtogroup sigmoidsvm
38  * @{
39  */
40 
41 
42 
43 /**
44  * @brief SVM sigmoid prediction
45  * @param[in]    S        Pointer to an instance of the rbf SVM structure.
46  * @param[in]    in       Pointer to input vector
47  * @param[out]   pResult  Decision value
48  *
49  */
50 
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
53 #include "arm_helium_utils.h"
54 #include "arm_vec_math_f16.h"
55 
arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 * S,const float16_t * in,int32_t * pResult)56 void arm_svm_sigmoid_predict_f16(
57     const arm_svm_sigmoid_instance_f16 *S,
58     const float16_t * in,
59     int32_t * pResult)
60 {
61         /* inlined Matrix x Vector function interleaved with dot prod */
62     uint32_t        numRows = S->nbOfSupportVectors;
63     uint32_t        numCols = S->vectorDimension;
64     const float16_t *pSupport = S->supportVectors;
65     const float16_t *pSrcA = pSupport;
66     const float16_t *pInA0;
67     const float16_t *pInA1;
68     uint32_t         row;
69     uint32_t         blkCnt;     /* loop counters */
70     const float16_t *pDualCoef = S->dualCoefficients;
71     _Float16       sum = S->intercept;
72     f16x8_t         vSum = vdupq_n_f16(0.0f);
73 
74     row = numRows;
75 
76     /*
77      * compute 4 rows in parrallel
78      */
79     while (row >= 4) {
80         const float16_t *pInA2, *pInA3;
81         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
82         f16x8_t         vecIn, acc0, acc1, acc2, acc3;
83         float16_t const *pSrcVecPtr = in;
84 
85         /*
86          * Initialize the pointers to 4 consecutive MatrixA rows
87          */
88         pInA0 = pSrcA;
89         pInA1 = pInA0 + numCols;
90         pInA2 = pInA1 + numCols;
91         pInA3 = pInA2 + numCols;
92         /*
93          * Initialize the vector pointer
94          */
95         pInVec = pSrcVecPtr;
96         /*
97          * reset accumulators
98          */
99         acc0 = vdupq_n_f16(0.0f);
100         acc1 = vdupq_n_f16(0.0f);
101         acc2 = vdupq_n_f16(0.0f);
102         acc3 = vdupq_n_f16(0.0f);
103 
104         pSrcA0Vec = pInA0;
105         pSrcA1Vec = pInA1;
106         pSrcA2Vec = pInA2;
107         pSrcA3Vec = pInA3;
108 
109         blkCnt = numCols >> 3;
110         while (blkCnt > 0U) {
111             f16x8_t         vecA;
112 
113             vecIn = vld1q(pInVec);
114             pInVec += 8;
115             vecA = vld1q(pSrcA0Vec);
116             pSrcA0Vec += 8;
117             acc0 = vfmaq(acc0, vecIn, vecA);
118             vecA = vld1q(pSrcA1Vec);
119             pSrcA1Vec += 8;
120             acc1 = vfmaq(acc1, vecIn, vecA);
121             vecA = vld1q(pSrcA2Vec);
122             pSrcA2Vec += 8;
123             acc2 = vfmaq(acc2, vecIn, vecA);
124             vecA = vld1q(pSrcA3Vec);
125             pSrcA3Vec += 8;
126             acc3 = vfmaq(acc3, vecIn, vecA);
127 
128             blkCnt--;
129         }
130         /*
131          * tail
132          * (will be merged thru tail predication)
133          */
134         blkCnt = numCols & 7;
135         if (blkCnt > 0U) {
136             mve_pred16_t    p0 = vctp16q(blkCnt);
137             f16x8_t         vecA;
138 
139             vecIn = vldrhq_z_f16(pInVec, p0);
140             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
141             acc0 = vfmaq(acc0, vecIn, vecA);
142             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
143             acc1 = vfmaq(acc1, vecIn, vecA);
144             vecA = vldrhq_z_f16(pSrcA2Vec, p0);
145             acc2 = vfmaq(acc2, vecIn, vecA);
146             vecA = vldrhq_z_f16(pSrcA3Vec, p0);
147             acc3 = vfmaq(acc3, vecIn, vecA);
148         }
149         /*
150          * Sum the partial parts
151          */
152         f16x8_t         vtmp = vuninitializedq_f16();
153         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
154         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
155         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
156         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
157 
158         vSum =
159             vfmaq_m_f16(vSum, vld1q(pDualCoef),
160                       vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4));
161 
162         pDualCoef += 4;
163 
164         pSrcA += numCols * 4;
165         /*
166          * Decrement the row loop counter
167          */
168         row -= 4;
169     }
170 
171     /*
172      * compute 2 rows in parrallel
173      */
174     if (row >= 2) {
175         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
176         f16x8_t         vecIn, acc0, acc1;
177         float16_t const *pSrcVecPtr = in;
178 
179         /*
180          * Initialize the pointers to 2 consecutive MatrixA rows
181          */
182         pInA0 = pSrcA;
183         pInA1 = pInA0 + numCols;
184         /*
185          * Initialize the vector pointer
186          */
187         pInVec = pSrcVecPtr;
188         /*
189          * reset accumulators
190          */
191         acc0 = vdupq_n_f16(0.0f);
192         acc1 = vdupq_n_f16(0.0f);
193         pSrcA0Vec = pInA0;
194         pSrcA1Vec = pInA1;
195 
196         blkCnt = numCols >> 3;
197         while (blkCnt > 0U) {
198             f16x8_t         vecA;
199 
200             vecIn = vld1q(pInVec);
201             pInVec += 8;
202             vecA = vld1q(pSrcA0Vec);
203             pSrcA0Vec += 8;
204             acc0 = vfmaq(acc0, vecIn, vecA);
205             vecA = vld1q(pSrcA1Vec);
206             pSrcA1Vec += 8;
207             acc1 = vfmaq(acc1, vecIn, vecA);
208 
209             blkCnt--;
210         }
211         /*
212          * tail
213          * (will be merged thru tail predication)
214          */
215         blkCnt = numCols & 7;
216         if (blkCnt > 0U) {
217             mve_pred16_t    p0 = vctp16q(blkCnt);
218             f16x8_t         vecA;
219 
220             vecIn = vldrhq_z_f16(pInVec, p0);
221             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
222             acc0 = vfmaq(acc0, vecIn, vecA);
223             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
224             acc1 = vfmaq(acc1, vecIn, vecA);
225         }
226         /*
227          * Sum the partial parts
228          */
229         f16x8_t         vtmp = vuninitializedq_f16();
230         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
231         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
232 
233         vSum =
234             vfmaq_m_f16(vSum, vld1q(pDualCoef),
235                         vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
236                         vctp16q(2));
237 
238         pSrcA += numCols * 2;
239         row -= 2;
240     }
241 
242     if (row >= 1) {
243         f16x8_t         vecIn, acc0;
244         float16_t const *pSrcA0Vec, *pInVec;
245         float16_t const *pSrcVecPtr = in;
246         /*
247          * Initialize the pointers to last MatrixA row
248          */
249         pInA0 = pSrcA;
250         /*
251          * Initialize the vector pointer
252          */
253         pInVec = pSrcVecPtr;
254         /*
255          * reset accumulators
256          */
257         acc0 = vdupq_n_f16(0.0f);
258 
259         pSrcA0Vec = pInA0;
260 
261         blkCnt = numCols >> 3;
262         while (blkCnt > 0U) {
263             f16x8_t         vecA;
264 
265             vecIn = vld1q(pInVec);
266             pInVec += 8;
267             vecA = vld1q(pSrcA0Vec);
268             pSrcA0Vec += 8;
269             acc0 = vfmaq(acc0, vecIn, vecA);
270 
271             blkCnt--;
272         }
273         /*
274          * tail
275          * (will be merged thru tail predication)
276          */
277         blkCnt = numCols & 7;
278         if (blkCnt > 0U) {
279             mve_pred16_t    p0 = vctp16q(blkCnt);
280             f16x8_t         vecA;
281 
282             vecIn = vldrhq_z_f16(pInVec, p0);
283             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
284             acc0 = vfmaq(acc0, vecIn, vecA);
285         }
286         /*
287          * Sum the partial parts
288          */
289         f16x8_t         vtmp = vuninitializedq_f16();
290         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
291 
292         vSum =
293             vfmaq_m_f16(vSum, vld1q(pDualCoef),
294                         vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
295                         vctp16q(1));
296     }
297     sum += (_Float16)vecAddAcrossF16Mve(vSum);
298 
299     *pResult = S->classes[STEP(sum)];
300 }
301 
302 #else
arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 * S,const float16_t * in,int32_t * pResult)303 void arm_svm_sigmoid_predict_f16(
304     const arm_svm_sigmoid_instance_f16 *S,
305     const float16_t * in,
306     int32_t * pResult)
307 {
308     _Float16 sum=S->intercept;
309     _Float16 dot=0.0f16;
310     uint32_t i,j;
311     const float16_t *pSupport = S->supportVectors;
312 
313     for(i=0; i < S->nbOfSupportVectors; i++)
314     {
315         dot=0.0f16;
316         for(j=0; j < S->vectorDimension; j++)
317         {
318             dot = (_Float16)dot + (_Float16)in[j] * (_Float16)*pSupport++;
319         }
320         sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((float32_t)((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0));
321     }
322     *pResult=S->classes[STEP(sum)];
323 }
324 
325 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
326 
327 /**
328  * @} end of sigmoidsvm group
329  */
330 
331 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
332 
333