1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_sigmoid_predict_f16.c
4  * Description:  SVM Sigmoid Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 #include <limits.h>
34 #include <math.h>
35 
36 /**
37  * @addtogroup sigmoidsvm
38  * @{
39  */
40 
41 
42 
43 /**
44  * @brief SVM sigmoid prediction
45  * @param[in]    S        Pointer to an instance of the rbf SVM structure.
46  * @param[in]    in       Pointer to input vector
47  * @param[out]   pResult  Decision value
48  * @return none.
49  *
50  */
51 
52 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
53 
54 #include "arm_helium_utils.h"
55 #include "arm_vec_math_f16.h"
56 
arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 * S,const float16_t * in,int32_t * pResult)57 void arm_svm_sigmoid_predict_f16(
58     const arm_svm_sigmoid_instance_f16 *S,
59     const float16_t * in,
60     int32_t * pResult)
61 {
62         /* inlined Matrix x Vector function interleaved with dot prod */
63     uint32_t        numRows = S->nbOfSupportVectors;
64     uint32_t        numCols = S->vectorDimension;
65     const float16_t *pSupport = S->supportVectors;
66     const float16_t *pSrcA = pSupport;
67     const float16_t *pInA0;
68     const float16_t *pInA1;
69     uint32_t         row;
70     uint32_t         blkCnt;     /* loop counters */
71     const float16_t *pDualCoef = S->dualCoefficients;
72     _Float16       sum = S->intercept;
73     f16x8_t         vSum = vdupq_n_f16(0.0f);
74 
75     row = numRows;
76 
77     /*
78      * compute 4 rows in parrallel
79      */
80     while (row >= 4) {
81         const float16_t *pInA2, *pInA3;
82         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
83         f16x8_t         vecIn, acc0, acc1, acc2, acc3;
84         float16_t const *pSrcVecPtr = in;
85 
86         /*
87          * Initialize the pointers to 4 consecutive MatrixA rows
88          */
89         pInA0 = pSrcA;
90         pInA1 = pInA0 + numCols;
91         pInA2 = pInA1 + numCols;
92         pInA3 = pInA2 + numCols;
93         /*
94          * Initialize the vector pointer
95          */
96         pInVec = pSrcVecPtr;
97         /*
98          * reset accumulators
99          */
100         acc0 = vdupq_n_f16(0.0f);
101         acc1 = vdupq_n_f16(0.0f);
102         acc2 = vdupq_n_f16(0.0f);
103         acc3 = vdupq_n_f16(0.0f);
104 
105         pSrcA0Vec = pInA0;
106         pSrcA1Vec = pInA1;
107         pSrcA2Vec = pInA2;
108         pSrcA3Vec = pInA3;
109 
110         blkCnt = numCols >> 3;
111         while (blkCnt > 0U) {
112             f16x8_t         vecA;
113 
114             vecIn = vld1q(pInVec);
115             pInVec += 8;
116             vecA = vld1q(pSrcA0Vec);
117             pSrcA0Vec += 8;
118             acc0 = vfmaq(acc0, vecIn, vecA);
119             vecA = vld1q(pSrcA1Vec);
120             pSrcA1Vec += 8;
121             acc1 = vfmaq(acc1, vecIn, vecA);
122             vecA = vld1q(pSrcA2Vec);
123             pSrcA2Vec += 8;
124             acc2 = vfmaq(acc2, vecIn, vecA);
125             vecA = vld1q(pSrcA3Vec);
126             pSrcA3Vec += 8;
127             acc3 = vfmaq(acc3, vecIn, vecA);
128 
129             blkCnt--;
130         }
131         /*
132          * tail
133          * (will be merged thru tail predication)
134          */
135         blkCnt = numCols & 7;
136         if (blkCnt > 0U) {
137             mve_pred16_t    p0 = vctp16q(blkCnt);
138             f16x8_t         vecA;
139 
140             vecIn = vldrhq_z_f16(pInVec, p0);
141             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
142             acc0 = vfmaq(acc0, vecIn, vecA);
143             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
144             acc1 = vfmaq(acc1, vecIn, vecA);
145             vecA = vldrhq_z_f16(pSrcA2Vec, p0);
146             acc2 = vfmaq(acc2, vecIn, vecA);
147             vecA = vldrhq_z_f16(pSrcA3Vec, p0);
148             acc3 = vfmaq(acc3, vecIn, vecA);
149         }
150         /*
151          * Sum the partial parts
152          */
153         f16x8_t         vtmp = vuninitializedq_f16();
154         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
155         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
156         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
157         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
158 
159         vSum =
160             vfmaq_m_f16(vSum, vld1q(pDualCoef),
161                       vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4));
162 
163         pDualCoef += 4;
164 
165         pSrcA += numCols * 4;
166         /*
167          * Decrement the row loop counter
168          */
169         row -= 4;
170     }
171 
172     /*
173      * compute 2 rows in parrallel
174      */
175     if (row >= 2) {
176         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
177         f16x8_t         vecIn, acc0, acc1;
178         float16_t const *pSrcVecPtr = in;
179 
180         /*
181          * Initialize the pointers to 2 consecutive MatrixA rows
182          */
183         pInA0 = pSrcA;
184         pInA1 = pInA0 + numCols;
185         /*
186          * Initialize the vector pointer
187          */
188         pInVec = pSrcVecPtr;
189         /*
190          * reset accumulators
191          */
192         acc0 = vdupq_n_f16(0.0f);
193         acc1 = vdupq_n_f16(0.0f);
194         pSrcA0Vec = pInA0;
195         pSrcA1Vec = pInA1;
196 
197         blkCnt = numCols >> 3;
198         while (blkCnt > 0U) {
199             f16x8_t         vecA;
200 
201             vecIn = vld1q(pInVec);
202             pInVec += 8;
203             vecA = vld1q(pSrcA0Vec);
204             pSrcA0Vec += 8;
205             acc0 = vfmaq(acc0, vecIn, vecA);
206             vecA = vld1q(pSrcA1Vec);
207             pSrcA1Vec += 8;
208             acc1 = vfmaq(acc1, vecIn, vecA);
209 
210             blkCnt--;
211         }
212         /*
213          * tail
214          * (will be merged thru tail predication)
215          */
216         blkCnt = numCols & 7;
217         if (blkCnt > 0U) {
218             mve_pred16_t    p0 = vctp16q(blkCnt);
219             f16x8_t         vecA;
220 
221             vecIn = vldrhq_z_f16(pInVec, p0);
222             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
223             acc0 = vfmaq(acc0, vecIn, vecA);
224             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
225             acc1 = vfmaq(acc1, vecIn, vecA);
226         }
227         /*
228          * Sum the partial parts
229          */
230         f16x8_t         vtmp = vuninitializedq_f16();
231         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
232         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
233 
234         vSum =
235             vfmaq_m_f16(vSum, vld1q(pDualCoef),
236                         vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
237                         vctp16q(2));
238 
239         pSrcA += numCols * 2;
240         row -= 2;
241     }
242 
243     if (row >= 1) {
244         f16x8_t         vecIn, acc0;
245         float16_t const *pSrcA0Vec, *pInVec;
246         float16_t const *pSrcVecPtr = in;
247         /*
248          * Initialize the pointers to last MatrixA row
249          */
250         pInA0 = pSrcA;
251         /*
252          * Initialize the vector pointer
253          */
254         pInVec = pSrcVecPtr;
255         /*
256          * reset accumulators
257          */
258         acc0 = vdupq_n_f16(0.0f);
259 
260         pSrcA0Vec = pInA0;
261 
262         blkCnt = numCols >> 3;
263         while (blkCnt > 0U) {
264             f16x8_t         vecA;
265 
266             vecIn = vld1q(pInVec);
267             pInVec += 8;
268             vecA = vld1q(pSrcA0Vec);
269             pSrcA0Vec += 8;
270             acc0 = vfmaq(acc0, vecIn, vecA);
271 
272             blkCnt--;
273         }
274         /*
275          * tail
276          * (will be merged thru tail predication)
277          */
278         blkCnt = numCols & 7;
279         if (blkCnt > 0U) {
280             mve_pred16_t    p0 = vctp16q(blkCnt);
281             f16x8_t         vecA;
282 
283             vecIn = vldrhq_z_f16(pInVec, p0);
284             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
285             acc0 = vfmaq(acc0, vecIn, vecA);
286         }
287         /*
288          * Sum the partial parts
289          */
290         f16x8_t         vtmp = vuninitializedq_f16();
291         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
292 
293         vSum =
294             vfmaq_m_f16(vSum, vld1q(pDualCoef),
295                         vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
296                         vctp16q(1));
297     }
298     sum += (_Float16)vecAddAcrossF16Mve(vSum);
299 
300     *pResult = S->classes[STEP(sum)];
301 }
302 
303 #else
arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 * S,const float16_t * in,int32_t * pResult)304 void arm_svm_sigmoid_predict_f16(
305     const arm_svm_sigmoid_instance_f16 *S,
306     const float16_t * in,
307     int32_t * pResult)
308 {
309     _Float16 sum=S->intercept;
310     _Float16 dot=0.0f16;
311     uint32_t i,j;
312     const float16_t *pSupport = S->supportVectors;
313 
314     for(i=0; i < S->nbOfSupportVectors; i++)
315     {
316         dot=0.0f16;
317         for(j=0; j < S->vectorDimension; j++)
318         {
319             dot = (_Float16)dot + (_Float16)in[j] * (_Float16)*pSupport++;
320         }
321         sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((float32_t)((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0));
322     }
323     *pResult=S->classes[STEP(sum)];
324 }
325 
326 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
327 
328 /**
329  * @} end of sigmoidsvm group
330  */
331 
332 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
333 
334