1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_sigmoid_predict_f32.c
4  * Description:  SVM Sigmoid Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32 
33 /**
34  * @addtogroup sigmoidsvm
35  * @{
36  */
37 
38 
39 
40 /**
41  * @brief SVM sigmoid prediction
42  * @param[in]    S        Pointer to an instance of the rbf SVM structure.
43  * @param[in]    in       Pointer to input vector
44  * @param[out]   pResult  Decision value
45  *
46  */
47 
48 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
49 
50 #include "arm_helium_utils.h"
51 #include "arm_vec_math.h"
52 
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)53 ARM_DSP_ATTRIBUTE void arm_svm_sigmoid_predict_f32(
54     const arm_svm_sigmoid_instance_f32 *S,
55     const float32_t * in,
56     int32_t * pResult)
57 {
58         /* inlined Matrix x Vector function interleaved with dot prod */
59     uint32_t        numRows = S->nbOfSupportVectors;
60     uint32_t        numCols = S->vectorDimension;
61     const float32_t *pSupport = S->supportVectors;
62     const float32_t *pSrcA = pSupport;
63     const float32_t *pInA0;
64     const float32_t *pInA1;
65     uint32_t         row;
66     uint32_t         blkCnt;     /* loop counters */
67     const float32_t *pDualCoef = S->dualCoefficients;
68     float32_t       sum = S->intercept;
69     f32x4_t         vSum = vdupq_n_f32(0.0f);
70 
71     row = numRows;
72 
73     /*
74      * compute 4 rows in parrallel
75      */
76     while (row >= 4) {
77         const float32_t *pInA2, *pInA3;
78         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
79         f32x4_t         vecIn, acc0, acc1, acc2, acc3;
80         float32_t const *pSrcVecPtr = in;
81 
82         /*
83          * Initialize the pointers to 4 consecutive MatrixA rows
84          */
85         pInA0 = pSrcA;
86         pInA1 = pInA0 + numCols;
87         pInA2 = pInA1 + numCols;
88         pInA3 = pInA2 + numCols;
89         /*
90          * Initialize the vector pointer
91          */
92         pInVec = pSrcVecPtr;
93         /*
94          * reset accumulators
95          */
96         acc0 = vdupq_n_f32(0.0f);
97         acc1 = vdupq_n_f32(0.0f);
98         acc2 = vdupq_n_f32(0.0f);
99         acc3 = vdupq_n_f32(0.0f);
100 
101         pSrcA0Vec = pInA0;
102         pSrcA1Vec = pInA1;
103         pSrcA2Vec = pInA2;
104         pSrcA3Vec = pInA3;
105 
106         blkCnt = numCols >> 2;
107         while (blkCnt > 0U) {
108             f32x4_t         vecA;
109 
110             vecIn = vld1q(pInVec);
111             pInVec += 4;
112             vecA = vld1q(pSrcA0Vec);
113             pSrcA0Vec += 4;
114             acc0 = vfmaq(acc0, vecIn, vecA);
115             vecA = vld1q(pSrcA1Vec);
116             pSrcA1Vec += 4;
117             acc1 = vfmaq(acc1, vecIn, vecA);
118             vecA = vld1q(pSrcA2Vec);
119             pSrcA2Vec += 4;
120             acc2 = vfmaq(acc2, vecIn, vecA);
121             vecA = vld1q(pSrcA3Vec);
122             pSrcA3Vec += 4;
123             acc3 = vfmaq(acc3, vecIn, vecA);
124 
125             blkCnt--;
126         }
127         /*
128          * tail
129          * (will be merged thru tail predication)
130          */
131         blkCnt = numCols & 3;
132         if (blkCnt > 0U) {
133             mve_pred16_t    p0 = vctp32q(blkCnt);
134             f32x4_t         vecA;
135 
136             vecIn = vldrwq_z_f32(pInVec, p0);
137             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
138             acc0 = vfmaq(acc0, vecIn, vecA);
139             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
140             acc1 = vfmaq(acc1, vecIn, vecA);
141             vecA = vldrwq_z_f32(pSrcA2Vec, p0);
142             acc2 = vfmaq(acc2, vecIn, vecA);
143             vecA = vldrwq_z_f32(pSrcA3Vec, p0);
144             acc3 = vfmaq(acc3, vecIn, vecA);
145         }
146         /*
147          * Sum the partial parts
148          */
149         f32x4_t         vtmp = vuninitializedq_f32();
150         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
151         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
152         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
153         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
154 
155         vSum =
156             vfmaq_f32(vSum, vld1q(pDualCoef),
157                       vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)));
158 
159         pDualCoef += 4;
160 
161         pSrcA += numCols * 4;
162         /*
163          * Decrement the row loop counter
164          */
165         row -= 4;
166     }
167 
168     /*
169      * compute 2 rows in parrallel
170      */
171     if (row >= 2) {
172         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
173         f32x4_t         vecIn, acc0, acc1;
174         float32_t const *pSrcVecPtr = in;
175 
176         /*
177          * Initialize the pointers to 2 consecutive MatrixA rows
178          */
179         pInA0 = pSrcA;
180         pInA1 = pInA0 + numCols;
181         /*
182          * Initialize the vector pointer
183          */
184         pInVec = pSrcVecPtr;
185         /*
186          * reset accumulators
187          */
188         acc0 = vdupq_n_f32(0.0f);
189         acc1 = vdupq_n_f32(0.0f);
190         pSrcA0Vec = pInA0;
191         pSrcA1Vec = pInA1;
192 
193         blkCnt = numCols >> 2;
194         while (blkCnt > 0U) {
195             f32x4_t         vecA;
196 
197             vecIn = vld1q(pInVec);
198             pInVec += 4;
199             vecA = vld1q(pSrcA0Vec);
200             pSrcA0Vec += 4;
201             acc0 = vfmaq(acc0, vecIn, vecA);
202             vecA = vld1q(pSrcA1Vec);
203             pSrcA1Vec += 4;
204             acc1 = vfmaq(acc1, vecIn, vecA);
205 
206             blkCnt--;
207         }
208         /*
209          * tail
210          * (will be merged thru tail predication)
211          */
212         blkCnt = numCols & 3;
213         if (blkCnt > 0U) {
214             mve_pred16_t    p0 = vctp32q(blkCnt);
215             f32x4_t         vecA;
216 
217             vecIn = vldrwq_z_f32(pInVec, p0);
218             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
219             acc0 = vfmaq(acc0, vecIn, vecA);
220             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
221             acc1 = vfmaq(acc1, vecIn, vecA);
222         }
223         /*
224          * Sum the partial parts
225          */
226         f32x4_t         vtmp = vuninitializedq_f32();
227         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
228         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
229 
230         vSum =
231             vfmaq_m_f32(vSum, vld1q(pDualCoef),
232                         vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
233                         vctp32q(2));
234 
235         pSrcA += numCols * 2;
236         row -= 2;
237     }
238 
239     if (row >= 1) {
240         f32x4_t         vecIn, acc0;
241         float32_t const *pSrcA0Vec, *pInVec;
242         float32_t const *pSrcVecPtr = in;
243         /*
244          * Initialize the pointers to last MatrixA row
245          */
246         pInA0 = pSrcA;
247         /*
248          * Initialize the vector pointer
249          */
250         pInVec = pSrcVecPtr;
251         /*
252          * reset accumulators
253          */
254         acc0 = vdupq_n_f32(0.0f);
255 
256         pSrcA0Vec = pInA0;
257 
258         blkCnt = numCols >> 2;
259         while (blkCnt > 0U) {
260             f32x4_t         vecA;
261 
262             vecIn = vld1q(pInVec);
263             pInVec += 4;
264             vecA = vld1q(pSrcA0Vec);
265             pSrcA0Vec += 4;
266             acc0 = vfmaq(acc0, vecIn, vecA);
267 
268             blkCnt--;
269         }
270         /*
271          * tail
272          * (will be merged thru tail predication)
273          */
274         blkCnt = numCols & 3;
275         if (blkCnt > 0U) {
276             mve_pred16_t    p0 = vctp32q(blkCnt);
277             f32x4_t         vecA;
278 
279             vecIn = vldrwq_z_f32(pInVec, p0);
280             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
281             acc0 = vfmaq(acc0, vecIn, vecA);
282         }
283         /*
284          * Sum the partial parts
285          */
286         f32x4_t         vtmp = vuninitializedq_f32();
287         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
288 
289         vSum =
290             vfmaq_m_f32(vSum, vld1q(pDualCoef),
291                         vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
292                         vctp32q(1));
293     }
294     sum += vecAddAcrossF32Mve(vSum);
295 
296     *pResult = S->classes[STEP(sum)];
297 }
298 
299 #else
300 #if defined(ARM_MATH_NEON)
301 #include "NEMath.h"
302 
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)303 ARM_DSP_ATTRIBUTE void arm_svm_sigmoid_predict_f32(
304     const arm_svm_sigmoid_instance_f32 *S,
305     const float32_t * in,
306     int32_t * pResult)
307 {
308     float32_t sum = S->intercept;
309 
310     float32_t dot;
311     float32x4_t dotV;
312 
313     float32x4_t accuma,accumb,accumc,accumd,accum;
314     float32x2_t accum2;
315     float32x4_t vec1;
316     float32x4_t coef0 = vdupq_n_f32(S->coef0);
317 
318     float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
319 
320     uint32_t blkCnt;
321     uint32_t vectorBlkCnt;
322 
323     const float32_t *pIn = in;
324 
325     const float32_t *pSupport = S->supportVectors;
326 
327     const float32_t *pSupporta = S->supportVectors;
328     const float32_t *pSupportb;
329     const float32_t *pSupportc;
330     const float32_t *pSupportd;
331 
332     pSupportb = pSupporta + S->vectorDimension;
333     pSupportc = pSupportb + S->vectorDimension;
334     pSupportd = pSupportc + S->vectorDimension;
335 
336     const float32_t *pDualCoefs = S->dualCoefficients;
337 
338     vectorBlkCnt = S->nbOfSupportVectors >> 2;
339     while (vectorBlkCnt > 0U)
340     {
341         accuma = vdupq_n_f32(0);
342         accumb = vdupq_n_f32(0);
343         accumc = vdupq_n_f32(0);
344         accumd = vdupq_n_f32(0);
345 
346         pIn = in;
347 
348         blkCnt = S->vectorDimension >> 2;
349         while (blkCnt > 0U)
350         {
351 
352             vec1 = vld1q_f32(pIn);
353             vec2a = vld1q_f32(pSupporta);
354             vec2b = vld1q_f32(pSupportb);
355             vec2c = vld1q_f32(pSupportc);
356             vec2d = vld1q_f32(pSupportd);
357 
358             pIn += 4;
359             pSupporta += 4;
360             pSupportb += 4;
361             pSupportc += 4;
362             pSupportd += 4;
363 
364             accuma = vmlaq_f32(accuma, vec1,vec2a);
365             accumb = vmlaq_f32(accumb, vec1,vec2b);
366             accumc = vmlaq_f32(accumc, vec1,vec2c);
367             accumd = vmlaq_f32(accumd, vec1,vec2d);
368 
369             blkCnt -- ;
370         }
371         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
372         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
373 
374         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
375         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
376 
377         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
378         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
379 
380         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
381         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
382 
383 
384         blkCnt = S->vectorDimension & 3;
385         while (blkCnt > 0U)
386         {
387             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
388             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
389             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
390             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
391 
392             pIn++;
393 
394             blkCnt -- ;
395         }
396 
397         vec1 = vld1q_f32(pDualCoefs);
398         pDualCoefs += 4;
399 
400         // To vectorize later
401         dotV = vmulq_n_f32(dotV, S->gamma);
402         dotV = vaddq_f32(dotV, coef0);
403 
404         dotV = vtanhq_f32(dotV);
405 
406         accum = vmulq_f32(vec1,dotV);
407         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
408         sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
409 
410         pSupporta += 3*S->vectorDimension;
411         pSupportb += 3*S->vectorDimension;
412         pSupportc += 3*S->vectorDimension;
413         pSupportd += 3*S->vectorDimension;
414 
415         vectorBlkCnt -- ;
416     }
417 
418     pSupport = pSupporta;
419     vectorBlkCnt = S->nbOfSupportVectors & 3;
420 
421     while (vectorBlkCnt > 0U)
422     {
423         accum = vdupq_n_f32(0);
424         dot = 0.0f;
425         pIn = in;
426 
427         blkCnt = S->vectorDimension >> 2;
428         while (blkCnt > 0U)
429         {
430 
431             vec1 = vld1q_f32(pIn);
432             vec2 = vld1q_f32(pSupport);
433             pIn += 4;
434             pSupport += 4;
435 
436             accum = vmlaq_f32(accum, vec1,vec2);
437 
438             blkCnt -- ;
439         }
440         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
441         dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
442 
443 
444         blkCnt = S->vectorDimension & 3;
445         while (blkCnt > 0U)
446         {
447             dot = dot + *pIn++ * *pSupport++;
448 
449             blkCnt -- ;
450         }
451 
452         sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0);
453         vectorBlkCnt -- ;
454     }
455 
456     *pResult=S->classes[STEP(sum)];
457 }
458 #else
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)459 ARM_DSP_ATTRIBUTE void arm_svm_sigmoid_predict_f32(
460     const arm_svm_sigmoid_instance_f32 *S,
461     const float32_t * in,
462     int32_t * pResult)
463 {
464     float32_t sum=S->intercept;
465     float32_t dot=0;
466     uint32_t i,j;
467     const float32_t *pSupport = S->supportVectors;
468 
469     for(i=0; i < S->nbOfSupportVectors; i++)
470     {
471         dot=0;
472         for(j=0; j < S->vectorDimension; j++)
473         {
474             dot = dot + in[j]* *pSupport++;
475         }
476         sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
477     }
478     *pResult=S->classes[STEP(sum)];
479 }
480 
481 #endif
482 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
483 
484 /**
485  * @} end of sigmoidsvm group
486  */
487