1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_polynomial_predict_f32.c
4  * Description:  SVM Polynomial Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32 
33 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
34 #include "arm_vec_math.h"
35 #endif
36 
37 /**
38  * @addtogroup polysvm
39  * @{
40  */
41 
42 
43 /**
44  * @brief SVM polynomial prediction
45  * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
46  * @param[in]    in         Pointer to input vector
47  * @param[out]   pResult    Decision value
48  *
49  */
50 
51 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
53 #include "arm_helium_utils.h"
54 #include "arm_vec_math.h"
55 
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)56 void arm_svm_polynomial_predict_f32(
57     const arm_svm_polynomial_instance_f32 *S,
58     const float32_t * in,
59     int32_t * pResult)
60 {
61         /* inlined Matrix x Vector function interleaved with dot prod */
62     uint32_t        numRows = S->nbOfSupportVectors;
63     uint32_t        numCols = S->vectorDimension;
64     const float32_t *pSupport = S->supportVectors;
65     const float32_t *pSrcA = pSupport;
66     const float32_t *pInA0;
67     const float32_t *pInA1;
68     uint32_t         row;
69     uint32_t         blkCnt;     /* loop counters */
70     const float32_t *pDualCoef = S->dualCoefficients;
71     float32_t       sum = S->intercept;
72     f32x4_t         vSum = vdupq_n_f32(0.0f);
73 
74     row = numRows;
75 
76     /*
77      * compute 4 rows in parrallel
78      */
79     while (row >= 4) {
80         const float32_t *pInA2, *pInA3;
81         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
82         f32x4_t         vecIn, acc0, acc1, acc2, acc3;
83         float32_t const *pSrcVecPtr = in;
84 
85         /*
86          * Initialize the pointers to 4 consecutive MatrixA rows
87          */
88         pInA0 = pSrcA;
89         pInA1 = pInA0 + numCols;
90         pInA2 = pInA1 + numCols;
91         pInA3 = pInA2 + numCols;
92         /*
93          * Initialize the vector pointer
94          */
95         pInVec = pSrcVecPtr;
96         /*
97          * reset accumulators
98          */
99         acc0 = vdupq_n_f32(0.0f);
100         acc1 = vdupq_n_f32(0.0f);
101         acc2 = vdupq_n_f32(0.0f);
102         acc3 = vdupq_n_f32(0.0f);
103 
104         pSrcA0Vec = pInA0;
105         pSrcA1Vec = pInA1;
106         pSrcA2Vec = pInA2;
107         pSrcA3Vec = pInA3;
108 
109         blkCnt = numCols >> 2;
110         while (blkCnt > 0U) {
111             f32x4_t         vecA;
112 
113             vecIn = vld1q(pInVec);
114             pInVec += 4;
115             vecA = vld1q(pSrcA0Vec);
116             pSrcA0Vec += 4;
117             acc0 = vfmaq(acc0, vecIn, vecA);
118             vecA = vld1q(pSrcA1Vec);
119             pSrcA1Vec += 4;
120             acc1 = vfmaq(acc1, vecIn, vecA);
121             vecA = vld1q(pSrcA2Vec);
122             pSrcA2Vec += 4;
123             acc2 = vfmaq(acc2, vecIn, vecA);
124             vecA = vld1q(pSrcA3Vec);
125             pSrcA3Vec += 4;
126             acc3 = vfmaq(acc3, vecIn, vecA);
127 
128             blkCnt--;
129         }
130         /*
131          * tail
132          * (will be merged thru tail predication)
133          */
134         blkCnt = numCols & 3;
135         if (blkCnt > 0U) {
136             mve_pred16_t    p0 = vctp32q(blkCnt);
137             f32x4_t         vecA;
138 
139             vecIn = vldrwq_z_f32(pInVec, p0);
140             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
141             acc0 = vfmaq(acc0, vecIn, vecA);
142             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
143             acc1 = vfmaq(acc1, vecIn, vecA);
144             vecA = vldrwq_z_f32(pSrcA2Vec, p0);
145             acc2 = vfmaq(acc2, vecIn, vecA);
146             vecA = vldrwq_z_f32(pSrcA3Vec, p0);
147             acc3 = vfmaq(acc3, vecIn, vecA);
148         }
149         /*
150          * Sum the partial parts
151          */
152         f32x4_t         vtmp = vuninitializedq_f32();
153         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
154         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
155         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
156         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
157 
158         vSum = vfmaq_f32(vSum, vld1q(pDualCoef),
159                              arm_vec_exponent_f32
160                              (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree));
161 
162         pDualCoef += 4;
163 
164         pSrcA += numCols * 4;
165         /*
166          * Decrement the row loop counter
167          */
168         row -= 4;
169     }
170 
171     /*
172      * compute 2 rows in parrallel
173      */
174     if (row >= 2) {
175         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
176         f32x4_t         vecIn, acc0, acc1;
177         float32_t const *pSrcVecPtr = in;
178 
179         /*
180          * Initialize the pointers to 2 consecutive MatrixA rows
181          */
182         pInA0 = pSrcA;
183         pInA1 = pInA0 + numCols;
184         /*
185          * Initialize the vector pointer
186          */
187         pInVec = pSrcVecPtr;
188         /*
189          * reset accumulators
190          */
191         acc0 = vdupq_n_f32(0.0f);
192         acc1 = vdupq_n_f32(0.0f);
193         pSrcA0Vec = pInA0;
194         pSrcA1Vec = pInA1;
195 
196         blkCnt = numCols >> 2;
197         while (blkCnt > 0U) {
198             f32x4_t         vecA;
199 
200             vecIn = vld1q(pInVec);
201             pInVec += 4;
202             vecA = vld1q(pSrcA0Vec);
203             pSrcA0Vec += 4;
204             acc0 = vfmaq(acc0, vecIn, vecA);
205             vecA = vld1q(pSrcA1Vec);
206             pSrcA1Vec += 4;
207             acc1 = vfmaq(acc1, vecIn, vecA);
208 
209             blkCnt--;
210         }
211         /*
212          * tail
213          * (will be merged thru tail predication)
214          */
215         blkCnt = numCols & 3;
216         if (blkCnt > 0U) {
217             mve_pred16_t    p0 = vctp32q(blkCnt);
218             f32x4_t         vecA;
219 
220             vecIn = vldrwq_z_f32(pInVec, p0);
221             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
222             acc0 = vfmaq(acc0, vecIn, vecA);
223             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
224             acc1 = vfmaq(acc1, vecIn, vecA);
225         }
226         /*
227          * Sum the partial parts
228          */
229         f32x4_t         vtmp = vuninitializedq_f32();
230         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
231         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
232 
233         vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
234                              arm_vec_exponent_f32
235                              (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
236                              vctp32q(2));
237 
238         pDualCoef += 2;
239         pSrcA += numCols * 2;
240         row -= 2;
241     }
242 
243     if (row >= 1) {
244         f32x4_t         vecIn, acc0;
245         float32_t const *pSrcA0Vec, *pInVec;
246         float32_t const *pSrcVecPtr = in;
247         /*
248          * Initialize the pointers to last MatrixA row
249          */
250         pInA0 = pSrcA;
251         /*
252          * Initialize the vector pointer
253          */
254         pInVec = pSrcVecPtr;
255         /*
256          * reset accumulators
257          */
258         acc0 = vdupq_n_f32(0.0f);
259 
260         pSrcA0Vec = pInA0;
261 
262         blkCnt = numCols >> 2;
263         while (blkCnt > 0U) {
264             f32x4_t         vecA;
265 
266             vecIn = vld1q(pInVec);
267             pInVec += 4;
268             vecA = vld1q(pSrcA0Vec);
269             pSrcA0Vec += 4;
270             acc0 = vfmaq(acc0, vecIn, vecA);
271 
272             blkCnt--;
273         }
274         /*
275          * tail
276          * (will be merged thru tail predication)
277          */
278         blkCnt = numCols & 3;
279         if (blkCnt > 0U) {
280             mve_pred16_t    p0 = vctp32q(blkCnt);
281             f32x4_t         vecA;
282 
283             vecIn = vldrwq_z_f32(pInVec, p0);
284             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
285             acc0 = vfmaq(acc0, vecIn, vecA);
286         }
287         /*
288          * Sum the partial parts
289          */
290         f32x4_t         vtmp = vuninitializedq_f32();
291         vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
292         vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
293                              arm_vec_exponent_f32
294                              (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
295                              vctp32q(1));
296     }
297     sum += vecAddAcrossF32Mve(vSum);
298 
299 
300     *pResult = S->classes[STEP(sum)];
301 }
302 
303 #else
304 #if defined(ARM_MATH_NEON)
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)305 void arm_svm_polynomial_predict_f32(
306     const arm_svm_polynomial_instance_f32 *S,
307     const float32_t * in,
308     int32_t * pResult)
309 {
310     float32_t sum = S->intercept;
311 
312     float32_t dot;
313     float32x4_t dotV;
314 
315     float32x4_t accuma,accumb,accumc,accumd,accum;
316     float32x2_t accum2;
317     float32x4_t vec1;
318     float32x4_t coef0 = vdupq_n_f32(S->coef0);
319 
320     float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
321 
322     uint32_t blkCnt;
323     uint32_t vectorBlkCnt;
324 
325     const float32_t *pIn = in;
326 
327     const float32_t *pSupport = S->supportVectors;
328 
329     const float32_t *pSupporta = S->supportVectors;
330     const float32_t *pSupportb;
331     const float32_t *pSupportc;
332     const float32_t *pSupportd;
333 
334     pSupportb = pSupporta + S->vectorDimension;
335     pSupportc = pSupportb + S->vectorDimension;
336     pSupportd = pSupportc + S->vectorDimension;
337 
338     const float32_t *pDualCoefs = S->dualCoefficients;
339 
340     vectorBlkCnt = S->nbOfSupportVectors >> 2;
341     while (vectorBlkCnt > 0U)
342     {
343         accuma = vdupq_n_f32(0);
344         accumb = vdupq_n_f32(0);
345         accumc = vdupq_n_f32(0);
346         accumd = vdupq_n_f32(0);
347 
348         pIn = in;
349 
350         blkCnt = S->vectorDimension >> 2;
351         while (blkCnt > 0U)
352         {
353 
354             vec1 = vld1q_f32(pIn);
355             vec2a = vld1q_f32(pSupporta);
356             vec2b = vld1q_f32(pSupportb);
357             vec2c = vld1q_f32(pSupportc);
358             vec2d = vld1q_f32(pSupportd);
359 
360             pIn += 4;
361             pSupporta += 4;
362             pSupportb += 4;
363             pSupportc += 4;
364             pSupportd += 4;
365 
366             accuma = vmlaq_f32(accuma, vec1,vec2a);
367             accumb = vmlaq_f32(accumb, vec1,vec2b);
368             accumc = vmlaq_f32(accumc, vec1,vec2c);
369             accumd = vmlaq_f32(accumd, vec1,vec2d);
370 
371             blkCnt -- ;
372         }
373         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
374         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
375 
376         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
377         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
378 
379         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
380         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
381 
382         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
383         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
384 
385 
386         blkCnt = S->vectorDimension & 3;
387         while (blkCnt > 0U)
388         {
389             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
390             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
391             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
392             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
393 
394             pIn++;
395 
396             blkCnt -- ;
397         }
398 
399         vec1 = vld1q_f32(pDualCoefs);
400         pDualCoefs += 4;
401 
402         // To vectorize later
403         dotV = vmulq_n_f32(dotV, S->gamma);
404         dotV = vaddq_f32(dotV, coef0);
405 
406         dotV = arm_vec_exponent_f32(dotV,S->degree);
407 
408         accum = vmulq_f32(vec1,dotV);
409         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
410         sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
411 
412         pSupporta += 3*S->vectorDimension;
413         pSupportb += 3*S->vectorDimension;
414         pSupportc += 3*S->vectorDimension;
415         pSupportd += 3*S->vectorDimension;
416 
417         vectorBlkCnt -- ;
418     }
419 
420     pSupport = pSupporta;
421     vectorBlkCnt = S->nbOfSupportVectors & 3;
422 
423     while (vectorBlkCnt > 0U)
424     {
425         accum = vdupq_n_f32(0);
426         dot = 0.0f;
427         pIn = in;
428 
429         blkCnt = S->vectorDimension >> 2;
430         while (blkCnt > 0U)
431         {
432 
433             vec1 = vld1q_f32(pIn);
434             vec2 = vld1q_f32(pSupport);
435             pIn += 4;
436             pSupport += 4;
437 
438             accum = vmlaq_f32(accum, vec1,vec2);
439 
440             blkCnt -- ;
441         }
442         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
443         dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
444 
445 
446         blkCnt = S->vectorDimension & 3;
447         while (blkCnt > 0U)
448         {
449             dot = dot + *pIn++ * *pSupport++;
450 
451             blkCnt -- ;
452         }
453 
454         sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
455         vectorBlkCnt -- ;
456     }
457 
458     *pResult=S->classes[STEP(sum)];
459 }
460 #else
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)461 void arm_svm_polynomial_predict_f32(
462     const arm_svm_polynomial_instance_f32 *S,
463     const float32_t * in,
464     int32_t * pResult)
465 {
466     float32_t sum=S->intercept;
467     float32_t dot=0;
468     uint32_t i,j;
469     const float32_t *pSupport = S->supportVectors;
470 
471     for(i=0; i < S->nbOfSupportVectors; i++)
472     {
473         dot=0;
474         for(j=0; j < S->vectorDimension; j++)
475         {
476             dot = dot + in[j]* *pSupport++;
477         }
478         sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
479     }
480 
481     *pResult=S->classes[STEP(sum)];
482 }
483 #endif
484 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
485 
486 
487 /**
488  * @} end of polysvm group
489  */
490