1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_polynomial_predict_f32.c
4 * Description: SVM Polynomial Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32
33 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
34 #include "arm_vec_math.h"
35 #endif
36
37 /**
38 * @addtogroup polysvm
39 * @{
40 */
41
42
43 /**
44 * @brief SVM polynomial prediction
45 * @param[in] S Pointer to an instance of the polynomial SVM structure.
46 * @param[in] in Pointer to input vector
47 * @param[out] pResult Decision value
48 *
49 */
50
51 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
52
53 #include "arm_helium_utils.h"
54 #include "arm_vec_math.h"
55
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)56 void arm_svm_polynomial_predict_f32(
57 const arm_svm_polynomial_instance_f32 *S,
58 const float32_t * in,
59 int32_t * pResult)
60 {
61 /* inlined Matrix x Vector function interleaved with dot prod */
62 uint32_t numRows = S->nbOfSupportVectors;
63 uint32_t numCols = S->vectorDimension;
64 const float32_t *pSupport = S->supportVectors;
65 const float32_t *pSrcA = pSupport;
66 const float32_t *pInA0;
67 const float32_t *pInA1;
68 uint32_t row;
69 uint32_t blkCnt; /* loop counters */
70 const float32_t *pDualCoef = S->dualCoefficients;
71 float32_t sum = S->intercept;
72 f32x4_t vSum = vdupq_n_f32(0.0f);
73
74 row = numRows;
75
76 /*
77 * compute 4 rows in parrallel
78 */
79 while (row >= 4) {
80 const float32_t *pInA2, *pInA3;
81 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
82 f32x4_t vecIn, acc0, acc1, acc2, acc3;
83 float32_t const *pSrcVecPtr = in;
84
85 /*
86 * Initialize the pointers to 4 consecutive MatrixA rows
87 */
88 pInA0 = pSrcA;
89 pInA1 = pInA0 + numCols;
90 pInA2 = pInA1 + numCols;
91 pInA3 = pInA2 + numCols;
92 /*
93 * Initialize the vector pointer
94 */
95 pInVec = pSrcVecPtr;
96 /*
97 * reset accumulators
98 */
99 acc0 = vdupq_n_f32(0.0f);
100 acc1 = vdupq_n_f32(0.0f);
101 acc2 = vdupq_n_f32(0.0f);
102 acc3 = vdupq_n_f32(0.0f);
103
104 pSrcA0Vec = pInA0;
105 pSrcA1Vec = pInA1;
106 pSrcA2Vec = pInA2;
107 pSrcA3Vec = pInA3;
108
109 blkCnt = numCols >> 2;
110 while (blkCnt > 0U) {
111 f32x4_t vecA;
112
113 vecIn = vld1q(pInVec);
114 pInVec += 4;
115 vecA = vld1q(pSrcA0Vec);
116 pSrcA0Vec += 4;
117 acc0 = vfmaq(acc0, vecIn, vecA);
118 vecA = vld1q(pSrcA1Vec);
119 pSrcA1Vec += 4;
120 acc1 = vfmaq(acc1, vecIn, vecA);
121 vecA = vld1q(pSrcA2Vec);
122 pSrcA2Vec += 4;
123 acc2 = vfmaq(acc2, vecIn, vecA);
124 vecA = vld1q(pSrcA3Vec);
125 pSrcA3Vec += 4;
126 acc3 = vfmaq(acc3, vecIn, vecA);
127
128 blkCnt--;
129 }
130 /*
131 * tail
132 * (will be merged thru tail predication)
133 */
134 blkCnt = numCols & 3;
135 if (blkCnt > 0U) {
136 mve_pred16_t p0 = vctp32q(blkCnt);
137 f32x4_t vecA;
138
139 vecIn = vldrwq_z_f32(pInVec, p0);
140 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
141 acc0 = vfmaq(acc0, vecIn, vecA);
142 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
143 acc1 = vfmaq(acc1, vecIn, vecA);
144 vecA = vldrwq_z_f32(pSrcA2Vec, p0);
145 acc2 = vfmaq(acc2, vecIn, vecA);
146 vecA = vldrwq_z_f32(pSrcA3Vec, p0);
147 acc3 = vfmaq(acc3, vecIn, vecA);
148 }
149 /*
150 * Sum the partial parts
151 */
152 f32x4_t vtmp = vuninitializedq_f32();
153 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
154 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
155 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
156 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
157
158 vSum = vfmaq_f32(vSum, vld1q(pDualCoef),
159 arm_vec_exponent_f32
160 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree));
161
162 pDualCoef += 4;
163
164 pSrcA += numCols * 4;
165 /*
166 * Decrement the row loop counter
167 */
168 row -= 4;
169 }
170
171 /*
172 * compute 2 rows in parrallel
173 */
174 if (row >= 2) {
175 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
176 f32x4_t vecIn, acc0, acc1;
177 float32_t const *pSrcVecPtr = in;
178
179 /*
180 * Initialize the pointers to 2 consecutive MatrixA rows
181 */
182 pInA0 = pSrcA;
183 pInA1 = pInA0 + numCols;
184 /*
185 * Initialize the vector pointer
186 */
187 pInVec = pSrcVecPtr;
188 /*
189 * reset accumulators
190 */
191 acc0 = vdupq_n_f32(0.0f);
192 acc1 = vdupq_n_f32(0.0f);
193 pSrcA0Vec = pInA0;
194 pSrcA1Vec = pInA1;
195
196 blkCnt = numCols >> 2;
197 while (blkCnt > 0U) {
198 f32x4_t vecA;
199
200 vecIn = vld1q(pInVec);
201 pInVec += 4;
202 vecA = vld1q(pSrcA0Vec);
203 pSrcA0Vec += 4;
204 acc0 = vfmaq(acc0, vecIn, vecA);
205 vecA = vld1q(pSrcA1Vec);
206 pSrcA1Vec += 4;
207 acc1 = vfmaq(acc1, vecIn, vecA);
208
209 blkCnt--;
210 }
211 /*
212 * tail
213 * (will be merged thru tail predication)
214 */
215 blkCnt = numCols & 3;
216 if (blkCnt > 0U) {
217 mve_pred16_t p0 = vctp32q(blkCnt);
218 f32x4_t vecA;
219
220 vecIn = vldrwq_z_f32(pInVec, p0);
221 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
222 acc0 = vfmaq(acc0, vecIn, vecA);
223 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
224 acc1 = vfmaq(acc1, vecIn, vecA);
225 }
226 /*
227 * Sum the partial parts
228 */
229 f32x4_t vtmp = vuninitializedq_f32();
230 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
231 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
232
233 vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
234 arm_vec_exponent_f32
235 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
236 vctp32q(2));
237
238 pDualCoef += 2;
239 pSrcA += numCols * 2;
240 row -= 2;
241 }
242
243 if (row >= 1) {
244 f32x4_t vecIn, acc0;
245 float32_t const *pSrcA0Vec, *pInVec;
246 float32_t const *pSrcVecPtr = in;
247 /*
248 * Initialize the pointers to last MatrixA row
249 */
250 pInA0 = pSrcA;
251 /*
252 * Initialize the vector pointer
253 */
254 pInVec = pSrcVecPtr;
255 /*
256 * reset accumulators
257 */
258 acc0 = vdupq_n_f32(0.0f);
259
260 pSrcA0Vec = pInA0;
261
262 blkCnt = numCols >> 2;
263 while (blkCnt > 0U) {
264 f32x4_t vecA;
265
266 vecIn = vld1q(pInVec);
267 pInVec += 4;
268 vecA = vld1q(pSrcA0Vec);
269 pSrcA0Vec += 4;
270 acc0 = vfmaq(acc0, vecIn, vecA);
271
272 blkCnt--;
273 }
274 /*
275 * tail
276 * (will be merged thru tail predication)
277 */
278 blkCnt = numCols & 3;
279 if (blkCnt > 0U) {
280 mve_pred16_t p0 = vctp32q(blkCnt);
281 f32x4_t vecA;
282
283 vecIn = vldrwq_z_f32(pInVec, p0);
284 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
285 acc0 = vfmaq(acc0, vecIn, vecA);
286 }
287 /*
288 * Sum the partial parts
289 */
290 f32x4_t vtmp = vuninitializedq_f32();
291 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
292 vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
293 arm_vec_exponent_f32
294 (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
295 vctp32q(1));
296 }
297 sum += vecAddAcrossF32Mve(vSum);
298
299
300 *pResult = S->classes[STEP(sum)];
301 }
302
303 #else
304 #if defined(ARM_MATH_NEON)
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)305 void arm_svm_polynomial_predict_f32(
306 const arm_svm_polynomial_instance_f32 *S,
307 const float32_t * in,
308 int32_t * pResult)
309 {
310 float32_t sum = S->intercept;
311
312 float32_t dot;
313 float32x4_t dotV;
314
315 float32x4_t accuma,accumb,accumc,accumd,accum;
316 float32x2_t accum2;
317 float32x4_t vec1;
318 float32x4_t coef0 = vdupq_n_f32(S->coef0);
319
320 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
321
322 uint32_t blkCnt;
323 uint32_t vectorBlkCnt;
324
325 const float32_t *pIn = in;
326
327 const float32_t *pSupport = S->supportVectors;
328
329 const float32_t *pSupporta = S->supportVectors;
330 const float32_t *pSupportb;
331 const float32_t *pSupportc;
332 const float32_t *pSupportd;
333
334 pSupportb = pSupporta + S->vectorDimension;
335 pSupportc = pSupportb + S->vectorDimension;
336 pSupportd = pSupportc + S->vectorDimension;
337
338 const float32_t *pDualCoefs = S->dualCoefficients;
339
340 vectorBlkCnt = S->nbOfSupportVectors >> 2;
341 while (vectorBlkCnt > 0U)
342 {
343 accuma = vdupq_n_f32(0);
344 accumb = vdupq_n_f32(0);
345 accumc = vdupq_n_f32(0);
346 accumd = vdupq_n_f32(0);
347
348 pIn = in;
349
350 blkCnt = S->vectorDimension >> 2;
351 while (blkCnt > 0U)
352 {
353
354 vec1 = vld1q_f32(pIn);
355 vec2a = vld1q_f32(pSupporta);
356 vec2b = vld1q_f32(pSupportb);
357 vec2c = vld1q_f32(pSupportc);
358 vec2d = vld1q_f32(pSupportd);
359
360 pIn += 4;
361 pSupporta += 4;
362 pSupportb += 4;
363 pSupportc += 4;
364 pSupportd += 4;
365
366 accuma = vmlaq_f32(accuma, vec1,vec2a);
367 accumb = vmlaq_f32(accumb, vec1,vec2b);
368 accumc = vmlaq_f32(accumc, vec1,vec2c);
369 accumd = vmlaq_f32(accumd, vec1,vec2d);
370
371 blkCnt -- ;
372 }
373 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
374 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
375
376 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
377 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
378
379 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
380 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
381
382 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
383 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
384
385
386 blkCnt = S->vectorDimension & 3;
387 while (blkCnt > 0U)
388 {
389 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
390 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
391 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
392 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
393
394 pIn++;
395
396 blkCnt -- ;
397 }
398
399 vec1 = vld1q_f32(pDualCoefs);
400 pDualCoefs += 4;
401
402 // To vectorize later
403 dotV = vmulq_n_f32(dotV, S->gamma);
404 dotV = vaddq_f32(dotV, coef0);
405
406 dotV = arm_vec_exponent_f32(dotV,S->degree);
407
408 accum = vmulq_f32(vec1,dotV);
409 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
410 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
411
412 pSupporta += 3*S->vectorDimension;
413 pSupportb += 3*S->vectorDimension;
414 pSupportc += 3*S->vectorDimension;
415 pSupportd += 3*S->vectorDimension;
416
417 vectorBlkCnt -- ;
418 }
419
420 pSupport = pSupporta;
421 vectorBlkCnt = S->nbOfSupportVectors & 3;
422
423 while (vectorBlkCnt > 0U)
424 {
425 accum = vdupq_n_f32(0);
426 dot = 0.0f;
427 pIn = in;
428
429 blkCnt = S->vectorDimension >> 2;
430 while (blkCnt > 0U)
431 {
432
433 vec1 = vld1q_f32(pIn);
434 vec2 = vld1q_f32(pSupport);
435 pIn += 4;
436 pSupport += 4;
437
438 accum = vmlaq_f32(accum, vec1,vec2);
439
440 blkCnt -- ;
441 }
442 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
443 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
444
445
446 blkCnt = S->vectorDimension & 3;
447 while (blkCnt > 0U)
448 {
449 dot = dot + *pIn++ * *pSupport++;
450
451 blkCnt -- ;
452 }
453
454 sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
455 vectorBlkCnt -- ;
456 }
457
458 *pResult=S->classes[STEP(sum)];
459 }
460 #else
arm_svm_polynomial_predict_f32(const arm_svm_polynomial_instance_f32 * S,const float32_t * in,int32_t * pResult)461 void arm_svm_polynomial_predict_f32(
462 const arm_svm_polynomial_instance_f32 *S,
463 const float32_t * in,
464 int32_t * pResult)
465 {
466 float32_t sum=S->intercept;
467 float32_t dot=0;
468 uint32_t i,j;
469 const float32_t *pSupport = S->supportVectors;
470
471 for(i=0; i < S->nbOfSupportVectors; i++)
472 {
473 dot=0;
474 for(j=0; j < S->vectorDimension; j++)
475 {
476 dot = dot + in[j]* *pSupport++;
477 }
478 sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
479 }
480
481 *pResult=S->classes[STEP(sum)];
482 }
483 #endif
484 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
485
486
487 /**
488 * @} end of polysvm group
489 */
490