1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_sigmoid_predict_f32.c
4 * Description: SVM Sigmoid Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32
33 /**
34 * @addtogroup sigmoidsvm
35 * @{
36 */
37
38
39
40 /**
41 * @brief SVM sigmoid prediction
42 * @param[in] S Pointer to an instance of the rbf SVM structure.
43 * @param[in] in Pointer to input vector
44 * @param[out] pResult Decision value
45 *
46 */
47
48 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
49
50 #include "arm_helium_utils.h"
51 #include "arm_vec_math.h"
52
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)53 ARM_DSP_ATTRIBUTE void arm_svm_sigmoid_predict_f32(
54 const arm_svm_sigmoid_instance_f32 *S,
55 const float32_t * in,
56 int32_t * pResult)
57 {
58 /* inlined Matrix x Vector function interleaved with dot prod */
59 uint32_t numRows = S->nbOfSupportVectors;
60 uint32_t numCols = S->vectorDimension;
61 const float32_t *pSupport = S->supportVectors;
62 const float32_t *pSrcA = pSupport;
63 const float32_t *pInA0;
64 const float32_t *pInA1;
65 uint32_t row;
66 uint32_t blkCnt; /* loop counters */
67 const float32_t *pDualCoef = S->dualCoefficients;
68 float32_t sum = S->intercept;
69 f32x4_t vSum = vdupq_n_f32(0.0f);
70
71 row = numRows;
72
73 /*
74 * compute 4 rows in parrallel
75 */
76 while (row >= 4) {
77 const float32_t *pInA2, *pInA3;
78 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
79 f32x4_t vecIn, acc0, acc1, acc2, acc3;
80 float32_t const *pSrcVecPtr = in;
81
82 /*
83 * Initialize the pointers to 4 consecutive MatrixA rows
84 */
85 pInA0 = pSrcA;
86 pInA1 = pInA0 + numCols;
87 pInA2 = pInA1 + numCols;
88 pInA3 = pInA2 + numCols;
89 /*
90 * Initialize the vector pointer
91 */
92 pInVec = pSrcVecPtr;
93 /*
94 * reset accumulators
95 */
96 acc0 = vdupq_n_f32(0.0f);
97 acc1 = vdupq_n_f32(0.0f);
98 acc2 = vdupq_n_f32(0.0f);
99 acc3 = vdupq_n_f32(0.0f);
100
101 pSrcA0Vec = pInA0;
102 pSrcA1Vec = pInA1;
103 pSrcA2Vec = pInA2;
104 pSrcA3Vec = pInA3;
105
106 blkCnt = numCols >> 2;
107 while (blkCnt > 0U) {
108 f32x4_t vecA;
109
110 vecIn = vld1q(pInVec);
111 pInVec += 4;
112 vecA = vld1q(pSrcA0Vec);
113 pSrcA0Vec += 4;
114 acc0 = vfmaq(acc0, vecIn, vecA);
115 vecA = vld1q(pSrcA1Vec);
116 pSrcA1Vec += 4;
117 acc1 = vfmaq(acc1, vecIn, vecA);
118 vecA = vld1q(pSrcA2Vec);
119 pSrcA2Vec += 4;
120 acc2 = vfmaq(acc2, vecIn, vecA);
121 vecA = vld1q(pSrcA3Vec);
122 pSrcA3Vec += 4;
123 acc3 = vfmaq(acc3, vecIn, vecA);
124
125 blkCnt--;
126 }
127 /*
128 * tail
129 * (will be merged thru tail predication)
130 */
131 blkCnt = numCols & 3;
132 if (blkCnt > 0U) {
133 mve_pred16_t p0 = vctp32q(blkCnt);
134 f32x4_t vecA;
135
136 vecIn = vldrwq_z_f32(pInVec, p0);
137 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
138 acc0 = vfmaq(acc0, vecIn, vecA);
139 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
140 acc1 = vfmaq(acc1, vecIn, vecA);
141 vecA = vldrwq_z_f32(pSrcA2Vec, p0);
142 acc2 = vfmaq(acc2, vecIn, vecA);
143 vecA = vldrwq_z_f32(pSrcA3Vec, p0);
144 acc3 = vfmaq(acc3, vecIn, vecA);
145 }
146 /*
147 * Sum the partial parts
148 */
149 f32x4_t vtmp = vuninitializedq_f32();
150 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
151 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
152 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
153 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
154
155 vSum =
156 vfmaq_f32(vSum, vld1q(pDualCoef),
157 vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)));
158
159 pDualCoef += 4;
160
161 pSrcA += numCols * 4;
162 /*
163 * Decrement the row loop counter
164 */
165 row -= 4;
166 }
167
168 /*
169 * compute 2 rows in parrallel
170 */
171 if (row >= 2) {
172 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
173 f32x4_t vecIn, acc0, acc1;
174 float32_t const *pSrcVecPtr = in;
175
176 /*
177 * Initialize the pointers to 2 consecutive MatrixA rows
178 */
179 pInA0 = pSrcA;
180 pInA1 = pInA0 + numCols;
181 /*
182 * Initialize the vector pointer
183 */
184 pInVec = pSrcVecPtr;
185 /*
186 * reset accumulators
187 */
188 acc0 = vdupq_n_f32(0.0f);
189 acc1 = vdupq_n_f32(0.0f);
190 pSrcA0Vec = pInA0;
191 pSrcA1Vec = pInA1;
192
193 blkCnt = numCols >> 2;
194 while (blkCnt > 0U) {
195 f32x4_t vecA;
196
197 vecIn = vld1q(pInVec);
198 pInVec += 4;
199 vecA = vld1q(pSrcA0Vec);
200 pSrcA0Vec += 4;
201 acc0 = vfmaq(acc0, vecIn, vecA);
202 vecA = vld1q(pSrcA1Vec);
203 pSrcA1Vec += 4;
204 acc1 = vfmaq(acc1, vecIn, vecA);
205
206 blkCnt--;
207 }
208 /*
209 * tail
210 * (will be merged thru tail predication)
211 */
212 blkCnt = numCols & 3;
213 if (blkCnt > 0U) {
214 mve_pred16_t p0 = vctp32q(blkCnt);
215 f32x4_t vecA;
216
217 vecIn = vldrwq_z_f32(pInVec, p0);
218 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
219 acc0 = vfmaq(acc0, vecIn, vecA);
220 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
221 acc1 = vfmaq(acc1, vecIn, vecA);
222 }
223 /*
224 * Sum the partial parts
225 */
226 f32x4_t vtmp = vuninitializedq_f32();
227 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
228 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
229
230 vSum =
231 vfmaq_m_f32(vSum, vld1q(pDualCoef),
232 vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
233 vctp32q(2));
234
235 pSrcA += numCols * 2;
236 row -= 2;
237 }
238
239 if (row >= 1) {
240 f32x4_t vecIn, acc0;
241 float32_t const *pSrcA0Vec, *pInVec;
242 float32_t const *pSrcVecPtr = in;
243 /*
244 * Initialize the pointers to last MatrixA row
245 */
246 pInA0 = pSrcA;
247 /*
248 * Initialize the vector pointer
249 */
250 pInVec = pSrcVecPtr;
251 /*
252 * reset accumulators
253 */
254 acc0 = vdupq_n_f32(0.0f);
255
256 pSrcA0Vec = pInA0;
257
258 blkCnt = numCols >> 2;
259 while (blkCnt > 0U) {
260 f32x4_t vecA;
261
262 vecIn = vld1q(pInVec);
263 pInVec += 4;
264 vecA = vld1q(pSrcA0Vec);
265 pSrcA0Vec += 4;
266 acc0 = vfmaq(acc0, vecIn, vecA);
267
268 blkCnt--;
269 }
270 /*
271 * tail
272 * (will be merged thru tail predication)
273 */
274 blkCnt = numCols & 3;
275 if (blkCnt > 0U) {
276 mve_pred16_t p0 = vctp32q(blkCnt);
277 f32x4_t vecA;
278
279 vecIn = vldrwq_z_f32(pInVec, p0);
280 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
281 acc0 = vfmaq(acc0, vecIn, vecA);
282 }
283 /*
284 * Sum the partial parts
285 */
286 f32x4_t vtmp = vuninitializedq_f32();
287 vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
288
289 vSum =
290 vfmaq_m_f32(vSum, vld1q(pDualCoef),
291 vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
292 vctp32q(1));
293 }
294 sum += vecAddAcrossF32Mve(vSum);
295
296 *pResult = S->classes[STEP(sum)];
297 }
298
299 #else
300 #if defined(ARM_MATH_NEON)
301 #include "NEMath.h"
302
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)303 ARM_DSP_ATTRIBUTE void arm_svm_sigmoid_predict_f32(
304 const arm_svm_sigmoid_instance_f32 *S,
305 const float32_t * in,
306 int32_t * pResult)
307 {
308 float32_t sum = S->intercept;
309
310 float32_t dot;
311 float32x4_t dotV;
312
313 float32x4_t accuma,accumb,accumc,accumd,accum;
314 float32x2_t accum2;
315 float32x4_t vec1;
316 float32x4_t coef0 = vdupq_n_f32(S->coef0);
317
318 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
319
320 uint32_t blkCnt;
321 uint32_t vectorBlkCnt;
322
323 const float32_t *pIn = in;
324
325 const float32_t *pSupport = S->supportVectors;
326
327 const float32_t *pSupporta = S->supportVectors;
328 const float32_t *pSupportb;
329 const float32_t *pSupportc;
330 const float32_t *pSupportd;
331
332 pSupportb = pSupporta + S->vectorDimension;
333 pSupportc = pSupportb + S->vectorDimension;
334 pSupportd = pSupportc + S->vectorDimension;
335
336 const float32_t *pDualCoefs = S->dualCoefficients;
337
338 vectorBlkCnt = S->nbOfSupportVectors >> 2;
339 while (vectorBlkCnt > 0U)
340 {
341 accuma = vdupq_n_f32(0);
342 accumb = vdupq_n_f32(0);
343 accumc = vdupq_n_f32(0);
344 accumd = vdupq_n_f32(0);
345
346 pIn = in;
347
348 blkCnt = S->vectorDimension >> 2;
349 while (blkCnt > 0U)
350 {
351
352 vec1 = vld1q_f32(pIn);
353 vec2a = vld1q_f32(pSupporta);
354 vec2b = vld1q_f32(pSupportb);
355 vec2c = vld1q_f32(pSupportc);
356 vec2d = vld1q_f32(pSupportd);
357
358 pIn += 4;
359 pSupporta += 4;
360 pSupportb += 4;
361 pSupportc += 4;
362 pSupportd += 4;
363
364 accuma = vmlaq_f32(accuma, vec1,vec2a);
365 accumb = vmlaq_f32(accumb, vec1,vec2b);
366 accumc = vmlaq_f32(accumc, vec1,vec2c);
367 accumd = vmlaq_f32(accumd, vec1,vec2d);
368
369 blkCnt -- ;
370 }
371 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
372 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
373
374 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
375 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
376
377 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
378 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
379
380 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
381 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
382
383
384 blkCnt = S->vectorDimension & 3;
385 while (blkCnt > 0U)
386 {
387 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
388 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
389 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
390 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
391
392 pIn++;
393
394 blkCnt -- ;
395 }
396
397 vec1 = vld1q_f32(pDualCoefs);
398 pDualCoefs += 4;
399
400 // To vectorize later
401 dotV = vmulq_n_f32(dotV, S->gamma);
402 dotV = vaddq_f32(dotV, coef0);
403
404 dotV = vtanhq_f32(dotV);
405
406 accum = vmulq_f32(vec1,dotV);
407 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
408 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
409
410 pSupporta += 3*S->vectorDimension;
411 pSupportb += 3*S->vectorDimension;
412 pSupportc += 3*S->vectorDimension;
413 pSupportd += 3*S->vectorDimension;
414
415 vectorBlkCnt -- ;
416 }
417
418 pSupport = pSupporta;
419 vectorBlkCnt = S->nbOfSupportVectors & 3;
420
421 while (vectorBlkCnt > 0U)
422 {
423 accum = vdupq_n_f32(0);
424 dot = 0.0f;
425 pIn = in;
426
427 blkCnt = S->vectorDimension >> 2;
428 while (blkCnt > 0U)
429 {
430
431 vec1 = vld1q_f32(pIn);
432 vec2 = vld1q_f32(pSupport);
433 pIn += 4;
434 pSupport += 4;
435
436 accum = vmlaq_f32(accum, vec1,vec2);
437
438 blkCnt -- ;
439 }
440 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
441 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
442
443
444 blkCnt = S->vectorDimension & 3;
445 while (blkCnt > 0U)
446 {
447 dot = dot + *pIn++ * *pSupport++;
448
449 blkCnt -- ;
450 }
451
452 sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0);
453 vectorBlkCnt -- ;
454 }
455
456 *pResult=S->classes[STEP(sum)];
457 }
458 #else
arm_svm_sigmoid_predict_f32(const arm_svm_sigmoid_instance_f32 * S,const float32_t * in,int32_t * pResult)459 ARM_DSP_ATTRIBUTE void arm_svm_sigmoid_predict_f32(
460 const arm_svm_sigmoid_instance_f32 *S,
461 const float32_t * in,
462 int32_t * pResult)
463 {
464 float32_t sum=S->intercept;
465 float32_t dot=0;
466 uint32_t i,j;
467 const float32_t *pSupport = S->supportVectors;
468
469 for(i=0; i < S->nbOfSupportVectors; i++)
470 {
471 dot=0;
472 for(j=0; j < S->vectorDimension; j++)
473 {
474 dot = dot + in[j]* *pSupport++;
475 }
476 sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
477 }
478 *pResult=S->classes[STEP(sum)];
479 }
480
481 #endif
482 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
483
484 /**
485 * @} end of sigmoidsvm group
486 */
487