1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_linear_predict_f32.c
4 * Description: SVM Linear Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32
33
34 /**
35 * @addtogroup linearsvm
36 * @{
37 */
38
39
40 /**
41 * @brief SVM linear prediction
42 * @param[in] S Pointer to an instance of the linear SVM structure.
43 * @param[in] in Pointer to input vector
44 * @param[out] pResult Decision value
45 *
46 */
47 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
48
49 #include "arm_helium_utils.h"
50
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)51 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f32(
52 const arm_svm_linear_instance_f32 *S,
53 const float32_t * in,
54 int32_t * pResult)
55 {
56 /* inlined Matrix x Vector function interleaved with dot prod */
57 uint32_t numRows = S->nbOfSupportVectors;
58 uint32_t numCols = S->vectorDimension;
59 const float32_t *pSupport = S->supportVectors;
60 const float32_t *pSrcA = pSupport;
61 const float32_t *pInA0;
62 const float32_t *pInA1;
63 uint32_t row;
64 uint32_t blkCnt; /* loop counters */
65 const float32_t *pDualCoef = S->dualCoefficients;
66 float32_t sum = S->intercept;
67 row = numRows;
68
69 /*
70 * compute 4 rows in parrallel
71 */
72 while (row >= 4)
73 {
74 const float32_t *pInA2, *pInA3;
75 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
76 f32x4_t vecIn, acc0, acc1, acc2, acc3;
77 float32_t const *pSrcVecPtr = in;
78
79 /*
80 * Initialize the pointers to 4 consecutive MatrixA rows
81 */
82 pInA0 = pSrcA;
83 pInA1 = pInA0 + numCols;
84 pInA2 = pInA1 + numCols;
85 pInA3 = pInA2 + numCols;
86 /*
87 * Initialize the vector pointer
88 */
89 pInVec = pSrcVecPtr;
90 /*
91 * reset accumulators
92 */
93 acc0 = vdupq_n_f32(0.0f);
94 acc1 = vdupq_n_f32(0.0f);
95 acc2 = vdupq_n_f32(0.0f);
96 acc3 = vdupq_n_f32(0.0f);
97
98 pSrcA0Vec = pInA0;
99 pSrcA1Vec = pInA1;
100 pSrcA2Vec = pInA2;
101 pSrcA3Vec = pInA3;
102
103 blkCnt = numCols >> 2;
104 while (blkCnt > 0U) {
105 f32x4_t vecA;
106
107 vecIn = vld1q(pInVec);
108 pInVec += 4;
109 vecA = vld1q(pSrcA0Vec);
110 pSrcA0Vec += 4;
111 acc0 = vfmaq(acc0, vecIn, vecA);
112 vecA = vld1q(pSrcA1Vec);
113 pSrcA1Vec += 4;
114 acc1 = vfmaq(acc1, vecIn, vecA);
115 vecA = vld1q(pSrcA2Vec);
116 pSrcA2Vec += 4;
117 acc2 = vfmaq(acc2, vecIn, vecA);
118 vecA = vld1q(pSrcA3Vec);
119 pSrcA3Vec += 4;
120 acc3 = vfmaq(acc3, vecIn, vecA);
121
122 blkCnt--;
123 }
124 /*
125 * tail
126 * (will be merged thru tail predication)
127 */
128 blkCnt = numCols & 3;
129 if (blkCnt > 0U) {
130 mve_pred16_t p0 = vctp32q(blkCnt);
131 f32x4_t vecA;
132
133 vecIn = vldrwq_z_f32(pInVec, p0);
134 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
135 acc0 = vfmaq(acc0, vecIn, vecA);
136 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
137 acc1 = vfmaq(acc1, vecIn, vecA);
138 vecA = vldrwq_z_f32(pSrcA2Vec, p0);
139 acc2 = vfmaq(acc2, vecIn, vecA);
140 vecA = vldrwq_z_f32(pSrcA3Vec, p0);
141 acc3 = vfmaq(acc3, vecIn, vecA);
142 }
143 /*
144 * Sum the partial parts
145 */
146
147 acc0 = vmulq_n_f32(acc0,*pDualCoef++);
148 acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
149 acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
150 acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
151
152 sum += vecAddAcrossF32Mve(acc0);
153
154 pSrcA += numCols * 4;
155 /*
156 * Decrement the row loop counter
157 */
158 row -= 4;
159 }
160
161 /*
162 * compute 2 rows in parallel
163 */
164 if (row >= 2) {
165 float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
166 f32x4_t vecIn, acc0, acc1;
167 float32_t const *pSrcVecPtr = in;
168
169 /*
170 * Initialize the pointers to 2 consecutive MatrixA rows
171 */
172 pInA0 = pSrcA;
173 pInA1 = pInA0 + numCols;
174 /*
175 * Initialize the vector pointer
176 */
177 pInVec = pSrcVecPtr;
178 /*
179 * reset accumulators
180 */
181 acc0 = vdupq_n_f32(0.0f);
182 acc1 = vdupq_n_f32(0.0f);
183 pSrcA0Vec = pInA0;
184 pSrcA1Vec = pInA1;
185
186 blkCnt = numCols >> 2;
187 while (blkCnt > 0U) {
188 f32x4_t vecA;
189
190 vecIn = vld1q(pInVec);
191 pInVec += 4;
192 vecA = vld1q(pSrcA0Vec);
193 pSrcA0Vec += 4;
194 acc0 = vfmaq(acc0, vecIn, vecA);
195 vecA = vld1q(pSrcA1Vec);
196 pSrcA1Vec += 4;
197 acc1 = vfmaq(acc1, vecIn, vecA);
198
199 blkCnt--;
200 }
201 /*
202 * tail
203 * (will be merged thru tail predication)
204 */
205 blkCnt = numCols & 3;
206 if (blkCnt > 0U) {
207 mve_pred16_t p0 = vctp32q(blkCnt);
208 f32x4_t vecA;
209
210 vecIn = vldrwq_z_f32(pInVec, p0);
211 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
212 acc0 = vfmaq(acc0, vecIn, vecA);
213 vecA = vldrwq_z_f32(pSrcA1Vec, p0);
214 acc1 = vfmaq(acc1, vecIn, vecA);
215 }
216 /*
217 * Sum the partial parts
218 */
219 acc0 = vmulq_n_f32(acc0,*pDualCoef++);
220 acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
221
222 sum += vecAddAcrossF32Mve(acc0);
223
224
225 pSrcA += numCols * 2;
226 row -= 2;
227 }
228
229 if (row >= 1) {
230 f32x4_t vecIn, acc0;
231 float32_t const *pSrcA0Vec, *pInVec;
232 float32_t const *pSrcVecPtr = in;
233 /*
234 * Initialize the pointers to last MatrixA row
235 */
236 pInA0 = pSrcA;
237 /*
238 * Initialize the vector pointer
239 */
240 pInVec = pSrcVecPtr;
241 /*
242 * reset accumulators
243 */
244 acc0 = vdupq_n_f32(0.0f);
245
246 pSrcA0Vec = pInA0;
247
248 blkCnt = numCols >> 2;
249 while (blkCnt > 0U) {
250 f32x4_t vecA;
251
252 vecIn = vld1q(pInVec);
253 pInVec += 4;
254 vecA = vld1q(pSrcA0Vec);
255 pSrcA0Vec += 4;
256 acc0 = vfmaq(acc0, vecIn, vecA);
257
258 blkCnt--;
259 }
260 /*
261 * tail
262 * (will be merged thru tail predication)
263 */
264 blkCnt = numCols & 3;
265 if (blkCnt > 0U) {
266 mve_pred16_t p0 = vctp32q(blkCnt);
267 f32x4_t vecA;
268
269 vecIn = vldrwq_z_f32(pInVec, p0);
270 vecA = vldrwq_z_f32(pSrcA0Vec, p0);
271 acc0 = vfmaq(acc0, vecIn, vecA);
272 }
273 /*
274 * Sum the partial parts
275 */
276 sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
277
278 }
279
280 *pResult = S->classes[STEP(sum)];
281 }
282
283 #else
284 #if defined(ARM_MATH_NEON)
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)285 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f32(
286 const arm_svm_linear_instance_f32 *S,
287 const float32_t * in,
288 int32_t * pResult)
289 {
290 float32_t sum = S->intercept;
291
292 float32_t dot;
293 float32x4_t dotV;
294
295 float32x4_t accuma,accumb,accumc,accumd,accum;
296 float32x2_t accum2;
297 float32x4_t vec1;
298
299 float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
300
301 uint32_t blkCnt;
302 uint32_t vectorBlkCnt;
303
304 const float32_t *pIn = in;
305
306 const float32_t *pSupport = S->supportVectors;
307
308 const float32_t *pSupporta = S->supportVectors;
309 const float32_t *pSupportb;
310 const float32_t *pSupportc;
311 const float32_t *pSupportd;
312
313 pSupportb = pSupporta + S->vectorDimension;
314 pSupportc = pSupportb + S->vectorDimension;
315 pSupportd = pSupportc + S->vectorDimension;
316
317 const float32_t *pDualCoefs = S->dualCoefficients;
318
319 vectorBlkCnt = S->nbOfSupportVectors >> 2;
320
321 while (vectorBlkCnt > 0U)
322 {
323 accuma = vdupq_n_f32(0);
324 accumb = vdupq_n_f32(0);
325 accumc = vdupq_n_f32(0);
326 accumd = vdupq_n_f32(0);
327
328 pIn = in;
329
330 blkCnt = S->vectorDimension >> 2;
331 while (blkCnt > 0U)
332 {
333
334 vec1 = vld1q_f32(pIn);
335 vec2a = vld1q_f32(pSupporta);
336 vec2b = vld1q_f32(pSupportb);
337 vec2c = vld1q_f32(pSupportc);
338 vec2d = vld1q_f32(pSupportd);
339
340 pIn += 4;
341 pSupporta += 4;
342 pSupportb += 4;
343 pSupportc += 4;
344 pSupportd += 4;
345
346 accuma = vmlaq_f32(accuma, vec1,vec2a);
347 accumb = vmlaq_f32(accumb, vec1,vec2b);
348 accumc = vmlaq_f32(accumc, vec1,vec2c);
349 accumd = vmlaq_f32(accumd, vec1,vec2d);
350
351 blkCnt -- ;
352 }
353 accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
354 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
355
356 accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
357 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
358
359 accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
360 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
361
362 accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
363 dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
364
365
366 blkCnt = S->vectorDimension & 3;
367 while (blkCnt > 0U)
368 {
369 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
370 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
371 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
372 dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
373
374 pIn++;
375
376 blkCnt -- ;
377 }
378
379 vec1 = vld1q_f32(pDualCoefs);
380 pDualCoefs += 4;
381
382 accum = vmulq_f32(vec1,dotV);
383 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
384 sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
385
386 pSupporta += 3*S->vectorDimension;
387 pSupportb += 3*S->vectorDimension;
388 pSupportc += 3*S->vectorDimension;
389 pSupportd += 3*S->vectorDimension;
390
391 vectorBlkCnt -- ;
392 }
393
394 pSupport = pSupporta;
395 vectorBlkCnt = S->nbOfSupportVectors & 3;
396 while (vectorBlkCnt > 0U)
397 {
398 accum = vdupq_n_f32(0);
399 dot = 0.0f;
400 pIn = in;
401
402 blkCnt = S->vectorDimension >> 2;
403 while (blkCnt > 0U)
404 {
405
406 vec1 = vld1q_f32(pIn);
407 vec2 = vld1q_f32(pSupport);
408 pIn += 4;
409 pSupport += 4;
410
411 accum = vmlaq_f32(accum, vec1,vec2);
412
413 blkCnt -- ;
414 }
415 accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
416 dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
417
418
419 blkCnt = S->vectorDimension & 3;
420 while (blkCnt > 0U)
421 {
422 dot = dot + *pIn++ * *pSupport++;
423
424 blkCnt -- ;
425 }
426
427 sum += *pDualCoefs++ * dot;
428 vectorBlkCnt -- ;
429 }
430
431 *pResult=S->classes[STEP(sum)];
432 }
433 #else
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)434 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f32(
435 const arm_svm_linear_instance_f32 *S,
436 const float32_t * in,
437 int32_t * pResult)
438 {
439 float32_t sum=S->intercept;
440 float32_t dot=0;
441 uint32_t i,j;
442 const float32_t *pSupport = S->supportVectors;
443
444 for(i=0; i < S->nbOfSupportVectors; i++)
445 {
446 dot=0;
447 for(j=0; j < S->vectorDimension; j++)
448 {
449 dot = dot + in[j]* *pSupport++;
450 }
451 sum += S->dualCoefficients[i] * dot;
452 }
453 *pResult=S->classes[STEP(sum)];
454 }
455 #endif
456 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
457
458 /**
459 * @} end of linearsvm group
460 */
461