1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_linear_predict_f32.c
4  * Description:  SVM Linear Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions.h"
30 #include <limits.h>
31 #include <math.h>
32 
33 
34 /**
35  * @addtogroup linearsvm
36  * @{
37  */
38 
39 
40 /**
41  * @brief SVM linear prediction
42  * @param[in]    S          Pointer to an instance of the linear SVM structure.
43  * @param[in]    in         Pointer to input vector
44  * @param[out]   pResult    Decision value
45  *
46  */
47 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
48 
49 #include "arm_helium_utils.h"
50 
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)51 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f32(
52     const arm_svm_linear_instance_f32 *S,
53     const float32_t * in,
54     int32_t * pResult)
55 {
56         /* inlined Matrix x Vector function interleaved with dot prod */
57     uint32_t        numRows = S->nbOfSupportVectors;
58     uint32_t        numCols = S->vectorDimension;
59     const float32_t *pSupport = S->supportVectors;
60     const float32_t *pSrcA = pSupport;
61     const float32_t *pInA0;
62     const float32_t *pInA1;
63     uint32_t         row;
64     uint32_t         blkCnt;     /* loop counters */
65     const float32_t *pDualCoef = S->dualCoefficients;
66     float32_t       sum = S->intercept;
67     row = numRows;
68 
69     /*
70      * compute 4 rows in parrallel
71      */
72     while (row >= 4)
73     {
74         const float32_t *pInA2, *pInA3;
75         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
76         f32x4_t         vecIn, acc0, acc1, acc2, acc3;
77         float32_t const *pSrcVecPtr = in;
78 
79         /*
80          * Initialize the pointers to 4 consecutive MatrixA rows
81          */
82         pInA0 = pSrcA;
83         pInA1 = pInA0 + numCols;
84         pInA2 = pInA1 + numCols;
85         pInA3 = pInA2 + numCols;
86         /*
87          * Initialize the vector pointer
88          */
89         pInVec = pSrcVecPtr;
90         /*
91          * reset accumulators
92          */
93         acc0 = vdupq_n_f32(0.0f);
94         acc1 = vdupq_n_f32(0.0f);
95         acc2 = vdupq_n_f32(0.0f);
96         acc3 = vdupq_n_f32(0.0f);
97 
98         pSrcA0Vec = pInA0;
99         pSrcA1Vec = pInA1;
100         pSrcA2Vec = pInA2;
101         pSrcA3Vec = pInA3;
102 
103         blkCnt = numCols >> 2;
104         while (blkCnt > 0U) {
105             f32x4_t         vecA;
106 
107             vecIn = vld1q(pInVec);
108             pInVec += 4;
109             vecA = vld1q(pSrcA0Vec);
110             pSrcA0Vec += 4;
111             acc0 = vfmaq(acc0, vecIn, vecA);
112             vecA = vld1q(pSrcA1Vec);
113             pSrcA1Vec += 4;
114             acc1 = vfmaq(acc1, vecIn, vecA);
115             vecA = vld1q(pSrcA2Vec);
116             pSrcA2Vec += 4;
117             acc2 = vfmaq(acc2, vecIn, vecA);
118             vecA = vld1q(pSrcA3Vec);
119             pSrcA3Vec += 4;
120             acc3 = vfmaq(acc3, vecIn, vecA);
121 
122             blkCnt--;
123         }
124         /*
125          * tail
126          * (will be merged thru tail predication)
127          */
128         blkCnt = numCols & 3;
129         if (blkCnt > 0U) {
130             mve_pred16_t    p0 = vctp32q(blkCnt);
131             f32x4_t         vecA;
132 
133             vecIn = vldrwq_z_f32(pInVec, p0);
134             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
135             acc0 = vfmaq(acc0, vecIn, vecA);
136             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
137             acc1 = vfmaq(acc1, vecIn, vecA);
138             vecA = vldrwq_z_f32(pSrcA2Vec, p0);
139             acc2 = vfmaq(acc2, vecIn, vecA);
140             vecA = vldrwq_z_f32(pSrcA3Vec, p0);
141             acc3 = vfmaq(acc3, vecIn, vecA);
142         }
143         /*
144          * Sum the partial parts
145          */
146 
147         acc0 = vmulq_n_f32(acc0,*pDualCoef++);
148         acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
149         acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
150         acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
151 
152         sum += vecAddAcrossF32Mve(acc0);
153 
154         pSrcA += numCols * 4;
155         /*
156          * Decrement the row loop counter
157          */
158         row -= 4;
159     }
160 
161     /*
162      * compute 2 rows in parallel
163      */
164     if (row >= 2) {
165         float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
166         f32x4_t         vecIn, acc0, acc1;
167         float32_t const *pSrcVecPtr = in;
168 
169         /*
170          * Initialize the pointers to 2 consecutive MatrixA rows
171          */
172         pInA0 = pSrcA;
173         pInA1 = pInA0 + numCols;
174         /*
175          * Initialize the vector pointer
176          */
177         pInVec = pSrcVecPtr;
178         /*
179          * reset accumulators
180          */
181         acc0 = vdupq_n_f32(0.0f);
182         acc1 = vdupq_n_f32(0.0f);
183         pSrcA0Vec = pInA0;
184         pSrcA1Vec = pInA1;
185 
186         blkCnt = numCols >> 2;
187         while (blkCnt > 0U) {
188             f32x4_t         vecA;
189 
190             vecIn = vld1q(pInVec);
191             pInVec += 4;
192             vecA = vld1q(pSrcA0Vec);
193             pSrcA0Vec += 4;
194             acc0 = vfmaq(acc0, vecIn, vecA);
195             vecA = vld1q(pSrcA1Vec);
196             pSrcA1Vec += 4;
197             acc1 = vfmaq(acc1, vecIn, vecA);
198 
199             blkCnt--;
200         }
201         /*
202          * tail
203          * (will be merged thru tail predication)
204          */
205         blkCnt = numCols & 3;
206         if (blkCnt > 0U) {
207             mve_pred16_t    p0 = vctp32q(blkCnt);
208             f32x4_t         vecA;
209 
210             vecIn = vldrwq_z_f32(pInVec, p0);
211             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
212             acc0 = vfmaq(acc0, vecIn, vecA);
213             vecA = vldrwq_z_f32(pSrcA1Vec, p0);
214             acc1 = vfmaq(acc1, vecIn, vecA);
215         }
216         /*
217          * Sum the partial parts
218          */
219         acc0 = vmulq_n_f32(acc0,*pDualCoef++);
220         acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
221 
222         sum += vecAddAcrossF32Mve(acc0);
223 
224 
225         pSrcA += numCols * 2;
226         row -= 2;
227     }
228 
229     if (row >= 1) {
230         f32x4_t         vecIn, acc0;
231         float32_t const *pSrcA0Vec, *pInVec;
232         float32_t const *pSrcVecPtr = in;
233         /*
234          * Initialize the pointers to last MatrixA row
235          */
236         pInA0 = pSrcA;
237         /*
238          * Initialize the vector pointer
239          */
240         pInVec = pSrcVecPtr;
241         /*
242          * reset accumulators
243          */
244         acc0 = vdupq_n_f32(0.0f);
245 
246         pSrcA0Vec = pInA0;
247 
248         blkCnt = numCols >> 2;
249         while (blkCnt > 0U) {
250             f32x4_t         vecA;
251 
252             vecIn = vld1q(pInVec);
253             pInVec += 4;
254             vecA = vld1q(pSrcA0Vec);
255             pSrcA0Vec += 4;
256             acc0 = vfmaq(acc0, vecIn, vecA);
257 
258             blkCnt--;
259         }
260         /*
261          * tail
262          * (will be merged thru tail predication)
263          */
264         blkCnt = numCols & 3;
265         if (blkCnt > 0U) {
266             mve_pred16_t    p0 = vctp32q(blkCnt);
267             f32x4_t         vecA;
268 
269             vecIn = vldrwq_z_f32(pInVec, p0);
270             vecA = vldrwq_z_f32(pSrcA0Vec, p0);
271             acc0 = vfmaq(acc0, vecIn, vecA);
272         }
273         /*
274          * Sum the partial parts
275          */
276         sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
277 
278     }
279 
280     *pResult = S->classes[STEP(sum)];
281 }
282 
283 #else
284 #if defined(ARM_MATH_NEON)
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)285 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f32(
286     const arm_svm_linear_instance_f32 *S,
287     const float32_t * in,
288     int32_t * pResult)
289 {
290     float32_t sum = S->intercept;
291 
292     float32_t dot;
293     float32x4_t dotV;
294 
295     float32x4_t accuma,accumb,accumc,accumd,accum;
296     float32x2_t accum2;
297     float32x4_t vec1;
298 
299     float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
300 
301     uint32_t blkCnt;
302     uint32_t vectorBlkCnt;
303 
304     const float32_t *pIn = in;
305 
306     const float32_t *pSupport = S->supportVectors;
307 
308     const float32_t *pSupporta = S->supportVectors;
309     const float32_t *pSupportb;
310     const float32_t *pSupportc;
311     const float32_t *pSupportd;
312 
313     pSupportb = pSupporta + S->vectorDimension;
314     pSupportc = pSupportb + S->vectorDimension;
315     pSupportd = pSupportc + S->vectorDimension;
316 
317     const float32_t *pDualCoefs = S->dualCoefficients;
318 
319     vectorBlkCnt = S->nbOfSupportVectors >> 2;
320 
321     while (vectorBlkCnt > 0U)
322     {
323         accuma = vdupq_n_f32(0);
324         accumb = vdupq_n_f32(0);
325         accumc = vdupq_n_f32(0);
326         accumd = vdupq_n_f32(0);
327 
328         pIn = in;
329 
330         blkCnt = S->vectorDimension >> 2;
331         while (blkCnt > 0U)
332         {
333 
334             vec1 = vld1q_f32(pIn);
335             vec2a = vld1q_f32(pSupporta);
336             vec2b = vld1q_f32(pSupportb);
337             vec2c = vld1q_f32(pSupportc);
338             vec2d = vld1q_f32(pSupportd);
339 
340             pIn += 4;
341             pSupporta += 4;
342             pSupportb += 4;
343             pSupportc += 4;
344             pSupportd += 4;
345 
346             accuma = vmlaq_f32(accuma, vec1,vec2a);
347             accumb = vmlaq_f32(accumb, vec1,vec2b);
348             accumc = vmlaq_f32(accumc, vec1,vec2c);
349             accumd = vmlaq_f32(accumd, vec1,vec2d);
350 
351             blkCnt -- ;
352         }
353         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
354         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
355 
356         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
357         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
358 
359         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
360         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
361 
362         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
363         dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
364 
365 
366         blkCnt = S->vectorDimension & 3;
367         while (blkCnt > 0U)
368         {
369             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
370             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
371             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
372             dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
373 
374             pIn++;
375 
376             blkCnt -- ;
377         }
378 
379         vec1 = vld1q_f32(pDualCoefs);
380         pDualCoefs += 4;
381 
382         accum = vmulq_f32(vec1,dotV);
383         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
384         sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
385 
386         pSupporta += 3*S->vectorDimension;
387         pSupportb += 3*S->vectorDimension;
388         pSupportc += 3*S->vectorDimension;
389         pSupportd += 3*S->vectorDimension;
390 
391         vectorBlkCnt -- ;
392     }
393 
394     pSupport = pSupporta;
395     vectorBlkCnt = S->nbOfSupportVectors & 3;
396     while (vectorBlkCnt > 0U)
397     {
398         accum = vdupq_n_f32(0);
399         dot = 0.0f;
400         pIn = in;
401 
402         blkCnt = S->vectorDimension >> 2;
403         while (blkCnt > 0U)
404         {
405 
406             vec1 = vld1q_f32(pIn);
407             vec2 = vld1q_f32(pSupport);
408             pIn += 4;
409             pSupport += 4;
410 
411             accum = vmlaq_f32(accum, vec1,vec2);
412 
413             blkCnt -- ;
414         }
415         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
416         dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
417 
418 
419         blkCnt = S->vectorDimension & 3;
420         while (blkCnt > 0U)
421         {
422             dot = dot + *pIn++ * *pSupport++;
423 
424             blkCnt -- ;
425         }
426 
427         sum += *pDualCoefs++ * dot;
428         vectorBlkCnt -- ;
429     }
430 
431     *pResult=S->classes[STEP(sum)];
432 }
433 #else
arm_svm_linear_predict_f32(const arm_svm_linear_instance_f32 * S,const float32_t * in,int32_t * pResult)434 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f32(
435     const arm_svm_linear_instance_f32 *S,
436     const float32_t * in,
437     int32_t * pResult)
438 {
439     float32_t sum=S->intercept;
440     float32_t dot=0;
441     uint32_t i,j;
442     const float32_t *pSupport = S->supportVectors;
443 
444     for(i=0; i < S->nbOfSupportVectors; i++)
445     {
446         dot=0;
447         for(j=0; j < S->vectorDimension; j++)
448         {
449             dot = dot + in[j]* *pSupport++;
450         }
451         sum += S->dualCoefficients[i] * dot;
452     }
453     *pResult=S->classes[STEP(sum)];
454 }
455 #endif
456 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
457 
458 /**
459  * @} end of linearsvm group
460  */
461