1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_polynomial_predict_f16.c
4  * Description:  SVM Polynomial Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 #include <limits.h>
34 #include <math.h>
35 
36 #if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
37 
38 /*
39 
40 _Float16 is not supported in g++ so we avoid putting _Float16 definitions
41 in the public headers.
42 
43 This function should at some point be moved in FastMath.
44 
45 */
arm_exponent_f16(float16_t x,int32_t nb)46 __STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
47 {
48     float16_t r = x;
49     nb --;
50     while(nb > 0)
51     {
52         r = (_Float16)r * (_Float16)x;
53         nb--;
54     }
55     return(r);
56 }
57 #endif
58 
59 /**
60  * @addtogroup polysvm
61  * @{
62  */
63 
64 
65 
66 
67 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
68 
69 #include "arm_helium_utils.h"
70 #include "arm_vec_math_f16.h"
71 
72 /**
73  * @brief SVM polynomial prediction
74  * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
75  * @param[in]    in         Pointer to input vector
76  * @param[out]   pResult    Decision value
77  *
78  */
arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 * S,const float16_t * in,int32_t * pResult)79 ARM_DSP_ATTRIBUTE void arm_svm_polynomial_predict_f16(
80     const arm_svm_polynomial_instance_f16 *S,
81     const float16_t * in,
82     int32_t * pResult)
83 {
84         /* inlined Matrix x Vector function interleaved with dot prod */
85     uint32_t        numRows = S->nbOfSupportVectors;
86     uint32_t        numCols = S->vectorDimension;
87     const float16_t *pSupport = S->supportVectors;
88     const float16_t *pSrcA = pSupport;
89     const float16_t *pInA0;
90     const float16_t *pInA1;
91     uint32_t         row;
92     uint32_t         blkCnt;     /* loop counters */
93     const float16_t *pDualCoef = S->dualCoefficients;
94     _Float16       sum = S->intercept;
95     f16x8_t         vSum = vdupq_n_f16(0.0f);
96 
97     row = numRows;
98 
99     /*
100      * compute 4 rows in parrallel
101      */
102     while (row >= 4) {
103         const float16_t *pInA2, *pInA3;
104         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
105         f16x8_t         vecIn, acc0, acc1, acc2, acc3;
106         float16_t const *pSrcVecPtr = in;
107 
108         /*
109          * Initialize the pointers to 4 consecutive MatrixA rows
110          */
111         pInA0 = pSrcA;
112         pInA1 = pInA0 + numCols;
113         pInA2 = pInA1 + numCols;
114         pInA3 = pInA2 + numCols;
115         /*
116          * Initialize the vector pointer
117          */
118         pInVec = pSrcVecPtr;
119         /*
120          * reset accumulators
121          */
122         acc0 = vdupq_n_f16(0.0f);
123         acc1 = vdupq_n_f16(0.0f);
124         acc2 = vdupq_n_f16(0.0f);
125         acc3 = vdupq_n_f16(0.0f);
126 
127         pSrcA0Vec = pInA0;
128         pSrcA1Vec = pInA1;
129         pSrcA2Vec = pInA2;
130         pSrcA3Vec = pInA3;
131 
132         blkCnt = numCols >> 3;
133         while (blkCnt > 0U) {
134             f16x8_t         vecA;
135 
136             vecIn = vld1q(pInVec);
137             pInVec += 8;
138             vecA = vld1q(pSrcA0Vec);
139             pSrcA0Vec += 8;
140             acc0 = vfmaq(acc0, vecIn, vecA);
141             vecA = vld1q(pSrcA1Vec);
142             pSrcA1Vec += 8;
143             acc1 = vfmaq(acc1, vecIn, vecA);
144             vecA = vld1q(pSrcA2Vec);
145             pSrcA2Vec += 8;
146             acc2 = vfmaq(acc2, vecIn, vecA);
147             vecA = vld1q(pSrcA3Vec);
148             pSrcA3Vec += 8;
149             acc3 = vfmaq(acc3, vecIn, vecA);
150 
151             blkCnt--;
152         }
153         /*
154          * tail
155          * (will be merged thru tail predication)
156          */
157         blkCnt = numCols & 7;
158         if (blkCnt > 0U) {
159             mve_pred16_t    p0 = vctp16q(blkCnt);
160             f16x8_t         vecA;
161 
162             vecIn = vldrhq_z_f16(pInVec, p0);
163             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
164             acc0 = vfmaq(acc0, vecIn, vecA);
165             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
166             acc1 = vfmaq(acc1, vecIn, vecA);
167             vecA = vldrhq_z_f16(pSrcA2Vec, p0);
168             acc2 = vfmaq(acc2, vecIn, vecA);
169             vecA = vldrhq_z_f16(pSrcA3Vec, p0);
170             acc3 = vfmaq(acc3, vecIn, vecA);
171         }
172         /*
173          * Sum the partial parts
174          */
175         f16x8_t         vtmp = vuninitializedq_f16();
176         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
177         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
178         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
179         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
180 
181         vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
182                              arm_vec_exponent_f16
183                              (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0),
184                                 S->degree),vctp16q(4));
185 
186         pDualCoef += 4;
187 
188         pSrcA += numCols * 4;
189         /*
190          * Decrement the row loop counter
191          */
192         row -= 4;
193     }
194 
195     /*
196      * compute 2 rows in parrallel
197      */
198     if (row >= 2) {
199         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
200         f16x8_t         vecIn, acc0, acc1;
201         float16_t const *pSrcVecPtr = in;
202 
203         /*
204          * Initialize the pointers to 2 consecutive MatrixA rows
205          */
206         pInA0 = pSrcA;
207         pInA1 = pInA0 + numCols;
208         /*
209          * Initialize the vector pointer
210          */
211         pInVec = pSrcVecPtr;
212         /*
213          * reset accumulators
214          */
215         acc0 = vdupq_n_f16(0.0f);
216         acc1 = vdupq_n_f16(0.0f);
217         pSrcA0Vec = pInA0;
218         pSrcA1Vec = pInA1;
219 
220         blkCnt = numCols >> 3;
221         while (blkCnt > 0U) {
222             f16x8_t         vecA;
223 
224             vecIn = vld1q(pInVec);
225             pInVec += 8;
226             vecA = vld1q(pSrcA0Vec);
227             pSrcA0Vec += 8;
228             acc0 = vfmaq(acc0, vecIn, vecA);
229             vecA = vld1q(pSrcA1Vec);
230             pSrcA1Vec += 8;
231             acc1 = vfmaq(acc1, vecIn, vecA);
232 
233             blkCnt--;
234         }
235         /*
236          * tail
237          * (will be merged thru tail predication)
238          */
239         blkCnt = numCols & 7;
240         if (blkCnt > 0U) {
241             mve_pred16_t    p0 = vctp16q(blkCnt);
242             f16x8_t         vecA;
243 
244             vecIn = vldrhq_z_f16(pInVec, p0);
245             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
246             acc0 = vfmaq(acc0, vecIn, vecA);
247             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
248             acc1 = vfmaq(acc1, vecIn, vecA);
249         }
250         /*
251          * Sum the partial parts
252          */
253         f16x8_t         vtmp = vuninitializedq_f16();
254         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
255         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
256 
257         vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
258                              arm_vec_exponent_f16
259                              (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
260                              vctp16q(2));
261 
262         pDualCoef += 2;
263         pSrcA += numCols * 2;
264         row -= 2;
265     }
266 
267     if (row >= 1) {
268         f16x8_t         vecIn, acc0;
269         float16_t const *pSrcA0Vec, *pInVec;
270         float16_t const *pSrcVecPtr = in;
271         /*
272          * Initialize the pointers to last MatrixA row
273          */
274         pInA0 = pSrcA;
275         /*
276          * Initialize the vector pointer
277          */
278         pInVec = pSrcVecPtr;
279         /*
280          * reset accumulators
281          */
282         acc0 = vdupq_n_f16(0.0f);
283 
284         pSrcA0Vec = pInA0;
285 
286         blkCnt = numCols >> 3;
287         while (blkCnt > 0U) {
288             f16x8_t         vecA;
289 
290             vecIn = vld1q(pInVec);
291             pInVec += 8;
292             vecA = vld1q(pSrcA0Vec);
293             pSrcA0Vec += 8;
294             acc0 = vfmaq(acc0, vecIn, vecA);
295 
296             blkCnt--;
297         }
298         /*
299          * tail
300          * (will be merged thru tail predication)
301          */
302         blkCnt = numCols & 7;
303         if (blkCnt > 0U) {
304             mve_pred16_t    p0 = vctp16q(blkCnt);
305             f16x8_t         vecA;
306 
307             vecIn = vldrhq_z_f16(pInVec, p0);
308             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
309             acc0 = vfmaq(acc0, vecIn, vecA);
310         }
311         /*
312          * Sum the partial parts
313          */
314         f16x8_t         vtmp = vuninitializedq_f16();
315         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
316         vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
317                              arm_vec_exponent_f16
318                              (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
319                              vctp16q(1));
320     }
321     sum += (_Float16)vecAddAcrossF16Mve(vSum);
322 
323 
324     *pResult = S->classes[STEP(sum)];
325 }
326 
327 #else
328 
329 
330 /**
331  * @brief SVM polynomial prediction
332  * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
333  * @param[in]    in         Pointer to input vector
334  * @param[out]   pResult    Decision value
335  *
336  */
arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 * S,const float16_t * in,int32_t * pResult)337 ARM_DSP_ATTRIBUTE void arm_svm_polynomial_predict_f16(
338     const arm_svm_polynomial_instance_f16 *S,
339     const float16_t * in,
340     int32_t * pResult)
341 {
342     _Float16 sum=S->intercept;
343     _Float16 dot=0;
344     uint32_t i,j;
345     const float16_t *pSupport = S->supportVectors;
346 
347     for(i=0; i < S->nbOfSupportVectors; i++)
348     {
349         dot=0;
350         for(j=0; j < S->vectorDimension; j++)
351         {
352             dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
353         }
354         sum += (_Float16)S->dualCoefficients[i] * (_Float16)arm_exponent_f16((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0, S->degree);
355     }
356 
357     *pResult=S->classes[STEP(sum)];
358 }
359 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
360 
361 
362 /**
363  * @} end of polysvm group
364  */
365 
366 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
367 
368