1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_svm_rbf_predict_f16.c
4  * Description:  SVM Radial Basis Function Classifier
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/svm_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 #include <limits.h>
34 #include <math.h>
35 
36 
37 /**
38  * @addtogroup rbfsvm
39  * @{
40  */
41 
42 
43 /**
44  * @brief SVM rbf prediction
45  * @param[in]    S         Pointer to an instance of the rbf SVM structure.
46  * @param[in]    in        Pointer to input vector
47  * @param[out]   pResult   decision value
48  *
49  */
50 
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
53 #include "arm_helium_utils.h"
54 #include "arm_vec_math_f16.h"
55 
arm_svm_rbf_predict_f16(const arm_svm_rbf_instance_f16 * S,const float16_t * in,int32_t * pResult)56 ARM_DSP_ATTRIBUTE void arm_svm_rbf_predict_f16(
57     const arm_svm_rbf_instance_f16 *S,
58     const float16_t * in,
59     int32_t * pResult)
60 {
61         /* inlined Matrix x Vector function interleaved with dot prod */
62     uint32_t        numRows = S->nbOfSupportVectors;
63     uint32_t        numCols = S->vectorDimension;
64     const float16_t *pSupport = S->supportVectors;
65     const float16_t *pSrcA = pSupport;
66     const float16_t *pInA0;
67     const float16_t *pInA1;
68     uint32_t         row;
69     uint32_t         blkCnt;     /* loop counters */
70     const float16_t *pDualCoef = S->dualCoefficients;
71     _Float16       sum = S->intercept;
72     f16x8_t         vSum = vdupq_n_f16(0.0f16);
73 
74     row = numRows;
75 
76     /*
77      * compute 4 rows in parrallel
78      */
79     while (row >= 4) {
80         const float16_t *pInA2, *pInA3;
81         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
82         f16x8_t         vecIn, acc0, acc1, acc2, acc3;
83         float16_t const *pSrcVecPtr = in;
84 
85         /*
86          * Initialize the pointers to 4 consecutive MatrixA rows
87          */
88         pInA0 = pSrcA;
89         pInA1 = pInA0 + numCols;
90         pInA2 = pInA1 + numCols;
91         pInA3 = pInA2 + numCols;
92         /*
93          * Initialize the vector pointer
94          */
95         pInVec = pSrcVecPtr;
96         /*
97          * reset accumulators
98          */
99         acc0 = vdupq_n_f16(0.0f16);
100         acc1 = vdupq_n_f16(0.0f16);
101         acc2 = vdupq_n_f16(0.0f16);
102         acc3 = vdupq_n_f16(0.0f16);
103 
104         pSrcA0Vec = pInA0;
105         pSrcA1Vec = pInA1;
106         pSrcA2Vec = pInA2;
107         pSrcA3Vec = pInA3;
108 
109         blkCnt = numCols >> 3;
110         while (blkCnt > 0U) {
111             f16x8_t         vecA;
112             f16x8_t         vecDif;
113 
114             vecIn = vld1q(pInVec);
115             pInVec += 8;
116             vecA = vld1q(pSrcA0Vec);
117             pSrcA0Vec += 8;
118             vecDif = vsubq(vecIn, vecA);
119             acc0 = vfmaq(acc0, vecDif, vecDif);
120             vecA = vld1q(pSrcA1Vec);
121             pSrcA1Vec += 8;
122             vecDif = vsubq(vecIn, vecA);
123             acc1 = vfmaq(acc1, vecDif, vecDif);
124             vecA = vld1q(pSrcA2Vec);
125             pSrcA2Vec += 8;
126             vecDif = vsubq(vecIn, vecA);
127             acc2 = vfmaq(acc2, vecDif, vecDif);
128             vecA = vld1q(pSrcA3Vec);
129             pSrcA3Vec += 8;
130             vecDif = vsubq(vecIn, vecA);
131             acc3 = vfmaq(acc3, vecDif, vecDif);
132 
133             blkCnt--;
134         }
135         /*
136          * tail
137          * (will be merged thru tail predication)
138          */
139         blkCnt = numCols & 7;
140         if (blkCnt > 0U) {
141             mve_pred16_t    p0 = vctp16q(blkCnt);
142             f16x8_t         vecA;
143             f16x8_t         vecDif;
144 
145             vecIn = vldrhq_z_f16(pInVec, p0);
146             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
147             vecDif = vsubq(vecIn, vecA);
148             acc0 = vfmaq(acc0, vecDif, vecDif);
149             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
150             vecDif = vsubq(vecIn, vecA);
151             acc1 = vfmaq(acc1, vecDif, vecDif);
152             vecA = vldrhq_z_f16(pSrcA2Vec, p0);;
153             vecDif = vsubq(vecIn, vecA);
154             acc2 = vfmaq(acc2, vecDif, vecDif);
155             vecA = vldrhq_z_f16(pSrcA3Vec, p0);
156             vecDif = vsubq(vecIn, vecA);
157             acc3 = vfmaq(acc3, vecDif, vecDif);
158         }
159         /*
160          * Sum the partial parts
161          */
162 
163         //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF16Mve(acc0));
164         f16x8_t         vtmp = vuninitializedq_f16();
165         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
166         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
167         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
168         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
169 
170         vSum =
171             vfmaq_m_f16(vSum, vld1q(pDualCoef),
172                       vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)),vctp16q(4));
173         pDualCoef += 4;
174         pSrcA += numCols * 4;
175         /*
176          * Decrement the row loop counter
177          */
178         row -= 4;
179     }
180 
181     /*
182      * compute 2 rows in parrallel
183      */
184     if (row >= 2) {
185         float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
186         f16x8_t         vecIn, acc0, acc1;
187         float16_t const *pSrcVecPtr = in;
188 
189         /*
190          * Initialize the pointers to 2 consecutive MatrixA rows
191          */
192         pInA0 = pSrcA;
193         pInA1 = pInA0 + numCols;
194         /*
195          * Initialize the vector pointer
196          */
197         pInVec = pSrcVecPtr;
198         /*
199          * reset accumulators
200          */
201         acc0 = vdupq_n_f16(0.0f16);
202         acc1 = vdupq_n_f16(0.0f16);
203         pSrcA0Vec = pInA0;
204         pSrcA1Vec = pInA1;
205 
206         blkCnt = numCols >> 3;
207         while (blkCnt > 0U) {
208             f16x8_t         vecA;
209             f16x8_t         vecDif;
210 
211             vecIn = vld1q(pInVec);
212             pInVec += 8;
213             vecA = vld1q(pSrcA0Vec);
214             pSrcA0Vec += 8;
215             vecDif = vsubq(vecIn, vecA);
216             acc0 = vfmaq(acc0, vecDif, vecDif);;
217             vecA = vld1q(pSrcA1Vec);
218             pSrcA1Vec += 8;
219             vecDif = vsubq(vecIn, vecA);
220             acc1 = vfmaq(acc1, vecDif, vecDif);
221 
222             blkCnt--;
223         }
224         /*
225          * tail
226          * (will be merged thru tail predication)
227          */
228         blkCnt = numCols & 7;
229         if (blkCnt > 0U) {
230             mve_pred16_t    p0 = vctp16q(blkCnt);
231             f16x8_t         vecA, vecDif;
232 
233             vecIn = vldrhq_z_f16(pInVec, p0);
234             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
235             vecDif = vsubq(vecIn, vecA);
236             acc0 = vfmaq(acc0, vecDif, vecDif);
237             vecA = vldrhq_z_f16(pSrcA1Vec, p0);
238             vecDif = vsubq(vecIn, vecA);
239             acc1 = vfmaq(acc1, vecDif, vecDif);
240         }
241         /*
242          * Sum the partial parts
243          */
244         f16x8_t         vtmp = vuninitializedq_f16();
245         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
246         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
247 
248         vSum =
249             vfmaq_m_f16(vSum, vld1q(pDualCoef),
250                         vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(2));
251         pDualCoef += 2;
252 
253         pSrcA += numCols * 2;
254         row -= 2;
255     }
256 
257     if (row >= 1) {
258         f16x8_t         vecIn, acc0;
259         float16_t const *pSrcA0Vec, *pInVec;
260         float16_t const *pSrcVecPtr = in;
261         /*
262          * Initialize the pointers to last MatrixA row
263          */
264         pInA0 = pSrcA;
265         /*
266          * Initialize the vector pointer
267          */
268         pInVec = pSrcVecPtr;
269         /*
270          * reset accumulators
271          */
272         acc0 = vdupq_n_f16(0.0f);
273 
274         pSrcA0Vec = pInA0;
275 
276         blkCnt = numCols >> 3;
277         while (blkCnt > 0U) {
278             f16x8_t         vecA, vecDif;
279 
280             vecIn = vld1q(pInVec);
281             pInVec += 8;
282             vecA = vld1q(pSrcA0Vec);
283             pSrcA0Vec += 8;
284             vecDif = vsubq(vecIn, vecA);
285             acc0 = vfmaq(acc0, vecDif, vecDif);
286 
287             blkCnt--;
288         }
289         /*
290          * tail
291          * (will be merged thru tail predication)
292          */
293         blkCnt = numCols & 7;
294         if (blkCnt > 0U) {
295             mve_pred16_t    p0 = vctp16q(blkCnt);
296             f16x8_t         vecA, vecDif;
297 
298             vecIn = vldrhq_z_f16(pInVec, p0);
299             vecA = vldrhq_z_f16(pSrcA0Vec, p0);
300             vecDif = vsubq(vecIn, vecA);
301             acc0 = vfmaq(acc0, vecDif, vecDif);
302         }
303         /*
304          * Sum the partial parts
305          */
306         f16x8_t         vtmp = vuninitializedq_f16();
307         vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
308 
309         vSum =
310             vfmaq_m_f16(vSum, vld1q(pDualCoef),
311                         vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(1));
312 
313     }
314 
315 
316     sum += (_Float16)vecAddAcrossF16Mve(vSum);
317     *pResult = S->classes[STEP(sum)];
318 }
319 
320 #else
arm_svm_rbf_predict_f16(const arm_svm_rbf_instance_f16 * S,const float16_t * in,int32_t * pResult)321 ARM_DSP_ATTRIBUTE void arm_svm_rbf_predict_f16(
322     const arm_svm_rbf_instance_f16 *S,
323     const float16_t * in,
324     int32_t * pResult)
325 {
326     _Float16 sum=S->intercept;
327     _Float16 dot=00.f16;
328     uint32_t i,j;
329     const float16_t *pSupport = S->supportVectors;
330 
331     for(i=0; i < S->nbOfSupportVectors; i++)
332     {
333         dot=0.0f16;
334         for(j=0; j < S->vectorDimension; j++)
335         {
336             dot = dot + ARM_SQ((_Float16)in[j] - (_Float16) *pSupport);
337             pSupport++;
338         }
339         sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf((float32_t)(-(_Float16)S->gamma * (_Float16)dot));
340     }
341     *pResult=S->classes[STEP(sum)];
342 }
343 
344 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
345 
346 /**
347  * @} end of rbfsvm group
348  */
349 
350 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
351 
352