1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_rbf_predict_f16.c
4 * Description: SVM Radial Basis Function Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 #include <limits.h>
34 #include <math.h>
35
36
37 /**
38 * @addtogroup rbfsvm
39 * @{
40 */
41
42
43 /**
44 * @brief SVM rbf prediction
45 * @param[in] S Pointer to an instance of the rbf SVM structure.
46 * @param[in] in Pointer to input vector
47 * @param[out] pResult decision value
48 *
49 */
50
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52
53 #include "arm_helium_utils.h"
54 #include "arm_vec_math_f16.h"
55
arm_svm_rbf_predict_f16(const arm_svm_rbf_instance_f16 * S,const float16_t * in,int32_t * pResult)56 ARM_DSP_ATTRIBUTE void arm_svm_rbf_predict_f16(
57 const arm_svm_rbf_instance_f16 *S,
58 const float16_t * in,
59 int32_t * pResult)
60 {
61 /* inlined Matrix x Vector function interleaved with dot prod */
62 uint32_t numRows = S->nbOfSupportVectors;
63 uint32_t numCols = S->vectorDimension;
64 const float16_t *pSupport = S->supportVectors;
65 const float16_t *pSrcA = pSupport;
66 const float16_t *pInA0;
67 const float16_t *pInA1;
68 uint32_t row;
69 uint32_t blkCnt; /* loop counters */
70 const float16_t *pDualCoef = S->dualCoefficients;
71 _Float16 sum = S->intercept;
72 f16x8_t vSum = vdupq_n_f16(0.0f16);
73
74 row = numRows;
75
76 /*
77 * compute 4 rows in parrallel
78 */
79 while (row >= 4) {
80 const float16_t *pInA2, *pInA3;
81 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
82 f16x8_t vecIn, acc0, acc1, acc2, acc3;
83 float16_t const *pSrcVecPtr = in;
84
85 /*
86 * Initialize the pointers to 4 consecutive MatrixA rows
87 */
88 pInA0 = pSrcA;
89 pInA1 = pInA0 + numCols;
90 pInA2 = pInA1 + numCols;
91 pInA3 = pInA2 + numCols;
92 /*
93 * Initialize the vector pointer
94 */
95 pInVec = pSrcVecPtr;
96 /*
97 * reset accumulators
98 */
99 acc0 = vdupq_n_f16(0.0f16);
100 acc1 = vdupq_n_f16(0.0f16);
101 acc2 = vdupq_n_f16(0.0f16);
102 acc3 = vdupq_n_f16(0.0f16);
103
104 pSrcA0Vec = pInA0;
105 pSrcA1Vec = pInA1;
106 pSrcA2Vec = pInA2;
107 pSrcA3Vec = pInA3;
108
109 blkCnt = numCols >> 3;
110 while (blkCnt > 0U) {
111 f16x8_t vecA;
112 f16x8_t vecDif;
113
114 vecIn = vld1q(pInVec);
115 pInVec += 8;
116 vecA = vld1q(pSrcA0Vec);
117 pSrcA0Vec += 8;
118 vecDif = vsubq(vecIn, vecA);
119 acc0 = vfmaq(acc0, vecDif, vecDif);
120 vecA = vld1q(pSrcA1Vec);
121 pSrcA1Vec += 8;
122 vecDif = vsubq(vecIn, vecA);
123 acc1 = vfmaq(acc1, vecDif, vecDif);
124 vecA = vld1q(pSrcA2Vec);
125 pSrcA2Vec += 8;
126 vecDif = vsubq(vecIn, vecA);
127 acc2 = vfmaq(acc2, vecDif, vecDif);
128 vecA = vld1q(pSrcA3Vec);
129 pSrcA3Vec += 8;
130 vecDif = vsubq(vecIn, vecA);
131 acc3 = vfmaq(acc3, vecDif, vecDif);
132
133 blkCnt--;
134 }
135 /*
136 * tail
137 * (will be merged thru tail predication)
138 */
139 blkCnt = numCols & 7;
140 if (blkCnt > 0U) {
141 mve_pred16_t p0 = vctp16q(blkCnt);
142 f16x8_t vecA;
143 f16x8_t vecDif;
144
145 vecIn = vldrhq_z_f16(pInVec, p0);
146 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
147 vecDif = vsubq(vecIn, vecA);
148 acc0 = vfmaq(acc0, vecDif, vecDif);
149 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
150 vecDif = vsubq(vecIn, vecA);
151 acc1 = vfmaq(acc1, vecDif, vecDif);
152 vecA = vldrhq_z_f16(pSrcA2Vec, p0);;
153 vecDif = vsubq(vecIn, vecA);
154 acc2 = vfmaq(acc2, vecDif, vecDif);
155 vecA = vldrhq_z_f16(pSrcA3Vec, p0);
156 vecDif = vsubq(vecIn, vecA);
157 acc3 = vfmaq(acc3, vecDif, vecDif);
158 }
159 /*
160 * Sum the partial parts
161 */
162
163 //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF16Mve(acc0));
164 f16x8_t vtmp = vuninitializedq_f16();
165 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
166 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
167 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
168 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
169
170 vSum =
171 vfmaq_m_f16(vSum, vld1q(pDualCoef),
172 vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)),vctp16q(4));
173 pDualCoef += 4;
174 pSrcA += numCols * 4;
175 /*
176 * Decrement the row loop counter
177 */
178 row -= 4;
179 }
180
181 /*
182 * compute 2 rows in parrallel
183 */
184 if (row >= 2) {
185 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
186 f16x8_t vecIn, acc0, acc1;
187 float16_t const *pSrcVecPtr = in;
188
189 /*
190 * Initialize the pointers to 2 consecutive MatrixA rows
191 */
192 pInA0 = pSrcA;
193 pInA1 = pInA0 + numCols;
194 /*
195 * Initialize the vector pointer
196 */
197 pInVec = pSrcVecPtr;
198 /*
199 * reset accumulators
200 */
201 acc0 = vdupq_n_f16(0.0f16);
202 acc1 = vdupq_n_f16(0.0f16);
203 pSrcA0Vec = pInA0;
204 pSrcA1Vec = pInA1;
205
206 blkCnt = numCols >> 3;
207 while (blkCnt > 0U) {
208 f16x8_t vecA;
209 f16x8_t vecDif;
210
211 vecIn = vld1q(pInVec);
212 pInVec += 8;
213 vecA = vld1q(pSrcA0Vec);
214 pSrcA0Vec += 8;
215 vecDif = vsubq(vecIn, vecA);
216 acc0 = vfmaq(acc0, vecDif, vecDif);;
217 vecA = vld1q(pSrcA1Vec);
218 pSrcA1Vec += 8;
219 vecDif = vsubq(vecIn, vecA);
220 acc1 = vfmaq(acc1, vecDif, vecDif);
221
222 blkCnt--;
223 }
224 /*
225 * tail
226 * (will be merged thru tail predication)
227 */
228 blkCnt = numCols & 7;
229 if (blkCnt > 0U) {
230 mve_pred16_t p0 = vctp16q(blkCnt);
231 f16x8_t vecA, vecDif;
232
233 vecIn = vldrhq_z_f16(pInVec, p0);
234 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
235 vecDif = vsubq(vecIn, vecA);
236 acc0 = vfmaq(acc0, vecDif, vecDif);
237 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
238 vecDif = vsubq(vecIn, vecA);
239 acc1 = vfmaq(acc1, vecDif, vecDif);
240 }
241 /*
242 * Sum the partial parts
243 */
244 f16x8_t vtmp = vuninitializedq_f16();
245 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
246 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
247
248 vSum =
249 vfmaq_m_f16(vSum, vld1q(pDualCoef),
250 vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(2));
251 pDualCoef += 2;
252
253 pSrcA += numCols * 2;
254 row -= 2;
255 }
256
257 if (row >= 1) {
258 f16x8_t vecIn, acc0;
259 float16_t const *pSrcA0Vec, *pInVec;
260 float16_t const *pSrcVecPtr = in;
261 /*
262 * Initialize the pointers to last MatrixA row
263 */
264 pInA0 = pSrcA;
265 /*
266 * Initialize the vector pointer
267 */
268 pInVec = pSrcVecPtr;
269 /*
270 * reset accumulators
271 */
272 acc0 = vdupq_n_f16(0.0f);
273
274 pSrcA0Vec = pInA0;
275
276 blkCnt = numCols >> 3;
277 while (blkCnt > 0U) {
278 f16x8_t vecA, vecDif;
279
280 vecIn = vld1q(pInVec);
281 pInVec += 8;
282 vecA = vld1q(pSrcA0Vec);
283 pSrcA0Vec += 8;
284 vecDif = vsubq(vecIn, vecA);
285 acc0 = vfmaq(acc0, vecDif, vecDif);
286
287 blkCnt--;
288 }
289 /*
290 * tail
291 * (will be merged thru tail predication)
292 */
293 blkCnt = numCols & 7;
294 if (blkCnt > 0U) {
295 mve_pred16_t p0 = vctp16q(blkCnt);
296 f16x8_t vecA, vecDif;
297
298 vecIn = vldrhq_z_f16(pInVec, p0);
299 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
300 vecDif = vsubq(vecIn, vecA);
301 acc0 = vfmaq(acc0, vecDif, vecDif);
302 }
303 /*
304 * Sum the partial parts
305 */
306 f16x8_t vtmp = vuninitializedq_f16();
307 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
308
309 vSum =
310 vfmaq_m_f16(vSum, vld1q(pDualCoef),
311 vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(1));
312
313 }
314
315
316 sum += (_Float16)vecAddAcrossF16Mve(vSum);
317 *pResult = S->classes[STEP(sum)];
318 }
319
320 #else
arm_svm_rbf_predict_f16(const arm_svm_rbf_instance_f16 * S,const float16_t * in,int32_t * pResult)321 ARM_DSP_ATTRIBUTE void arm_svm_rbf_predict_f16(
322 const arm_svm_rbf_instance_f16 *S,
323 const float16_t * in,
324 int32_t * pResult)
325 {
326 _Float16 sum=S->intercept;
327 _Float16 dot=00.f16;
328 uint32_t i,j;
329 const float16_t *pSupport = S->supportVectors;
330
331 for(i=0; i < S->nbOfSupportVectors; i++)
332 {
333 dot=0.0f16;
334 for(j=0; j < S->vectorDimension; j++)
335 {
336 dot = dot + ARM_SQ((_Float16)in[j] - (_Float16) *pSupport);
337 pSupport++;
338 }
339 sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf((float32_t)(-(_Float16)S->gamma * (_Float16)dot));
340 }
341 *pResult=S->classes[STEP(sum)];
342 }
343
344 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
345
346 /**
347 * @} end of rbfsvm group
348 */
349
350 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
351
352