1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_polynomial_predict_f16.c
4 * Description: SVM Polynomial Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 #include <limits.h>
34 #include <math.h>
35
36 #if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
37
38 /*
39
40 _Float16 is not supported in g++ so we avoid putting _Float16 definitions
41 in the public headers.
42
43 This function should at some point be moved in FastMath.
44
45 */
arm_exponent_f16(float16_t x,int32_t nb)46 __STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
47 {
48 float16_t r = x;
49 nb --;
50 while(nb > 0)
51 {
52 r = (_Float16)r * (_Float16)x;
53 nb--;
54 }
55 return(r);
56 }
57 #endif
58
59 /**
60 * @addtogroup polysvm
61 * @{
62 */
63
64
65
66
67 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
68
69 #include "arm_helium_utils.h"
70 #include "arm_vec_math_f16.h"
71
72 /**
73 * @brief SVM polynomial prediction
74 * @param[in] S Pointer to an instance of the polynomial SVM structure.
75 * @param[in] in Pointer to input vector
76 * @param[out] pResult Decision value
77 *
78 */
arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 * S,const float16_t * in,int32_t * pResult)79 ARM_DSP_ATTRIBUTE void arm_svm_polynomial_predict_f16(
80 const arm_svm_polynomial_instance_f16 *S,
81 const float16_t * in,
82 int32_t * pResult)
83 {
84 /* inlined Matrix x Vector function interleaved with dot prod */
85 uint32_t numRows = S->nbOfSupportVectors;
86 uint32_t numCols = S->vectorDimension;
87 const float16_t *pSupport = S->supportVectors;
88 const float16_t *pSrcA = pSupport;
89 const float16_t *pInA0;
90 const float16_t *pInA1;
91 uint32_t row;
92 uint32_t blkCnt; /* loop counters */
93 const float16_t *pDualCoef = S->dualCoefficients;
94 _Float16 sum = S->intercept;
95 f16x8_t vSum = vdupq_n_f16(0.0f);
96
97 row = numRows;
98
99 /*
100 * compute 4 rows in parrallel
101 */
102 while (row >= 4) {
103 const float16_t *pInA2, *pInA3;
104 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
105 f16x8_t vecIn, acc0, acc1, acc2, acc3;
106 float16_t const *pSrcVecPtr = in;
107
108 /*
109 * Initialize the pointers to 4 consecutive MatrixA rows
110 */
111 pInA0 = pSrcA;
112 pInA1 = pInA0 + numCols;
113 pInA2 = pInA1 + numCols;
114 pInA3 = pInA2 + numCols;
115 /*
116 * Initialize the vector pointer
117 */
118 pInVec = pSrcVecPtr;
119 /*
120 * reset accumulators
121 */
122 acc0 = vdupq_n_f16(0.0f);
123 acc1 = vdupq_n_f16(0.0f);
124 acc2 = vdupq_n_f16(0.0f);
125 acc3 = vdupq_n_f16(0.0f);
126
127 pSrcA0Vec = pInA0;
128 pSrcA1Vec = pInA1;
129 pSrcA2Vec = pInA2;
130 pSrcA3Vec = pInA3;
131
132 blkCnt = numCols >> 3;
133 while (blkCnt > 0U) {
134 f16x8_t vecA;
135
136 vecIn = vld1q(pInVec);
137 pInVec += 8;
138 vecA = vld1q(pSrcA0Vec);
139 pSrcA0Vec += 8;
140 acc0 = vfmaq(acc0, vecIn, vecA);
141 vecA = vld1q(pSrcA1Vec);
142 pSrcA1Vec += 8;
143 acc1 = vfmaq(acc1, vecIn, vecA);
144 vecA = vld1q(pSrcA2Vec);
145 pSrcA2Vec += 8;
146 acc2 = vfmaq(acc2, vecIn, vecA);
147 vecA = vld1q(pSrcA3Vec);
148 pSrcA3Vec += 8;
149 acc3 = vfmaq(acc3, vecIn, vecA);
150
151 blkCnt--;
152 }
153 /*
154 * tail
155 * (will be merged thru tail predication)
156 */
157 blkCnt = numCols & 7;
158 if (blkCnt > 0U) {
159 mve_pred16_t p0 = vctp16q(blkCnt);
160 f16x8_t vecA;
161
162 vecIn = vldrhq_z_f16(pInVec, p0);
163 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
164 acc0 = vfmaq(acc0, vecIn, vecA);
165 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
166 acc1 = vfmaq(acc1, vecIn, vecA);
167 vecA = vldrhq_z_f16(pSrcA2Vec, p0);
168 acc2 = vfmaq(acc2, vecIn, vecA);
169 vecA = vldrhq_z_f16(pSrcA3Vec, p0);
170 acc3 = vfmaq(acc3, vecIn, vecA);
171 }
172 /*
173 * Sum the partial parts
174 */
175 f16x8_t vtmp = vuninitializedq_f16();
176 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
177 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
178 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
179 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
180
181 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
182 arm_vec_exponent_f16
183 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0),
184 S->degree),vctp16q(4));
185
186 pDualCoef += 4;
187
188 pSrcA += numCols * 4;
189 /*
190 * Decrement the row loop counter
191 */
192 row -= 4;
193 }
194
195 /*
196 * compute 2 rows in parrallel
197 */
198 if (row >= 2) {
199 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
200 f16x8_t vecIn, acc0, acc1;
201 float16_t const *pSrcVecPtr = in;
202
203 /*
204 * Initialize the pointers to 2 consecutive MatrixA rows
205 */
206 pInA0 = pSrcA;
207 pInA1 = pInA0 + numCols;
208 /*
209 * Initialize the vector pointer
210 */
211 pInVec = pSrcVecPtr;
212 /*
213 * reset accumulators
214 */
215 acc0 = vdupq_n_f16(0.0f);
216 acc1 = vdupq_n_f16(0.0f);
217 pSrcA0Vec = pInA0;
218 pSrcA1Vec = pInA1;
219
220 blkCnt = numCols >> 3;
221 while (blkCnt > 0U) {
222 f16x8_t vecA;
223
224 vecIn = vld1q(pInVec);
225 pInVec += 8;
226 vecA = vld1q(pSrcA0Vec);
227 pSrcA0Vec += 8;
228 acc0 = vfmaq(acc0, vecIn, vecA);
229 vecA = vld1q(pSrcA1Vec);
230 pSrcA1Vec += 8;
231 acc1 = vfmaq(acc1, vecIn, vecA);
232
233 blkCnt--;
234 }
235 /*
236 * tail
237 * (will be merged thru tail predication)
238 */
239 blkCnt = numCols & 7;
240 if (blkCnt > 0U) {
241 mve_pred16_t p0 = vctp16q(blkCnt);
242 f16x8_t vecA;
243
244 vecIn = vldrhq_z_f16(pInVec, p0);
245 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
246 acc0 = vfmaq(acc0, vecIn, vecA);
247 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
248 acc1 = vfmaq(acc1, vecIn, vecA);
249 }
250 /*
251 * Sum the partial parts
252 */
253 f16x8_t vtmp = vuninitializedq_f16();
254 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
255 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
256
257 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
258 arm_vec_exponent_f16
259 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
260 vctp16q(2));
261
262 pDualCoef += 2;
263 pSrcA += numCols * 2;
264 row -= 2;
265 }
266
267 if (row >= 1) {
268 f16x8_t vecIn, acc0;
269 float16_t const *pSrcA0Vec, *pInVec;
270 float16_t const *pSrcVecPtr = in;
271 /*
272 * Initialize the pointers to last MatrixA row
273 */
274 pInA0 = pSrcA;
275 /*
276 * Initialize the vector pointer
277 */
278 pInVec = pSrcVecPtr;
279 /*
280 * reset accumulators
281 */
282 acc0 = vdupq_n_f16(0.0f);
283
284 pSrcA0Vec = pInA0;
285
286 blkCnt = numCols >> 3;
287 while (blkCnt > 0U) {
288 f16x8_t vecA;
289
290 vecIn = vld1q(pInVec);
291 pInVec += 8;
292 vecA = vld1q(pSrcA0Vec);
293 pSrcA0Vec += 8;
294 acc0 = vfmaq(acc0, vecIn, vecA);
295
296 blkCnt--;
297 }
298 /*
299 * tail
300 * (will be merged thru tail predication)
301 */
302 blkCnt = numCols & 7;
303 if (blkCnt > 0U) {
304 mve_pred16_t p0 = vctp16q(blkCnt);
305 f16x8_t vecA;
306
307 vecIn = vldrhq_z_f16(pInVec, p0);
308 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
309 acc0 = vfmaq(acc0, vecIn, vecA);
310 }
311 /*
312 * Sum the partial parts
313 */
314 f16x8_t vtmp = vuninitializedq_f16();
315 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
316 vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
317 arm_vec_exponent_f16
318 (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
319 vctp16q(1));
320 }
321 sum += (_Float16)vecAddAcrossF16Mve(vSum);
322
323
324 *pResult = S->classes[STEP(sum)];
325 }
326
327 #else
328
329
330 /**
331 * @brief SVM polynomial prediction
332 * @param[in] S Pointer to an instance of the polynomial SVM structure.
333 * @param[in] in Pointer to input vector
334 * @param[out] pResult Decision value
335 *
336 */
arm_svm_polynomial_predict_f16(const arm_svm_polynomial_instance_f16 * S,const float16_t * in,int32_t * pResult)337 ARM_DSP_ATTRIBUTE void arm_svm_polynomial_predict_f16(
338 const arm_svm_polynomial_instance_f16 *S,
339 const float16_t * in,
340 int32_t * pResult)
341 {
342 _Float16 sum=S->intercept;
343 _Float16 dot=0;
344 uint32_t i,j;
345 const float16_t *pSupport = S->supportVectors;
346
347 for(i=0; i < S->nbOfSupportVectors; i++)
348 {
349 dot=0;
350 for(j=0; j < S->vectorDimension; j++)
351 {
352 dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
353 }
354 sum += (_Float16)S->dualCoefficients[i] * (_Float16)arm_exponent_f16((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0, S->degree);
355 }
356
357 *pResult=S->classes[STEP(sum)];
358 }
359 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
360
361
362 /**
363 * @} end of polysvm group
364 */
365
366 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
367
368