1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_sigmoid_predict_f16.c
4 * Description: SVM Sigmoid Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 #include <limits.h>
34 #include <math.h>
35
36 /**
37 * @addtogroup sigmoidsvm
38 * @{
39 */
40
41
42
43 /**
44 * @brief SVM sigmoid prediction
45 * @param[in] S Pointer to an instance of the rbf SVM structure.
46 * @param[in] in Pointer to input vector
47 * @param[out] pResult Decision value
48 *
49 */
50
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52
53 #include "arm_helium_utils.h"
54 #include "arm_vec_math_f16.h"
55
arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 * S,const float16_t * in,int32_t * pResult)56 void arm_svm_sigmoid_predict_f16(
57 const arm_svm_sigmoid_instance_f16 *S,
58 const float16_t * in,
59 int32_t * pResult)
60 {
61 /* inlined Matrix x Vector function interleaved with dot prod */
62 uint32_t numRows = S->nbOfSupportVectors;
63 uint32_t numCols = S->vectorDimension;
64 const float16_t *pSupport = S->supportVectors;
65 const float16_t *pSrcA = pSupport;
66 const float16_t *pInA0;
67 const float16_t *pInA1;
68 uint32_t row;
69 uint32_t blkCnt; /* loop counters */
70 const float16_t *pDualCoef = S->dualCoefficients;
71 _Float16 sum = S->intercept;
72 f16x8_t vSum = vdupq_n_f16(0.0f);
73
74 row = numRows;
75
76 /*
77 * compute 4 rows in parrallel
78 */
79 while (row >= 4) {
80 const float16_t *pInA2, *pInA3;
81 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
82 f16x8_t vecIn, acc0, acc1, acc2, acc3;
83 float16_t const *pSrcVecPtr = in;
84
85 /*
86 * Initialize the pointers to 4 consecutive MatrixA rows
87 */
88 pInA0 = pSrcA;
89 pInA1 = pInA0 + numCols;
90 pInA2 = pInA1 + numCols;
91 pInA3 = pInA2 + numCols;
92 /*
93 * Initialize the vector pointer
94 */
95 pInVec = pSrcVecPtr;
96 /*
97 * reset accumulators
98 */
99 acc0 = vdupq_n_f16(0.0f);
100 acc1 = vdupq_n_f16(0.0f);
101 acc2 = vdupq_n_f16(0.0f);
102 acc3 = vdupq_n_f16(0.0f);
103
104 pSrcA0Vec = pInA0;
105 pSrcA1Vec = pInA1;
106 pSrcA2Vec = pInA2;
107 pSrcA3Vec = pInA3;
108
109 blkCnt = numCols >> 3;
110 while (blkCnt > 0U) {
111 f16x8_t vecA;
112
113 vecIn = vld1q(pInVec);
114 pInVec += 8;
115 vecA = vld1q(pSrcA0Vec);
116 pSrcA0Vec += 8;
117 acc0 = vfmaq(acc0, vecIn, vecA);
118 vecA = vld1q(pSrcA1Vec);
119 pSrcA1Vec += 8;
120 acc1 = vfmaq(acc1, vecIn, vecA);
121 vecA = vld1q(pSrcA2Vec);
122 pSrcA2Vec += 8;
123 acc2 = vfmaq(acc2, vecIn, vecA);
124 vecA = vld1q(pSrcA3Vec);
125 pSrcA3Vec += 8;
126 acc3 = vfmaq(acc3, vecIn, vecA);
127
128 blkCnt--;
129 }
130 /*
131 * tail
132 * (will be merged thru tail predication)
133 */
134 blkCnt = numCols & 7;
135 if (blkCnt > 0U) {
136 mve_pred16_t p0 = vctp16q(blkCnt);
137 f16x8_t vecA;
138
139 vecIn = vldrhq_z_f16(pInVec, p0);
140 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
141 acc0 = vfmaq(acc0, vecIn, vecA);
142 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
143 acc1 = vfmaq(acc1, vecIn, vecA);
144 vecA = vldrhq_z_f16(pSrcA2Vec, p0);
145 acc2 = vfmaq(acc2, vecIn, vecA);
146 vecA = vldrhq_z_f16(pSrcA3Vec, p0);
147 acc3 = vfmaq(acc3, vecIn, vecA);
148 }
149 /*
150 * Sum the partial parts
151 */
152 f16x8_t vtmp = vuninitializedq_f16();
153 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
154 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
155 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
156 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
157
158 vSum =
159 vfmaq_m_f16(vSum, vld1q(pDualCoef),
160 vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4));
161
162 pDualCoef += 4;
163
164 pSrcA += numCols * 4;
165 /*
166 * Decrement the row loop counter
167 */
168 row -= 4;
169 }
170
171 /*
172 * compute 2 rows in parrallel
173 */
174 if (row >= 2) {
175 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
176 f16x8_t vecIn, acc0, acc1;
177 float16_t const *pSrcVecPtr = in;
178
179 /*
180 * Initialize the pointers to 2 consecutive MatrixA rows
181 */
182 pInA0 = pSrcA;
183 pInA1 = pInA0 + numCols;
184 /*
185 * Initialize the vector pointer
186 */
187 pInVec = pSrcVecPtr;
188 /*
189 * reset accumulators
190 */
191 acc0 = vdupq_n_f16(0.0f);
192 acc1 = vdupq_n_f16(0.0f);
193 pSrcA0Vec = pInA0;
194 pSrcA1Vec = pInA1;
195
196 blkCnt = numCols >> 3;
197 while (blkCnt > 0U) {
198 f16x8_t vecA;
199
200 vecIn = vld1q(pInVec);
201 pInVec += 8;
202 vecA = vld1q(pSrcA0Vec);
203 pSrcA0Vec += 8;
204 acc0 = vfmaq(acc0, vecIn, vecA);
205 vecA = vld1q(pSrcA1Vec);
206 pSrcA1Vec += 8;
207 acc1 = vfmaq(acc1, vecIn, vecA);
208
209 blkCnt--;
210 }
211 /*
212 * tail
213 * (will be merged thru tail predication)
214 */
215 blkCnt = numCols & 7;
216 if (blkCnt > 0U) {
217 mve_pred16_t p0 = vctp16q(blkCnt);
218 f16x8_t vecA;
219
220 vecIn = vldrhq_z_f16(pInVec, p0);
221 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
222 acc0 = vfmaq(acc0, vecIn, vecA);
223 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
224 acc1 = vfmaq(acc1, vecIn, vecA);
225 }
226 /*
227 * Sum the partial parts
228 */
229 f16x8_t vtmp = vuninitializedq_f16();
230 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
231 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
232
233 vSum =
234 vfmaq_m_f16(vSum, vld1q(pDualCoef),
235 vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
236 vctp16q(2));
237
238 pSrcA += numCols * 2;
239 row -= 2;
240 }
241
242 if (row >= 1) {
243 f16x8_t vecIn, acc0;
244 float16_t const *pSrcA0Vec, *pInVec;
245 float16_t const *pSrcVecPtr = in;
246 /*
247 * Initialize the pointers to last MatrixA row
248 */
249 pInA0 = pSrcA;
250 /*
251 * Initialize the vector pointer
252 */
253 pInVec = pSrcVecPtr;
254 /*
255 * reset accumulators
256 */
257 acc0 = vdupq_n_f16(0.0f);
258
259 pSrcA0Vec = pInA0;
260
261 blkCnt = numCols >> 3;
262 while (blkCnt > 0U) {
263 f16x8_t vecA;
264
265 vecIn = vld1q(pInVec);
266 pInVec += 8;
267 vecA = vld1q(pSrcA0Vec);
268 pSrcA0Vec += 8;
269 acc0 = vfmaq(acc0, vecIn, vecA);
270
271 blkCnt--;
272 }
273 /*
274 * tail
275 * (will be merged thru tail predication)
276 */
277 blkCnt = numCols & 7;
278 if (blkCnt > 0U) {
279 mve_pred16_t p0 = vctp16q(blkCnt);
280 f16x8_t vecA;
281
282 vecIn = vldrhq_z_f16(pInVec, p0);
283 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
284 acc0 = vfmaq(acc0, vecIn, vecA);
285 }
286 /*
287 * Sum the partial parts
288 */
289 f16x8_t vtmp = vuninitializedq_f16();
290 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
291
292 vSum =
293 vfmaq_m_f16(vSum, vld1q(pDualCoef),
294 vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
295 vctp16q(1));
296 }
297 sum += (_Float16)vecAddAcrossF16Mve(vSum);
298
299 *pResult = S->classes[STEP(sum)];
300 }
301
302 #else
arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 * S,const float16_t * in,int32_t * pResult)303 void arm_svm_sigmoid_predict_f16(
304 const arm_svm_sigmoid_instance_f16 *S,
305 const float16_t * in,
306 int32_t * pResult)
307 {
308 _Float16 sum=S->intercept;
309 _Float16 dot=0.0f16;
310 uint32_t i,j;
311 const float16_t *pSupport = S->supportVectors;
312
313 for(i=0; i < S->nbOfSupportVectors; i++)
314 {
315 dot=0.0f16;
316 for(j=0; j < S->vectorDimension; j++)
317 {
318 dot = (_Float16)dot + (_Float16)in[j] * (_Float16)*pSupport++;
319 }
320 sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((float32_t)((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0));
321 }
322 *pResult=S->classes[STEP(sum)];
323 }
324
325 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
326
327 /**
328 * @} end of sigmoidsvm group
329 */
330
331 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
332
333