1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_sigmoid_predict_f16.c
4 * Description: SVM Sigmoid Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 #include <limits.h>
34 #include <math.h>
35
36 /**
37 * @addtogroup sigmoidsvm
38 * @{
39 */
40
41
42
43 /**
44 * @brief SVM sigmoid prediction
45 * @param[in] S Pointer to an instance of the rbf SVM structure.
46 * @param[in] in Pointer to input vector
47 * @param[out] pResult Decision value
48 * @return none.
49 *
50 */
51
52 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
53
54 #include "arm_helium_utils.h"
55 #include "arm_vec_math_f16.h"
56
arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 * S,const float16_t * in,int32_t * pResult)57 void arm_svm_sigmoid_predict_f16(
58 const arm_svm_sigmoid_instance_f16 *S,
59 const float16_t * in,
60 int32_t * pResult)
61 {
62 /* inlined Matrix x Vector function interleaved with dot prod */
63 uint32_t numRows = S->nbOfSupportVectors;
64 uint32_t numCols = S->vectorDimension;
65 const float16_t *pSupport = S->supportVectors;
66 const float16_t *pSrcA = pSupport;
67 const float16_t *pInA0;
68 const float16_t *pInA1;
69 uint32_t row;
70 uint32_t blkCnt; /* loop counters */
71 const float16_t *pDualCoef = S->dualCoefficients;
72 _Float16 sum = S->intercept;
73 f16x8_t vSum = vdupq_n_f16(0.0f);
74
75 row = numRows;
76
77 /*
78 * compute 4 rows in parrallel
79 */
80 while (row >= 4) {
81 const float16_t *pInA2, *pInA3;
82 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
83 f16x8_t vecIn, acc0, acc1, acc2, acc3;
84 float16_t const *pSrcVecPtr = in;
85
86 /*
87 * Initialize the pointers to 4 consecutive MatrixA rows
88 */
89 pInA0 = pSrcA;
90 pInA1 = pInA0 + numCols;
91 pInA2 = pInA1 + numCols;
92 pInA3 = pInA2 + numCols;
93 /*
94 * Initialize the vector pointer
95 */
96 pInVec = pSrcVecPtr;
97 /*
98 * reset accumulators
99 */
100 acc0 = vdupq_n_f16(0.0f);
101 acc1 = vdupq_n_f16(0.0f);
102 acc2 = vdupq_n_f16(0.0f);
103 acc3 = vdupq_n_f16(0.0f);
104
105 pSrcA0Vec = pInA0;
106 pSrcA1Vec = pInA1;
107 pSrcA2Vec = pInA2;
108 pSrcA3Vec = pInA3;
109
110 blkCnt = numCols >> 3;
111 while (blkCnt > 0U) {
112 f16x8_t vecA;
113
114 vecIn = vld1q(pInVec);
115 pInVec += 8;
116 vecA = vld1q(pSrcA0Vec);
117 pSrcA0Vec += 8;
118 acc0 = vfmaq(acc0, vecIn, vecA);
119 vecA = vld1q(pSrcA1Vec);
120 pSrcA1Vec += 8;
121 acc1 = vfmaq(acc1, vecIn, vecA);
122 vecA = vld1q(pSrcA2Vec);
123 pSrcA2Vec += 8;
124 acc2 = vfmaq(acc2, vecIn, vecA);
125 vecA = vld1q(pSrcA3Vec);
126 pSrcA3Vec += 8;
127 acc3 = vfmaq(acc3, vecIn, vecA);
128
129 blkCnt--;
130 }
131 /*
132 * tail
133 * (will be merged thru tail predication)
134 */
135 blkCnt = numCols & 7;
136 if (blkCnt > 0U) {
137 mve_pred16_t p0 = vctp16q(blkCnt);
138 f16x8_t vecA;
139
140 vecIn = vldrhq_z_f16(pInVec, p0);
141 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
142 acc0 = vfmaq(acc0, vecIn, vecA);
143 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
144 acc1 = vfmaq(acc1, vecIn, vecA);
145 vecA = vldrhq_z_f16(pSrcA2Vec, p0);
146 acc2 = vfmaq(acc2, vecIn, vecA);
147 vecA = vldrhq_z_f16(pSrcA3Vec, p0);
148 acc3 = vfmaq(acc3, vecIn, vecA);
149 }
150 /*
151 * Sum the partial parts
152 */
153 f16x8_t vtmp = vuninitializedq_f16();
154 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
155 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
156 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
157 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
158
159 vSum =
160 vfmaq_m_f16(vSum, vld1q(pDualCoef),
161 vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4));
162
163 pDualCoef += 4;
164
165 pSrcA += numCols * 4;
166 /*
167 * Decrement the row loop counter
168 */
169 row -= 4;
170 }
171
172 /*
173 * compute 2 rows in parrallel
174 */
175 if (row >= 2) {
176 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
177 f16x8_t vecIn, acc0, acc1;
178 float16_t const *pSrcVecPtr = in;
179
180 /*
181 * Initialize the pointers to 2 consecutive MatrixA rows
182 */
183 pInA0 = pSrcA;
184 pInA1 = pInA0 + numCols;
185 /*
186 * Initialize the vector pointer
187 */
188 pInVec = pSrcVecPtr;
189 /*
190 * reset accumulators
191 */
192 acc0 = vdupq_n_f16(0.0f);
193 acc1 = vdupq_n_f16(0.0f);
194 pSrcA0Vec = pInA0;
195 pSrcA1Vec = pInA1;
196
197 blkCnt = numCols >> 3;
198 while (blkCnt > 0U) {
199 f16x8_t vecA;
200
201 vecIn = vld1q(pInVec);
202 pInVec += 8;
203 vecA = vld1q(pSrcA0Vec);
204 pSrcA0Vec += 8;
205 acc0 = vfmaq(acc0, vecIn, vecA);
206 vecA = vld1q(pSrcA1Vec);
207 pSrcA1Vec += 8;
208 acc1 = vfmaq(acc1, vecIn, vecA);
209
210 blkCnt--;
211 }
212 /*
213 * tail
214 * (will be merged thru tail predication)
215 */
216 blkCnt = numCols & 7;
217 if (blkCnt > 0U) {
218 mve_pred16_t p0 = vctp16q(blkCnt);
219 f16x8_t vecA;
220
221 vecIn = vldrhq_z_f16(pInVec, p0);
222 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
223 acc0 = vfmaq(acc0, vecIn, vecA);
224 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
225 acc1 = vfmaq(acc1, vecIn, vecA);
226 }
227 /*
228 * Sum the partial parts
229 */
230 f16x8_t vtmp = vuninitializedq_f16();
231 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
232 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
233
234 vSum =
235 vfmaq_m_f16(vSum, vld1q(pDualCoef),
236 vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
237 vctp16q(2));
238
239 pSrcA += numCols * 2;
240 row -= 2;
241 }
242
243 if (row >= 1) {
244 f16x8_t vecIn, acc0;
245 float16_t const *pSrcA0Vec, *pInVec;
246 float16_t const *pSrcVecPtr = in;
247 /*
248 * Initialize the pointers to last MatrixA row
249 */
250 pInA0 = pSrcA;
251 /*
252 * Initialize the vector pointer
253 */
254 pInVec = pSrcVecPtr;
255 /*
256 * reset accumulators
257 */
258 acc0 = vdupq_n_f16(0.0f);
259
260 pSrcA0Vec = pInA0;
261
262 blkCnt = numCols >> 3;
263 while (blkCnt > 0U) {
264 f16x8_t vecA;
265
266 vecIn = vld1q(pInVec);
267 pInVec += 8;
268 vecA = vld1q(pSrcA0Vec);
269 pSrcA0Vec += 8;
270 acc0 = vfmaq(acc0, vecIn, vecA);
271
272 blkCnt--;
273 }
274 /*
275 * tail
276 * (will be merged thru tail predication)
277 */
278 blkCnt = numCols & 7;
279 if (blkCnt > 0U) {
280 mve_pred16_t p0 = vctp16q(blkCnt);
281 f16x8_t vecA;
282
283 vecIn = vldrhq_z_f16(pInVec, p0);
284 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
285 acc0 = vfmaq(acc0, vecIn, vecA);
286 }
287 /*
288 * Sum the partial parts
289 */
290 f16x8_t vtmp = vuninitializedq_f16();
291 vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
292
293 vSum =
294 vfmaq_m_f16(vSum, vld1q(pDualCoef),
295 vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
296 vctp16q(1));
297 }
298 sum += (_Float16)vecAddAcrossF16Mve(vSum);
299
300 *pResult = S->classes[STEP(sum)];
301 }
302
303 #else
arm_svm_sigmoid_predict_f16(const arm_svm_sigmoid_instance_f16 * S,const float16_t * in,int32_t * pResult)304 void arm_svm_sigmoid_predict_f16(
305 const arm_svm_sigmoid_instance_f16 *S,
306 const float16_t * in,
307 int32_t * pResult)
308 {
309 _Float16 sum=S->intercept;
310 _Float16 dot=0.0f16;
311 uint32_t i,j;
312 const float16_t *pSupport = S->supportVectors;
313
314 for(i=0; i < S->nbOfSupportVectors; i++)
315 {
316 dot=0.0f16;
317 for(j=0; j < S->vectorDimension; j++)
318 {
319 dot = (_Float16)dot + (_Float16)in[j] * (_Float16)*pSupport++;
320 }
321 sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((float32_t)((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0));
322 }
323 *pResult=S->classes[STEP(sum)];
324 }
325
326 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
327
328 /**
329 * @} end of sigmoidsvm group
330 */
331
332 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
333
334