1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_svm_linear_predict_f16.c
4 * Description: SVM Linear Classifier
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/svm_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 #include <limits.h>
34 #include <math.h>
35
36
37 /**
38 * @addtogroup linearsvm
39 * @{
40 */
41
42
43 /**
44 * @brief SVM linear prediction
45 * @param[in] S Pointer to an instance of the linear SVM structure.
46 * @param[in] in Pointer to input vector
47 * @param[out] pResult Decision value
48 *
49 */
50 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
51
52 #include "arm_helium_utils.h"
53
arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 * S,const float16_t * in,int32_t * pResult)54 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f16(
55 const arm_svm_linear_instance_f16 *S,
56 const float16_t * in,
57 int32_t * pResult)
58 {
59 /* inlined Matrix x Vector function interleaved with dot prod */
60 uint32_t numRows = S->nbOfSupportVectors;
61 uint32_t numCols = S->vectorDimension;
62 const float16_t *pSupport = S->supportVectors;
63 const float16_t *pSrcA = pSupport;
64 const float16_t *pInA0;
65 const float16_t *pInA1;
66 uint32_t row;
67 uint32_t blkCnt; /* loop counters */
68 const float16_t *pDualCoef = S->dualCoefficients;
69 _Float16 sum = S->intercept;
70 row = numRows;
71
72 /*
73 * compute 4 rows in parrallel
74 */
75 while (row >= 4)
76 {
77 const float16_t *pInA2, *pInA3;
78 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
79 f16x8_t vecIn, acc0, acc1, acc2, acc3;
80 float16_t const *pSrcVecPtr = in;
81
82 /*
83 * Initialize the pointers to 4 consecutive MatrixA rows
84 */
85 pInA0 = pSrcA;
86 pInA1 = pInA0 + numCols;
87 pInA2 = pInA1 + numCols;
88 pInA3 = pInA2 + numCols;
89 /*
90 * Initialize the vector pointer
91 */
92 pInVec = pSrcVecPtr;
93 /*
94 * reset accumulators
95 */
96 acc0 = vdupq_n_f16(0.0f);
97 acc1 = vdupq_n_f16(0.0f);
98 acc2 = vdupq_n_f16(0.0f);
99 acc3 = vdupq_n_f16(0.0f);
100
101 pSrcA0Vec = pInA0;
102 pSrcA1Vec = pInA1;
103 pSrcA2Vec = pInA2;
104 pSrcA3Vec = pInA3;
105
106 blkCnt = numCols >> 3;
107 while (blkCnt > 0U) {
108 f16x8_t vecA;
109
110 vecIn = vld1q(pInVec);
111 pInVec += 8;
112 vecA = vld1q(pSrcA0Vec);
113 pSrcA0Vec += 8;
114 acc0 = vfmaq(acc0, vecIn, vecA);
115 vecA = vld1q(pSrcA1Vec);
116 pSrcA1Vec += 8;
117 acc1 = vfmaq(acc1, vecIn, vecA);
118 vecA = vld1q(pSrcA2Vec);
119 pSrcA2Vec += 8;
120 acc2 = vfmaq(acc2, vecIn, vecA);
121 vecA = vld1q(pSrcA3Vec);
122 pSrcA3Vec += 8;
123 acc3 = vfmaq(acc3, vecIn, vecA);
124
125 blkCnt--;
126 }
127 /*
128 * tail
129 * (will be merged thru tail predication)
130 */
131 blkCnt = numCols & 7;
132 if (blkCnt > 0U) {
133 mve_pred16_t p0 = vctp16q(blkCnt);
134 f16x8_t vecA;
135
136 vecIn = vldrhq_z_f16(pInVec, p0);
137 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
138 acc0 = vfmaq(acc0, vecIn, vecA);
139 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
140 acc1 = vfmaq(acc1, vecIn, vecA);
141 vecA = vldrhq_z_f16(pSrcA2Vec, p0);
142 acc2 = vfmaq(acc2, vecIn, vecA);
143 vecA = vldrhq_z_f16(pSrcA3Vec, p0);
144 acc3 = vfmaq(acc3, vecIn, vecA);
145 }
146 /*
147 * Sum the partial parts
148 */
149 acc0 = vmulq_n_f16(acc0,*pDualCoef++);
150 acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
151 acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
152 acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
153
154 sum += (_Float16)vecAddAcrossF16Mve(acc0);
155
156 pSrcA += numCols * 4;
157 /*
158 * Decrement the row loop counter
159 */
160 row -= 4;
161 }
162
163 /*
164 * compute 2 rows in parallel
165 */
166 if (row >= 2) {
167 float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
168 f16x8_t vecIn, acc0, acc1;
169 float16_t const *pSrcVecPtr = in;
170
171 /*
172 * Initialize the pointers to 2 consecutive MatrixA rows
173 */
174 pInA0 = pSrcA;
175 pInA1 = pInA0 + numCols;
176 /*
177 * Initialize the vector pointer
178 */
179 pInVec = pSrcVecPtr;
180 /*
181 * reset accumulators
182 */
183 acc0 = vdupq_n_f16(0.0f);
184 acc1 = vdupq_n_f16(0.0f);
185 pSrcA0Vec = pInA0;
186 pSrcA1Vec = pInA1;
187
188 blkCnt = numCols >> 3;
189 while (blkCnt > 0U) {
190 f16x8_t vecA;
191
192 vecIn = vld1q(pInVec);
193 pInVec += 8;
194 vecA = vld1q(pSrcA0Vec);
195 pSrcA0Vec += 8;
196 acc0 = vfmaq(acc0, vecIn, vecA);
197 vecA = vld1q(pSrcA1Vec);
198 pSrcA1Vec += 8;
199 acc1 = vfmaq(acc1, vecIn, vecA);
200
201 blkCnt--;
202 }
203 /*
204 * tail
205 * (will be merged thru tail predication)
206 */
207 blkCnt = numCols & 7;
208 if (blkCnt > 0U) {
209 mve_pred16_t p0 = vctp16q(blkCnt);
210 f16x8_t vecA;
211
212 vecIn = vldrhq_z_f16(pInVec, p0);
213 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
214 acc0 = vfmaq(acc0, vecIn, vecA);
215 vecA = vldrhq_z_f16(pSrcA1Vec, p0);
216 acc1 = vfmaq(acc1, vecIn, vecA);
217 }
218 /*
219 * Sum the partial parts
220 */
221 acc0 = vmulq_n_f16(acc0,*pDualCoef++);
222 acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
223
224 sum += (_Float16)vecAddAcrossF16Mve(acc0);
225
226 pSrcA += numCols * 2;
227 row -= 2;
228 }
229
230 if (row >= 1) {
231 f16x8_t vecIn, acc0;
232 float16_t const *pSrcA0Vec, *pInVec;
233 float16_t const *pSrcVecPtr = in;
234 /*
235 * Initialize the pointers to last MatrixA row
236 */
237 pInA0 = pSrcA;
238 /*
239 * Initialize the vector pointer
240 */
241 pInVec = pSrcVecPtr;
242 /*
243 * reset accumulators
244 */
245 acc0 = vdupq_n_f16(0.0f);
246
247 pSrcA0Vec = pInA0;
248
249 blkCnt = numCols >> 3;
250 while (blkCnt > 0U) {
251 f16x8_t vecA;
252
253 vecIn = vld1q(pInVec);
254 pInVec += 8;
255 vecA = vld1q(pSrcA0Vec);
256 pSrcA0Vec += 8;
257 acc0 = vfmaq(acc0, vecIn, vecA);
258
259 blkCnt--;
260 }
261 /*
262 * tail
263 * (will be merged thru tail predication)
264 */
265 blkCnt = numCols & 7;
266 if (blkCnt > 0U) {
267 mve_pred16_t p0 = vctp16q(blkCnt);
268 f16x8_t vecA;
269
270 vecIn = vldrhq_z_f16(pInVec, p0);
271 vecA = vldrhq_z_f16(pSrcA0Vec, p0);
272 acc0 = vfmaq(acc0, vecIn, vecA);
273 }
274 /*
275 * Sum the partial parts
276 */
277 sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
278
279 }
280
281 *pResult = S->classes[STEP(sum)];
282 }
283
284 #else
arm_svm_linear_predict_f16(const arm_svm_linear_instance_f16 * S,const float16_t * in,int32_t * pResult)285 ARM_DSP_ATTRIBUTE void arm_svm_linear_predict_f16(
286 const arm_svm_linear_instance_f16 *S,
287 const float16_t * in,
288 int32_t * pResult)
289 {
290 _Float16 sum=S->intercept;
291 _Float16 dot=0;
292 uint32_t i,j;
293 const float16_t *pSupport = S->supportVectors;
294
295 for(i=0; i < S->nbOfSupportVectors; i++)
296 {
297 dot=0;
298 for(j=0; j < S->vectorDimension; j++)
299 {
300 dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
301 }
302 sum += (_Float16)S->dualCoefficients[i] * (_Float16)dot;
303 }
304 *pResult=S->classes[STEP(sum)];
305 }
306 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
307
308 /**
309 * @} end of linearsvm group
310 */
311
312 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
313
314