1 /*
2 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_pool_q7_HWC.c
22 * Description: Pooling function implementations
23 *
24 * $Date: 09. October 2020
25 * $Revision: V.1.0.1
26 *
27 * Target Processor: Cortex-M cores
28 *
29 * -------------------------------------------------------------------- */
30
31 #include "arm_nnfunctions.h"
32 #include "arm_nnsupportfunctions.h"
33
34 #if defined(ARM_MATH_DSP)
35
36 /**
37 * @brief A few utility functions used by pooling functions
38 *
39 *
40 */
41
buffer_scale_back_q15_to_q7(q15_t * buffer,q7_t * target,uint16_t length,uint16_t scale)42 static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale)
43 {
44 int i;
45
46 for (i = 0; i < length; i++)
47 {
48 target[i] = (q7_t)(buffer[i] / scale);
49 }
50 }
51
compare_and_replace_if_larger_q7(q7_t * base,const q7_t * target,const uint16_t length)52 static void compare_and_replace_if_larger_q7(q7_t *base, // base data
53 const q7_t *target, // compare target
54 const uint16_t length // data length
55 )
56 {
57 q7_t *pIn = base;
58 const q7_t *pCom = target;
59 union arm_nnword in;
60 union arm_nnword com;
61 uint16_t cnt = length >> 2;
62
63 while (cnt > 0u)
64 {
65 in.word = arm_nn_read_q7x4((const q7_t *)pIn);
66 com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom);
67
68 // if version
69 if (com.bytes[0] > in.bytes[0])
70 in.bytes[0] = com.bytes[0];
71 if (com.bytes[1] > in.bytes[1])
72 in.bytes[1] = com.bytes[1];
73 if (com.bytes[2] > in.bytes[2])
74 in.bytes[2] = com.bytes[2];
75 if (com.bytes[3] > in.bytes[3])
76 in.bytes[3] = com.bytes[3];
77
78 *__SIMD32(pIn)++ = in.word;
79
80 cnt--;
81 }
82
83 cnt = length & 0x3;
84 while (cnt > 0u)
85 {
86 if (*pCom > *pIn)
87 {
88 *pIn = *pCom;
89 }
90 pIn++;
91 pCom++;
92 cnt--;
93 }
94 }
95
accumulate_q7_to_q15(q15_t * base,q7_t * target,const uint16_t length)96 static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length)
97 {
98 q15_t *pCnt = base;
99 q7_t *pV = target;
100 q31_t v1, v2, vo1, vo2;
101 uint16_t cnt = length >> 2;
102 q31_t in;
103
104 while (cnt > 0u)
105 {
106 q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV);
107 v1 = __SXTB16(__ROR(value, 8));
108 v2 = __SXTB16(value);
109 #ifndef ARM_MATH_BIG_ENDIAN
110
111 vo2 = __PKHTB(v1, v2, 16);
112 vo1 = __PKHBT(v2, v1, 16);
113
114 #else
115
116 vo1 = __PKHTB(v1, v2, 16);
117 vo2 = __PKHBT(v2, v1, 16);
118
119 #endif
120
121 in = arm_nn_read_q15x2(pCnt);
122 *__SIMD32(pCnt)++ = __QADD16(vo1, in);
123
124 in = arm_nn_read_q15x2(pCnt);
125 *__SIMD32(pCnt)++ = __QADD16(vo2, in);
126
127 cnt--;
128 }
129 cnt = length & 0x3;
130 while (cnt > 0u)
131 {
132 *pCnt++ += *pV++;
133 cnt--;
134 }
135 }
136
137 #endif // ARM_MATH_DSP
138
139 /**
140 * @ingroup groupNN
141 */
142
143 /**
144 * @addtogroup Pooling
145 * @{
146 */
147
148 /**
149 * @brief Q7 max pooling function
150 * @param[in, out] Im_in pointer to input tensor
151 * @param[in] dim_im_in input tensor dimention
152 * @param[in] ch_im_in number of input tensor channels
153 * @param[in] dim_kernel filter kernel size
154 * @param[in] padding padding sizes
155 * @param[in] stride convolution stride
156 * @param[in] dim_im_out output tensor dimension
157 * @param[in,out] bufferA Not used
158 * @param[in,out] Im_out pointer to output tensor
159 *
160 * @details
161 *
162 * The pooling function is implemented as split x-pooling then
163 * y-pooling.
164 *
165 * This pooling function is input-destructive. Input data is undefined
166 * after calling this function.
167 *
168 */
169
arm_maxpool_q7_HWC(q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const uint16_t dim_im_out,q7_t * bufferA,q7_t * Im_out)170 void arm_maxpool_q7_HWC(q7_t *Im_in,
171 const uint16_t dim_im_in,
172 const uint16_t ch_im_in,
173 const uint16_t dim_kernel,
174 const uint16_t padding,
175 const uint16_t stride,
176 const uint16_t dim_im_out,
177 q7_t *bufferA,
178 q7_t *Im_out)
179 {
180 (void)bufferA;
181 #if defined(ARM_MATH_DSP)
182 /* Run the following code for Cortex-M4 and Cortex-M7 */
183
184 int16_t i_x, i_y;
185
186 /* first does the pooling along x axis */
187 for (i_y = 0; i_y < dim_im_in; i_y++)
188 {
189
190 for (i_x = 0; i_x < dim_im_out; i_x++)
191 {
192 /* for each output pixel */
193 q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
194 q7_t *win_start;
195 q7_t *win_stop;
196 if (i_x * stride - padding < 0)
197 {
198 win_start = target;
199 }
200 else
201 {
202 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
203 }
204
205 if (i_x * stride - padding + dim_kernel >= dim_im_in)
206 {
207 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
208 }
209 else
210 {
211 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
212 }
213
214 /* first step is to copy over initial data */
215 /* arm_copy_q7(win_start, target, ch_im_in); */
216 memmove(target, win_start, ch_im_in);
217
218 /* start the max operation from the second part */
219 win_start += ch_im_in;
220 for (; win_start < win_stop; win_start += ch_im_in)
221 {
222 compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
223 }
224 }
225 }
226
227 /* then does the pooling along y axis */
228 for (i_y = 0; i_y < dim_im_out; i_y++)
229 {
230
231 /* for each output row */
232 q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
233 q7_t *row_start;
234 q7_t *row_end;
235 /* setting the starting row */
236 if (i_y * stride - padding < 0)
237 {
238 row_start = Im_in;
239 }
240 else
241 {
242 row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
243 }
244 /* setting the stopping row */
245 if (i_y * stride - padding + dim_kernel >= dim_im_in)
246 {
247 row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
248 }
249 else
250 {
251 row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
252 }
253
254 /* copy over the first row */
255 /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
256 memmove(target, row_start, dim_im_out * ch_im_in);
257
258 /* move over to next row */
259 row_start += ch_im_in * dim_im_in;
260
261 for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
262 {
263 compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
264 }
265 }
266
267 #else
268 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
269 int16_t i_ch_in, i_x, i_y;
270 int16_t k_x, k_y;
271
272 for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
273 {
274 for (i_y = 0; i_y < dim_im_out; i_y++)
275 {
276 for (i_x = 0; i_x < dim_im_out; i_x++)
277 {
278 int max = -129;
279 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
280 {
281 for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
282 {
283 if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
284 {
285 if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
286 {
287 max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
288 }
289 }
290 }
291 }
292 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
293 }
294 }
295 }
296
297 #endif /* ARM_MATH_DSP */
298 }
299
300 /**
301 * @brief Q7 average pooling function
302 * @param[in,out] Im_in pointer to input tensor
303 * @param[in] dim_im_in input tensor dimention
304 * @param[in] ch_im_in number of input tensor channels
305 * @param[in] dim_kernel filter kernel size
306 * @param[in] padding padding sizes
307 * @param[in] stride convolution stride
308 * @param[in] dim_im_out output tensor dimension
309 * @param[in,out] bufferA pointer to buffer space for input
310 * @param[in,out] Im_out pointer to output tensor
311 *
312 * @details
313 *
314 * <b>Buffer size:</b>
315 *
316 * bufferA size: 2*dim_im_out*ch_im_in
317 *
318 * The pooling function is implemented as split x-pooling then
319 * y-pooling.
320 *
321 * This pooling function is input-destructive. Input data is undefined
322 * after calling this function.
323 *
324 */
325
arm_avepool_q7_HWC(q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const uint16_t dim_im_out,q7_t * bufferA,q7_t * Im_out)326 void arm_avepool_q7_HWC(q7_t *Im_in,
327 const uint16_t dim_im_in,
328 const uint16_t ch_im_in,
329 const uint16_t dim_kernel,
330 const uint16_t padding,
331 const uint16_t stride,
332 const uint16_t dim_im_out,
333 q7_t *bufferA,
334 q7_t *Im_out)
335 {
336
337 #if defined(ARM_MATH_DSP)
338 /* Run the following code for Cortex-M4 and Cortex-M7 */
339
340 q15_t *buffer = (q15_t *)bufferA;
341 int16_t i_x, i_y;
342 int16_t count = 0;
343
344 /* first does the pooling along x axis */
345 for (i_y = 0; i_y < dim_im_in; i_y++)
346 {
347
348 for (i_x = 0; i_x < dim_im_out; i_x++)
349 {
350 /* for each output pixel */
351 q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
352 q7_t *win_start;
353 q7_t *win_stop;
354 if (i_x * stride - padding < 0)
355 {
356 win_start = target;
357 }
358 else
359 {
360 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
361 }
362
363 if (i_x * stride - padding + dim_kernel >= dim_im_in)
364 {
365 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
366 }
367 else
368 {
369 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
370 }
371
372 /* first step is to copy over initial data */
373 arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
374 count = 1;
375
376 /* start the max operation from the second part */
377 win_start += ch_im_in;
378 for (; win_start < win_stop; win_start += ch_im_in)
379 {
380 accumulate_q7_to_q15(buffer, win_start, ch_im_in);
381 count++;
382 }
383 buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
384 }
385 }
386
387 /* then does the pooling along y axis */
388 for (i_y = 0; i_y < dim_im_out; i_y++)
389 {
390 /* for each output row */
391 q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
392 q7_t *row_start;
393 q7_t *row_end;
394 /* setting the starting row */
395 if (i_y * stride - padding < 0)
396 {
397 row_start = Im_in;
398 }
399 else
400 {
401 row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
402 }
403 /* setting the stopping row */
404 if (i_y * stride - padding + dim_kernel >= dim_im_in)
405 {
406 row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
407 }
408 else
409 {
410 row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
411 }
412
413 /* copy over the first row */
414 arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
415 count = 1;
416
417 /* move over to next row */
418 row_start += ch_im_in * dim_im_in;
419
420 for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
421 {
422 accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
423 count++;
424 }
425 buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
426 }
427
428 #else
429 /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
430
431 (void)bufferA;
432 int16_t i_ch_in, i_x, i_y;
433 int16_t k_x, k_y;
434
435 for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
436 {
437 for (i_y = 0; i_y < dim_im_out; i_y++)
438 {
439 for (i_x = 0; i_x < dim_im_out; i_x++)
440 {
441 int sum = 0;
442 int count = 0;
443 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
444 {
445 for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
446 {
447 if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
448 {
449 sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
450 count++;
451 }
452 }
453 }
454 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
455 }
456 }
457 }
458
459 #endif /* ARM_MATH_DSP */
460 }
461
462 /**
463 * @} end of Pooling group
464 */
465