1 /*
2  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_pool_q7_HWC.c
22  * Description:  Pooling function implementations
23  *
24  * $Date:        09. October 2020
25  * $Revision:    V.1.0.1
26  *
27  * Target Processor:  Cortex-M cores
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_nnfunctions.h"
32 #include "arm_nnsupportfunctions.h"
33 
34 #if defined(ARM_MATH_DSP)
35 
36 /**
37  * @brief A few utility functions used by pooling functions
38  *
39  *
40  */
41 
buffer_scale_back_q15_to_q7(q15_t * buffer,q7_t * target,uint16_t length,uint16_t scale)42 static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale)
43 {
44     int i;
45 
46     for (i = 0; i < length; i++)
47     {
48         target[i] = (q7_t)(buffer[i] / scale);
49     }
50 }
51 
compare_and_replace_if_larger_q7(q7_t * base,const q7_t * target,const uint16_t length)52 static void compare_and_replace_if_larger_q7(q7_t *base,           // base data
53                                              const q7_t *target,   // compare target
54                                              const uint16_t length // data length
55 )
56 {
57     q7_t *pIn = base;
58     const q7_t *pCom = target;
59     union arm_nnword in;
60     union arm_nnword com;
61     uint16_t cnt = length >> 2;
62 
63     while (cnt > 0u)
64     {
65         in.word = arm_nn_read_q7x4((const q7_t *)pIn);
66         com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom);
67 
68         // if version
69         if (com.bytes[0] > in.bytes[0])
70             in.bytes[0] = com.bytes[0];
71         if (com.bytes[1] > in.bytes[1])
72             in.bytes[1] = com.bytes[1];
73         if (com.bytes[2] > in.bytes[2])
74             in.bytes[2] = com.bytes[2];
75         if (com.bytes[3] > in.bytes[3])
76             in.bytes[3] = com.bytes[3];
77 
78         *__SIMD32(pIn)++ = in.word;
79 
80         cnt--;
81     }
82 
83     cnt = length & 0x3;
84     while (cnt > 0u)
85     {
86         if (*pCom > *pIn)
87         {
88             *pIn = *pCom;
89         }
90         pIn++;
91         pCom++;
92         cnt--;
93     }
94 }
95 
accumulate_q7_to_q15(q15_t * base,q7_t * target,const uint16_t length)96 static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length)
97 {
98     q15_t *pCnt = base;
99     q7_t *pV = target;
100     q31_t v1, v2, vo1, vo2;
101     uint16_t cnt = length >> 2;
102     q31_t in;
103 
104     while (cnt > 0u)
105     {
106         q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV);
107         v1 = __SXTB16(__ROR(value, 8));
108         v2 = __SXTB16(value);
109 #ifndef ARM_MATH_BIG_ENDIAN
110 
111         vo2 = __PKHTB(v1, v2, 16);
112         vo1 = __PKHBT(v2, v1, 16);
113 
114 #else
115 
116         vo1 = __PKHTB(v1, v2, 16);
117         vo2 = __PKHBT(v2, v1, 16);
118 
119 #endif
120 
121         in = arm_nn_read_q15x2(pCnt);
122         *__SIMD32(pCnt)++ = __QADD16(vo1, in);
123 
124         in = arm_nn_read_q15x2(pCnt);
125         *__SIMD32(pCnt)++ = __QADD16(vo2, in);
126 
127         cnt--;
128     }
129     cnt = length & 0x3;
130     while (cnt > 0u)
131     {
132         *pCnt++ += *pV++;
133         cnt--;
134     }
135 }
136 
137 #endif // ARM_MATH_DSP
138 
139 /**
140  *  @ingroup groupNN
141  */
142 
143 /**
144  * @addtogroup Pooling
145  * @{
146  */
147 
148 /**
149  * @brief Q7 max pooling function
150  * @param[in, out]  Im_in       pointer to input tensor
151  * @param[in]       dim_im_in   input tensor dimention
152  * @param[in]       ch_im_in    number of input tensor channels
153  * @param[in]       dim_kernel  filter kernel size
154  * @param[in]       padding     padding sizes
155  * @param[in]       stride      convolution stride
156  * @param[in]       dim_im_out  output tensor dimension
157  * @param[in,out]   bufferA     Not used
158  * @param[in,out]   Im_out      pointer to output tensor
159  *
160  * @details
161  *
162  * The pooling function is implemented as split x-pooling then
163  * y-pooling.
164  *
165  * This pooling function is input-destructive. Input data is undefined
166  * after calling this function.
167  *
168  */
169 
arm_maxpool_q7_HWC(q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const uint16_t dim_im_out,q7_t * bufferA,q7_t * Im_out)170 void arm_maxpool_q7_HWC(q7_t *Im_in,
171                         const uint16_t dim_im_in,
172                         const uint16_t ch_im_in,
173                         const uint16_t dim_kernel,
174                         const uint16_t padding,
175                         const uint16_t stride,
176                         const uint16_t dim_im_out,
177                         q7_t *bufferA,
178                         q7_t *Im_out)
179 {
180     (void)bufferA;
181 #if defined(ARM_MATH_DSP)
182     /* Run the following code for Cortex-M4 and Cortex-M7 */
183 
184     int16_t i_x, i_y;
185 
186     /* first does the pooling along x axis */
187     for (i_y = 0; i_y < dim_im_in; i_y++)
188     {
189 
190         for (i_x = 0; i_x < dim_im_out; i_x++)
191         {
192             /* for each output pixel */
193             q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
194             q7_t *win_start;
195             q7_t *win_stop;
196             if (i_x * stride - padding < 0)
197             {
198                 win_start = target;
199             }
200             else
201             {
202                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
203             }
204 
205             if (i_x * stride - padding + dim_kernel >= dim_im_in)
206             {
207                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
208             }
209             else
210             {
211                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
212             }
213 
214             /* first step is to copy over initial data */
215             /* arm_copy_q7(win_start, target, ch_im_in); */
216             memmove(target, win_start, ch_im_in);
217 
218             /* start the max operation from the second part */
219             win_start += ch_im_in;
220             for (; win_start < win_stop; win_start += ch_im_in)
221             {
222                 compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
223             }
224         }
225     }
226 
227     /* then does the pooling along y axis */
228     for (i_y = 0; i_y < dim_im_out; i_y++)
229     {
230 
231         /* for each output row */
232         q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
233         q7_t *row_start;
234         q7_t *row_end;
235         /* setting the starting row */
236         if (i_y * stride - padding < 0)
237         {
238             row_start = Im_in;
239         }
240         else
241         {
242             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
243         }
244         /* setting the stopping row */
245         if (i_y * stride - padding + dim_kernel >= dim_im_in)
246         {
247             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
248         }
249         else
250         {
251             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
252         }
253 
254         /* copy over the first row */
255         /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
256         memmove(target, row_start, dim_im_out * ch_im_in);
257 
258         /* move over to next row */
259         row_start += ch_im_in * dim_im_in;
260 
261         for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
262         {
263             compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
264         }
265     }
266 
267 #else
268     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
269     int16_t i_ch_in, i_x, i_y;
270     int16_t k_x, k_y;
271 
272     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
273     {
274         for (i_y = 0; i_y < dim_im_out; i_y++)
275         {
276             for (i_x = 0; i_x < dim_im_out; i_x++)
277             {
278                 int max = -129;
279                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
280                 {
281                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
282                     {
283                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
284                         {
285                             if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
286                             {
287                                 max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
288                             }
289                         }
290                     }
291                 }
292                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
293             }
294         }
295     }
296 
297 #endif /* ARM_MATH_DSP */
298 }
299 
300 /**
301  * @brief Q7 average pooling function
302  * @param[in,out]   Im_in       pointer to input tensor
303  * @param[in]       dim_im_in   input tensor dimention
304  * @param[in]       ch_im_in    number of input tensor channels
305  * @param[in]       dim_kernel  filter kernel size
306  * @param[in]       padding     padding sizes
307  * @param[in]       stride      convolution stride
308  * @param[in]       dim_im_out  output tensor dimension
309  * @param[in,out]   bufferA     pointer to buffer space for input
310  * @param[in,out]   Im_out      pointer to output tensor
311  *
312  * @details
313  *
314  * <b>Buffer size:</b>
315  *
316  * bufferA size:  2*dim_im_out*ch_im_in
317  *
318  * The pooling function is implemented as split x-pooling then
319  * y-pooling.
320  *
321  * This pooling function is input-destructive. Input data is undefined
322  * after calling this function.
323  *
324  */
325 
arm_avepool_q7_HWC(q7_t * Im_in,const uint16_t dim_im_in,const uint16_t ch_im_in,const uint16_t dim_kernel,const uint16_t padding,const uint16_t stride,const uint16_t dim_im_out,q7_t * bufferA,q7_t * Im_out)326 void arm_avepool_q7_HWC(q7_t *Im_in,
327                         const uint16_t dim_im_in,
328                         const uint16_t ch_im_in,
329                         const uint16_t dim_kernel,
330                         const uint16_t padding,
331                         const uint16_t stride,
332                         const uint16_t dim_im_out,
333                         q7_t *bufferA,
334                         q7_t *Im_out)
335 {
336 
337 #if defined(ARM_MATH_DSP)
338     /* Run the following code for Cortex-M4 and Cortex-M7 */
339 
340     q15_t *buffer = (q15_t *)bufferA;
341     int16_t i_x, i_y;
342     int16_t count = 0;
343 
344     /* first does the pooling along x axis */
345     for (i_y = 0; i_y < dim_im_in; i_y++)
346     {
347 
348         for (i_x = 0; i_x < dim_im_out; i_x++)
349         {
350             /* for each output pixel */
351             q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
352             q7_t *win_start;
353             q7_t *win_stop;
354             if (i_x * stride - padding < 0)
355             {
356                 win_start = target;
357             }
358             else
359             {
360                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
361             }
362 
363             if (i_x * stride - padding + dim_kernel >= dim_im_in)
364             {
365                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
366             }
367             else
368             {
369                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
370             }
371 
372             /* first step is to copy over initial data */
373             arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
374             count = 1;
375 
376             /* start the max operation from the second part */
377             win_start += ch_im_in;
378             for (; win_start < win_stop; win_start += ch_im_in)
379             {
380                 accumulate_q7_to_q15(buffer, win_start, ch_im_in);
381                 count++;
382             }
383             buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
384         }
385     }
386 
387     /* then does the pooling along y axis */
388     for (i_y = 0; i_y < dim_im_out; i_y++)
389     {
390         /* for each output row */
391         q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
392         q7_t *row_start;
393         q7_t *row_end;
394         /* setting the starting row */
395         if (i_y * stride - padding < 0)
396         {
397             row_start = Im_in;
398         }
399         else
400         {
401             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
402         }
403         /* setting the stopping row */
404         if (i_y * stride - padding + dim_kernel >= dim_im_in)
405         {
406             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
407         }
408         else
409         {
410             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
411         }
412 
413         /* copy over the first row */
414         arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
415         count = 1;
416 
417         /* move over to next row */
418         row_start += ch_im_in * dim_im_in;
419 
420         for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
421         {
422             accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
423             count++;
424         }
425         buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
426     }
427 
428 #else
429     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
430 
431     (void)bufferA;
432     int16_t i_ch_in, i_x, i_y;
433     int16_t k_x, k_y;
434 
435     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
436     {
437         for (i_y = 0; i_y < dim_im_out; i_y++)
438         {
439             for (i_x = 0; i_x < dim_im_out; i_x++)
440             {
441                 int sum = 0;
442                 int count = 0;
443                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
444                 {
445                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
446                     {
447                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
448                         {
449                             sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
450                             count++;
451                         }
452                     }
453                 }
454                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
455             }
456         }
457     }
458 
459 #endif /* ARM_MATH_DSP */
460 }
461 
462 /**
463  * @} end of Pooling group
464  */
465