1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_max_f16.c
4  * Description:  Maximum value of a floating-point vector
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
34 #include <limits.h>
35 #endif
36 
37 /**
38   @ingroup groupStats
39  */
40 
41 
42 /**
43   @addtogroup Max
44   @{
45  */
46 
47 /**
48   @brief         Maximum value of a floating-point vector.
49   @param[in]     pSrc       points to the input vector
50   @param[in]     blockSize  number of samples in input vector
51   @param[out]    pResult    maximum value returned here
52   @param[out]    pIndex     index of maximum value returned here
53  */
54 
55 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
56 
arm_max_f16(const float16_t * pSrc,uint32_t blockSize,float16_t * pResult,uint32_t * pIndex)57 ARM_DSP_ATTRIBUTE void arm_max_f16(
58   const float16_t * pSrc,
59   uint32_t blockSize,
60   float16_t * pResult,
61   uint32_t * pIndex)
62 {
63      int32_t blkCnt;
64     f16x8_t vecSrc;
65     f16x8_t curExtremValVec = vdupq_n_f16(F16_MIN);
66     float16_t maxValue = F16_MIN;
67     uint32_t idx = blockSize;
68     uint16x8_t indexVec;
69     uint16x8_t curExtremIdxVec;
70     uint32_t curIdx = 0;
71     mve_pred16_t p0;
72     float16_t tmp;
73 
74 
75     indexVec = vidupq_wb_u16(&curIdx, 1);
76     curExtremIdxVec = vdupq_n_u16(0);
77 
78     /* Compute 4 outputs at a time */
79     blkCnt = blockSize >> 3;
80     while (blkCnt > 0)
81     {
82         vecSrc = vldrhq_f16(pSrc);
83         /*
84          * Get current max per lane and current index per lane
85          * when a max is selected
86          */
87         p0 = vcmpgeq(vecSrc, curExtremValVec);
88         curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
89         curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
90 
91         indexVec = vidupq_wb_u16(&curIdx, 1);
92 
93         pSrc += 8;
94         /* Decrement the loop counter */
95         blkCnt--;
96     }
97 
98 
99     /*
100      * Get max value across the vector
101      */
102     maxValue = vmaxnmvq(maxValue, curExtremValVec);
103     /*
104      * set index for lower values to max possible index
105      */
106     p0 = vcmpgeq(curExtremValVec, maxValue);
107     indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
108     /*
109      * Get min index which is thus for a max value
110      */
111     idx = vminvq(idx, indexVec);
112 
113     /* Tail */
114     blkCnt = blockSize & 7;
115 
116     while (blkCnt > 0)
117     {
118       /* Initialize tmp to the next consecutive values one by one */
119       tmp = *pSrc++;
120 
121       /* compare for the maximum value */
122       if ((_Float16)maxValue < (_Float16)tmp)
123       {
124         /* Update the maximum value and it's index */
125         maxValue = tmp;
126         idx = blockSize - blkCnt;
127       }
128 
129       /* Decrement loop counter */
130       blkCnt--;
131     }
132 
133     /*
134      * Save result
135      */
136     *pIndex = idx;
137     *pResult = maxValue;
138 }
139 
140 #else
arm_max_f16(const float16_t * pSrc,uint32_t blockSize,float16_t * pResult,uint32_t * pIndex)141 ARM_DSP_ATTRIBUTE void arm_max_f16(
142   const float16_t * pSrc,
143         uint32_t blockSize,
144         float16_t * pResult,
145         uint32_t * pIndex)
146 {
147         float16_t maxVal, out;                         /* Temporary variables to store the output value. */
148         uint32_t blkCnt, outIndex;                     /* Loop counter */
149 
150 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
151         uint32_t index;                                /* index of maximum value */
152 #endif
153 
154   /* Initialise index value to zero. */
155   outIndex = 0U;
156 
157   /* Load first input value that act as reference value for comparision */
158   out = *pSrc++;
159 
160 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
161   /* Initialise index of maximum value. */
162   index = 0U;
163 
164   /* Loop unrolling: Compute 4 outputs at a time */
165   blkCnt = (blockSize - 1U) >> 2U;
166 
167   while (blkCnt > 0U)
168   {
169     /* Initialize maxVal to next consecutive values one by one */
170     maxVal = *pSrc++;
171 
172     /* compare for the maximum value */
173     if ((_Float16)out < (_Float16)maxVal)
174     {
175       /* Update the maximum value and it's index */
176       out = maxVal;
177       outIndex = index + 1U;
178     }
179 
180     maxVal = *pSrc++;
181     if ((_Float16)out < (_Float16)maxVal)
182     {
183       out = maxVal;
184       outIndex = index + 2U;
185     }
186 
187     maxVal = *pSrc++;
188     if ((_Float16)out < (_Float16)maxVal)
189     {
190       out = maxVal;
191       outIndex = index + 3U;
192     }
193 
194     maxVal = *pSrc++;
195     if ((_Float16)out < (_Float16)maxVal)
196     {
197       out = maxVal;
198       outIndex = index + 4U;
199     }
200 
201     index += 4U;
202 
203     /* Decrement loop counter */
204     blkCnt--;
205   }
206 
207   /* Loop unrolling: Compute remaining outputs */
208   blkCnt = (blockSize - 1U) % 4U;
209 
210 #else
211 
212   /* Initialize blkCnt with number of samples */
213   blkCnt = (blockSize - 1U);
214 
215 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
216 
217   while (blkCnt > 0U)
218   {
219     /* Initialize maxVal to the next consecutive values one by one */
220     maxVal = *pSrc++;
221 
222     /* compare for the maximum value */
223     if ((_Float16)out < (_Float16)maxVal)
224     {
225       /* Update the maximum value and it's index */
226       out = maxVal;
227       outIndex = blockSize - blkCnt;
228     }
229 
230     /* Decrement loop counter */
231     blkCnt--;
232   }
233 
234   /* Store the maximum value and it's index into destination pointers */
235   *pResult = out;
236   *pIndex = outIndex;
237 }
238 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
239 
240 /**
241   @} end of Max group
242  */
243 
244 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
245 
246