1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_max_f32.c
4  * Description:  Maximum value of a floating-point vector
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions.h"
30 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
31 #include <limits.h>
32 #endif
33 
34 /**
35   @ingroup groupStats
36  */
37 
38 /**
39   @defgroup Max Maximum
40 
41   Computes the maximum value of an array of data.
42   The function returns both the maximum value and its position within the array.
43   There are separate functions for floating-point, Q31, Q15, and Q7 data types.
44  */
45 
46 /**
47   @addtogroup Max
48   @{
49  */
50 
51 /**
52   @brief         Maximum value of a floating-point vector.
53   @param[in]     pSrc       points to the input vector
54   @param[in]     blockSize  number of samples in input vector
55   @param[out]    pResult    maximum value returned here
56   @param[out]    pIndex     index of maximum value returned here
57   @return        none
58  */
59 
60 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)61 void arm_max_f32(
62   const float32_t * pSrc,
63   uint32_t blockSize,
64   float32_t * pResult,
65   uint32_t * pIndex)
66 {
67     uint32_t blkCnt;
68     f32x4_t vecSrc;
69     f32x4_t curExtremValVec = vdupq_n_f32(F32_MIN);
70     float32_t maxValue = F32_MIN;
71     uint32_t idx = blockSize;
72     uint32x4_t indexVec;
73     uint32x4_t curExtremIdxVec;
74     uint32_t curIdx = 0;
75     mve_pred16_t p0;
76     float32_t tmp;
77 
78 
79     indexVec = vidupq_wb_u32(&curIdx, 1);
80     curExtremIdxVec = vdupq_n_u32(0);
81 
82     /* Compute 4 outputs at a time */
83     blkCnt = blockSize >> 2U;
84     while (blkCnt > 0U)
85     {
86         vecSrc = vldrwq_f32(pSrc);
87         /*
88          * Get current max per lane and current index per lane
89          * when a max is selected
90          */
91         p0 = vcmpgeq(vecSrc, curExtremValVec);
92         curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
93         curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
94 
95         indexVec = vidupq_wb_u32(&curIdx, 1);
96 
97         pSrc += 4;
98         /* Decrement the loop counter */
99         blkCnt--;
100     }
101 
102 
103     /*
104      * Get max value across the vector
105      */
106     maxValue = vmaxnmvq(maxValue, curExtremValVec);
107     /*
108      * set index for lower values to max possible index
109      */
110     p0 = vcmpgeq(curExtremValVec, maxValue);
111     indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
112     /*
113      * Get min index which is thus for a max value
114      */
115     idx = vminvq(idx, indexVec);
116 
117     /* Tail */
118     blkCnt = blockSize & 0x3;
119 
120     while (blkCnt > 0U)
121     {
122       /* Initialize tmp to the next consecutive values one by one */
123       tmp = *pSrc++;
124 
125       /* compare for the maximum value */
126       if (maxValue < tmp)
127       {
128         /* Update the maximum value and it's index */
129         maxValue = tmp;
130         idx = blockSize - blkCnt;
131       }
132 
133       /* Decrement loop counter */
134       blkCnt--;
135     }
136 
137     /*
138      * Save result
139      */
140     *pIndex = idx;
141     *pResult = maxValue;
142 }
143 
144 #else
145 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)146 void arm_max_f32(
147   const float32_t * pSrc,
148   uint32_t blockSize,
149   float32_t * pResult,
150   uint32_t * pIndex)
151 {
152   float32_t maxVal1, out;               /* Temporary variables to store the output value. */
153   uint32_t blkCnt, outIndex;              /* loop counter */
154 
155   float32x4_t outV, srcV;
156   float32x2_t outV2;
157 
158   uint32x4_t idxV;
159   uint32x4_t maxIdx;
160   static const uint32_t indexInit[4]={4,5,6,7};
161   static const uint32_t countVInit[4]={0,1,2,3};
162 
163   uint32x4_t index;
164   uint32x4_t delta;
165   uint32x4_t countV;
166   uint32x2_t countV2;
167 
168   maxIdx = vdupq_n_u32(ULONG_MAX);
169   delta = vdupq_n_u32(4);
170   index = vld1q_u32(indexInit);
171   countV = vld1q_u32(countVInit);
172 
173 
174   /* Initialise the index value to zero. */
175   outIndex = 0U;
176 
177   /* Load first input value that act as reference value for comparison */
178   if (blockSize <= 3)
179   {
180       out = *pSrc++;
181 
182       blkCnt = blockSize - 1;
183 
184       while (blkCnt > 0U)
185       {
186         /* Initialize maxVal to the next consecutive values one by one */
187         maxVal1 = *pSrc++;
188 
189         /* compare for the maximum value */
190         if (out < maxVal1)
191         {
192           /* Update the maximum value and it's index */
193           out = maxVal1;
194           outIndex = blockSize - blkCnt;
195         }
196 
197         /* Decrement the loop counter */
198         blkCnt--;
199       }
200   }
201   else
202   {
203       outV = vld1q_f32(pSrc);
204       pSrc += 4;
205 
206       /* Compute 4 outputs at a time */
207       blkCnt = (blockSize - 4 ) >> 2U;
208 
209       while (blkCnt > 0U)
210       {
211         srcV = vld1q_f32(pSrc);
212         pSrc += 4;
213 
214         idxV = vcgtq_f32(srcV, outV);
215         outV = vbslq_f32(idxV, srcV, outV );
216         countV = vbslq_u32(idxV, index,countV );
217 
218         index = vaddq_u32(index,delta);
219 
220         /* Decrement the loop counter */
221         blkCnt--;
222       }
223 
224       outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
225       outV2 = vpmax_f32(outV2,outV2);
226       out = vget_lane_f32(outV2, 0);
227 
228       idxV = vceqq_f32(outV, vdupq_n_f32(out));
229       countV = vbslq_u32(idxV, countV,maxIdx);
230 
231       countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
232       countV2 = vpmin_u32(countV2,countV2);
233       outIndex = vget_lane_u32(countV2,0);
234 
235       /* if (blockSize - 1U) is not multiple of 4 */
236       blkCnt = (blockSize - 4 ) % 4U;
237 
238       while (blkCnt > 0U)
239       {
240         /* Initialize maxVal to the next consecutive values one by one */
241         maxVal1 = *pSrc++;
242 
243         /* compare for the maximum value */
244         if (out < maxVal1)
245         {
246           /* Update the maximum value and it's index */
247           out = maxVal1;
248           outIndex = blockSize - blkCnt ;
249         }
250 
251         /* Decrement the loop counter */
252         blkCnt--;
253       }
254 
255 
256   }
257 
258   /* Store the maximum value and it's index into destination pointers */
259   *pResult = out;
260   *pIndex = outIndex;
261 }
262 #else
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)263 void arm_max_f32(
264   const float32_t * pSrc,
265         uint32_t blockSize,
266         float32_t * pResult,
267         uint32_t * pIndex)
268 {
269         float32_t maxVal, out;                         /* Temporary variables to store the output value. */
270         uint32_t blkCnt, outIndex;                     /* Loop counter */
271 
272 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
273         uint32_t index;                                /* index of maximum value */
274 #endif
275 
276   /* Initialise index value to zero. */
277   outIndex = 0U;
278 
279   /* Load first input value that act as reference value for comparision */
280   out = *pSrc++;
281 
282 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
283   /* Initialise index of maximum value. */
284   index = 0U;
285 
286   /* Loop unrolling: Compute 4 outputs at a time */
287   blkCnt = (blockSize - 1U) >> 2U;
288 
289   while (blkCnt > 0U)
290   {
291     /* Initialize maxVal to next consecutive values one by one */
292     maxVal = *pSrc++;
293 
294     /* compare for the maximum value */
295     if (out < maxVal)
296     {
297       /* Update the maximum value and it's index */
298       out = maxVal;
299       outIndex = index + 1U;
300     }
301 
302     maxVal = *pSrc++;
303     if (out < maxVal)
304     {
305       out = maxVal;
306       outIndex = index + 2U;
307     }
308 
309     maxVal = *pSrc++;
310     if (out < maxVal)
311     {
312       out = maxVal;
313       outIndex = index + 3U;
314     }
315 
316     maxVal = *pSrc++;
317     if (out < maxVal)
318     {
319       out = maxVal;
320       outIndex = index + 4U;
321     }
322 
323     index += 4U;
324 
325     /* Decrement loop counter */
326     blkCnt--;
327   }
328 
329   /* Loop unrolling: Compute remaining outputs */
330   blkCnt = (blockSize - 1U) % 4U;
331 
332 #else
333 
334   /* Initialize blkCnt with number of samples */
335   blkCnt = (blockSize - 1U);
336 
337 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
338 
339   while (blkCnt > 0U)
340   {
341     /* Initialize maxVal to the next consecutive values one by one */
342     maxVal = *pSrc++;
343 
344     /* compare for the maximum value */
345     if (out < maxVal)
346     {
347       /* Update the maximum value and it's index */
348       out = maxVal;
349       outIndex = blockSize - blkCnt;
350     }
351 
352     /* Decrement loop counter */
353     blkCnt--;
354   }
355 
356   /* Store the maximum value and it's index into destination pointers */
357   *pResult = out;
358   *pIndex = outIndex;
359 }
360 #endif /* #if defined(ARM_MATH_NEON) */
361 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
362 
363 /**
364   @} end of Max group
365  */
366