1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_max_f32.c
4 * Description: Maximum value of a floating-point vector
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions.h"
30 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
31 #include <limits.h>
32 #endif
33
34 /**
35 @ingroup groupStats
36 */
37
38 /**
39 @defgroup Max Maximum
40
41 Computes the maximum value of an array of data.
42 The function returns both the maximum value and its position within the array.
43 There are separate functions for floating-point, Q31, Q15, and Q7 data types.
44 */
45
46 /**
47 @addtogroup Max
48 @{
49 */
50
51 /**
52 @brief Maximum value of a floating-point vector.
53 @param[in] pSrc points to the input vector
54 @param[in] blockSize number of samples in input vector
55 @param[out] pResult maximum value returned here
56 @param[out] pIndex index of maximum value returned here
57 */
58
59 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)60 ARM_DSP_ATTRIBUTE void arm_max_f32(
61 const float32_t * pSrc,
62 uint32_t blockSize,
63 float32_t * pResult,
64 uint32_t * pIndex)
65 {
66 uint32_t blkCnt;
67 f32x4_t vecSrc;
68 f32x4_t curExtremValVec = vdupq_n_f32(F32_MIN);
69 float32_t maxValue = F32_MIN;
70 uint32_t idx = blockSize;
71 uint32x4_t indexVec;
72 uint32x4_t curExtremIdxVec;
73 uint32_t curIdx = 0;
74 mve_pred16_t p0;
75 float32_t tmp;
76
77
78 indexVec = vidupq_wb_u32(&curIdx, 1);
79 curExtremIdxVec = vdupq_n_u32(0);
80
81 /* Compute 4 outputs at a time */
82 blkCnt = blockSize >> 2U;
83 while (blkCnt > 0U)
84 {
85 vecSrc = vldrwq_f32(pSrc);
86 /*
87 * Get current max per lane and current index per lane
88 * when a max is selected
89 */
90 p0 = vcmpgeq(vecSrc, curExtremValVec);
91 curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
92 curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
93
94 indexVec = vidupq_wb_u32(&curIdx, 1);
95
96 pSrc += 4;
97 /* Decrement the loop counter */
98 blkCnt--;
99 }
100
101
102 /*
103 * Get max value across the vector
104 */
105 maxValue = vmaxnmvq(maxValue, curExtremValVec);
106 /*
107 * set index for lower values to max possible index
108 */
109 p0 = vcmpgeq(curExtremValVec, maxValue);
110 indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
111 /*
112 * Get min index which is thus for a max value
113 */
114 idx = vminvq(idx, indexVec);
115
116 /* Tail */
117 blkCnt = blockSize & 0x3;
118
119 while (blkCnt > 0U)
120 {
121 /* Initialize tmp to the next consecutive values one by one */
122 tmp = *pSrc++;
123
124 /* compare for the maximum value */
125 if (maxValue < tmp)
126 {
127 /* Update the maximum value and it's index */
128 maxValue = tmp;
129 idx = blockSize - blkCnt;
130 }
131
132 /* Decrement loop counter */
133 blkCnt--;
134 }
135
136 /*
137 * Save result
138 */
139 *pIndex = idx;
140 *pResult = maxValue;
141 }
142
143 #else
144 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)145 ARM_DSP_ATTRIBUTE void arm_max_f32(
146 const float32_t * pSrc,
147 uint32_t blockSize,
148 float32_t * pResult,
149 uint32_t * pIndex)
150 {
151 float32_t maxVal1, out; /* Temporary variables to store the output value. */
152 uint32_t blkCnt, outIndex; /* loop counter */
153
154 float32x4_t outV, srcV;
155 float32x2_t outV2;
156
157 uint32x4_t idxV;
158 uint32x4_t maxIdx;
159 static const uint32_t indexInit[4]={4,5,6,7};
160 static const uint32_t countVInit[4]={0,1,2,3};
161
162 uint32x4_t index;
163 uint32x4_t delta;
164 uint32x4_t countV;
165 uint32x2_t countV2;
166
167 maxIdx = vdupq_n_u32(UINT_MAX);
168 delta = vdupq_n_u32(4);
169 index = vld1q_u32(indexInit);
170 countV = vld1q_u32(countVInit);
171
172
173 /* Initialise the index value to zero. */
174 outIndex = 0U;
175
176 /* Load first input value that act as reference value for comparison */
177 if (blockSize <= 3)
178 {
179 out = *pSrc++;
180
181 blkCnt = blockSize - 1;
182
183 while (blkCnt > 0U)
184 {
185 /* Initialize maxVal to the next consecutive values one by one */
186 maxVal1 = *pSrc++;
187
188 /* compare for the maximum value */
189 if (out < maxVal1)
190 {
191 /* Update the maximum value and it's index */
192 out = maxVal1;
193 outIndex = blockSize - blkCnt;
194 }
195
196 /* Decrement the loop counter */
197 blkCnt--;
198 }
199 }
200 else
201 {
202 outV = vld1q_f32(pSrc);
203 pSrc += 4;
204
205 /* Compute 4 outputs at a time */
206 blkCnt = (blockSize - 4 ) >> 2U;
207
208 while (blkCnt > 0U)
209 {
210 srcV = vld1q_f32(pSrc);
211 pSrc += 4;
212
213 idxV = vcgtq_f32(srcV, outV);
214 outV = vbslq_f32(idxV, srcV, outV );
215 countV = vbslq_u32(idxV, index,countV );
216
217 index = vaddq_u32(index,delta);
218
219 /* Decrement the loop counter */
220 blkCnt--;
221 }
222
223 outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
224 outV2 = vpmax_f32(outV2,outV2);
225 out = vget_lane_f32(outV2, 0);
226
227 idxV = vceqq_f32(outV, vdupq_n_f32(out));
228 countV = vbslq_u32(idxV, countV,maxIdx);
229
230 countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
231 countV2 = vpmin_u32(countV2,countV2);
232 outIndex = vget_lane_u32(countV2,0);
233
234 /* if (blockSize - 1U) is not multiple of 4 */
235 blkCnt = (blockSize - 4 ) % 4U;
236
237 while (blkCnt > 0U)
238 {
239 /* Initialize maxVal to the next consecutive values one by one */
240 maxVal1 = *pSrc++;
241
242 /* compare for the maximum value */
243 if (out < maxVal1)
244 {
245 /* Update the maximum value and it's index */
246 out = maxVal1;
247 outIndex = blockSize - blkCnt ;
248 }
249
250 /* Decrement the loop counter */
251 blkCnt--;
252 }
253
254
255 }
256
257 /* Store the maximum value and it's index into destination pointers */
258 *pResult = out;
259 *pIndex = outIndex;
260 }
261 #else
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)262 ARM_DSP_ATTRIBUTE void arm_max_f32(
263 const float32_t * pSrc,
264 uint32_t blockSize,
265 float32_t * pResult,
266 uint32_t * pIndex)
267 {
268 float32_t maxVal, out; /* Temporary variables to store the output value. */
269 uint32_t blkCnt, outIndex; /* Loop counter */
270
271 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
272 uint32_t index; /* index of maximum value */
273 #endif
274
275 /* Initialise index value to zero. */
276 outIndex = 0U;
277
278 /* Load first input value that act as reference value for comparision */
279 out = *pSrc++;
280
281 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
282 /* Initialise index of maximum value. */
283 index = 0U;
284
285 /* Loop unrolling: Compute 4 outputs at a time */
286 blkCnt = (blockSize - 1U) >> 2U;
287
288 while (blkCnt > 0U)
289 {
290 /* Initialize maxVal to next consecutive values one by one */
291 maxVal = *pSrc++;
292
293 /* compare for the maximum value */
294 if (out < maxVal)
295 {
296 /* Update the maximum value and it's index */
297 out = maxVal;
298 outIndex = index + 1U;
299 }
300
301 maxVal = *pSrc++;
302 if (out < maxVal)
303 {
304 out = maxVal;
305 outIndex = index + 2U;
306 }
307
308 maxVal = *pSrc++;
309 if (out < maxVal)
310 {
311 out = maxVal;
312 outIndex = index + 3U;
313 }
314
315 maxVal = *pSrc++;
316 if (out < maxVal)
317 {
318 out = maxVal;
319 outIndex = index + 4U;
320 }
321
322 index += 4U;
323
324 /* Decrement loop counter */
325 blkCnt--;
326 }
327
328 /* Loop unrolling: Compute remaining outputs */
329 blkCnt = (blockSize - 1U) % 4U;
330
331 #else
332
333 /* Initialize blkCnt with number of samples */
334 blkCnt = (blockSize - 1U);
335
336 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
337
338 while (blkCnt > 0U)
339 {
340 /* Initialize maxVal to the next consecutive values one by one */
341 maxVal = *pSrc++;
342
343 /* compare for the maximum value */
344 if (out < maxVal)
345 {
346 /* Update the maximum value and it's index */
347 out = maxVal;
348 outIndex = blockSize - blkCnt;
349 }
350
351 /* Decrement loop counter */
352 blkCnt--;
353 }
354
355 /* Store the maximum value and it's index into destination pointers */
356 *pResult = out;
357 *pIndex = outIndex;
358 }
359 #endif /* #if defined(ARM_MATH_NEON) */
360 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
361
362 /**
363 @} end of Max group
364 */
365