1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_max_f32.c
4 * Description: Maximum value of a floating-point vector
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions.h"
30 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
31 #include <limits.h>
32 #endif
33
34 /**
35 @ingroup groupStats
36 */
37
38 /**
39 @defgroup Max Maximum
40
41 Computes the maximum value of an array of data.
42 The function returns both the maximum value and its position within the array.
43 There are separate functions for floating-point, Q31, Q15, and Q7 data types.
44 */
45
46 /**
47 @addtogroup Max
48 @{
49 */
50
51 /**
52 @brief Maximum value of a floating-point vector.
53 @param[in] pSrc points to the input vector
54 @param[in] blockSize number of samples in input vector
55 @param[out] pResult maximum value returned here
56 @param[out] pIndex index of maximum value returned here
57 @return none
58 */
59
60 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)61 void arm_max_f32(
62 const float32_t * pSrc,
63 uint32_t blockSize,
64 float32_t * pResult,
65 uint32_t * pIndex)
66 {
67 uint32_t blkCnt;
68 f32x4_t vecSrc;
69 f32x4_t curExtremValVec = vdupq_n_f32(F32_MIN);
70 float32_t maxValue = F32_MIN;
71 uint32_t idx = blockSize;
72 uint32x4_t indexVec;
73 uint32x4_t curExtremIdxVec;
74 uint32_t curIdx = 0;
75 mve_pred16_t p0;
76 float32_t tmp;
77
78
79 indexVec = vidupq_wb_u32(&curIdx, 1);
80 curExtremIdxVec = vdupq_n_u32(0);
81
82 /* Compute 4 outputs at a time */
83 blkCnt = blockSize >> 2U;
84 while (blkCnt > 0U)
85 {
86 vecSrc = vldrwq_f32(pSrc);
87 /*
88 * Get current max per lane and current index per lane
89 * when a max is selected
90 */
91 p0 = vcmpgeq(vecSrc, curExtremValVec);
92 curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
93 curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
94
95 indexVec = vidupq_wb_u32(&curIdx, 1);
96
97 pSrc += 4;
98 /* Decrement the loop counter */
99 blkCnt--;
100 }
101
102
103 /*
104 * Get max value across the vector
105 */
106 maxValue = vmaxnmvq(maxValue, curExtremValVec);
107 /*
108 * set index for lower values to max possible index
109 */
110 p0 = vcmpgeq(curExtremValVec, maxValue);
111 indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
112 /*
113 * Get min index which is thus for a max value
114 */
115 idx = vminvq(idx, indexVec);
116
117 /* Tail */
118 blkCnt = blockSize & 0x3;
119
120 while (blkCnt > 0U)
121 {
122 /* Initialize tmp to the next consecutive values one by one */
123 tmp = *pSrc++;
124
125 /* compare for the maximum value */
126 if (maxValue < tmp)
127 {
128 /* Update the maximum value and it's index */
129 maxValue = tmp;
130 idx = blockSize - blkCnt;
131 }
132
133 /* Decrement loop counter */
134 blkCnt--;
135 }
136
137 /*
138 * Save result
139 */
140 *pIndex = idx;
141 *pResult = maxValue;
142 }
143
144 #else
145 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)146 void arm_max_f32(
147 const float32_t * pSrc,
148 uint32_t blockSize,
149 float32_t * pResult,
150 uint32_t * pIndex)
151 {
152 float32_t maxVal1, out; /* Temporary variables to store the output value. */
153 uint32_t blkCnt, outIndex; /* loop counter */
154
155 float32x4_t outV, srcV;
156 float32x2_t outV2;
157
158 uint32x4_t idxV;
159 uint32x4_t maxIdx;
160 static const uint32_t indexInit[4]={4,5,6,7};
161 static const uint32_t countVInit[4]={0,1,2,3};
162
163 uint32x4_t index;
164 uint32x4_t delta;
165 uint32x4_t countV;
166 uint32x2_t countV2;
167
168 maxIdx = vdupq_n_u32(ULONG_MAX);
169 delta = vdupq_n_u32(4);
170 index = vld1q_u32(indexInit);
171 countV = vld1q_u32(countVInit);
172
173
174 /* Initialise the index value to zero. */
175 outIndex = 0U;
176
177 /* Load first input value that act as reference value for comparison */
178 if (blockSize <= 3)
179 {
180 out = *pSrc++;
181
182 blkCnt = blockSize - 1;
183
184 while (blkCnt > 0U)
185 {
186 /* Initialize maxVal to the next consecutive values one by one */
187 maxVal1 = *pSrc++;
188
189 /* compare for the maximum value */
190 if (out < maxVal1)
191 {
192 /* Update the maximum value and it's index */
193 out = maxVal1;
194 outIndex = blockSize - blkCnt;
195 }
196
197 /* Decrement the loop counter */
198 blkCnt--;
199 }
200 }
201 else
202 {
203 outV = vld1q_f32(pSrc);
204 pSrc += 4;
205
206 /* Compute 4 outputs at a time */
207 blkCnt = (blockSize - 4 ) >> 2U;
208
209 while (blkCnt > 0U)
210 {
211 srcV = vld1q_f32(pSrc);
212 pSrc += 4;
213
214 idxV = vcgtq_f32(srcV, outV);
215 outV = vbslq_f32(idxV, srcV, outV );
216 countV = vbslq_u32(idxV, index,countV );
217
218 index = vaddq_u32(index,delta);
219
220 /* Decrement the loop counter */
221 blkCnt--;
222 }
223
224 outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
225 outV2 = vpmax_f32(outV2,outV2);
226 out = vget_lane_f32(outV2, 0);
227
228 idxV = vceqq_f32(outV, vdupq_n_f32(out));
229 countV = vbslq_u32(idxV, countV,maxIdx);
230
231 countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
232 countV2 = vpmin_u32(countV2,countV2);
233 outIndex = vget_lane_u32(countV2,0);
234
235 /* if (blockSize - 1U) is not multiple of 4 */
236 blkCnt = (blockSize - 4 ) % 4U;
237
238 while (blkCnt > 0U)
239 {
240 /* Initialize maxVal to the next consecutive values one by one */
241 maxVal1 = *pSrc++;
242
243 /* compare for the maximum value */
244 if (out < maxVal1)
245 {
246 /* Update the maximum value and it's index */
247 out = maxVal1;
248 outIndex = blockSize - blkCnt ;
249 }
250
251 /* Decrement the loop counter */
252 blkCnt--;
253 }
254
255
256 }
257
258 /* Store the maximum value and it's index into destination pointers */
259 *pResult = out;
260 *pIndex = outIndex;
261 }
262 #else
arm_max_f32(const float32_t * pSrc,uint32_t blockSize,float32_t * pResult,uint32_t * pIndex)263 void arm_max_f32(
264 const float32_t * pSrc,
265 uint32_t blockSize,
266 float32_t * pResult,
267 uint32_t * pIndex)
268 {
269 float32_t maxVal, out; /* Temporary variables to store the output value. */
270 uint32_t blkCnt, outIndex; /* Loop counter */
271
272 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
273 uint32_t index; /* index of maximum value */
274 #endif
275
276 /* Initialise index value to zero. */
277 outIndex = 0U;
278
279 /* Load first input value that act as reference value for comparision */
280 out = *pSrc++;
281
282 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
283 /* Initialise index of maximum value. */
284 index = 0U;
285
286 /* Loop unrolling: Compute 4 outputs at a time */
287 blkCnt = (blockSize - 1U) >> 2U;
288
289 while (blkCnt > 0U)
290 {
291 /* Initialize maxVal to next consecutive values one by one */
292 maxVal = *pSrc++;
293
294 /* compare for the maximum value */
295 if (out < maxVal)
296 {
297 /* Update the maximum value and it's index */
298 out = maxVal;
299 outIndex = index + 1U;
300 }
301
302 maxVal = *pSrc++;
303 if (out < maxVal)
304 {
305 out = maxVal;
306 outIndex = index + 2U;
307 }
308
309 maxVal = *pSrc++;
310 if (out < maxVal)
311 {
312 out = maxVal;
313 outIndex = index + 3U;
314 }
315
316 maxVal = *pSrc++;
317 if (out < maxVal)
318 {
319 out = maxVal;
320 outIndex = index + 4U;
321 }
322
323 index += 4U;
324
325 /* Decrement loop counter */
326 blkCnt--;
327 }
328
329 /* Loop unrolling: Compute remaining outputs */
330 blkCnt = (blockSize - 1U) % 4U;
331
332 #else
333
334 /* Initialize blkCnt with number of samples */
335 blkCnt = (blockSize - 1U);
336
337 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
338
339 while (blkCnt > 0U)
340 {
341 /* Initialize maxVal to the next consecutive values one by one */
342 maxVal = *pSrc++;
343
344 /* compare for the maximum value */
345 if (out < maxVal)
346 {
347 /* Update the maximum value and it's index */
348 out = maxVal;
349 outIndex = blockSize - blkCnt;
350 }
351
352 /* Decrement loop counter */
353 blkCnt--;
354 }
355
356 /* Store the maximum value and it's index into destination pointers */
357 *pResult = out;
358 *pIndex = outIndex;
359 }
360 #endif /* #if defined(ARM_MATH_NEON) */
361 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
362
363 /**
364 @} end of Max group
365 */
366