1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_bitonic_sort_f32.c
4 * Description: Floating point bitonic sort
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/support_functions.h"
30 #include "arm_sorting.h"
31
32
33 #if !defined(ARM_MATH_NEON)
34
arm_bitonic_sort_core_f32(float32_t * pSrc,uint32_t n,uint8_t dir)35 static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
36 {
37 uint32_t step;
38 uint32_t k, j;
39 float32_t *leftPtr, *rightPtr;
40 float32_t temp;
41
42 step = n>>1;
43 leftPtr = pSrc;
44 rightPtr = pSrc+n-1;
45
46 for(k=0; k<step; k++)
47 {
48 if(dir == (*leftPtr > *rightPtr))
49 {
50 // Swap
51 temp=*leftPtr;
52 *leftPtr=*rightPtr;
53 *rightPtr=temp;
54 }
55
56 leftPtr++; // Move right
57 rightPtr--; // Move left
58 }
59
60 // Merge
61 for(step=(n>>2); step>0; step/=2)
62 {
63 for(j=0; j<n; j=j+step*2)
64 {
65 leftPtr = pSrc+j;
66 rightPtr = pSrc+j+step;
67
68 for(k=0; k<step; k++)
69 {
70 if(*leftPtr > *rightPtr)
71 {
72 // Swap
73 temp=*leftPtr;
74 *leftPtr=*rightPtr;
75 *rightPtr=temp;
76 }
77
78 leftPtr++;
79 rightPtr++;
80 }
81 }
82 }
83 }
84 #endif
85
86 #if defined(ARM_MATH_NEON)
87
88
arm_bitonic_resort_8_f32(float32x4_t a,float32x4_t b,uint8_t dir)89 static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
90 {
91 /* Start with two vectors:
92 * +---+---+---+---+
93 * | a | b | c | d |
94 * +---+---+---+---+
95 * +---+---+---+---+
96 * | e | f | g | h |
97 * +---+---+---+---+
98 * All the elements of the first are guaranteed to be less than or equal to
99 * all of the elements in the second, and both vectors are bitonic.
100 * We need to perform these operations to completely sort both lists:
101 * vminmax([abcd],[efgh])
102 * vminmax([acbd],[egfh])
103 */
104 vtrn128_64q(a, b);
105 /* +---+---+---+---+
106 * | a | b | e | f |
107 * +---+---+---+---+
108 * +---+---+---+---+
109 * | c | d | g | h |
110 * +---+---+---+---+
111 */
112 if(dir)
113 vminmaxq(a, b);
114 else
115 vminmaxq(b, a);
116
117 vtrn128_32q(a, b);
118 /* +---+---+---+---+
119 * | a | c | e | g |
120 * +---+---+---+---+
121 * +---+---+---+---+
122 * | b | d | f | h |
123 * +---+---+---+---+
124 */
125 if(dir)
126 vminmaxq(a, b);
127 else
128 vminmaxq(b, a);
129
130 return vzipq_f32(a, b);
131 }
132
133
arm_bitonic_merge_8_f32(float32x4_t a,float32x4_t b,uint8_t dir)134 static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
135 {
136 /* a and b are guaranteed to be bitonic */
137 // Reverse the element of the second vector
138 b = vrev128q_f32(b);
139
140 // Compare the two vectors
141 if(dir)
142 vminmaxq(a, b);
143 else
144 vminmaxq(b, a);
145
146 // Merge the two vectors
147 float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
148
149 return ab;
150 }
151
arm_bitonic_resort_16_f32(float32_t * pOut,float32x4x2_t a,float32x4x2_t b,uint8_t dir)152 static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
153 {
154 /* Start with two vectors:
155 * +---+---+---+---+---+---+---+---+
156 * | a | b | c | d | e | f | g | h |
157 * +---+---+---+---+---+---+---+---+
158 * +---+---+---+---+---+---+---+---+
159 * | i | j | k | l | m | n | o | p |
160 * +---+---+---+---+---+---+---+---+
161 * All the elements of the first are guaranteed to be less than or equal to
162 * all of the elements in the second, and both vectors are bitonic.
163 * We need to perform these operations to completely sort both lists:
164 * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
165 * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
166 * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
167 */
168
169 vtrn256_128q(a, b);
170 /* +---+---+---+---+---+---+---+---+
171 * | a | b | c | d | i | j | k | l |
172 * +---+---+---+---+---+---+---+---+
173 * +---+---+---+---+---+---+---+---+
174 * | e | f | g | h | m | n | o | p |
175 * +---+---+---+---+---+---+---+---+
176 */
177 if(dir)
178 vminmax256q(a, b);
179 else
180 vminmax256q(b, a);
181
182 vtrn256_64q(a, b);
183
184 /* +---+---+---+---+---+---+---+---+
185 * | a | b | e | f | i | j | m | n |
186 * +---+---+---+---+---+---+---+---+
187 * +---+---+---+---+---+---+---+---+
188 * | c | d | g | h | k | l | o | p |
189 * +---+---+---+---+---+---+---+---+
190 */
191 if(dir)
192 vminmax256q(a, b);
193 else
194 vminmax256q(b, a);
195
196 vtrn256_32q(a, b);
197 /* We now have:
198 * +---+---+---+---+---+---+---+---+
199 * | a | c | e | g | i | k | m | o |
200 * +---+---+---+---+---+---+---+---+
201 * +---+---+---+---+---+---+---+---+
202 * | b | d | f | h | j | l | n | p |
203 * +---+---+---+---+---+---+---+---+
204 */
205 if(dir)
206 vminmax256q(a, b);
207 else
208 vminmax256q(b, a);
209
210 float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
211 float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
212
213 vst1q_f32(pOut, out1.val[0]);
214 vst1q_f32(pOut+4, out1.val[1]);
215 vst1q_f32(pOut+8, out2.val[0]);
216 vst1q_f32(pOut+12, out2.val[1]);
217 }
218
arm_bitonic_merge_16_f32(float32_t * pOut,float32x4x2_t a,float32x4x2_t b,uint8_t dir)219 static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
220 {
221 // Merge two preordered float32x4x2_t
222 vrev256q_f32(b);
223
224 if(dir)
225 vminmax256q(a, b);
226 else
227 vminmax256q(b, a);
228
229 arm_bitonic_resort_16_f32(pOut, a, b, dir);
230 }
231
arm_bitonic_sort_16_f32(float32_t * pSrc,float32_t * pDst,uint8_t dir)232 static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
233 {
234 float32x4_t a;
235 float32x4_t b;
236 float32x4_t c;
237 float32x4_t d;
238
239 // Load 16 samples
240 a = vld1q_f32(pSrc);
241 b = vld1q_f32(pSrc+4);
242 c = vld1q_f32(pSrc+8);
243 d = vld1q_f32(pSrc+12);
244
245 // Bitonic sorting network for 4 samples x 4 times
246 if(dir)
247 {
248 vminmaxq(a, b);
249 vminmaxq(c, d);
250
251 vminmaxq(a, d);
252 vminmaxq(b, c);
253
254 vminmaxq(a, b);
255 vminmaxq(c, d);
256 }
257 else
258 {
259 vminmaxq(b, a);
260 vminmaxq(d, c);
261
262 vminmaxq(d, a);
263 vminmaxq(c, b);
264
265 vminmaxq(b, a);
266 vminmaxq(d, c);
267 }
268
269 float32x4x2_t ab = vtrnq_f32 (a, b);
270 float32x4x2_t cd = vtrnq_f32 (c, d);
271
272 // Transpose 4 ordered arrays of 4 samples
273 a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
274 b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
275 c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
276 d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
277
278 // Merge pairs of arrays of 4 samples
279 ab = arm_bitonic_merge_8_f32(a, b, dir);
280 cd = arm_bitonic_merge_8_f32(c, d, dir);
281
282 // Merge arrays of 8 samples
283 arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
284 }
285
286
287
288
289
arm_bitonic_merge_32_f32(float32_t * pSrc,float32x4x2_t ab1,float32x4x2_t ab2,float32x4x2_t cd1,float32x4x2_t cd2,uint8_t dir)290 static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
291 {
292 //Compare
293 if(dir)
294 {
295 vminmax256q(ab1, cd1);
296 vminmax256q(ab2, cd2);
297 }
298 else
299 {
300 vminmax256q(cd1, ab1);
301 vminmax256q(cd2, ab2);
302 }
303 //Transpose 256
304 float32x4_t temp;
305
306 temp = ab2.val[0];
307 ab2.val[0] = cd1.val[0];
308 cd1.val[0] = temp;
309 temp = ab2.val[1];
310 ab2.val[1] = cd1.val[1];
311 cd1.val[1] = temp;
312
313 //Compare
314 if(dir)
315 {
316 vminmax256q(ab1, cd1);
317 vminmax256q(ab2, cd2);
318 }
319 else
320 {
321 vminmax256q(cd1, ab1);
322 vminmax256q(cd2, ab2);
323 }
324
325 //Transpose 128
326 arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir);
327 arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
328 }
329
arm_bitonic_merge_64_f32(float32_t * pSrc,uint8_t dir)330 static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
331 {
332 float32x4x2_t ab1, ab2, ab3, ab4;
333 float32x4x2_t cd1, cd2, cd3, cd4;
334
335 //Load and reverse second array
336 ab1.val[0] = vld1q_f32(pSrc+0 );
337 ab1.val[1] = vld1q_f32(pSrc+4 );
338 ab2.val[0] = vld1q_f32(pSrc+8 );
339 ab2.val[1] = vld1q_f32(pSrc+12);
340 ab3.val[0] = vld1q_f32(pSrc+16);
341 ab3.val[1] = vld1q_f32(pSrc+20);
342 ab4.val[0] = vld1q_f32(pSrc+24);
343 ab4.val[1] = vld1q_f32(pSrc+28);
344
345 vldrev128q_f32(cd4.val[1], pSrc+32);
346 vldrev128q_f32(cd4.val[0], pSrc+36);
347 vldrev128q_f32(cd3.val[1], pSrc+40);
348 vldrev128q_f32(cd3.val[0], pSrc+44);
349 vldrev128q_f32(cd2.val[1], pSrc+48);
350 vldrev128q_f32(cd2.val[0], pSrc+52);
351 vldrev128q_f32(cd1.val[1], pSrc+56);
352 vldrev128q_f32(cd1.val[0], pSrc+60);
353
354 //Compare
355 if(dir)
356 {
357 vminmax256q(ab1, cd1);
358 vminmax256q(ab2, cd2);
359 vminmax256q(ab3, cd3);
360 vminmax256q(ab4, cd4);
361 }
362 else
363 {
364 vminmax256q(cd1, ab1);
365 vminmax256q(cd2, ab2);
366 vminmax256q(cd3, ab3);
367 vminmax256q(cd4, ab4);
368 }
369
370 //Transpose 512
371 float32x4_t temp;
372
373 temp = ab3.val[0];
374 ab3.val[0] = cd1.val[0];
375 cd1.val[0] = temp;
376 temp = ab3.val[1];
377 ab3.val[1] = cd1.val[1];
378 cd1.val[1] = temp;
379 temp = ab4.val[0];
380 ab4.val[0] = cd2.val[0];
381 cd2.val[0] = temp;
382 temp = ab4.val[1];
383 ab4.val[1] = cd2.val[1];
384 cd2.val[1] = temp;
385
386 //Compare
387 if(dir)
388 {
389 vminmax256q(ab1, cd1);
390 vminmax256q(ab2, cd2);
391 vminmax256q(ab3, cd3);
392 vminmax256q(ab4, cd4);
393 }
394 else
395 {
396 vminmax256q(cd1, ab1);
397 vminmax256q(cd2, ab2);
398 vminmax256q(cd3, ab3);
399 vminmax256q(cd4, ab4);
400 }
401
402 //Transpose 256
403 arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir);
404 arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
405 }
406
arm_bitonic_merge_128_f32(float32_t * pSrc,uint8_t dir)407 static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
408 {
409 float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
410 float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
411
412 //Load and reverse second array
413 ab1.val[0] = vld1q_f32(pSrc+0 );
414 ab1.val[1] = vld1q_f32(pSrc+4 );
415 ab2.val[0] = vld1q_f32(pSrc+8 );
416 ab2.val[1] = vld1q_f32(pSrc+12);
417 ab3.val[0] = vld1q_f32(pSrc+16);
418 ab3.val[1] = vld1q_f32(pSrc+20);
419 ab4.val[0] = vld1q_f32(pSrc+24);
420 ab4.val[1] = vld1q_f32(pSrc+28);
421 ab5.val[0] = vld1q_f32(pSrc+32);
422 ab5.val[1] = vld1q_f32(pSrc+36);
423 ab6.val[0] = vld1q_f32(pSrc+40);
424 ab6.val[1] = vld1q_f32(pSrc+44);
425 ab7.val[0] = vld1q_f32(pSrc+48);
426 ab7.val[1] = vld1q_f32(pSrc+52);
427 ab8.val[0] = vld1q_f32(pSrc+56);
428 ab8.val[1] = vld1q_f32(pSrc+60);
429
430 vldrev128q_f32(cd8.val[1], pSrc+64);
431 vldrev128q_f32(cd8.val[0], pSrc+68);
432 vldrev128q_f32(cd7.val[1], pSrc+72);
433 vldrev128q_f32(cd7.val[0], pSrc+76);
434 vldrev128q_f32(cd6.val[1], pSrc+80);
435 vldrev128q_f32(cd6.val[0], pSrc+84);
436 vldrev128q_f32(cd5.val[1], pSrc+88);
437 vldrev128q_f32(cd5.val[0], pSrc+92);
438 vldrev128q_f32(cd4.val[1], pSrc+96);
439 vldrev128q_f32(cd4.val[0], pSrc+100);
440 vldrev128q_f32(cd3.val[1], pSrc+104);
441 vldrev128q_f32(cd3.val[0], pSrc+108);
442 vldrev128q_f32(cd2.val[1], pSrc+112);
443 vldrev128q_f32(cd2.val[0], pSrc+116);
444 vldrev128q_f32(cd1.val[1], pSrc+120);
445 vldrev128q_f32(cd1.val[0], pSrc+124);
446
447 //Compare
448 if(dir)
449 {
450 vminmax256q(ab1, cd1);
451 vminmax256q(ab2, cd2);
452 vminmax256q(ab3, cd3);
453 vminmax256q(ab4, cd4);
454 vminmax256q(ab5, cd5);
455 vminmax256q(ab6, cd6);
456 vminmax256q(ab7, cd7);
457 vminmax256q(ab8, cd8);
458 }
459 else
460 {
461 vminmax256q(cd1, ab1);
462 vminmax256q(cd2, ab2);
463 vminmax256q(cd3, ab3);
464 vminmax256q(cd4, ab4);
465 vminmax256q(cd5, ab5);
466 vminmax256q(cd6, ab6);
467 vminmax256q(cd7, ab7);
468 vminmax256q(cd8, ab8);
469 }
470
471 //Transpose
472 float32x4_t temp;
473
474 temp = ab5.val[0];
475 ab5.val[0] = cd1.val[0];
476 cd1.val[0] = temp;
477 temp = ab5.val[1];
478 ab5.val[1] = cd1.val[1];
479 cd1.val[1] = temp;
480 temp = ab6.val[0];
481 ab6.val[0] = cd2.val[0];
482 cd2.val[0] = temp;
483 temp = ab6.val[1];
484 ab6.val[1] = cd2.val[1];
485 cd2.val[1] = temp;
486 temp = ab7.val[0];
487 ab7.val[0] = cd3.val[0];
488 cd3.val[0] = temp;
489 temp = ab7.val[1];
490 ab7.val[1] = cd3.val[1];
491 cd3.val[1] = temp;
492 temp = ab8.val[0];
493 ab8.val[0] = cd4.val[0];
494 cd4.val[0] = temp;
495 temp = ab8.val[1];
496 ab8.val[1] = cd4.val[1];
497 cd4.val[1] = temp;
498
499 //Compare
500 if(dir)
501 {
502 vminmax256q(ab1, cd1);
503 vminmax256q(ab2, cd2);
504 vminmax256q(ab3, cd3);
505 vminmax256q(ab4, cd4);
506 vminmax256q(ab5, cd5);
507 vminmax256q(ab6, cd6);
508 vminmax256q(ab7, cd7);
509 vminmax256q(ab8, cd8);
510 }
511 else
512 {
513 vminmax256q(cd1, ab1);
514 vminmax256q(cd2, ab2);
515 vminmax256q(cd3, ab3);
516 vminmax256q(cd4, ab4);
517 vminmax256q(cd5, ab5);
518 vminmax256q(cd6, ab6);
519 vminmax256q(cd7, ab7);
520 vminmax256q(cd8, ab8);
521 }
522
523 vst1q_f32(pSrc, ab1.val[0]);
524 vst1q_f32(pSrc+4, ab1.val[1]);
525 vst1q_f32(pSrc+8, ab2.val[0]);
526 vst1q_f32(pSrc+12, ab2.val[1]);
527 vst1q_f32(pSrc+16, ab3.val[0]);
528 vst1q_f32(pSrc+20, ab3.val[1]);
529 vst1q_f32(pSrc+24, ab4.val[0]);
530 vst1q_f32(pSrc+28, ab4.val[1]);
531 vst1q_f32(pSrc+32, cd1.val[0]);
532 vst1q_f32(pSrc+36, cd1.val[1]);
533 vst1q_f32(pSrc+40, cd2.val[0]);
534 vst1q_f32(pSrc+44, cd2.val[1]);
535 vst1q_f32(pSrc+48, cd3.val[0]);
536 vst1q_f32(pSrc+52, cd3.val[1]);
537 vst1q_f32(pSrc+56, cd4.val[0]);
538 vst1q_f32(pSrc+60, cd4.val[1]);
539 vst1q_f32(pSrc+64, ab5.val[0]);
540 vst1q_f32(pSrc+68, ab5.val[1]);
541 vst1q_f32(pSrc+72, ab6.val[0]);
542 vst1q_f32(pSrc+76, ab6.val[1]);
543 vst1q_f32(pSrc+80, ab7.val[0]);
544 vst1q_f32(pSrc+84, ab7.val[1]);
545 vst1q_f32(pSrc+88, ab8.val[0]);
546 vst1q_f32(pSrc+92, ab8.val[1]);
547 vst1q_f32(pSrc+96, cd5.val[0]);
548 vst1q_f32(pSrc+100, cd5.val[1]);
549 vst1q_f32(pSrc+104, cd6.val[0]);
550 vst1q_f32(pSrc+108, cd6.val[1]);
551 vst1q_f32(pSrc+112, cd7.val[0]);
552 vst1q_f32(pSrc+116, cd7.val[1]);
553 vst1q_f32(pSrc+120, cd8.val[0]);
554 vst1q_f32(pSrc+124, cd8.val[1]);
555
556 //Transpose
557 arm_bitonic_merge_64_f32(pSrc+0 , dir);
558 arm_bitonic_merge_64_f32(pSrc+64, dir);
559 }
560
arm_bitonic_merge_256_f32(float32_t * pSrc,uint8_t dir)561 static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
562 {
563 float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
564 float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
565 float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
566 float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
567
568 //Load and reverse second array
569 ab1.val[0] = vld1q_f32(pSrc+0 );
570 ab1.val[1] = vld1q_f32(pSrc+4 );
571 ab2.val[0] = vld1q_f32(pSrc+8 );
572 ab2.val[1] = vld1q_f32(pSrc+12 );
573 ab3.val[0] = vld1q_f32(pSrc+16 );
574 ab3.val[1] = vld1q_f32(pSrc+20 );
575 ab4.val[0] = vld1q_f32(pSrc+24 );
576 ab4.val[1] = vld1q_f32(pSrc+28 );
577 ab5.val[0] = vld1q_f32(pSrc+32 );
578 ab5.val[1] = vld1q_f32(pSrc+36 );
579 ab6.val[0] = vld1q_f32(pSrc+40 );
580 ab6.val[1] = vld1q_f32(pSrc+44 );
581 ab7.val[0] = vld1q_f32(pSrc+48 );
582 ab7.val[1] = vld1q_f32(pSrc+52 );
583 ab8.val[0] = vld1q_f32(pSrc+56 );
584 ab8.val[1] = vld1q_f32(pSrc+60 );
585 ab9.val[0] = vld1q_f32(pSrc+64 );
586 ab9.val[1] = vld1q_f32(pSrc+68 );
587 ab10.val[0] = vld1q_f32(pSrc+72 );
588 ab10.val[1] = vld1q_f32(pSrc+76 );
589 ab11.val[0] = vld1q_f32(pSrc+80 );
590 ab11.val[1] = vld1q_f32(pSrc+84 );
591 ab12.val[0] = vld1q_f32(pSrc+88 );
592 ab12.val[1] = vld1q_f32(pSrc+92 );
593 ab13.val[0] = vld1q_f32(pSrc+96 );
594 ab13.val[1] = vld1q_f32(pSrc+100);
595 ab14.val[0] = vld1q_f32(pSrc+104);
596 ab14.val[1] = vld1q_f32(pSrc+108);
597 ab15.val[0] = vld1q_f32(pSrc+112);
598 ab15.val[1] = vld1q_f32(pSrc+116);
599 ab16.val[0] = vld1q_f32(pSrc+120);
600 ab16.val[1] = vld1q_f32(pSrc+124);
601
602 vldrev128q_f32(cd16.val[1], pSrc+128);
603 vldrev128q_f32(cd16.val[0], pSrc+132);
604 vldrev128q_f32(cd15.val[1], pSrc+136);
605 vldrev128q_f32(cd15.val[0], pSrc+140);
606 vldrev128q_f32(cd14.val[1], pSrc+144);
607 vldrev128q_f32(cd14.val[0], pSrc+148);
608 vldrev128q_f32(cd13.val[1], pSrc+152);
609 vldrev128q_f32(cd13.val[0], pSrc+156);
610 vldrev128q_f32(cd12.val[1], pSrc+160);
611 vldrev128q_f32(cd12.val[0], pSrc+164);
612 vldrev128q_f32(cd11.val[1], pSrc+168);
613 vldrev128q_f32(cd11.val[0], pSrc+172);
614 vldrev128q_f32(cd10.val[1], pSrc+176);
615 vldrev128q_f32(cd10.val[0], pSrc+180);
616 vldrev128q_f32(cd9.val[1] , pSrc+184);
617 vldrev128q_f32(cd9.val[0] , pSrc+188);
618 vldrev128q_f32(cd8.val[1] , pSrc+192);
619 vldrev128q_f32(cd8.val[0] , pSrc+196);
620 vldrev128q_f32(cd7.val[1] , pSrc+200);
621 vldrev128q_f32(cd7.val[0] , pSrc+204);
622 vldrev128q_f32(cd6.val[1] , pSrc+208);
623 vldrev128q_f32(cd6.val[0] , pSrc+212);
624 vldrev128q_f32(cd5.val[1] , pSrc+216);
625 vldrev128q_f32(cd5.val[0] , pSrc+220);
626 vldrev128q_f32(cd4.val[1] , pSrc+224);
627 vldrev128q_f32(cd4.val[0] , pSrc+228);
628 vldrev128q_f32(cd3.val[1] , pSrc+232);
629 vldrev128q_f32(cd3.val[0] , pSrc+236);
630 vldrev128q_f32(cd2.val[1] , pSrc+240);
631 vldrev128q_f32(cd2.val[0] , pSrc+244);
632 vldrev128q_f32(cd1.val[1] , pSrc+248);
633 vldrev128q_f32(cd1.val[0] , pSrc+252);
634
635 //Compare
636 if(dir)
637 {
638 vminmax256q(ab1 , cd1 );
639 vminmax256q(ab2 , cd2 );
640 vminmax256q(ab3 , cd3 );
641 vminmax256q(ab4 , cd4 );
642 vminmax256q(ab5 , cd5 );
643 vminmax256q(ab6 , cd6 );
644 vminmax256q(ab7 , cd7 );
645 vminmax256q(ab8 , cd8 );
646 vminmax256q(ab9 , cd9 );
647 vminmax256q(ab10, cd10);
648 vminmax256q(ab11, cd11);
649 vminmax256q(ab12, cd12);
650 vminmax256q(ab13, cd13);
651 vminmax256q(ab14, cd14);
652 vminmax256q(ab15, cd15);
653 vminmax256q(ab16, cd16);
654 }
655 else
656 {
657 vminmax256q(cd1 , ab1 );
658 vminmax256q(cd2 , ab2 );
659 vminmax256q(cd3 , ab3 );
660 vminmax256q(cd4 , ab4 );
661 vminmax256q(cd5 , ab5 );
662 vminmax256q(cd6 , ab6 );
663 vminmax256q(cd7 , ab7 );
664 vminmax256q(cd8 , ab8 );
665 vminmax256q(cd9 , ab9 );
666 vminmax256q(cd10, ab10);
667 vminmax256q(cd11, ab11);
668 vminmax256q(cd12, ab12);
669 vminmax256q(cd13, ab13);
670 vminmax256q(cd14, ab14);
671 vminmax256q(cd15, ab15);
672 vminmax256q(cd16, ab16);
673 }
674
675 //Transpose
676 float32x4_t temp;
677
678 temp = ab9.val[0];
679 ab9.val[0] = cd1.val[0];
680 cd1.val[0] = temp;
681 temp = ab9.val[1];
682 ab9.val[1] = cd1.val[1];
683 cd1.val[1] = temp;
684 temp = ab10.val[0];
685 ab10.val[0] = cd2.val[0];
686 cd2.val[0] = temp;
687 temp = ab10.val[1];
688 ab10.val[1] = cd2.val[1];
689 cd2.val[1] = temp;
690 temp = ab11.val[0];
691 ab11.val[0] = cd3.val[0];
692 cd3.val[0] = temp;
693 temp = ab11.val[1];
694 ab11.val[1] = cd3.val[1];
695 cd3.val[1] = temp;
696 temp = ab12.val[0];
697 ab12.val[0] = cd4.val[0];
698 cd4.val[0] = temp;
699 temp = ab12.val[1];
700 ab12.val[1] = cd4.val[1];
701 cd4.val[1] = temp;
702 temp = ab13.val[0];
703 ab13.val[0] = cd5.val[0];
704 cd5.val[0] = temp;
705 temp = ab13.val[1];
706 ab13.val[1] = cd5.val[1];
707 cd5.val[1] = temp;
708 temp = ab14.val[0];
709 ab14.val[0] = cd6.val[0];
710 cd6.val[0] = temp;
711 temp = ab14.val[1];
712 ab14.val[1] = cd6.val[1];
713 cd6.val[1] = temp;
714 temp = ab15.val[0];
715 ab15.val[0] = cd7.val[0];
716 cd7.val[0] = temp;
717 temp = ab15.val[1];
718 ab15.val[1] = cd7.val[1];
719 cd7.val[1] = temp;
720 temp = ab16.val[0];
721 ab16.val[0] = cd8.val[0];
722 cd8.val[0] = temp;
723 temp = ab16.val[1];
724 ab16.val[1] = cd8.val[1];
725 cd8.val[1] = temp;
726
727 //Compare
728 if(dir)
729 {
730 vminmax256q(ab1 , cd1 );
731 vminmax256q(ab2 , cd2 );
732 vminmax256q(ab3 , cd3 );
733 vminmax256q(ab4 , cd4 );
734 vminmax256q(ab5 , cd5 );
735 vminmax256q(ab6 , cd6 );
736 vminmax256q(ab7 , cd7 );
737 vminmax256q(ab8 , cd8 );
738 vminmax256q(ab9 , cd9 );
739 vminmax256q(ab10, cd10);
740 vminmax256q(ab11, cd11);
741 vminmax256q(ab12, cd12);
742 vminmax256q(ab13, cd13);
743 vminmax256q(ab14, cd14);
744 vminmax256q(ab15, cd15);
745 vminmax256q(ab16, cd16);
746 }
747 else
748 {
749 vminmax256q(cd1 , ab1 );
750 vminmax256q(cd2 , ab2 );
751 vminmax256q(cd3 , ab3 );
752 vminmax256q(cd4 , ab4 );
753 vminmax256q(cd5 , ab5 );
754 vminmax256q(cd6 , ab6 );
755 vminmax256q(cd7 , ab7 );
756 vminmax256q(cd8 , ab8 );
757 vminmax256q(cd9 , ab9 );
758 vminmax256q(cd10, ab10);
759 vminmax256q(cd11, ab11);
760 vminmax256q(cd12, ab12);
761 vminmax256q(cd13, ab13);
762 vminmax256q(cd14, ab14);
763 vminmax256q(cd15, ab15);
764 vminmax256q(cd16, ab16);
765 }
766
767 vst1q_f32(pSrc, ab1.val[0] );
768 vst1q_f32(pSrc+4, ab1.val[1] );
769 vst1q_f32(pSrc+8, ab2.val[0] );
770 vst1q_f32(pSrc+12, ab2.val[1] );
771 vst1q_f32(pSrc+16, ab3.val[0] );
772 vst1q_f32(pSrc+20, ab3.val[1] );
773 vst1q_f32(pSrc+24, ab4.val[0] );
774 vst1q_f32(pSrc+28, ab4.val[1] );
775 vst1q_f32(pSrc+32, ab5.val[0] );
776 vst1q_f32(pSrc+36, ab5.val[1] );
777 vst1q_f32(pSrc+40, ab6.val[0] );
778 vst1q_f32(pSrc+44, ab6.val[1] );
779 vst1q_f32(pSrc+48, ab7.val[0] );
780 vst1q_f32(pSrc+52, ab7.val[1] );
781 vst1q_f32(pSrc+56, ab8.val[0] );
782 vst1q_f32(pSrc+60, ab8.val[1] );
783 vst1q_f32(pSrc+64, cd1.val[0] );
784 vst1q_f32(pSrc+68, cd1.val[1] );
785 vst1q_f32(pSrc+72, cd2.val[0] );
786 vst1q_f32(pSrc+76, cd2.val[1] );
787 vst1q_f32(pSrc+80, cd3.val[0] );
788 vst1q_f32(pSrc+84, cd3.val[1] );
789 vst1q_f32(pSrc+88, cd4.val[0] );
790 vst1q_f32(pSrc+92, cd4.val[1] );
791 vst1q_f32(pSrc+96, cd5.val[0] );
792 vst1q_f32(pSrc+100, cd5.val[1] );
793 vst1q_f32(pSrc+104, cd6.val[0] );
794 vst1q_f32(pSrc+108, cd6.val[1] );
795 vst1q_f32(pSrc+112, cd7.val[0] );
796 vst1q_f32(pSrc+116, cd7.val[1] );
797 vst1q_f32(pSrc+120, cd8.val[0] );
798 vst1q_f32(pSrc+124, cd8.val[1] );
799 vst1q_f32(pSrc+128, ab9.val[0] );
800 vst1q_f32(pSrc+132, ab9.val[1] );
801 vst1q_f32(pSrc+136, ab10.val[0]);
802 vst1q_f32(pSrc+140, ab10.val[1]);
803 vst1q_f32(pSrc+144, ab11.val[0]);
804 vst1q_f32(pSrc+148, ab11.val[1]);
805 vst1q_f32(pSrc+152, ab12.val[0]);
806 vst1q_f32(pSrc+156, ab12.val[1]);
807 vst1q_f32(pSrc+160, ab13.val[0]);
808 vst1q_f32(pSrc+164, ab13.val[1]);
809 vst1q_f32(pSrc+168, ab14.val[0]);
810 vst1q_f32(pSrc+172, ab14.val[1]);
811 vst1q_f32(pSrc+176, ab15.val[0]);
812 vst1q_f32(pSrc+180, ab15.val[1]);
813 vst1q_f32(pSrc+184, ab16.val[0]);
814 vst1q_f32(pSrc+188, ab16.val[1]);
815 vst1q_f32(pSrc+192, cd9.val[0] );
816 vst1q_f32(pSrc+196, cd9.val[1] );
817 vst1q_f32(pSrc+200, cd10.val[0]);
818 vst1q_f32(pSrc+204, cd10.val[1]);
819 vst1q_f32(pSrc+208, cd11.val[0]);
820 vst1q_f32(pSrc+212, cd11.val[1]);
821 vst1q_f32(pSrc+216, cd12.val[0]);
822 vst1q_f32(pSrc+220, cd12.val[1]);
823 vst1q_f32(pSrc+224, cd13.val[0]);
824 vst1q_f32(pSrc+228, cd13.val[1]);
825 vst1q_f32(pSrc+232, cd14.val[0]);
826 vst1q_f32(pSrc+236, cd14.val[1]);
827 vst1q_f32(pSrc+240, cd15.val[0]);
828 vst1q_f32(pSrc+244, cd15.val[1]);
829 vst1q_f32(pSrc+248, cd16.val[0]);
830 vst1q_f32(pSrc+252, cd16.val[1]);
831
832 //Transpose
833 arm_bitonic_merge_128_f32(pSrc+0 , dir);
834 arm_bitonic_merge_128_f32(pSrc+128, dir);
835 }
836
837 #define SWAP(a,i,j) \
838 temp = vgetq_lane_f32(a, j); \
839 a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\
840 a = vsetq_lane_f32(temp, a, i);
841
arm_bitonic_sort_4_f32(float32x4_t a,uint8_t dir)842 static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
843 {
844 float32_t temp;
845
846
847 if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
848 {
849 SWAP(a,0,1);
850 }
851 if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) )
852 {
853 SWAP(a,2,3);
854 }
855
856 if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) )
857 {
858 SWAP(a,0,3);
859 }
860 if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) )
861 {
862 SWAP(a,1,2);
863 }
864
865 if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
866 {
867 SWAP(a,0,1);
868 }
869 if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) )
870 {
871 SWAP(a,2,3);
872 }
873
874 return a;
875 }
876
arm_bitonic_sort_8_f32(float32x4_t a,float32x4_t b,uint8_t dir)877 static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
878 {
879 a = arm_bitonic_sort_4_f32(a, dir);
880 b = arm_bitonic_sort_4_f32(b, dir);
881 return arm_bitonic_merge_8_f32(a, b, dir);
882 }
883
884
885
886 #endif
887
888 /**
889 @ingroup groupSupport
890 */
891
892 /**
893 @defgroup Sorting Vector sorting algorithms
894
895 Sort the elements of a vector
896
897 There are separate functions for floating-point, Q31, Q15, and Q7 data types.
898 */
899
900 /**
901 @addtogroup Sorting
902 @{
903 */
904
905 /**
906 * @private
907 * @param[in] S points to an instance of the sorting structure.
908 * @param[in] pSrc points to the block of input data.
909 * @param[out] pDst points to the block of output data
910 * @param[in] blockSize number of samples to process.
911 */
arm_bitonic_sort_f32(const arm_sort_instance_f32 * S,float32_t * pSrc,float32_t * pDst,uint32_t blockSize)912 void arm_bitonic_sort_f32(
913 const arm_sort_instance_f32 * S,
914 float32_t * pSrc,
915 float32_t * pDst,
916 uint32_t blockSize)
917 {
918 uint16_t s, i;
919 uint8_t dir = S->dir;
920
921 #ifdef ARM_MATH_NEON
922 (void)s;
923
924 float32_t * pOut;
925 uint16_t counter = blockSize>>5;
926
927 if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
928 {
929 if(pSrc == pDst) // in-place
930 pOut = pSrc;
931 else
932 pOut = pDst;
933
934 float32x4x2_t ab1, ab2;
935 float32x4x2_t cd1, cd2;
936
937 if(blockSize == 1)
938 pOut = pSrc;
939 else if(blockSize == 2)
940 {
941 float32_t temp;
942
943 if( dir==(pSrc[0]>pSrc[1]) )
944 {
945 temp = pSrc[1];
946 pOut[1] = pSrc[0];
947 pOut[0] = temp;
948 }
949 else
950 pOut = pSrc;
951 }
952 else if(blockSize == 4)
953 {
954 float32x4_t a = vld1q_f32(pSrc);
955
956 a = arm_bitonic_sort_4_f32(a, dir);
957
958 vst1q_f32(pOut, a);
959 }
960 else if(blockSize == 8)
961 {
962 float32x4_t a;
963 float32x4_t b;
964 float32x4x2_t ab;
965
966 a = vld1q_f32(pSrc);
967 b = vld1q_f32(pSrc+4);
968
969 ab = arm_bitonic_sort_8_f32(a, b, dir);
970
971 vst1q_f32(pOut, ab.val[0]);
972 vst1q_f32(pOut+4, ab.val[1]);
973 }
974 else if(blockSize >=16)
975 {
976 // Order 16 bits long vectors
977 for(i=0; i<blockSize; i=i+16)
978 arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir);
979
980 // Merge
981 for(i=0; i<counter; i++)
982 {
983 // Load and reverse second vector
984 ab1.val[0] = vld1q_f32(pOut+32*i+0 );
985 ab1.val[1] = vld1q_f32(pOut+32*i+4 );
986 ab2.val[0] = vld1q_f32(pOut+32*i+8 );
987 ab2.val[1] = vld1q_f32(pOut+32*i+12);
988
989 vldrev128q_f32(cd2.val[1], pOut+32*i+16);
990 vldrev128q_f32(cd2.val[0], pOut+32*i+20);
991 vldrev128q_f32(cd1.val[1], pOut+32*i+24);
992 vldrev128q_f32(cd1.val[0], pOut+32*i+28);
993
994 arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir);
995 }
996
997 counter = counter>>1;
998 for(i=0; i<counter; i++)
999 arm_bitonic_merge_64_f32(pOut+64*i, dir);
1000
1001 counter = counter>>1;
1002 for(i=0; i<counter; i++)
1003 arm_bitonic_merge_128_f32(pOut+128*i, dir);
1004
1005 counter = counter>>1;
1006 for(i=0; i<counter; i++)
1007 arm_bitonic_merge_256_f32(pOut+256*i, dir);
1008
1009 // Etc...
1010 }
1011 }
1012
1013 #else
1014
1015 float32_t * pA;
1016
1017 if(pSrc != pDst) // out-of-place
1018 {
1019 memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
1020 pA = pDst;
1021 }
1022 else
1023 pA = pSrc;
1024
1025
1026 if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
1027 {
1028 for(s=2; s<=blockSize; s=s*2)
1029 {
1030 for(i=0; i<blockSize; i=i+s)
1031 arm_bitonic_sort_core_f32(pA+i, s, dir);
1032 }
1033 }
1034 #endif
1035 }
1036
1037 /**
1038 @} end of Sorting group
1039 */
1040