1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_bitonic_sort_f32.c
4  * Description:  Floating point bitonic sort
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/support_functions.h"
30 #include "arm_sorting.h"
31 
32 
33 #if !defined(ARM_MATH_NEON)
34 
arm_bitonic_sort_core_f32(float32_t * pSrc,uint32_t n,uint8_t dir)35 static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
36 {
37     uint32_t step;
38     uint32_t k, j;
39     float32_t *leftPtr, *rightPtr;
40     float32_t temp;
41 
42     step = n>>1;
43     leftPtr = pSrc;
44     rightPtr = pSrc+n-1;
45 
46     for(k=0; k<step; k++)
47     {
48 	if(dir == (*leftPtr > *rightPtr))
49 	{
50             // Swap
51 	    temp=*leftPtr;
52 	    *leftPtr=*rightPtr;
53 	    *rightPtr=temp;
54 	}
55 
56 	leftPtr++;  // Move right
57 	rightPtr--; // Move left
58     }
59 
60     // Merge
61     for(step=(n>>2); step>0; step/=2)
62     {
63 	for(j=0; j<n; j=j+step*2)
64 	{
65 	    leftPtr  = pSrc+j;
66 	    rightPtr = pSrc+j+step;
67 
68 	    for(k=0; k<step; k++)
69 	    {
70 		if(*leftPtr > *rightPtr)
71 		{
72 		    // Swap
73 	    	    temp=*leftPtr;
74 		    *leftPtr=*rightPtr;
75 		    *rightPtr=temp;
76 		}
77 
78 		leftPtr++;
79 		rightPtr++;
80 	    }
81 	}
82     }
83 }
84 #endif
85 
86 #if defined(ARM_MATH_NEON)
87 
88 
arm_bitonic_resort_8_f32(float32x4_t a,float32x4_t b,uint8_t dir)89 static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
90 {
91     /* Start with two vectors:
92      * +---+---+---+---+
93      * | a | b | c | d |
94      * +---+---+---+---+
95      * +---+---+---+---+
96      * | e | f | g | h |
97      * +---+---+---+---+
98      * All the elements of the first are guaranteed to be less than or equal to
99      * all of the elements in the second, and both vectors are bitonic.
100      * We need to perform these operations to completely sort both lists:
101      * vminmax([abcd],[efgh])
102      * vminmax([acbd],[egfh])
103      */
104     vtrn128_64q(a, b);
105     /* +---+---+---+---+
106      * | a | b | e | f |
107      * +---+---+---+---+
108      * +---+---+---+---+
109      * | c | d | g | h |
110      * +---+---+---+---+
111      */
112     if(dir)
113         vminmaxq(a, b);
114     else
115         vminmaxq(b, a);
116 
117     vtrn128_32q(a, b);
118     /* +---+---+---+---+
119      * | a | c | e | g |
120      * +---+---+---+---+
121      * +---+---+---+---+
122      * | b | d | f | h |
123      * +---+---+---+---+
124      */
125     if(dir)
126         vminmaxq(a, b);
127     else
128         vminmaxq(b, a);
129 
130     return vzipq_f32(a, b);
131 }
132 
133 
arm_bitonic_merge_8_f32(float32x4_t a,float32x4_t b,uint8_t dir)134 static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
135 {
136     /* a and b are guaranteed to be bitonic */
137     // Reverse the element of the second vector
138     b = vrev128q_f32(b);
139 
140     // Compare the two vectors
141     if(dir)
142         vminmaxq(a, b);
143     else
144     vminmaxq(b, a);
145 
146     // Merge the two vectors
147     float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
148 
149     return ab;
150 }
151 
arm_bitonic_resort_16_f32(float32_t * pOut,float32x4x2_t a,float32x4x2_t b,uint8_t dir)152 static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
153 {
154     /* Start with two vectors:
155      * +---+---+---+---+---+---+---+---+
156      * | a | b | c | d | e | f | g | h |
157      * +---+---+---+---+---+---+---+---+
158      * +---+---+---+---+---+---+---+---+
159      * | i | j | k | l | m | n | o | p |
160      * +---+---+---+---+---+---+---+---+
161      * All the elements of the first are guaranteed to be less than or equal to
162      * all of the elements in the second, and both vectors are bitonic.
163      * We need to perform these operations to completely sort both lists:
164      * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
165      * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
166      * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
167      */
168 
169     vtrn256_128q(a, b);
170     /* +---+---+---+---+---+---+---+---+
171      * | a | b | c | d | i | j | k | l |
172      * +---+---+---+---+---+---+---+---+
173      * +---+---+---+---+---+---+---+---+
174      * | e | f | g | h | m | n | o | p |
175      * +---+---+---+---+---+---+---+---+
176      */
177     if(dir)
178         vminmax256q(a, b);
179     else
180         vminmax256q(b, a);
181 
182     vtrn256_64q(a, b);
183 
184     /* +---+---+---+---+---+---+---+---+
185      * | a | b | e | f | i | j | m | n |
186      * +---+---+---+---+---+---+---+---+
187      * +---+---+---+---+---+---+---+---+
188      * | c | d | g | h | k | l | o | p |
189      * +---+---+---+---+---+---+---+---+
190      */
191     if(dir)
192         vminmax256q(a, b);
193     else
194         vminmax256q(b, a);
195 
196     vtrn256_32q(a, b);
197     /* We now have:
198      * +---+---+---+---+---+---+---+---+
199      * | a | c | e | g | i | k | m | o |
200      * +---+---+---+---+---+---+---+---+
201      * +---+---+---+---+---+---+---+---+
202      * | b | d | f | h | j | l | n | p |
203      * +---+---+---+---+---+---+---+---+
204      */
205     if(dir)
206         vminmax256q(a, b);
207     else
208         vminmax256q(b, a);
209 
210     float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
211     float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
212 
213     vst1q_f32(pOut, out1.val[0]);
214     vst1q_f32(pOut+4, out1.val[1]);
215     vst1q_f32(pOut+8, out2.val[0]);
216     vst1q_f32(pOut+12, out2.val[1]);
217 }
218 
arm_bitonic_merge_16_f32(float32_t * pOut,float32x4x2_t a,float32x4x2_t b,uint8_t dir)219 static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
220 {
221     // Merge two preordered float32x4x2_t
222     vrev256q_f32(b);
223 
224     if(dir)
225         vminmax256q(a, b);
226     else
227         vminmax256q(b, a);
228 
229     arm_bitonic_resort_16_f32(pOut, a, b, dir);
230 }
231 
arm_bitonic_sort_16_f32(float32_t * pSrc,float32_t * pDst,uint8_t dir)232 static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
233 {
234     float32x4_t a;
235     float32x4_t b;
236     float32x4_t c;
237     float32x4_t d;
238 
239     // Load 16 samples
240     a = vld1q_f32(pSrc);
241     b = vld1q_f32(pSrc+4);
242     c = vld1q_f32(pSrc+8);
243     d = vld1q_f32(pSrc+12);
244 
245     // Bitonic sorting network for 4 samples x 4 times
246     if(dir)
247     {
248         vminmaxq(a, b);
249         vminmaxq(c, d);
250 
251         vminmaxq(a, d);
252         vminmaxq(b, c);
253 
254         vminmaxq(a, b);
255         vminmaxq(c, d);
256     }
257     else
258     {
259         vminmaxq(b, a);
260         vminmaxq(d, c);
261 
262         vminmaxq(d, a);
263         vminmaxq(c, b);
264 
265         vminmaxq(b, a);
266         vminmaxq(d, c);
267     }
268 
269     float32x4x2_t ab = vtrnq_f32 (a, b);
270     float32x4x2_t cd = vtrnq_f32 (c, d);
271 
272     // Transpose 4 ordered arrays of 4 samples
273     a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
274     b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
275     c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
276     d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
277 
278     // Merge pairs of arrays of 4 samples
279     ab = arm_bitonic_merge_8_f32(a, b, dir);
280     cd = arm_bitonic_merge_8_f32(c, d, dir);
281 
282     // Merge arrays of 8 samples
283     arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
284 }
285 
286 
287 
288 
289 
arm_bitonic_merge_32_f32(float32_t * pSrc,float32x4x2_t ab1,float32x4x2_t ab2,float32x4x2_t cd1,float32x4x2_t cd2,uint8_t dir)290 static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
291 {
292     //Compare
293     if(dir)
294     {
295         vminmax256q(ab1, cd1);
296         vminmax256q(ab2, cd2);
297     }
298     else
299     {
300         vminmax256q(cd1, ab1);
301         vminmax256q(cd2, ab2);
302     }
303     //Transpose 256
304     float32x4_t temp;
305 
306     temp = ab2.val[0];
307     ab2.val[0] = cd1.val[0];
308     cd1.val[0] = temp;
309     temp = ab2.val[1];
310     ab2.val[1] = cd1.val[1];
311     cd1.val[1] = temp;
312 
313     //Compare
314     if(dir)
315     {
316         vminmax256q(ab1, cd1);
317         vminmax256q(ab2, cd2);
318     }
319     else
320     {
321         vminmax256q(cd1, ab1);
322         vminmax256q(cd2, ab2);
323     }
324 
325     //Transpose 128
326     arm_bitonic_merge_16_f32(pSrc+0,  ab1, cd1, dir);
327     arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
328 }
329 
arm_bitonic_merge_64_f32(float32_t * pSrc,uint8_t dir)330 static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
331 {
332     float32x4x2_t ab1, ab2, ab3, ab4;
333     float32x4x2_t cd1, cd2, cd3, cd4;
334 
335     //Load and reverse second array
336     ab1.val[0] = vld1q_f32(pSrc+0 );
337     ab1.val[1] = vld1q_f32(pSrc+4 );
338     ab2.val[0] = vld1q_f32(pSrc+8 );
339     ab2.val[1] = vld1q_f32(pSrc+12);
340     ab3.val[0] = vld1q_f32(pSrc+16);
341     ab3.val[1] = vld1q_f32(pSrc+20);
342     ab4.val[0] = vld1q_f32(pSrc+24);
343     ab4.val[1] = vld1q_f32(pSrc+28);
344 
345     vldrev128q_f32(cd4.val[1], pSrc+32);
346     vldrev128q_f32(cd4.val[0], pSrc+36);
347     vldrev128q_f32(cd3.val[1], pSrc+40);
348     vldrev128q_f32(cd3.val[0], pSrc+44);
349     vldrev128q_f32(cd2.val[1], pSrc+48);
350     vldrev128q_f32(cd2.val[0], pSrc+52);
351     vldrev128q_f32(cd1.val[1], pSrc+56);
352     vldrev128q_f32(cd1.val[0], pSrc+60);
353 
354     //Compare
355     if(dir)
356     {
357         vminmax256q(ab1, cd1);
358         vminmax256q(ab2, cd2);
359         vminmax256q(ab3, cd3);
360         vminmax256q(ab4, cd4);
361     }
362     else
363     {
364         vminmax256q(cd1, ab1);
365         vminmax256q(cd2, ab2);
366         vminmax256q(cd3, ab3);
367         vminmax256q(cd4, ab4);
368     }
369 
370     //Transpose 512
371     float32x4_t temp;
372 
373     temp = ab3.val[0];
374     ab3.val[0] = cd1.val[0];
375     cd1.val[0] = temp;
376     temp = ab3.val[1];
377     ab3.val[1] = cd1.val[1];
378     cd1.val[1] = temp;
379     temp = ab4.val[0];
380     ab4.val[0] = cd2.val[0];
381     cd2.val[0] = temp;
382     temp = ab4.val[1];
383     ab4.val[1] = cd2.val[1];
384     cd2.val[1] = temp;
385 
386     //Compare
387     if(dir)
388     {
389         vminmax256q(ab1, cd1);
390         vminmax256q(ab2, cd2);
391         vminmax256q(ab3, cd3);
392         vminmax256q(ab4, cd4);
393     }
394     else
395     {
396         vminmax256q(cd1, ab1);
397         vminmax256q(cd2, ab2);
398         vminmax256q(cd3, ab3);
399         vminmax256q(cd4, ab4);
400     }
401 
402     //Transpose 256
403     arm_bitonic_merge_32_f32(pSrc+0,  ab1, ab2, cd1, cd2, dir);
404     arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
405 }
406 
arm_bitonic_merge_128_f32(float32_t * pSrc,uint8_t dir)407 static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
408 {
409     float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
410     float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
411 
412     //Load and reverse second array
413     ab1.val[0] = vld1q_f32(pSrc+0 );
414     ab1.val[1] = vld1q_f32(pSrc+4 );
415     ab2.val[0] = vld1q_f32(pSrc+8 );
416     ab2.val[1] = vld1q_f32(pSrc+12);
417     ab3.val[0] = vld1q_f32(pSrc+16);
418     ab3.val[1] = vld1q_f32(pSrc+20);
419     ab4.val[0] = vld1q_f32(pSrc+24);
420     ab4.val[1] = vld1q_f32(pSrc+28);
421     ab5.val[0] = vld1q_f32(pSrc+32);
422     ab5.val[1] = vld1q_f32(pSrc+36);
423     ab6.val[0] = vld1q_f32(pSrc+40);
424     ab6.val[1] = vld1q_f32(pSrc+44);
425     ab7.val[0] = vld1q_f32(pSrc+48);
426     ab7.val[1] = vld1q_f32(pSrc+52);
427     ab8.val[0] = vld1q_f32(pSrc+56);
428     ab8.val[1] = vld1q_f32(pSrc+60);
429 
430     vldrev128q_f32(cd8.val[1], pSrc+64);
431     vldrev128q_f32(cd8.val[0], pSrc+68);
432     vldrev128q_f32(cd7.val[1], pSrc+72);
433     vldrev128q_f32(cd7.val[0], pSrc+76);
434     vldrev128q_f32(cd6.val[1], pSrc+80);
435     vldrev128q_f32(cd6.val[0], pSrc+84);
436     vldrev128q_f32(cd5.val[1], pSrc+88);
437     vldrev128q_f32(cd5.val[0], pSrc+92);
438     vldrev128q_f32(cd4.val[1], pSrc+96);
439     vldrev128q_f32(cd4.val[0], pSrc+100);
440     vldrev128q_f32(cd3.val[1], pSrc+104);
441     vldrev128q_f32(cd3.val[0], pSrc+108);
442     vldrev128q_f32(cd2.val[1], pSrc+112);
443     vldrev128q_f32(cd2.val[0], pSrc+116);
444     vldrev128q_f32(cd1.val[1], pSrc+120);
445     vldrev128q_f32(cd1.val[0], pSrc+124);
446 
447     //Compare
448     if(dir)
449     {
450         vminmax256q(ab1, cd1);
451         vminmax256q(ab2, cd2);
452         vminmax256q(ab3, cd3);
453         vminmax256q(ab4, cd4);
454         vminmax256q(ab5, cd5);
455         vminmax256q(ab6, cd6);
456         vminmax256q(ab7, cd7);
457         vminmax256q(ab8, cd8);
458     }
459     else
460     {
461         vminmax256q(cd1, ab1);
462         vminmax256q(cd2, ab2);
463         vminmax256q(cd3, ab3);
464         vminmax256q(cd4, ab4);
465         vminmax256q(cd5, ab5);
466         vminmax256q(cd6, ab6);
467         vminmax256q(cd7, ab7);
468         vminmax256q(cd8, ab8);
469     }
470 
471     //Transpose
472     float32x4_t temp;
473 
474     temp = ab5.val[0];
475     ab5.val[0] = cd1.val[0];
476     cd1.val[0] = temp;
477     temp = ab5.val[1];
478     ab5.val[1] = cd1.val[1];
479     cd1.val[1] = temp;
480     temp = ab6.val[0];
481     ab6.val[0] = cd2.val[0];
482     cd2.val[0] = temp;
483     temp = ab6.val[1];
484     ab6.val[1] = cd2.val[1];
485     cd2.val[1] = temp;
486     temp = ab7.val[0];
487     ab7.val[0] = cd3.val[0];
488     cd3.val[0] = temp;
489     temp = ab7.val[1];
490     ab7.val[1] = cd3.val[1];
491     cd3.val[1] = temp;
492     temp = ab8.val[0];
493     ab8.val[0] = cd4.val[0];
494     cd4.val[0] = temp;
495     temp = ab8.val[1];
496     ab8.val[1] = cd4.val[1];
497     cd4.val[1] = temp;
498 
499     //Compare
500     if(dir)
501     {
502         vminmax256q(ab1, cd1);
503         vminmax256q(ab2, cd2);
504         vminmax256q(ab3, cd3);
505         vminmax256q(ab4, cd4);
506         vminmax256q(ab5, cd5);
507         vminmax256q(ab6, cd6);
508         vminmax256q(ab7, cd7);
509         vminmax256q(ab8, cd8);
510     }
511     else
512     {
513         vminmax256q(cd1, ab1);
514         vminmax256q(cd2, ab2);
515         vminmax256q(cd3, ab3);
516         vminmax256q(cd4, ab4);
517         vminmax256q(cd5, ab5);
518         vminmax256q(cd6, ab6);
519         vminmax256q(cd7, ab7);
520         vminmax256q(cd8, ab8);
521     }
522 
523     vst1q_f32(pSrc,     ab1.val[0]);
524     vst1q_f32(pSrc+4,   ab1.val[1]);
525     vst1q_f32(pSrc+8,   ab2.val[0]);
526     vst1q_f32(pSrc+12,  ab2.val[1]);
527     vst1q_f32(pSrc+16,  ab3.val[0]);
528     vst1q_f32(pSrc+20,  ab3.val[1]);
529     vst1q_f32(pSrc+24,  ab4.val[0]);
530     vst1q_f32(pSrc+28,  ab4.val[1]);
531     vst1q_f32(pSrc+32,  cd1.val[0]);
532     vst1q_f32(pSrc+36,  cd1.val[1]);
533     vst1q_f32(pSrc+40,  cd2.val[0]);
534     vst1q_f32(pSrc+44,  cd2.val[1]);
535     vst1q_f32(pSrc+48,  cd3.val[0]);
536     vst1q_f32(pSrc+52,  cd3.val[1]);
537     vst1q_f32(pSrc+56,  cd4.val[0]);
538     vst1q_f32(pSrc+60,  cd4.val[1]);
539     vst1q_f32(pSrc+64,  ab5.val[0]);
540     vst1q_f32(pSrc+68,  ab5.val[1]);
541     vst1q_f32(pSrc+72,  ab6.val[0]);
542     vst1q_f32(pSrc+76,  ab6.val[1]);
543     vst1q_f32(pSrc+80,  ab7.val[0]);
544     vst1q_f32(pSrc+84,  ab7.val[1]);
545     vst1q_f32(pSrc+88,  ab8.val[0]);
546     vst1q_f32(pSrc+92,  ab8.val[1]);
547     vst1q_f32(pSrc+96,  cd5.val[0]);
548     vst1q_f32(pSrc+100, cd5.val[1]);
549     vst1q_f32(pSrc+104, cd6.val[0]);
550     vst1q_f32(pSrc+108, cd6.val[1]);
551     vst1q_f32(pSrc+112, cd7.val[0]);
552     vst1q_f32(pSrc+116, cd7.val[1]);
553     vst1q_f32(pSrc+120, cd8.val[0]);
554     vst1q_f32(pSrc+124, cd8.val[1]);
555 
556     //Transpose
557     arm_bitonic_merge_64_f32(pSrc+0 , dir);
558     arm_bitonic_merge_64_f32(pSrc+64, dir);
559 }
560 
arm_bitonic_merge_256_f32(float32_t * pSrc,uint8_t dir)561 static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
562 {
563     float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
564     float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
565     float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
566     float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
567 
568     //Load and reverse second array
569     ab1.val[0]  = vld1q_f32(pSrc+0  );
570     ab1.val[1]  = vld1q_f32(pSrc+4  );
571     ab2.val[0]  = vld1q_f32(pSrc+8  );
572     ab2.val[1]  = vld1q_f32(pSrc+12 );
573     ab3.val[0]  = vld1q_f32(pSrc+16 );
574     ab3.val[1]  = vld1q_f32(pSrc+20 );
575     ab4.val[0]  = vld1q_f32(pSrc+24 );
576     ab4.val[1]  = vld1q_f32(pSrc+28 );
577     ab5.val[0]  = vld1q_f32(pSrc+32 );
578     ab5.val[1]  = vld1q_f32(pSrc+36 );
579     ab6.val[0]  = vld1q_f32(pSrc+40 );
580     ab6.val[1]  = vld1q_f32(pSrc+44 );
581     ab7.val[0]  = vld1q_f32(pSrc+48 );
582     ab7.val[1]  = vld1q_f32(pSrc+52 );
583     ab8.val[0]  = vld1q_f32(pSrc+56 );
584     ab8.val[1]  = vld1q_f32(pSrc+60 );
585     ab9.val[0]  = vld1q_f32(pSrc+64 );
586     ab9.val[1]  = vld1q_f32(pSrc+68 );
587     ab10.val[0] = vld1q_f32(pSrc+72 );
588     ab10.val[1] = vld1q_f32(pSrc+76 );
589     ab11.val[0] = vld1q_f32(pSrc+80 );
590     ab11.val[1] = vld1q_f32(pSrc+84 );
591     ab12.val[0] = vld1q_f32(pSrc+88 );
592     ab12.val[1] = vld1q_f32(pSrc+92 );
593     ab13.val[0] = vld1q_f32(pSrc+96 );
594     ab13.val[1] = vld1q_f32(pSrc+100);
595     ab14.val[0] = vld1q_f32(pSrc+104);
596     ab14.val[1] = vld1q_f32(pSrc+108);
597     ab15.val[0] = vld1q_f32(pSrc+112);
598     ab15.val[1] = vld1q_f32(pSrc+116);
599     ab16.val[0] = vld1q_f32(pSrc+120);
600     ab16.val[1] = vld1q_f32(pSrc+124);
601 
602     vldrev128q_f32(cd16.val[1], pSrc+128);
603     vldrev128q_f32(cd16.val[0], pSrc+132);
604     vldrev128q_f32(cd15.val[1], pSrc+136);
605     vldrev128q_f32(cd15.val[0], pSrc+140);
606     vldrev128q_f32(cd14.val[1], pSrc+144);
607     vldrev128q_f32(cd14.val[0], pSrc+148);
608     vldrev128q_f32(cd13.val[1], pSrc+152);
609     vldrev128q_f32(cd13.val[0], pSrc+156);
610     vldrev128q_f32(cd12.val[1], pSrc+160);
611     vldrev128q_f32(cd12.val[0], pSrc+164);
612     vldrev128q_f32(cd11.val[1], pSrc+168);
613     vldrev128q_f32(cd11.val[0], pSrc+172);
614     vldrev128q_f32(cd10.val[1], pSrc+176);
615     vldrev128q_f32(cd10.val[0], pSrc+180);
616     vldrev128q_f32(cd9.val[1] , pSrc+184);
617     vldrev128q_f32(cd9.val[0] , pSrc+188);
618     vldrev128q_f32(cd8.val[1] , pSrc+192);
619     vldrev128q_f32(cd8.val[0] , pSrc+196);
620     vldrev128q_f32(cd7.val[1] , pSrc+200);
621     vldrev128q_f32(cd7.val[0] , pSrc+204);
622     vldrev128q_f32(cd6.val[1] , pSrc+208);
623     vldrev128q_f32(cd6.val[0] , pSrc+212);
624     vldrev128q_f32(cd5.val[1] , pSrc+216);
625     vldrev128q_f32(cd5.val[0] , pSrc+220);
626     vldrev128q_f32(cd4.val[1] , pSrc+224);
627     vldrev128q_f32(cd4.val[0] , pSrc+228);
628     vldrev128q_f32(cd3.val[1] , pSrc+232);
629     vldrev128q_f32(cd3.val[0] , pSrc+236);
630     vldrev128q_f32(cd2.val[1] , pSrc+240);
631     vldrev128q_f32(cd2.val[0] , pSrc+244);
632     vldrev128q_f32(cd1.val[1] , pSrc+248);
633     vldrev128q_f32(cd1.val[0] , pSrc+252);
634 
635     //Compare
636     if(dir)
637     {
638         vminmax256q(ab1 , cd1 );
639         vminmax256q(ab2 , cd2 );
640         vminmax256q(ab3 , cd3 );
641         vminmax256q(ab4 , cd4 );
642         vminmax256q(ab5 , cd5 );
643         vminmax256q(ab6 , cd6 );
644         vminmax256q(ab7 , cd7 );
645         vminmax256q(ab8 , cd8 );
646         vminmax256q(ab9 , cd9 );
647         vminmax256q(ab10, cd10);
648         vminmax256q(ab11, cd11);
649         vminmax256q(ab12, cd12);
650         vminmax256q(ab13, cd13);
651         vminmax256q(ab14, cd14);
652         vminmax256q(ab15, cd15);
653         vminmax256q(ab16, cd16);
654     }
655     else
656     {
657         vminmax256q(cd1 , ab1 );
658         vminmax256q(cd2 , ab2 );
659         vminmax256q(cd3 , ab3 );
660         vminmax256q(cd4 , ab4 );
661         vminmax256q(cd5 , ab5 );
662         vminmax256q(cd6 , ab6 );
663         vminmax256q(cd7 , ab7 );
664         vminmax256q(cd8 , ab8 );
665         vminmax256q(cd9 , ab9 );
666         vminmax256q(cd10, ab10);
667         vminmax256q(cd11, ab11);
668         vminmax256q(cd12, ab12);
669         vminmax256q(cd13, ab13);
670         vminmax256q(cd14, ab14);
671         vminmax256q(cd15, ab15);
672         vminmax256q(cd16, ab16);
673     }
674 
675     //Transpose
676     float32x4_t temp;
677 
678     temp = ab9.val[0];
679     ab9.val[0] = cd1.val[0];
680     cd1.val[0] = temp;
681     temp = ab9.val[1];
682     ab9.val[1] = cd1.val[1];
683     cd1.val[1] = temp;
684     temp = ab10.val[0];
685     ab10.val[0] = cd2.val[0];
686     cd2.val[0] = temp;
687     temp = ab10.val[1];
688     ab10.val[1] = cd2.val[1];
689     cd2.val[1] = temp;
690     temp = ab11.val[0];
691     ab11.val[0] = cd3.val[0];
692     cd3.val[0] = temp;
693     temp = ab11.val[1];
694     ab11.val[1] = cd3.val[1];
695     cd3.val[1] = temp;
696     temp = ab12.val[0];
697     ab12.val[0] = cd4.val[0];
698     cd4.val[0] = temp;
699     temp = ab12.val[1];
700     ab12.val[1] = cd4.val[1];
701     cd4.val[1] = temp;
702     temp = ab13.val[0];
703     ab13.val[0] = cd5.val[0];
704     cd5.val[0] = temp;
705     temp = ab13.val[1];
706     ab13.val[1] = cd5.val[1];
707     cd5.val[1] = temp;
708     temp = ab14.val[0];
709     ab14.val[0] = cd6.val[0];
710     cd6.val[0] = temp;
711     temp = ab14.val[1];
712     ab14.val[1] = cd6.val[1];
713     cd6.val[1] = temp;
714     temp = ab15.val[0];
715     ab15.val[0] = cd7.val[0];
716     cd7.val[0] = temp;
717     temp = ab15.val[1];
718     ab15.val[1] = cd7.val[1];
719     cd7.val[1] = temp;
720     temp = ab16.val[0];
721     ab16.val[0] = cd8.val[0];
722     cd8.val[0] = temp;
723     temp = ab16.val[1];
724     ab16.val[1] = cd8.val[1];
725     cd8.val[1] = temp;
726 
727     //Compare
728     if(dir)
729     {
730         vminmax256q(ab1 , cd1 );
731         vminmax256q(ab2 , cd2 );
732         vminmax256q(ab3 , cd3 );
733         vminmax256q(ab4 , cd4 );
734         vminmax256q(ab5 , cd5 );
735         vminmax256q(ab6 , cd6 );
736         vminmax256q(ab7 , cd7 );
737         vminmax256q(ab8 , cd8 );
738         vminmax256q(ab9 , cd9 );
739         vminmax256q(ab10, cd10);
740         vminmax256q(ab11, cd11);
741         vminmax256q(ab12, cd12);
742         vminmax256q(ab13, cd13);
743         vminmax256q(ab14, cd14);
744         vminmax256q(ab15, cd15);
745         vminmax256q(ab16, cd16);
746     }
747     else
748     {
749         vminmax256q(cd1 , ab1 );
750         vminmax256q(cd2 , ab2 );
751         vminmax256q(cd3 , ab3 );
752         vminmax256q(cd4 , ab4 );
753         vminmax256q(cd5 , ab5 );
754         vminmax256q(cd6 , ab6 );
755         vminmax256q(cd7 , ab7 );
756         vminmax256q(cd8 , ab8 );
757         vminmax256q(cd9 , ab9 );
758         vminmax256q(cd10, ab10);
759         vminmax256q(cd11, ab11);
760         vminmax256q(cd12, ab12);
761         vminmax256q(cd13, ab13);
762         vminmax256q(cd14, ab14);
763         vminmax256q(cd15, ab15);
764         vminmax256q(cd16, ab16);
765     }
766 
767     vst1q_f32(pSrc,     ab1.val[0] );
768     vst1q_f32(pSrc+4,   ab1.val[1] );
769     vst1q_f32(pSrc+8,   ab2.val[0] );
770     vst1q_f32(pSrc+12,  ab2.val[1] );
771     vst1q_f32(pSrc+16,  ab3.val[0] );
772     vst1q_f32(pSrc+20,  ab3.val[1] );
773     vst1q_f32(pSrc+24,  ab4.val[0] );
774     vst1q_f32(pSrc+28,  ab4.val[1] );
775     vst1q_f32(pSrc+32,  ab5.val[0] );
776     vst1q_f32(pSrc+36,  ab5.val[1] );
777     vst1q_f32(pSrc+40,  ab6.val[0] );
778     vst1q_f32(pSrc+44,  ab6.val[1] );
779     vst1q_f32(pSrc+48,  ab7.val[0] );
780     vst1q_f32(pSrc+52,  ab7.val[1] );
781     vst1q_f32(pSrc+56,  ab8.val[0] );
782     vst1q_f32(pSrc+60,  ab8.val[1] );
783     vst1q_f32(pSrc+64,  cd1.val[0] );
784     vst1q_f32(pSrc+68,  cd1.val[1] );
785     vst1q_f32(pSrc+72,  cd2.val[0] );
786     vst1q_f32(pSrc+76,  cd2.val[1] );
787     vst1q_f32(pSrc+80,  cd3.val[0] );
788     vst1q_f32(pSrc+84,  cd3.val[1] );
789     vst1q_f32(pSrc+88,  cd4.val[0] );
790     vst1q_f32(pSrc+92,  cd4.val[1] );
791     vst1q_f32(pSrc+96,  cd5.val[0] );
792     vst1q_f32(pSrc+100, cd5.val[1] );
793     vst1q_f32(pSrc+104, cd6.val[0] );
794     vst1q_f32(pSrc+108, cd6.val[1] );
795     vst1q_f32(pSrc+112, cd7.val[0] );
796     vst1q_f32(pSrc+116, cd7.val[1] );
797     vst1q_f32(pSrc+120, cd8.val[0] );
798     vst1q_f32(pSrc+124, cd8.val[1] );
799     vst1q_f32(pSrc+128, ab9.val[0] );
800     vst1q_f32(pSrc+132, ab9.val[1] );
801     vst1q_f32(pSrc+136, ab10.val[0]);
802     vst1q_f32(pSrc+140, ab10.val[1]);
803     vst1q_f32(pSrc+144, ab11.val[0]);
804     vst1q_f32(pSrc+148, ab11.val[1]);
805     vst1q_f32(pSrc+152, ab12.val[0]);
806     vst1q_f32(pSrc+156, ab12.val[1]);
807     vst1q_f32(pSrc+160, ab13.val[0]);
808     vst1q_f32(pSrc+164, ab13.val[1]);
809     vst1q_f32(pSrc+168, ab14.val[0]);
810     vst1q_f32(pSrc+172, ab14.val[1]);
811     vst1q_f32(pSrc+176, ab15.val[0]);
812     vst1q_f32(pSrc+180, ab15.val[1]);
813     vst1q_f32(pSrc+184, ab16.val[0]);
814     vst1q_f32(pSrc+188, ab16.val[1]);
815     vst1q_f32(pSrc+192, cd9.val[0] );
816     vst1q_f32(pSrc+196, cd9.val[1] );
817     vst1q_f32(pSrc+200, cd10.val[0]);
818     vst1q_f32(pSrc+204, cd10.val[1]);
819     vst1q_f32(pSrc+208, cd11.val[0]);
820     vst1q_f32(pSrc+212, cd11.val[1]);
821     vst1q_f32(pSrc+216, cd12.val[0]);
822     vst1q_f32(pSrc+220, cd12.val[1]);
823     vst1q_f32(pSrc+224, cd13.val[0]);
824     vst1q_f32(pSrc+228, cd13.val[1]);
825     vst1q_f32(pSrc+232, cd14.val[0]);
826     vst1q_f32(pSrc+236, cd14.val[1]);
827     vst1q_f32(pSrc+240, cd15.val[0]);
828     vst1q_f32(pSrc+244, cd15.val[1]);
829     vst1q_f32(pSrc+248, cd16.val[0]);
830     vst1q_f32(pSrc+252, cd16.val[1]);
831 
832     //Transpose
833     arm_bitonic_merge_128_f32(pSrc+0  , dir);
834     arm_bitonic_merge_128_f32(pSrc+128, dir);
835 }
836 
837 #define SWAP(a,i,j)                            \
838     temp = vgetq_lane_f32(a, j);                   \
839     a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\
840     a = vsetq_lane_f32(temp, a, i);
841 
arm_bitonic_sort_4_f32(float32x4_t a,uint8_t dir)842 static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
843 {
844     float32_t temp;
845 
846 
847     if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
848     {
849         SWAP(a,0,1);
850     }
851     if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) )
852     {
853        SWAP(a,2,3);
854     }
855 
856     if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) )
857     {
858       SWAP(a,0,3);
859     }
860     if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) )
861     {
862       SWAP(a,1,2);
863     }
864 
865     if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
866     {
867       SWAP(a,0,1);
868     }
869     if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) )
870     {
871       SWAP(a,2,3);
872     }
873 
874     return a;
875 }
876 
arm_bitonic_sort_8_f32(float32x4_t a,float32x4_t b,uint8_t dir)877 static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
878 {
879     a = arm_bitonic_sort_4_f32(a, dir);
880     b = arm_bitonic_sort_4_f32(b, dir);
881     return arm_bitonic_merge_8_f32(a, b, dir);
882 }
883 
884 
885 
886 #endif
887 
888 /**
889   @ingroup groupSupport
890  */
891 
892 /**
893   @defgroup Sorting Vector sorting algorithms
894 
895   Sort the elements of a vector
896 
897   There are separate functions for floating-point, Q31, Q15, and Q7 data types.
898  */
899 
900 /**
901   @addtogroup Sorting
902   @{
903  */
904 
905 /**
906    * @private
907    * @param[in]  S          points to an instance of the sorting structure.
908    * @param[in]  pSrc       points to the block of input data.
909    * @param[out] pDst       points to the block of output data
910    * @param[in]  blockSize  number of samples to process.
911    */
arm_bitonic_sort_f32(const arm_sort_instance_f32 * S,float32_t * pSrc,float32_t * pDst,uint32_t blockSize)912 void arm_bitonic_sort_f32(
913 const arm_sort_instance_f32 * S,
914       float32_t * pSrc,
915       float32_t * pDst,
916       uint32_t blockSize)
917 {
918     uint16_t s, i;
919     uint8_t dir = S->dir;
920 
921 #ifdef ARM_MATH_NEON
922     (void)s;
923 
924     float32_t * pOut;
925     uint16_t counter = blockSize>>5;
926 
927     if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
928     {
929         if(pSrc == pDst) // in-place
930             pOut = pSrc;
931         else
932     	    pOut = pDst;
933 
934         float32x4x2_t ab1, ab2;
935         float32x4x2_t cd1, cd2;
936 
937 	if(blockSize == 1)
938 		pOut = pSrc;
939 	else if(blockSize == 2)
940 	{
941             float32_t temp;
942 
943             if( dir==(pSrc[0]>pSrc[1]) )
944             {
945                 temp = pSrc[1];
946                 pOut[1] = pSrc[0];
947                 pOut[0] = temp;
948             }
949 	    else
950 		pOut = pSrc;
951 	}
952 	else if(blockSize == 4)
953         {
954     	    float32x4_t a = vld1q_f32(pSrc);
955 
956     	    a = arm_bitonic_sort_4_f32(a, dir);
957 
958     	    vst1q_f32(pOut, a);
959         }
960         else if(blockSize == 8)
961         {
962             float32x4_t a;
963             float32x4_t b;
964             float32x4x2_t ab;
965 
966             a = vld1q_f32(pSrc);
967             b = vld1q_f32(pSrc+4);
968 
969             ab = arm_bitonic_sort_8_f32(a, b, dir);
970 
971             vst1q_f32(pOut,   ab.val[0]);
972             vst1q_f32(pOut+4, ab.val[1]);
973         }
974         else if(blockSize >=16)
975         {
976             // Order 16 bits long vectors
977             for(i=0; i<blockSize; i=i+16)
978                 arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir);
979 
980             // Merge
981             for(i=0; i<counter; i++)
982             {
983                 // Load and reverse second vector
984                 ab1.val[0] = vld1q_f32(pOut+32*i+0 );
985                 ab1.val[1] = vld1q_f32(pOut+32*i+4 );
986                 ab2.val[0] = vld1q_f32(pOut+32*i+8 );
987                 ab2.val[1] = vld1q_f32(pOut+32*i+12);
988 
989                 vldrev128q_f32(cd2.val[1], pOut+32*i+16);
990                 vldrev128q_f32(cd2.val[0], pOut+32*i+20);
991                 vldrev128q_f32(cd1.val[1], pOut+32*i+24);
992                 vldrev128q_f32(cd1.val[0], pOut+32*i+28);
993 
994                 arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir);
995             }
996 
997             counter = counter>>1;
998             for(i=0; i<counter; i++)
999                 arm_bitonic_merge_64_f32(pOut+64*i, dir);
1000 
1001             counter = counter>>1;
1002             for(i=0; i<counter; i++)
1003                 arm_bitonic_merge_128_f32(pOut+128*i, dir);
1004 
1005             counter = counter>>1;
1006             for(i=0; i<counter; i++)
1007                 arm_bitonic_merge_256_f32(pOut+256*i, dir);
1008 
1009             // Etc...
1010         }
1011     }
1012 
1013 #else
1014 
1015     float32_t * pA;
1016 
1017     if(pSrc != pDst) // out-of-place
1018     {
1019         memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
1020         pA = pDst;
1021     }
1022     else
1023         pA = pSrc;
1024 
1025 
1026     if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
1027     {
1028         for(s=2; s<=blockSize; s=s*2)
1029         {
1030     	    for(i=0; i<blockSize; i=i+s)
1031     	        arm_bitonic_sort_core_f32(pA+i, s, dir);
1032         }
1033     }
1034 #endif
1035 }
1036 
1037 /**
1038   @} end of Sorting group
1039  */
1040