/* ---------------------------------------------------------------------- * Project: CMSIS DSP Library * Title: arm_bitonic_sort_f32.c * Description: Floating point bitonic sort * * $Date: 23 April 2021 * $Revision: V1.9.0 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ /* * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the License); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an AS IS BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "dsp/support_functions.h" #include "arm_sorting.h" #if !defined(ARM_MATH_NEON) static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir) { uint32_t step; uint32_t k, j; float32_t *leftPtr, *rightPtr; float32_t temp; step = n>>1; leftPtr = pSrc; rightPtr = pSrc+n-1; for(k=0; k *rightPtr)) { // Swap temp=*leftPtr; *leftPtr=*rightPtr; *rightPtr=temp; } leftPtr++; // Move right rightPtr--; // Move left } // Merge for(step=(n>>2); step>0; step/=2) { for(j=0; j *rightPtr) { // Swap temp=*leftPtr; *leftPtr=*rightPtr; *rightPtr=temp; } leftPtr++; rightPtr++; } } } } #endif #if defined(ARM_MATH_NEON) static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) { /* Start with two vectors: * +---+---+---+---+ * | a | b | c | d | * +---+---+---+---+ * +---+---+---+---+ * | e | f | g | h | * +---+---+---+---+ * All the elements of the first are guaranteed to be less than or equal to * all of the elements in the second, and both vectors are bitonic. * We need to perform these operations to completely sort both lists: * vminmax([abcd],[efgh]) * vminmax([acbd],[egfh]) */ vtrn128_64q(a, b); /* +---+---+---+---+ * | a | b | e | f | * +---+---+---+---+ * +---+---+---+---+ * | c | d | g | h | * +---+---+---+---+ */ if(dir) vminmaxq(a, b); else vminmaxq(b, a); vtrn128_32q(a, b); /* +---+---+---+---+ * | a | c | e | g | * +---+---+---+---+ * +---+---+---+---+ * | b | d | f | h | * +---+---+---+---+ */ if(dir) vminmaxq(a, b); else vminmaxq(b, a); return vzipq_f32(a, b); } static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) { /* a and b are guaranteed to be bitonic */ // Reverse the element of the second vector b = vrev128q_f32(b); // Compare the two vectors if(dir) vminmaxq(a, b); else vminmaxq(b, a); // Merge the two vectors float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir); return ab; } static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir) { /* Start with two vectors: * +---+---+---+---+---+---+---+---+ * | a | b | c | d | e | f | g | h | * +---+---+---+---+---+---+---+---+ * +---+---+---+---+---+---+---+---+ * | i | j | k | l | m | n | o | p | * +---+---+---+---+---+---+---+---+ * All the elements of the first are guaranteed to be less than or equal to * all of the elements in the second, and both vectors are bitonic. * We need to perform these operations to completely sort both lists: * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop]) * vminmax([abef],[cdgh]) vminmax([ijmn],[klop]) * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp]) */ vtrn256_128q(a, b); /* +---+---+---+---+---+---+---+---+ * | a | b | c | d | i | j | k | l | * +---+---+---+---+---+---+---+---+ * +---+---+---+---+---+---+---+---+ * | e | f | g | h | m | n | o | p | * +---+---+---+---+---+---+---+---+ */ if(dir) vminmax256q(a, b); else vminmax256q(b, a); vtrn256_64q(a, b); /* +---+---+---+---+---+---+---+---+ * | a | b | e | f | i | j | m | n | * +---+---+---+---+---+---+---+---+ * +---+---+---+---+---+---+---+---+ * | c | d | g | h | k | l | o | p | * +---+---+---+---+---+---+---+---+ */ if(dir) vminmax256q(a, b); else vminmax256q(b, a); vtrn256_32q(a, b); /* We now have: * +---+---+---+---+---+---+---+---+ * | a | c | e | g | i | k | m | o | * +---+---+---+---+---+---+---+---+ * +---+---+---+---+---+---+---+---+ * | b | d | f | h | j | l | n | p | * +---+---+---+---+---+---+---+---+ */ if(dir) vminmax256q(a, b); else vminmax256q(b, a); float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]); float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]); vst1q_f32(pOut, out1.val[0]); vst1q_f32(pOut+4, out1.val[1]); vst1q_f32(pOut+8, out2.val[0]); vst1q_f32(pOut+12, out2.val[1]); } static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir) { // Merge two preordered float32x4x2_t vrev256q_f32(b); if(dir) vminmax256q(a, b); else vminmax256q(b, a); arm_bitonic_resort_16_f32(pOut, a, b, dir); } static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir) { float32x4_t a; float32x4_t b; float32x4_t c; float32x4_t d; // Load 16 samples a = vld1q_f32(pSrc); b = vld1q_f32(pSrc+4); c = vld1q_f32(pSrc+8); d = vld1q_f32(pSrc+12); // Bitonic sorting network for 4 samples x 4 times if(dir) { vminmaxq(a, b); vminmaxq(c, d); vminmaxq(a, d); vminmaxq(b, c); vminmaxq(a, b); vminmaxq(c, d); } else { vminmaxq(b, a); vminmaxq(d, c); vminmaxq(d, a); vminmaxq(c, b); vminmaxq(b, a); vminmaxq(d, c); } float32x4x2_t ab = vtrnq_f32 (a, b); float32x4x2_t cd = vtrnq_f32 (c, d); // Transpose 4 ordered arrays of 4 samples a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0])); b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1])); c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0])); d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1])); // Merge pairs of arrays of 4 samples ab = arm_bitonic_merge_8_f32(a, b, dir); cd = arm_bitonic_merge_8_f32(c, d, dir); // Merge arrays of 8 samples arm_bitonic_merge_16_f32(pDst, ab, cd, dir); } static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir) { //Compare if(dir) { vminmax256q(ab1, cd1); vminmax256q(ab2, cd2); } else { vminmax256q(cd1, ab1); vminmax256q(cd2, ab2); } //Transpose 256 float32x4_t temp; temp = ab2.val[0]; ab2.val[0] = cd1.val[0]; cd1.val[0] = temp; temp = ab2.val[1]; ab2.val[1] = cd1.val[1]; cd1.val[1] = temp; //Compare if(dir) { vminmax256q(ab1, cd1); vminmax256q(ab2, cd2); } else { vminmax256q(cd1, ab1); vminmax256q(cd2, ab2); } //Transpose 128 arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir); arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir); } static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir) { float32x4x2_t ab1, ab2, ab3, ab4; float32x4x2_t cd1, cd2, cd3, cd4; //Load and reverse second array ab1.val[0] = vld1q_f32(pSrc+0 ); ab1.val[1] = vld1q_f32(pSrc+4 ); ab2.val[0] = vld1q_f32(pSrc+8 ); ab2.val[1] = vld1q_f32(pSrc+12); ab3.val[0] = vld1q_f32(pSrc+16); ab3.val[1] = vld1q_f32(pSrc+20); ab4.val[0] = vld1q_f32(pSrc+24); ab4.val[1] = vld1q_f32(pSrc+28); vldrev128q_f32(cd4.val[1], pSrc+32); vldrev128q_f32(cd4.val[0], pSrc+36); vldrev128q_f32(cd3.val[1], pSrc+40); vldrev128q_f32(cd3.val[0], pSrc+44); vldrev128q_f32(cd2.val[1], pSrc+48); vldrev128q_f32(cd2.val[0], pSrc+52); vldrev128q_f32(cd1.val[1], pSrc+56); vldrev128q_f32(cd1.val[0], pSrc+60); //Compare if(dir) { vminmax256q(ab1, cd1); vminmax256q(ab2, cd2); vminmax256q(ab3, cd3); vminmax256q(ab4, cd4); } else { vminmax256q(cd1, ab1); vminmax256q(cd2, ab2); vminmax256q(cd3, ab3); vminmax256q(cd4, ab4); } //Transpose 512 float32x4_t temp; temp = ab3.val[0]; ab3.val[0] = cd1.val[0]; cd1.val[0] = temp; temp = ab3.val[1]; ab3.val[1] = cd1.val[1]; cd1.val[1] = temp; temp = ab4.val[0]; ab4.val[0] = cd2.val[0]; cd2.val[0] = temp; temp = ab4.val[1]; ab4.val[1] = cd2.val[1]; cd2.val[1] = temp; //Compare if(dir) { vminmax256q(ab1, cd1); vminmax256q(ab2, cd2); vminmax256q(ab3, cd3); vminmax256q(ab4, cd4); } else { vminmax256q(cd1, ab1); vminmax256q(cd2, ab2); vminmax256q(cd3, ab3); vminmax256q(cd4, ab4); } //Transpose 256 arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir); arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir); } static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir) { float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8; float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8; //Load and reverse second array ab1.val[0] = vld1q_f32(pSrc+0 ); ab1.val[1] = vld1q_f32(pSrc+4 ); ab2.val[0] = vld1q_f32(pSrc+8 ); ab2.val[1] = vld1q_f32(pSrc+12); ab3.val[0] = vld1q_f32(pSrc+16); ab3.val[1] = vld1q_f32(pSrc+20); ab4.val[0] = vld1q_f32(pSrc+24); ab4.val[1] = vld1q_f32(pSrc+28); ab5.val[0] = vld1q_f32(pSrc+32); ab5.val[1] = vld1q_f32(pSrc+36); ab6.val[0] = vld1q_f32(pSrc+40); ab6.val[1] = vld1q_f32(pSrc+44); ab7.val[0] = vld1q_f32(pSrc+48); ab7.val[1] = vld1q_f32(pSrc+52); ab8.val[0] = vld1q_f32(pSrc+56); ab8.val[1] = vld1q_f32(pSrc+60); vldrev128q_f32(cd8.val[1], pSrc+64); vldrev128q_f32(cd8.val[0], pSrc+68); vldrev128q_f32(cd7.val[1], pSrc+72); vldrev128q_f32(cd7.val[0], pSrc+76); vldrev128q_f32(cd6.val[1], pSrc+80); vldrev128q_f32(cd6.val[0], pSrc+84); vldrev128q_f32(cd5.val[1], pSrc+88); vldrev128q_f32(cd5.val[0], pSrc+92); vldrev128q_f32(cd4.val[1], pSrc+96); vldrev128q_f32(cd4.val[0], pSrc+100); vldrev128q_f32(cd3.val[1], pSrc+104); vldrev128q_f32(cd3.val[0], pSrc+108); vldrev128q_f32(cd2.val[1], pSrc+112); vldrev128q_f32(cd2.val[0], pSrc+116); vldrev128q_f32(cd1.val[1], pSrc+120); vldrev128q_f32(cd1.val[0], pSrc+124); //Compare if(dir) { vminmax256q(ab1, cd1); vminmax256q(ab2, cd2); vminmax256q(ab3, cd3); vminmax256q(ab4, cd4); vminmax256q(ab5, cd5); vminmax256q(ab6, cd6); vminmax256q(ab7, cd7); vminmax256q(ab8, cd8); } else { vminmax256q(cd1, ab1); vminmax256q(cd2, ab2); vminmax256q(cd3, ab3); vminmax256q(cd4, ab4); vminmax256q(cd5, ab5); vminmax256q(cd6, ab6); vminmax256q(cd7, ab7); vminmax256q(cd8, ab8); } //Transpose float32x4_t temp; temp = ab5.val[0]; ab5.val[0] = cd1.val[0]; cd1.val[0] = temp; temp = ab5.val[1]; ab5.val[1] = cd1.val[1]; cd1.val[1] = temp; temp = ab6.val[0]; ab6.val[0] = cd2.val[0]; cd2.val[0] = temp; temp = ab6.val[1]; ab6.val[1] = cd2.val[1]; cd2.val[1] = temp; temp = ab7.val[0]; ab7.val[0] = cd3.val[0]; cd3.val[0] = temp; temp = ab7.val[1]; ab7.val[1] = cd3.val[1]; cd3.val[1] = temp; temp = ab8.val[0]; ab8.val[0] = cd4.val[0]; cd4.val[0] = temp; temp = ab8.val[1]; ab8.val[1] = cd4.val[1]; cd4.val[1] = temp; //Compare if(dir) { vminmax256q(ab1, cd1); vminmax256q(ab2, cd2); vminmax256q(ab3, cd3); vminmax256q(ab4, cd4); vminmax256q(ab5, cd5); vminmax256q(ab6, cd6); vminmax256q(ab7, cd7); vminmax256q(ab8, cd8); } else { vminmax256q(cd1, ab1); vminmax256q(cd2, ab2); vminmax256q(cd3, ab3); vminmax256q(cd4, ab4); vminmax256q(cd5, ab5); vminmax256q(cd6, ab6); vminmax256q(cd7, ab7); vminmax256q(cd8, ab8); } vst1q_f32(pSrc, ab1.val[0]); vst1q_f32(pSrc+4, ab1.val[1]); vst1q_f32(pSrc+8, ab2.val[0]); vst1q_f32(pSrc+12, ab2.val[1]); vst1q_f32(pSrc+16, ab3.val[0]); vst1q_f32(pSrc+20, ab3.val[1]); vst1q_f32(pSrc+24, ab4.val[0]); vst1q_f32(pSrc+28, ab4.val[1]); vst1q_f32(pSrc+32, cd1.val[0]); vst1q_f32(pSrc+36, cd1.val[1]); vst1q_f32(pSrc+40, cd2.val[0]); vst1q_f32(pSrc+44, cd2.val[1]); vst1q_f32(pSrc+48, cd3.val[0]); vst1q_f32(pSrc+52, cd3.val[1]); vst1q_f32(pSrc+56, cd4.val[0]); vst1q_f32(pSrc+60, cd4.val[1]); vst1q_f32(pSrc+64, ab5.val[0]); vst1q_f32(pSrc+68, ab5.val[1]); vst1q_f32(pSrc+72, ab6.val[0]); vst1q_f32(pSrc+76, ab6.val[1]); vst1q_f32(pSrc+80, ab7.val[0]); vst1q_f32(pSrc+84, ab7.val[1]); vst1q_f32(pSrc+88, ab8.val[0]); vst1q_f32(pSrc+92, ab8.val[1]); vst1q_f32(pSrc+96, cd5.val[0]); vst1q_f32(pSrc+100, cd5.val[1]); vst1q_f32(pSrc+104, cd6.val[0]); vst1q_f32(pSrc+108, cd6.val[1]); vst1q_f32(pSrc+112, cd7.val[0]); vst1q_f32(pSrc+116, cd7.val[1]); vst1q_f32(pSrc+120, cd8.val[0]); vst1q_f32(pSrc+124, cd8.val[1]); //Transpose arm_bitonic_merge_64_f32(pSrc+0 , dir); arm_bitonic_merge_64_f32(pSrc+64, dir); } static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir) { float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8; float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16; float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8; float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16; //Load and reverse second array ab1.val[0] = vld1q_f32(pSrc+0 ); ab1.val[1] = vld1q_f32(pSrc+4 ); ab2.val[0] = vld1q_f32(pSrc+8 ); ab2.val[1] = vld1q_f32(pSrc+12 ); ab3.val[0] = vld1q_f32(pSrc+16 ); ab3.val[1] = vld1q_f32(pSrc+20 ); ab4.val[0] = vld1q_f32(pSrc+24 ); ab4.val[1] = vld1q_f32(pSrc+28 ); ab5.val[0] = vld1q_f32(pSrc+32 ); ab5.val[1] = vld1q_f32(pSrc+36 ); ab6.val[0] = vld1q_f32(pSrc+40 ); ab6.val[1] = vld1q_f32(pSrc+44 ); ab7.val[0] = vld1q_f32(pSrc+48 ); ab7.val[1] = vld1q_f32(pSrc+52 ); ab8.val[0] = vld1q_f32(pSrc+56 ); ab8.val[1] = vld1q_f32(pSrc+60 ); ab9.val[0] = vld1q_f32(pSrc+64 ); ab9.val[1] = vld1q_f32(pSrc+68 ); ab10.val[0] = vld1q_f32(pSrc+72 ); ab10.val[1] = vld1q_f32(pSrc+76 ); ab11.val[0] = vld1q_f32(pSrc+80 ); ab11.val[1] = vld1q_f32(pSrc+84 ); ab12.val[0] = vld1q_f32(pSrc+88 ); ab12.val[1] = vld1q_f32(pSrc+92 ); ab13.val[0] = vld1q_f32(pSrc+96 ); ab13.val[1] = vld1q_f32(pSrc+100); ab14.val[0] = vld1q_f32(pSrc+104); ab14.val[1] = vld1q_f32(pSrc+108); ab15.val[0] = vld1q_f32(pSrc+112); ab15.val[1] = vld1q_f32(pSrc+116); ab16.val[0] = vld1q_f32(pSrc+120); ab16.val[1] = vld1q_f32(pSrc+124); vldrev128q_f32(cd16.val[1], pSrc+128); vldrev128q_f32(cd16.val[0], pSrc+132); vldrev128q_f32(cd15.val[1], pSrc+136); vldrev128q_f32(cd15.val[0], pSrc+140); vldrev128q_f32(cd14.val[1], pSrc+144); vldrev128q_f32(cd14.val[0], pSrc+148); vldrev128q_f32(cd13.val[1], pSrc+152); vldrev128q_f32(cd13.val[0], pSrc+156); vldrev128q_f32(cd12.val[1], pSrc+160); vldrev128q_f32(cd12.val[0], pSrc+164); vldrev128q_f32(cd11.val[1], pSrc+168); vldrev128q_f32(cd11.val[0], pSrc+172); vldrev128q_f32(cd10.val[1], pSrc+176); vldrev128q_f32(cd10.val[0], pSrc+180); vldrev128q_f32(cd9.val[1] , pSrc+184); vldrev128q_f32(cd9.val[0] , pSrc+188); vldrev128q_f32(cd8.val[1] , pSrc+192); vldrev128q_f32(cd8.val[0] , pSrc+196); vldrev128q_f32(cd7.val[1] , pSrc+200); vldrev128q_f32(cd7.val[0] , pSrc+204); vldrev128q_f32(cd6.val[1] , pSrc+208); vldrev128q_f32(cd6.val[0] , pSrc+212); vldrev128q_f32(cd5.val[1] , pSrc+216); vldrev128q_f32(cd5.val[0] , pSrc+220); vldrev128q_f32(cd4.val[1] , pSrc+224); vldrev128q_f32(cd4.val[0] , pSrc+228); vldrev128q_f32(cd3.val[1] , pSrc+232); vldrev128q_f32(cd3.val[0] , pSrc+236); vldrev128q_f32(cd2.val[1] , pSrc+240); vldrev128q_f32(cd2.val[0] , pSrc+244); vldrev128q_f32(cd1.val[1] , pSrc+248); vldrev128q_f32(cd1.val[0] , pSrc+252); //Compare if(dir) { vminmax256q(ab1 , cd1 ); vminmax256q(ab2 , cd2 ); vminmax256q(ab3 , cd3 ); vminmax256q(ab4 , cd4 ); vminmax256q(ab5 , cd5 ); vminmax256q(ab6 , cd6 ); vminmax256q(ab7 , cd7 ); vminmax256q(ab8 , cd8 ); vminmax256q(ab9 , cd9 ); vminmax256q(ab10, cd10); vminmax256q(ab11, cd11); vminmax256q(ab12, cd12); vminmax256q(ab13, cd13); vminmax256q(ab14, cd14); vminmax256q(ab15, cd15); vminmax256q(ab16, cd16); } else { vminmax256q(cd1 , ab1 ); vminmax256q(cd2 , ab2 ); vminmax256q(cd3 , ab3 ); vminmax256q(cd4 , ab4 ); vminmax256q(cd5 , ab5 ); vminmax256q(cd6 , ab6 ); vminmax256q(cd7 , ab7 ); vminmax256q(cd8 , ab8 ); vminmax256q(cd9 , ab9 ); vminmax256q(cd10, ab10); vminmax256q(cd11, ab11); vminmax256q(cd12, ab12); vminmax256q(cd13, ab13); vminmax256q(cd14, ab14); vminmax256q(cd15, ab15); vminmax256q(cd16, ab16); } //Transpose float32x4_t temp; temp = ab9.val[0]; ab9.val[0] = cd1.val[0]; cd1.val[0] = temp; temp = ab9.val[1]; ab9.val[1] = cd1.val[1]; cd1.val[1] = temp; temp = ab10.val[0]; ab10.val[0] = cd2.val[0]; cd2.val[0] = temp; temp = ab10.val[1]; ab10.val[1] = cd2.val[1]; cd2.val[1] = temp; temp = ab11.val[0]; ab11.val[0] = cd3.val[0]; cd3.val[0] = temp; temp = ab11.val[1]; ab11.val[1] = cd3.val[1]; cd3.val[1] = temp; temp = ab12.val[0]; ab12.val[0] = cd4.val[0]; cd4.val[0] = temp; temp = ab12.val[1]; ab12.val[1] = cd4.val[1]; cd4.val[1] = temp; temp = ab13.val[0]; ab13.val[0] = cd5.val[0]; cd5.val[0] = temp; temp = ab13.val[1]; ab13.val[1] = cd5.val[1]; cd5.val[1] = temp; temp = ab14.val[0]; ab14.val[0] = cd6.val[0]; cd6.val[0] = temp; temp = ab14.val[1]; ab14.val[1] = cd6.val[1]; cd6.val[1] = temp; temp = ab15.val[0]; ab15.val[0] = cd7.val[0]; cd7.val[0] = temp; temp = ab15.val[1]; ab15.val[1] = cd7.val[1]; cd7.val[1] = temp; temp = ab16.val[0]; ab16.val[0] = cd8.val[0]; cd8.val[0] = temp; temp = ab16.val[1]; ab16.val[1] = cd8.val[1]; cd8.val[1] = temp; //Compare if(dir) { vminmax256q(ab1 , cd1 ); vminmax256q(ab2 , cd2 ); vminmax256q(ab3 , cd3 ); vminmax256q(ab4 , cd4 ); vminmax256q(ab5 , cd5 ); vminmax256q(ab6 , cd6 ); vminmax256q(ab7 , cd7 ); vminmax256q(ab8 , cd8 ); vminmax256q(ab9 , cd9 ); vminmax256q(ab10, cd10); vminmax256q(ab11, cd11); vminmax256q(ab12, cd12); vminmax256q(ab13, cd13); vminmax256q(ab14, cd14); vminmax256q(ab15, cd15); vminmax256q(ab16, cd16); } else { vminmax256q(cd1 , ab1 ); vminmax256q(cd2 , ab2 ); vminmax256q(cd3 , ab3 ); vminmax256q(cd4 , ab4 ); vminmax256q(cd5 , ab5 ); vminmax256q(cd6 , ab6 ); vminmax256q(cd7 , ab7 ); vminmax256q(cd8 , ab8 ); vminmax256q(cd9 , ab9 ); vminmax256q(cd10, ab10); vminmax256q(cd11, ab11); vminmax256q(cd12, ab12); vminmax256q(cd13, ab13); vminmax256q(cd14, ab14); vminmax256q(cd15, ab15); vminmax256q(cd16, ab16); } vst1q_f32(pSrc, ab1.val[0] ); vst1q_f32(pSrc+4, ab1.val[1] ); vst1q_f32(pSrc+8, ab2.val[0] ); vst1q_f32(pSrc+12, ab2.val[1] ); vst1q_f32(pSrc+16, ab3.val[0] ); vst1q_f32(pSrc+20, ab3.val[1] ); vst1q_f32(pSrc+24, ab4.val[0] ); vst1q_f32(pSrc+28, ab4.val[1] ); vst1q_f32(pSrc+32, ab5.val[0] ); vst1q_f32(pSrc+36, ab5.val[1] ); vst1q_f32(pSrc+40, ab6.val[0] ); vst1q_f32(pSrc+44, ab6.val[1] ); vst1q_f32(pSrc+48, ab7.val[0] ); vst1q_f32(pSrc+52, ab7.val[1] ); vst1q_f32(pSrc+56, ab8.val[0] ); vst1q_f32(pSrc+60, ab8.val[1] ); vst1q_f32(pSrc+64, cd1.val[0] ); vst1q_f32(pSrc+68, cd1.val[1] ); vst1q_f32(pSrc+72, cd2.val[0] ); vst1q_f32(pSrc+76, cd2.val[1] ); vst1q_f32(pSrc+80, cd3.val[0] ); vst1q_f32(pSrc+84, cd3.val[1] ); vst1q_f32(pSrc+88, cd4.val[0] ); vst1q_f32(pSrc+92, cd4.val[1] ); vst1q_f32(pSrc+96, cd5.val[0] ); vst1q_f32(pSrc+100, cd5.val[1] ); vst1q_f32(pSrc+104, cd6.val[0] ); vst1q_f32(pSrc+108, cd6.val[1] ); vst1q_f32(pSrc+112, cd7.val[0] ); vst1q_f32(pSrc+116, cd7.val[1] ); vst1q_f32(pSrc+120, cd8.val[0] ); vst1q_f32(pSrc+124, cd8.val[1] ); vst1q_f32(pSrc+128, ab9.val[0] ); vst1q_f32(pSrc+132, ab9.val[1] ); vst1q_f32(pSrc+136, ab10.val[0]); vst1q_f32(pSrc+140, ab10.val[1]); vst1q_f32(pSrc+144, ab11.val[0]); vst1q_f32(pSrc+148, ab11.val[1]); vst1q_f32(pSrc+152, ab12.val[0]); vst1q_f32(pSrc+156, ab12.val[1]); vst1q_f32(pSrc+160, ab13.val[0]); vst1q_f32(pSrc+164, ab13.val[1]); vst1q_f32(pSrc+168, ab14.val[0]); vst1q_f32(pSrc+172, ab14.val[1]); vst1q_f32(pSrc+176, ab15.val[0]); vst1q_f32(pSrc+180, ab15.val[1]); vst1q_f32(pSrc+184, ab16.val[0]); vst1q_f32(pSrc+188, ab16.val[1]); vst1q_f32(pSrc+192, cd9.val[0] ); vst1q_f32(pSrc+196, cd9.val[1] ); vst1q_f32(pSrc+200, cd10.val[0]); vst1q_f32(pSrc+204, cd10.val[1]); vst1q_f32(pSrc+208, cd11.val[0]); vst1q_f32(pSrc+212, cd11.val[1]); vst1q_f32(pSrc+216, cd12.val[0]); vst1q_f32(pSrc+220, cd12.val[1]); vst1q_f32(pSrc+224, cd13.val[0]); vst1q_f32(pSrc+228, cd13.val[1]); vst1q_f32(pSrc+232, cd14.val[0]); vst1q_f32(pSrc+236, cd14.val[1]); vst1q_f32(pSrc+240, cd15.val[0]); vst1q_f32(pSrc+244, cd15.val[1]); vst1q_f32(pSrc+248, cd16.val[0]); vst1q_f32(pSrc+252, cd16.val[1]); //Transpose arm_bitonic_merge_128_f32(pSrc+0 , dir); arm_bitonic_merge_128_f32(pSrc+128, dir); } #define SWAP(a,i,j) \ temp = vgetq_lane_f32(a, j); \ a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\ a = vsetq_lane_f32(temp, a, i); static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir) { float32_t temp; if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) ) { SWAP(a,0,1); } if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) ) { SWAP(a,2,3); } if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) ) { SWAP(a,0,3); } if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) ) { SWAP(a,1,2); } if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) ) { SWAP(a,0,1); } if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) ) { SWAP(a,2,3); } return a; } static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) { a = arm_bitonic_sort_4_f32(a, dir); b = arm_bitonic_sort_4_f32(b, dir); return arm_bitonic_merge_8_f32(a, b, dir); } #endif /** @ingroup groupSupport */ /** @defgroup Sorting Vector sorting algorithms Sort the elements of a vector There are separate functions for floating-point, Q31, Q15, and Q7 data types. */ /** @addtogroup Sorting @{ */ /** * @private * @param[in] S points to an instance of the sorting structure. * @param[in] pSrc points to the block of input data. * @param[out] pDst points to the block of output data * @param[in] blockSize number of samples to process. */ void arm_bitonic_sort_f32( const arm_sort_instance_f32 * S, float32_t * pSrc, float32_t * pDst, uint32_t blockSize) { uint16_t s, i; uint8_t dir = S->dir; #ifdef ARM_MATH_NEON (void)s; float32_t * pOut; uint16_t counter = blockSize>>5; if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only { if(pSrc == pDst) // in-place pOut = pSrc; else pOut = pDst; float32x4x2_t ab1, ab2; float32x4x2_t cd1, cd2; if(blockSize == 1) pOut = pSrc; else if(blockSize == 2) { float32_t temp; if( dir==(pSrc[0]>pSrc[1]) ) { temp = pSrc[1]; pOut[1] = pSrc[0]; pOut[0] = temp; } else pOut = pSrc; } else if(blockSize == 4) { float32x4_t a = vld1q_f32(pSrc); a = arm_bitonic_sort_4_f32(a, dir); vst1q_f32(pOut, a); } else if(blockSize == 8) { float32x4_t a; float32x4_t b; float32x4x2_t ab; a = vld1q_f32(pSrc); b = vld1q_f32(pSrc+4); ab = arm_bitonic_sort_8_f32(a, b, dir); vst1q_f32(pOut, ab.val[0]); vst1q_f32(pOut+4, ab.val[1]); } else if(blockSize >=16) { // Order 16 bits long vectors for(i=0; i>1; for(i=0; i>1; for(i=0; i>1; for(i=0; i