1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_levinson_durbin_f16.c
4  * Description:  f16 version of Levinson Durbin algorithm
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions_f16.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 
36 
37 /**
38   @addtogroup LD
39   @{
40  */
41 
42 /**
43   @brief         Levinson Durbin
44   @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
45   @param[out]    a        autoregressive coefficients
46   @param[out]    err      prediction error (variance)
47   @param[in]     nbCoefs  number of autoregressive coefficients
48  */
49 
50 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(ARM_DSP_BUILT_WITH_GCC)
51 #pragma message "Scalar version of arm_levinson_durbin_f16 built. Helium version has build issues with gcc."
52 #endif
53 
54 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(ARM_DSP_BUILT_WITH_GCC)
55 
56 #include "arm_helium_utils.h"
57 
58 #define LANE4567_MASK 0xFF00
59 
arm_levinson_durbin_f16(const float16_t * phi,float16_t * a,float16_t * err,int nbCoefs)60 ARM_DSP_ATTRIBUTE void arm_levinson_durbin_f16(const float16_t *phi,
61   float16_t *a,
62   float16_t *err,
63   int nbCoefs)
64 {
65    _Float16 e;
66    static const uint16_t revOffsetArray[8] = {7,6,5,4,3,2,1,0};
67 
68    a[0] = (_Float16)phi[1] / (_Float16)phi[0];
69 
70    e = (_Float16)phi[0] - (_Float16)phi[1] * (_Float16)a[0];
71    for(int p=1; p < nbCoefs; p++)
72    {
73       _Float16 suma = 0.0f16;
74       _Float16 sumb = 0.0f16;
75       f16x8_t vecA,vecRevPhi,vecPhi,vecSumA, vecSumB;
76       _Float16 k;
77       uint32_t blkCnt;
78       const float16_t *pPhi,*pRevPhi,*pA;
79       uint16x8_t revOffset;
80 
81       int nb,j,i;
82 
83       revOffset = vld1q(revOffsetArray);
84       vecSumA = vdupq_n_f16(0.0f16);
85       vecSumB = vdupq_n_f16(0.0f16);
86 
87       pRevPhi = &phi[p-7];
88       pPhi = &phi[1];
89       pA = a;
90 
91       i = 0;
92       blkCnt = p >> 3;
93       while(blkCnt > 0)
94       {
95          vecA = vld1q(pA);
96          pA += 8;
97 
98          vecPhi = vld1q(pPhi);
99          pPhi += 8;
100 
101          vecRevPhi = vldrhq_gather_shifted_offset_f16(pRevPhi,revOffset);
102          pRevPhi -= 8;
103 
104          vecSumA = vfmaq(vecSumA,vecA,vecRevPhi);
105          vecSumB = vfmaq(vecSumB,vecA,vecPhi);
106 
107          i += 8;
108          blkCnt--;
109 
110       }
111 
112       suma = vecAddAcrossF16Mve(vecSumA);
113       sumb = vecAddAcrossF16Mve(vecSumB);
114 
115       blkCnt = p & 7;
116       while(blkCnt > 0)
117       {
118          suma += (_Float16)a[i] * (_Float16)phi[p - i];
119          sumb += (_Float16)a[i] * (_Float16)phi[i + 1];
120 
121          i++;
122          blkCnt--;
123       }
124 
125       k = ((_Float16)phi[p+1] - suma)/((_Float16)phi[0] - sumb);
126 
127       f16x8_t vecRevA,tmp;
128       static int16_t orgOffsetArray[8]={0,1,2,3,-1,-2,-3,-4};
129       static const int16_t offsetIncArray[8]={4,4,4,4,-4,-4,-4,-4};
130 
131       uint16x8_t offset,offsetInc,vecTmp;
132 
133 
134       offset = vld1q_u16((uint16_t*)orgOffsetArray);
135       vecTmp = vdupq_n_u16(p);
136 
137       offset = vaddq_m_u16(offset,offset,vecTmp,LANE4567_MASK);
138       offsetInc = vld1q_u16((uint16_t*)offsetIncArray);
139 
140       nb = p >> 3;
141       j=0;
142       for(int i = 0; i < nb ; i++)
143       {
144 
145           /*
146             x0=a[j] - k * a[p-1-j];
147             x1=a[j+1] - k * a[p-2-j];
148             x3=a[p-1-j] - k * a[j];
149             x4=a[p-2-j] - k * a[j+1];
150 
151             a[j] = x0;
152             a[j+1] = x1;
153             a[p-1-j] = x2;
154             a[p-2-j] = x3;
155           */
156 
157           uint64_t tmpa,tmpb;
158           vecA = vldrhq_gather_shifted_offset_f16(a,offset);
159 
160 
161           tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
162           tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
163           vecRevA = (f16x8_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
164           vecRevA = (f16x8_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
165 
166 
167           tmp = vsubq(vecA,vmulq_n_f16(vecRevA,k));
168           vstrhq_scatter_shifted_offset_f16(a, offset, tmp);
169 
170           offset = vaddq(offset,offsetInc);
171 
172           j+=4;
173 
174       }
175 
176       blkCnt = p & 7;
177 
178       if (blkCnt)
179       {
180          nb = blkCnt >> 1;
181          for(int i =0;i < nb ; i++)
182          {
183              _Float16 x,y;
184 
185              x=(_Float16)a[j] - (_Float16)k * (_Float16)a[p-1-j];
186              y=(_Float16)a[p-1-j] - (_Float16)k * (_Float16)a[j];
187 
188              a[j] = x;
189              a[p-1-j] = y;
190 
191              j++;
192          }
193 
194          nb = blkCnt & 1;
195          if (nb)
196          {
197                a[j]=(_Float16)a[j]- (_Float16)k * (_Float16)a[p-1-j];
198          }
199       }
200 
201 
202       a[p] = k;
203       e = e * (1.0f16 - k*k);
204 
205 
206    }
207    *err = e;
208 }
209 
210 #else
211 
212 #if defined(ARM_FLOAT16_SUPPORTED)
213 
arm_levinson_durbin_f16(const float16_t * phi,float16_t * a,float16_t * err,int nbCoefs)214 ARM_DSP_ATTRIBUTE void arm_levinson_durbin_f16(const float16_t *phi,
215   float16_t *a,
216   float16_t *err,
217   int nbCoefs)
218 {
219    _Float16 e;
220 
221    a[0] = (_Float16)phi[1] / (_Float16)phi[0];
222 
223    e = (_Float16)phi[0] - (_Float16)phi[1] * (_Float16)a[0];
224    for(int p=1; p < nbCoefs; p++)
225    {
226       _Float16 suma=0.0f16;
227       _Float16 sumb=0.0f16;
228       _Float16 k;
229       int nb,j;
230 
231       for(int i=0; i < p; i++)
232       {
233          suma += (_Float16)a[i] * (_Float16)phi[p - i];
234          sumb += (_Float16)a[i] * (_Float16)phi[i + 1];
235       }
236 
237       k = ((_Float16)phi[p+1]-suma)/((_Float16)phi[0] - sumb);
238 
239 
240       nb = p >> 1;
241       j=0;
242       for(int i =0;i < nb ; i++)
243       {
244           _Float16 x,y;
245 
246           x=(_Float16)a[j] - (_Float16)k * (_Float16)a[p-1-j];
247           y=(_Float16)a[p-1-j] - (_Float16)k * (_Float16)a[j];
248 
249           a[j] = x;
250           a[p-1-j] = y;
251 
252           j++;
253       }
254 
255       nb = p & 1;
256       if (nb)
257       {
258             a[j]=(_Float16)a[j]- (_Float16)k * (_Float16)a[p-1-j];
259       }
260 
261       a[p] = k;
262       e = e * (1.0f16 - k*k);
263 
264 
265    }
266    *err = e;
267 }
268 #endif /* defined(ARM_FLOAT16_SUPPORTED */
269 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
270 /**
271   @} end of LD group
272  */
273