1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_levinson_durbin_q31.c
4  * Description:  q31 version of Levinson Durbin algorithm
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 #define ONE_Q31 0x7FFFFFFFL
32 #define TWO_Q30 0x7FFFFFFFL
33 
34 #define HALF_Q31 0x00008000L
35 #define ONE_Q15 0x7FFF
36 #define HALF_Q15 0x3FFF
37 #define LOWPART_MASK 0x07FFF
38 
mul32x16(q31_t a,q15_t b)39 __STATIC_FORCEINLINE q31_t mul32x16(q31_t a, q15_t b)
40 {
41   q31_t r = ((q63_t)a * (q63_t)b) >> 15;
42 
43   return(r);
44 
45 }
46 
mul32x32(q31_t a,q31_t b)47 __STATIC_FORCEINLINE q31_t mul32x32(q31_t a, q31_t b)
48 {
49   //q31_t r = __SSAT(((q63_t)a * b) >> 31,31);
50   q31_t r = ((q63_t)a * b) >> 31;
51 
52   return(r);
53 
54 }
55 
divide(q31_t n,q31_t d)56 __STATIC_FORCEINLINE q31_t divide(q31_t n, q31_t d)
57 {
58   arm_status status;
59   int16_t shift;
60   q15_t inverse;
61   q31_t r;
62   // We are computing:
63   // n / d = n / (h + l) where h and l are the high end and low end part.
64   // 1 / (h + l) = 1 / h (1 - l / h)
65   // Our division algorithm has a shift. So it is returning a scaled value sh.
66   // So we need a << shift to convert 1/ sh to 1/h.
67   // In below code, we are organizing the computation differently. Instead of computing:
68   // 1 / h (1 - l / h)
69   // we are computing
70   // 1 / h (2 - (l + h) / h)
71   // 1 / h (2 - d / h)
72   // Also, we are not computing 1/h in Q15 but in Q14.
73   // 2 is expressed in Q30.
74   // So at the end of all computation we need a << 2
75 
76   // Result is in Q14 because of use of HALF_Q15 instead of ONE_Q15.
77   status=arm_divide_q15(HALF_Q15,d>>16,&inverse,&shift);
78   (void)status;
79 
80   // d is used instead of l
81   // So we will need to substract to 2 instead of 1.
82   r = mul32x16(d,inverse);
83   r = TWO_Q30 - (r << shift);
84   r = mul32x16(r, inverse);
85   r = mul32x32(r,n) ;
86   r = r << (shift + 2);
87 
88   return(r);
89 
90 }
91 
92 /**
93   @ingroup groupFilters
94  */
95 
96 
97 
98 /**
99   @addtogroup LD
100   @{
101  */
102 
103 /**
104   @brief         Levinson Durbin
105   @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
106   @param[out]    a        autoregressive coefficients
107   @param[out]    err      prediction error (variance)
108   @param[in]     nbCoefs  number of autoregressive coefficients
109  */
110 
111 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(ARM_DSP_BUILT_WITH_GCC)
112 #pragma message "Scalar version of arm_levinson_durbin_q31 built. Helium version has build issues with gcc."
113 #endif
114 
115 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(ARM_DSP_BUILT_WITH_GCC)
116 
117 #define LANE23_MASK 0xFF00
118 
119 #include "arm_helium_utils.h"
arm_levinson_durbin_q31(const q31_t * phi,q31_t * a,q31_t * err,int nbCoefs)120 ARM_DSP_ATTRIBUTE void arm_levinson_durbin_q31(const q31_t *phi,
121   q31_t *a,
122   q31_t *err,
123   int nbCoefs)
124 {
125     q31_t e;
126 
127     static const uint32_t revOffsetArray[4] = {3,2,1,0};
128 
129    //a[0] = phi[1] / phi[0];
130    a[0] = divide(phi[1], phi[0]);
131 
132 
133    //e = phi[0] - phi[1] * a[0];
134    e = phi[0] - mul32x32(phi[1],a[0]);
135 
136    for(int p=1; p < nbCoefs; p++)
137    {
138       q63_t suma=0;
139       q63_t sumb=0;
140       q31x4_t vecA,vecRevPhi,vecPhi;
141       q31_t k;
142       uint32_t blkCnt;
143       const q31_t *pPhi,*pRevPhi,*pA;
144       uint32x4_t revOffset;
145 
146 
147       int nb,j,i;
148 
149       revOffset = vld1q(revOffsetArray);
150 
151       pRevPhi = &phi[p-3];
152       pPhi = &phi[1];
153       pA = a;
154 
155       i = 0;
156       blkCnt = p >> 2;
157       while(blkCnt > 0)
158       {
159          vecA = vld1q(pA);
160          pA += 4;
161 
162          vecPhi = vld1q(pPhi);
163          pPhi += 4;
164 
165          vecRevPhi = vldrwq_gather_shifted_offset_s32(pRevPhi,revOffset);
166          pRevPhi -= 4;
167 
168          suma = vmlaldavaq(suma,vecA,vecRevPhi);
169          sumb = vmlaldavaq(sumb,vecA,vecPhi);
170 
171          i += 4;
172          blkCnt--;
173       }
174 
175 
176       blkCnt = p & 3;
177       while(blkCnt > 0)
178       {
179          suma += ((q63_t)a[i] * phi[p - i]);
180          sumb += ((q63_t)a[i] * phi[i + 1]);
181 
182          i++;
183          blkCnt--;
184       }
185 
186       suma = asrl(suma, 31);
187       sumb = asrl(sumb, 31);
188 
189 
190 
191       //k = (phi[p+1]-suma)/(phi[0] - sumb);
192       k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);
193 
194       q31x4_t vecRevA,tmp;
195       static int32_t orgOffsetArray[4]={0,1,-1,-2};
196       static const int32_t offsetIncArray[4]={2,2,-2,-2};
197 
198       uint32x4_t offset,offsetInc,vecTmp;
199 
200 
201       offset = vld1q_u32((uint32_t*)orgOffsetArray);
202       vecTmp = vdupq_n_u32(p);
203 
204       offset = vaddq_m_u32(offset,offset,vecTmp,LANE23_MASK);
205       offsetInc = vld1q_u32((uint32_t*)offsetIncArray);
206 
207 
208       nb = p >> 2;
209       j=0;
210       for(int i =0;i < nb ; i++)
211       {
212         /*
213           q31_t x0,x1,x2,x3;
214 
215           //x = a[j] - k * a[p-1-j];
216           x0 = a[j] - mul32x32(k,a[p-1-j]);
217           x1 = a[j+1] - mul32x32(k,a[p-2-j]);
218 
219           //y = a[p-1-j] - k * a[j];
220           x2 = a[p-1-j] - mul32x32(k , a[j]);
221           x3 = a[p-2-j] - mul32x32(k , a[j+1]);
222 
223           a[j] = x0;
224           a[j+1] = x1;
225           a[p-1-j] = x2;
226           a[p-2-j] = x3;
227         */
228 
229           uint64_t tmpa,tmpb;
230           vecA = vldrwq_gather_shifted_offset_s32(a,offset);
231 
232 
233           tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
234           tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
235           vecRevA = (q31x4_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
236           vecRevA = (q31x4_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
237 
238 
239           tmp = vsubq(vecA,vqdmulhq_n_s32(vecRevA,k));
240           vstrwq_scatter_shifted_offset_s32(a, offset, tmp);
241 
242           offset = vaddq(offset,offsetInc);
243 
244           j+=2;
245       }
246 
247       switch(p & 3)
248       {
249          case 3:
250          {
251           q31_t x,y;
252 
253           //x = a[j] - k * a[p-1-j];
254           x = a[j] - mul32x32(k,a[p-1-j]);
255 
256           //y = a[p-1-j] - k * a[j];
257           y = a[p-1-j] - mul32x32(k , a[j]);
258 
259           a[j] = x;
260           a[p-1-j] = y;
261 
262           //a[j] = a[j]- k * a[p-1-j];
263           a[j+1] = a[j+1] - mul32x32(k,a[p-2-j]);
264          }
265          break;
266 
267          case 2:
268          {
269           q31_t x,y;
270 
271           //x = a[j] - k * a[p-1-j];
272           x = a[j] - mul32x32(k,a[p-1-j]);
273 
274           //y = a[p-1-j] - k * a[j];
275           y = a[p-1-j] - mul32x32(k , a[j]);
276 
277           a[j] = x;
278           a[p-1-j] = y;
279          }
280          break;
281 
282          case 1:
283             //a[j] = a[j]- k * a[p-1-j];
284             a[j] = a[j] - mul32x32(k,a[p-1-j]);
285          break;
286       }
287 
288       a[p] = k;
289 
290       // e = e * (1 - k*k);
291       e = mul32x32(e,ONE_Q31 - mul32x32(k,k));
292 
293 
294    }
295    *err = e;
296 }
297 
298 #else
299 
arm_levinson_durbin_q31(const q31_t * phi,q31_t * a,q31_t * err,int nbCoefs)300 ARM_DSP_ATTRIBUTE void arm_levinson_durbin_q31(const q31_t *phi,
301   q31_t *a,
302   q31_t *err,
303   int nbCoefs)
304 {
305    q31_t e;
306    int p;
307 
308    //a[0] = phi[1] / phi[0];
309    a[0] = divide(phi[1], phi[0]);
310 
311 
312    //e = phi[0] - phi[1] * a[0];
313    e = phi[0] - mul32x32(phi[1],a[0]);
314 
315    for(p=1; p < nbCoefs; p++)
316    {
317       q63_t suma=0;
318       q63_t sumb=0;
319       q31_t k;
320       int nb,j,i;
321 
322       for(i=0; i < p; i++)
323       {
324          suma += ((q63_t)a[i] * phi[p - i]);
325          sumb += ((q63_t)a[i] * phi[i + 1]);
326       }
327 
328       suma = suma >> 31;
329       sumb = sumb >> 31;
330 
331 
332 
333       //k = (phi[p+1]-suma)/(phi[0] - sumb);
334       k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);
335 
336 
337       nb = p >> 1;
338       j=0;
339       for(i =0;i < nb ; i++)
340       {
341           q31_t x,y;
342 
343           //x = a[j] - k * a[p-1-j];
344           x = a[j] - mul32x32(k,a[p-1-j]);
345 
346           //y = a[p-1-j] - k * a[j];
347           y = a[p-1-j] - mul32x32(k , a[j]);
348 
349           a[j] = x;
350           a[p-1-j] = y;
351 
352           j++;
353       }
354 
355       nb = p & 1;
356       if (nb)
357       {
358             //a[j] = a[j]- k * a[p-1-j];
359             a[j] = a[j] - mul32x32(k,a[p-1-j]);
360       }
361 
362       a[p] = k;
363 
364       // e = e * (1 - k*k);
365       e = mul32x32(e,ONE_Q31 - mul32x32(k,k));
366 
367 
368    }
369    *err = e;
370 }
371 #endif /* defined(ARM_MATH_MVEI) */
372 
373 /**
374   @} end of LD group
375  */
376