1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_levinson_durbin_q31.c
4 * Description: q31 version of Levinson Durbin algorithm
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 #define ONE_Q31 0x7FFFFFFFL
32 #define TWO_Q30 0x7FFFFFFFL
33
34 #define HALF_Q31 0x00008000L
35 #define ONE_Q15 0x7FFF
36 #define HALF_Q15 0x3FFF
37 #define LOWPART_MASK 0x07FFF
38
mul32x16(q31_t a,q15_t b)39 __STATIC_FORCEINLINE q31_t mul32x16(q31_t a, q15_t b)
40 {
41 q31_t r = ((q63_t)a * (q63_t)b) >> 15;
42
43 return(r);
44
45 }
46
mul32x32(q31_t a,q31_t b)47 __STATIC_FORCEINLINE q31_t mul32x32(q31_t a, q31_t b)
48 {
49 //q31_t r = __SSAT(((q63_t)a * b) >> 31,31);
50 q31_t r = ((q63_t)a * b) >> 31;
51
52 return(r);
53
54 }
55
divide(q31_t n,q31_t d)56 __STATIC_FORCEINLINE q31_t divide(q31_t n, q31_t d)
57 {
58 arm_status status;
59 int16_t shift;
60 q15_t inverse;
61 q31_t r;
62 // We are computing:
63 // n / d = n / (h + l) where h and l are the high end and low end part.
64 // 1 / (h + l) = 1 / h (1 - l / h)
65 // Our division algorithm has a shift. So it is returning a scaled value sh.
66 // So we need a << shift to convert 1/ sh to 1/h.
67 // In below code, we are organizing the computation differently. Instead of computing:
68 // 1 / h (1 - l / h)
69 // we are computing
70 // 1 / h (2 - (l + h) / h)
71 // 1 / h (2 - d / h)
72 // Also, we are not computing 1/h in Q15 but in Q14.
73 // 2 is expressed in Q30.
74 // So at the end of all computation we need a << 2
75
76 // Result is in Q14 because of use of HALF_Q15 instead of ONE_Q15.
77 status=arm_divide_q15(HALF_Q15,d>>16,&inverse,&shift);
78 (void)status;
79
80 // d is used instead of l
81 // So we will need to substract to 2 instead of 1.
82 r = mul32x16(d,inverse);
83 r = TWO_Q30 - (r << shift);
84 r = mul32x16(r, inverse);
85 r = mul32x32(r,n) ;
86 r = r << (shift + 2);
87
88 return(r);
89
90 }
91
92 /**
93 @ingroup groupFilters
94 */
95
96
97
98 /**
99 @addtogroup LD
100 @{
101 */
102
103 /**
104 @brief Levinson Durbin
105 @param[in] phi autocovariance vector starting with lag 0 (length is nbCoefs + 1)
106 @param[out] a autoregressive coefficients
107 @param[out] err prediction error (variance)
108 @param[in] nbCoefs number of autoregressive coefficients
109 */
110
111 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(ARM_DSP_BUILT_WITH_GCC)
112 #pragma message "Scalar version of arm_levinson_durbin_q31 built. Helium version has build issues with gcc."
113 #endif
114
115 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(ARM_DSP_BUILT_WITH_GCC)
116
117 #define LANE23_MASK 0xFF00
118
119 #include "arm_helium_utils.h"
arm_levinson_durbin_q31(const q31_t * phi,q31_t * a,q31_t * err,int nbCoefs)120 ARM_DSP_ATTRIBUTE void arm_levinson_durbin_q31(const q31_t *phi,
121 q31_t *a,
122 q31_t *err,
123 int nbCoefs)
124 {
125 q31_t e;
126
127 static const uint32_t revOffsetArray[4] = {3,2,1,0};
128
129 //a[0] = phi[1] / phi[0];
130 a[0] = divide(phi[1], phi[0]);
131
132
133 //e = phi[0] - phi[1] * a[0];
134 e = phi[0] - mul32x32(phi[1],a[0]);
135
136 for(int p=1; p < nbCoefs; p++)
137 {
138 q63_t suma=0;
139 q63_t sumb=0;
140 q31x4_t vecA,vecRevPhi,vecPhi;
141 q31_t k;
142 uint32_t blkCnt;
143 const q31_t *pPhi,*pRevPhi,*pA;
144 uint32x4_t revOffset;
145
146
147 int nb,j,i;
148
149 revOffset = vld1q(revOffsetArray);
150
151 pRevPhi = &phi[p-3];
152 pPhi = &phi[1];
153 pA = a;
154
155 i = 0;
156 blkCnt = p >> 2;
157 while(blkCnt > 0)
158 {
159 vecA = vld1q(pA);
160 pA += 4;
161
162 vecPhi = vld1q(pPhi);
163 pPhi += 4;
164
165 vecRevPhi = vldrwq_gather_shifted_offset_s32(pRevPhi,revOffset);
166 pRevPhi -= 4;
167
168 suma = vmlaldavaq(suma,vecA,vecRevPhi);
169 sumb = vmlaldavaq(sumb,vecA,vecPhi);
170
171 i += 4;
172 blkCnt--;
173 }
174
175
176 blkCnt = p & 3;
177 while(blkCnt > 0)
178 {
179 suma += ((q63_t)a[i] * phi[p - i]);
180 sumb += ((q63_t)a[i] * phi[i + 1]);
181
182 i++;
183 blkCnt--;
184 }
185
186 suma = asrl(suma, 31);
187 sumb = asrl(sumb, 31);
188
189
190
191 //k = (phi[p+1]-suma)/(phi[0] - sumb);
192 k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);
193
194 q31x4_t vecRevA,tmp;
195 static int32_t orgOffsetArray[4]={0,1,-1,-2};
196 static const int32_t offsetIncArray[4]={2,2,-2,-2};
197
198 uint32x4_t offset,offsetInc,vecTmp;
199
200
201 offset = vld1q_u32((uint32_t*)orgOffsetArray);
202 vecTmp = vdupq_n_u32(p);
203
204 offset = vaddq_m_u32(offset,offset,vecTmp,LANE23_MASK);
205 offsetInc = vld1q_u32((uint32_t*)offsetIncArray);
206
207
208 nb = p >> 2;
209 j=0;
210 for(int i =0;i < nb ; i++)
211 {
212 /*
213 q31_t x0,x1,x2,x3;
214
215 //x = a[j] - k * a[p-1-j];
216 x0 = a[j] - mul32x32(k,a[p-1-j]);
217 x1 = a[j+1] - mul32x32(k,a[p-2-j]);
218
219 //y = a[p-1-j] - k * a[j];
220 x2 = a[p-1-j] - mul32x32(k , a[j]);
221 x3 = a[p-2-j] - mul32x32(k , a[j+1]);
222
223 a[j] = x0;
224 a[j+1] = x1;
225 a[p-1-j] = x2;
226 a[p-2-j] = x3;
227 */
228
229 uint64_t tmpa,tmpb;
230 vecA = vldrwq_gather_shifted_offset_s32(a,offset);
231
232
233 tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
234 tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
235 vecRevA = (q31x4_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
236 vecRevA = (q31x4_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
237
238
239 tmp = vsubq(vecA,vqdmulhq_n_s32(vecRevA,k));
240 vstrwq_scatter_shifted_offset_s32(a, offset, tmp);
241
242 offset = vaddq(offset,offsetInc);
243
244 j+=2;
245 }
246
247 switch(p & 3)
248 {
249 case 3:
250 {
251 q31_t x,y;
252
253 //x = a[j] - k * a[p-1-j];
254 x = a[j] - mul32x32(k,a[p-1-j]);
255
256 //y = a[p-1-j] - k * a[j];
257 y = a[p-1-j] - mul32x32(k , a[j]);
258
259 a[j] = x;
260 a[p-1-j] = y;
261
262 //a[j] = a[j]- k * a[p-1-j];
263 a[j+1] = a[j+1] - mul32x32(k,a[p-2-j]);
264 }
265 break;
266
267 case 2:
268 {
269 q31_t x,y;
270
271 //x = a[j] - k * a[p-1-j];
272 x = a[j] - mul32x32(k,a[p-1-j]);
273
274 //y = a[p-1-j] - k * a[j];
275 y = a[p-1-j] - mul32x32(k , a[j]);
276
277 a[j] = x;
278 a[p-1-j] = y;
279 }
280 break;
281
282 case 1:
283 //a[j] = a[j]- k * a[p-1-j];
284 a[j] = a[j] - mul32x32(k,a[p-1-j]);
285 break;
286 }
287
288 a[p] = k;
289
290 // e = e * (1 - k*k);
291 e = mul32x32(e,ONE_Q31 - mul32x32(k,k));
292
293
294 }
295 *err = e;
296 }
297
298 #else
299
arm_levinson_durbin_q31(const q31_t * phi,q31_t * a,q31_t * err,int nbCoefs)300 ARM_DSP_ATTRIBUTE void arm_levinson_durbin_q31(const q31_t *phi,
301 q31_t *a,
302 q31_t *err,
303 int nbCoefs)
304 {
305 q31_t e;
306 int p;
307
308 //a[0] = phi[1] / phi[0];
309 a[0] = divide(phi[1], phi[0]);
310
311
312 //e = phi[0] - phi[1] * a[0];
313 e = phi[0] - mul32x32(phi[1],a[0]);
314
315 for(p=1; p < nbCoefs; p++)
316 {
317 q63_t suma=0;
318 q63_t sumb=0;
319 q31_t k;
320 int nb,j,i;
321
322 for(i=0; i < p; i++)
323 {
324 suma += ((q63_t)a[i] * phi[p - i]);
325 sumb += ((q63_t)a[i] * phi[i + 1]);
326 }
327
328 suma = suma >> 31;
329 sumb = sumb >> 31;
330
331
332
333 //k = (phi[p+1]-suma)/(phi[0] - sumb);
334 k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);
335
336
337 nb = p >> 1;
338 j=0;
339 for(i =0;i < nb ; i++)
340 {
341 q31_t x,y;
342
343 //x = a[j] - k * a[p-1-j];
344 x = a[j] - mul32x32(k,a[p-1-j]);
345
346 //y = a[p-1-j] - k * a[j];
347 y = a[p-1-j] - mul32x32(k , a[j]);
348
349 a[j] = x;
350 a[p-1-j] = y;
351
352 j++;
353 }
354
355 nb = p & 1;
356 if (nb)
357 {
358 //a[j] = a[j]- k * a[p-1-j];
359 a[j] = a[j] - mul32x32(k,a[p-1-j]);
360 }
361
362 a[p] = k;
363
364 // e = e * (1 - k*k);
365 e = mul32x32(e,ONE_Q31 - mul32x32(k,k));
366
367
368 }
369 *err = e;
370 }
371 #endif /* defined(ARM_MATH_MVEI) */
372
373 /**
374 @} end of LD group
375 */
376