1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_lms_f32.c
4 * Description: Processing function for the floating-point LMS filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup LMS Least Mean Square (LMS) Filters
37
38 LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions.
39 LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal.
40 Adaptive filters are often used in communication systems, equalizers, and noise removal.
41 The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types.
42 The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal.
43
44 An LMS filter consists of two components as shown below.
45 The first component is a standard transversal or FIR filter.
46 The second component is a coefficient update mechanism.
47 The LMS filter has two input signals.
48 The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
49 That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
50 The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
51 This "error signal" tends towards zero as the filter adapts.
52 The LMS processing functions accept the input and reference input signals and generate the filter output and error signal.
53 \image html LMS.gif "Internal structure of the Least Mean Square filter"
54
55 The functions operate on blocks of data and each call to the function processes
56 <code>blockSize</code> samples through the filter.
57 <code>pSrc</code> points to input signal, <code>pRef</code> points to reference signal,
58 <code>pOut</code> points to output signal and <code>pErr</code> points to error signal.
59 All arrays contain <code>blockSize</code> values.
60
61 The functions operate on a block-by-block basis.
62 Internally, the filter coefficients <code>b[n]</code> are updated on a sample-by-sample basis.
63 The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
64
65 @par Algorithm
66 The output signal <code>y[n]</code> is computed by a standard FIR filter:
67 <pre>
68 y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
69 </pre>
70
71 @par
72 The error signal equals the difference between the reference signal <code>d[n]</code> and the filter output:
73 <pre>
74 e[n] = d[n] - y[n].
75 </pre>
76
77 @par
78 After each sample of the error signal is computed, the filter coefficients <code>b[k]</code> are updated on a sample-by-sample basis:
79 <pre>
80 b[k] = b[k] + e[n] * mu * x[n-k], for k=0, 1, ..., numTaps-1
81 </pre>
82 where <code>mu</code> is the step size and controls the rate of coefficient convergence.
83 @par
84 In the APIs, <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
85 Coefficients are stored in time reversed order.
86 @par
87 <pre>
88 {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
89 </pre>
90 @par
91 <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
92 Samples in the state buffer are stored in the order:
93 @par
94 <pre>
95 {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
96 </pre>
97 @par
98 Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code> samples.
99 The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
100 to be avoided and yields a significant speed improvement.
101 The state variables are updated after each block of data is processed.
102 @par Instance Structure
103 The coefficients and state variables for a filter are stored together in an instance data structure.
104 A separate instance structure must be defined for each filter and
105 coefficient and state arrays cannot be shared among instances.
106 There are separate instance structure declarations for each of the 3 supported data types.
107
108 @par Initialization Functions
109 There is also an associated initialization function for each data type.
110 The initialization function performs the following operations:
111 - Sets the values of the internal structure fields.
112 - Zeros out the values in the state buffer.
113 To do this manually without calling the init function, assign the follow subfields of the instance structure:
114 numTaps, pCoeffs, mu, postShift (not for f32), pState. Also set all of the values in pState to zero.
115
116 @par
117 Use of the initialization function is optional.
118 However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
119 To place an instance structure into a const data section, the instance structure must be manually initialized.
120 Set the values in the state buffer to zeros before static initialization.
121 The code below statically initializes each of the 3 different data type filter instance structures
122 <pre>
123 arm_lms_instance_f32 S = {numTaps, pState, pCoeffs, mu};
124 arm_lms_instance_q31 S = {numTaps, pState, pCoeffs, mu, postShift};
125 arm_lms_instance_q15 S = {numTaps, pState, pCoeffs, mu, postShift};
126 </pre>
127 where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
128 <code>pCoeffs</code> is the address of the coefficient buffer; <code>mu</code> is the step size parameter; and <code>postShift</code> is the shift applied to coefficients.
129
130 @par Fixed-Point Behavior
131 Care must be taken when using the Q15 and Q31 versions of the LMS filter.
132 The following issues must be considered:
133 - Scaling of coefficients
134 - Overflow and saturation
135
136 @par Scaling of Coefficients
137 Filter coefficients are represented as fractional values and
138 coefficients are restricted to lie in the range <code>[-1 +1)</code>.
139 The fixed-point functions have an additional scaling parameter <code>postShift</code>.
140 At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
141 This essentially scales the filter coefficients by <code>2^postShift</code> and
142 allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
143 The value of <code>postShift</code> is set by the user based on the expected gain through the system being modeled.
144
145 @par Overflow and Saturation
146 Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
147 described separately as part of the function specific documentation below.
148 */
149
150 /**
151 @addtogroup LMS
152 @{
153 */
154
155 /**
156 @brief Processing function for floating-point LMS filter.
157 @param[in] S points to an instance of the floating-point LMS filter structure
158 @param[in] pSrc points to the block of input data
159 @param[in] pRef points to the block of reference data
160 @param[out] pOut points to the block of output data
161 @param[out] pErr points to the block of error data
162 @param[in] blockSize number of samples to process
163 */
164 #if defined(ARM_MATH_NEON)
arm_lms_f32(const arm_lms_instance_f32 * S,const float32_t * pSrc,float32_t * pRef,float32_t * pOut,float32_t * pErr,uint32_t blockSize)165 ARM_DSP_ATTRIBUTE void arm_lms_f32(
166 const arm_lms_instance_f32 * S,
167 const float32_t * pSrc,
168 float32_t * pRef,
169 float32_t * pOut,
170 float32_t * pErr,
171 uint32_t blockSize)
172 {
173 float32_t *pState = S->pState; /* State pointer */
174 float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
175 float32_t *pStateCurnt; /* Points to the current sample of the state */
176 float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
177 float32_t mu = S->mu; /* Adaptive factor */
178 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
179 uint32_t tapCnt, blkCnt; /* Loop counters */
180 float32_t sum, e, d; /* accumulator, error, reference data sample */
181 float32_t w = 0.0f; /* weight factor */
182
183 float32x4_t tempV, sumV, xV, bV;
184 float32x2_t tempV2;
185
186 e = 0.0f;
187 d = 0.0f;
188
189 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
190 /* pStateCurnt points to the location where the new input data should be written */
191 pStateCurnt = &(S->pState[(numTaps - 1U)]);
192
193 blkCnt = blockSize;
194
195 while (blkCnt > 0U)
196 {
197 /* Copy the new input sample into the state buffer */
198 *pStateCurnt++ = *pSrc++;
199
200 /* Initialize pState pointer */
201 px = pState;
202
203 /* Initialize coeff pointer */
204 pb = (pCoeffs);
205
206 /* Set the accumulator to zero */
207 sum = 0.0f;
208 sumV = vdupq_n_f32(0.0);
209
210 /* Process 4 taps at a time. */
211 tapCnt = numTaps >> 2;
212
213 while (tapCnt > 0U)
214 {
215 /* Perform the multiply-accumulate */
216 xV = vld1q_f32(px);
217 bV = vld1q_f32(pb);
218 sumV = vmlaq_f32(sumV, xV, bV);
219
220 px += 4;
221 pb += 4;
222
223 /* Decrement the loop counter */
224 tapCnt--;
225 }
226 tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
227 sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
228
229
230 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
231 tapCnt = numTaps % 0x4U;
232
233 while (tapCnt > 0U)
234 {
235 /* Perform the multiply-accumulate */
236 sum += (*px++) * (*pb++);
237
238 /* Decrement the loop counter */
239 tapCnt--;
240 }
241
242 /* The result in the accumulator, store in the destination buffer. */
243 *pOut++ = sum;
244
245 /* Compute and store error */
246 d = (float32_t) (*pRef++);
247 e = d - sum;
248 *pErr++ = e;
249
250 /* Calculation of Weighting factor for the updating filter coefficients */
251 w = e * mu;
252
253 /* Initialize pState pointer */
254 px = pState;
255
256 /* Initialize coeff pointer */
257 pb = (pCoeffs);
258
259 /* Process 4 taps at a time. */
260 tapCnt = numTaps >> 2;
261
262 /* Update filter coefficients */
263 while (tapCnt > 0U)
264 {
265 /* Perform the multiply-accumulate */
266 xV = vld1q_f32(px);
267 bV = vld1q_f32(pb);
268 px += 4;
269 bV = vmlaq_n_f32(bV,xV,w);
270
271 vst1q_f32(pb,bV);
272 pb += 4;
273
274
275 /* Decrement the loop counter */
276 tapCnt--;
277 }
278
279 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
280 tapCnt = numTaps % 0x4U;
281
282 while (tapCnt > 0U)
283 {
284 /* Perform the multiply-accumulate */
285 *pb = *pb + (w * (*px++));
286 pb++;
287
288 /* Decrement the loop counter */
289 tapCnt--;
290 }
291
292 /* Advance state pointer by 1 for the next sample */
293 pState = pState + 1;
294
295 /* Decrement the loop counter */
296 blkCnt--;
297 }
298
299
300 /* Processing is complete. Now copy the last numTaps - 1 samples to the
301 satrt of the state buffer. This prepares the state buffer for the
302 next function call. */
303
304 /* Points to the start of the pState buffer */
305 pStateCurnt = S->pState;
306
307 /* Process 4 taps at a time for (numTaps - 1U) samples copy */
308 tapCnt = (numTaps - 1U) >> 2U;
309
310 /* copy data */
311 while (tapCnt > 0U)
312 {
313 tempV = vld1q_f32(pState);
314 vst1q_f32(pStateCurnt,tempV);
315 pState += 4;
316 pStateCurnt += 4;
317
318 /* Decrement the loop counter */
319 tapCnt--;
320 }
321
322 /* Calculate remaining number of copies */
323 tapCnt = (numTaps - 1U) % 0x4U;
324
325 /* Copy the remaining q31_t data */
326 while (tapCnt > 0U)
327 {
328 *pStateCurnt++ = *pState++;
329
330 /* Decrement the loop counter */
331 tapCnt--;
332 }
333
334
335 }
336 #else
arm_lms_f32(const arm_lms_instance_f32 * S,const float32_t * pSrc,float32_t * pRef,float32_t * pOut,float32_t * pErr,uint32_t blockSize)337 ARM_DSP_ATTRIBUTE void arm_lms_f32(
338 const arm_lms_instance_f32 * S,
339 const float32_t * pSrc,
340 float32_t * pRef,
341 float32_t * pOut,
342 float32_t * pErr,
343 uint32_t blockSize)
344 {
345 float32_t *pState = S->pState; /* State pointer */
346 float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
347 float32_t *pStateCurnt; /* Points to the current sample of the state */
348 float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
349 float32_t mu = S->mu; /* Adaptive factor */
350 float32_t acc, e; /* Accumulator, error */
351 float32_t w; /* Weight factor */
352 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
353 uint32_t tapCnt, blkCnt; /* Loop counters */
354
355 /* Initializations of error, difference, Coefficient update */
356 e = 0.0f;
357 w = 0.0f;
358
359 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
360 /* pStateCurnt points to the location where the new input data should be written */
361 pStateCurnt = &(S->pState[(numTaps - 1U)]);
362
363 /* initialise loop count */
364 blkCnt = blockSize;
365
366 while (blkCnt > 0U)
367 {
368 /* Copy the new input sample into the state buffer */
369 *pStateCurnt++ = *pSrc++;
370
371 /* Initialize pState pointer */
372 px = pState;
373
374 /* Initialize coefficient pointer */
375 pb = pCoeffs;
376
377 /* Set the accumulator to zero */
378 acc = 0.0f;
379
380 #if defined (ARM_MATH_LOOPUNROLL)
381
382 /* Loop unrolling: Compute 4 taps at a time. */
383 tapCnt = numTaps >> 2U;
384
385 while (tapCnt > 0U)
386 {
387 /* Perform the multiply-accumulate */
388 acc += (*px++) * (*pb++);
389
390 acc += (*px++) * (*pb++);
391
392 acc += (*px++) * (*pb++);
393
394 acc += (*px++) * (*pb++);
395
396 /* Decrement loop counter */
397 tapCnt--;
398 }
399
400 /* Loop unrolling: Compute remaining taps */
401 tapCnt = numTaps % 0x4U;
402
403 #else
404
405 /* Initialize tapCnt with number of samples */
406 tapCnt = numTaps;
407
408 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
409
410 while (tapCnt > 0U)
411 {
412 /* Perform the multiply-accumulate */
413 acc += (*px++) * (*pb++);
414
415 /* Decrement the loop counter */
416 tapCnt--;
417 }
418
419 /* Store the result from accumulator into the destination buffer. */
420 *pOut++ = acc;
421
422 /* Compute and store error */
423 e = (float32_t) *pRef++ - acc;
424 *pErr++ = e;
425
426 /* Calculation of Weighting factor for updating filter coefficients */
427 w = e * mu;
428
429 /* Initialize pState pointer */
430 /* Advance state pointer by 1 for the next sample */
431 px = pState++;
432
433 /* Initialize coefficient pointer */
434 pb = pCoeffs;
435
436 #if defined (ARM_MATH_LOOPUNROLL)
437
438 /* Loop unrolling: Compute 4 taps at a time. */
439 tapCnt = numTaps >> 2U;
440
441 /* Update filter coefficients */
442 while (tapCnt > 0U)
443 {
444 /* Perform the multiply-accumulate */
445 *pb += w * (*px++);
446 pb++;
447
448 *pb += w * (*px++);
449 pb++;
450
451 *pb += w * (*px++);
452 pb++;
453
454 *pb += w * (*px++);
455 pb++;
456
457 /* Decrement loop counter */
458 tapCnt--;
459 }
460
461 /* Loop unrolling: Compute remaining taps */
462 tapCnt = numTaps % 0x4U;
463
464 #else
465
466 /* Initialize tapCnt with number of samples */
467 tapCnt = numTaps;
468
469 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
470
471 while (tapCnt > 0U)
472 {
473 /* Perform the multiply-accumulate */
474 *pb += w * (*px++);
475 pb++;
476
477 /* Decrement loop counter */
478 tapCnt--;
479 }
480
481 /* Decrement loop counter */
482 blkCnt--;
483 }
484
485 /* Processing is complete.
486 Now copy the last numTaps - 1 samples to the start of the state buffer.
487 This prepares the state buffer for the next function call. */
488
489 /* Points to the start of the pState buffer */
490 pStateCurnt = S->pState;
491
492 /* copy data */
493 #if defined (ARM_MATH_LOOPUNROLL)
494
495 /* Loop unrolling: Compute 4 taps at a time. */
496 tapCnt = (numTaps - 1U) >> 2U;
497
498 while (tapCnt > 0U)
499 {
500 *pStateCurnt++ = *pState++;
501 *pStateCurnt++ = *pState++;
502 *pStateCurnt++ = *pState++;
503 *pStateCurnt++ = *pState++;
504
505 /* Decrement loop counter */
506 tapCnt--;
507 }
508
509 /* Loop unrolling: Compute remaining taps */
510 tapCnt = (numTaps - 1U) % 0x4U;
511
512 #else
513
514 /* Initialize tapCnt with number of samples */
515 tapCnt = (numTaps - 1U);
516
517 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
518
519 while (tapCnt > 0U)
520 {
521 *pStateCurnt++ = *pState++;
522
523 /* Decrement loop counter */
524 tapCnt--;
525 }
526
527 }
528 #endif /* #if defined(ARM_MATH_NEON) */
529
530 /**
531 @} end of LMS group
532 */
533