1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_lms_norm_f32.c
4 * Description: Processing function for the floating-point NLMS filter
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @defgroup LMS_NORM Normalized LMS Filters
37
38 This set of functions implements a commonly used adaptive filter.
39 It is related to the Least Mean Square (LMS) adaptive filter and includes an additional normalization
40 factor which increases the adaptation rate of the filter.
41 The CMSIS DSP Library contains normalized LMS filter functions that operate on Q15, Q31, and floating-point data types.
42
43 A normalized least mean square (NLMS) filter consists of two components as shown below.
44 The first component is a standard transversal or FIR filter.
45 The second component is a coefficient update mechanism.
46 The NLMS filter has two input signals.
47 The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
48 That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
49 The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
50 This "error signal" tends towards zero as the filter adapts.
51 The NLMS processing functions accept the input and reference input signals and generate the filter output and error signal.
52 \image html LMS.gif "Internal structure of the NLMS adaptive filter"
53
54 The functions operate on blocks of data and each call to the function processes
55 <code>blockSize</code> samples through the filter.
56 <code>pSrc</code> points to input signal, <code>pRef</code> points to reference signal,
57 <code>pOut</code> points to output signal and <code>pErr</code> points to error signal.
58 All arrays contain <code>blockSize</code> values.
59
60 The functions operate on a block-by-block basis.
61 Internally, the filter coefficients <code>b[n]</code> are updated on a sample-by-sample basis.
62 The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
63
64 @par Algorithm
65 The output signal <code>y[n]</code> is computed by a standard FIR filter:
66 <pre>
67 y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
68 </pre>
69
70 @par
71 The error signal equals the difference between the reference signal <code>d[n]</code> and the filter output:
72 <pre>
73 e[n] = d[n] - y[n].
74 </pre>
75
76 @par
77 After each sample of the error signal is computed the instanteous energy of the filter state variables is calculated:
78 <pre>
79 E = x[n]^2 + x[n-1]^2 + ... + x[n-numTaps+1]^2.
80 </pre>
81 The filter coefficients <code>b[k]</code> are then updated on a sample-by-sample basis:
82 <pre>
83 b[k] = b[k] + e[n] * (mu/E) * x[n-k], for k=0, 1, ..., numTaps-1
84 </pre>
85 where <code>mu</code> is the step size and controls the rate of coefficient convergence.
86 @par
87 In the APIs, <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
88 Coefficients are stored in time reversed order.
89 @par
90 <pre>
91 {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
92 </pre>
93 @par
94 <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
95 Samples in the state buffer are stored in the order:
96 @par
97 <pre>
98 {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
99 </pre>
100 @par
101 Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code> samples.
102 The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
103 to be avoided and yields a significant speed improvement.
104 The state variables are updated after each block of data is processed.
105
106 @par Instance Structure
107 The coefficients and state variables for a filter are stored together in an instance data structure.
108 A separate instance structure must be defined for each filter and
109 coefficient and state arrays cannot be shared among instances.
110 There are separate instance structure declarations for each of the 3 supported data types.
111
112 @par Initialization Functions
113 There is also an associated initialization function for each data type.
114 The initialization function performs the following operations:
115 - Sets the values of the internal structure fields.
116 - Zeros out the values in the state buffer.
117 To do this manually without calling the init function, assign the follow subfields of the instance structure:
118 numTaps, pCoeffs, mu, energy, x0, pState. Also set all of the values in pState to zero.
119 For Q7, Q15, and Q31 the following fields must also be initialized;
120 recipTable, postShift
121 @par
122 Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
123 @par Fixed-Point Behavior
124 Care must be taken when using the Q15 and Q31 versions of the normalised LMS filter.
125 The following issues must be considered:
126 - Scaling of coefficients
127 - Overflow and saturation
128
129 @par Scaling of Coefficients (fixed point versions)
130 Filter coefficients are represented as fractional values and
131 coefficients are restricted to lie in the range <code>[-1 +1)</code>.
132 The fixed-point functions have an additional scaling parameter <code>postShift</code>.
133 At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
134 This essentially scales the filter coefficients by <code>2^postShift</code> and
135 allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
136 The value of <code>postShift</code> is set by the user based on the expected gain through the system being modeled.
137
138 @par Overflow and Saturation (fixed point versions)
139 Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
140 described separately as part of the function specific documentation below.
141 */
142
143 /**
144 @addtogroup LMS_NORM
145 @{
146 */
147
148 /**
149 @brief Processing function for floating-point normalized LMS filter.
150 @param[in] S points to an instance of the floating-point normalized LMS filter structure
151 @param[in] pSrc points to the block of input data
152 @param[in] pRef points to the block of reference data
153 @param[out] pOut points to the block of output data
154 @param[out] pErr points to the block of error data
155 @param[in] blockSize number of samples to process
156 */
157
158 #if defined(ARM_MATH_NEON)
arm_lms_norm_f32(arm_lms_norm_instance_f32 * S,const float32_t * pSrc,float32_t * pRef,float32_t * pOut,float32_t * pErr,uint32_t blockSize)159 ARM_DSP_ATTRIBUTE void arm_lms_norm_f32(
160 arm_lms_norm_instance_f32 * S,
161 const float32_t * pSrc,
162 float32_t * pRef,
163 float32_t * pOut,
164 float32_t * pErr,
165 uint32_t blockSize)
166 {
167 float32_t *pState = S->pState; /* State pointer */
168 float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
169 float32_t *pStateCurnt; /* Points to the current sample of the state */
170 float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
171 float32_t mu = S->mu; /* Adaptive factor */
172 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
173 uint32_t tapCnt, blkCnt; /* Loop counters */
174 float32_t energy; /* Energy of the input */
175 float32_t sum, e, d; /* accumulator, error, reference data sample */
176 float32_t w, x0, in; /* weight factor, temporary variable to hold input sample and state */
177
178 float32x4_t tempV, sumV, xV, bV;
179 float32x2_t tempV2;
180
181 /* Initializations of error, difference, Coefficient update */
182 e = 0.0f;
183 d = 0.0f;
184 w = 0.0f;
185
186 energy = S->energy;
187 x0 = S->x0;
188
189 /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
190 /* pStateCurnt points to the location where the new input data should be written */
191 pStateCurnt = &(S->pState[(numTaps - 1U)]);
192
193 /* Loop over blockSize number of values */
194 blkCnt = blockSize;
195
196 while (blkCnt > 0U)
197 {
198 /* Copy the new input sample into the state buffer */
199 *pStateCurnt++ = *pSrc;
200
201 /* Initialize pState pointer */
202 px = pState;
203
204 /* Initialize coeff pointer */
205 pb = (pCoeffs);
206
207 /* Read the sample from input buffer */
208 in = *pSrc++;
209
210 /* Update the energy calculation */
211 energy -= x0 * x0;
212 energy += in * in;
213
214 /* Set the accumulator to zero */
215 sum = 0.0f;
216 sumV = vdupq_n_f32(0.0);
217
218 /* Process 4 taps at a time. */
219 tapCnt = numTaps >> 2;
220
221 while (tapCnt > 0U)
222 {
223 /* Perform the multiply-accumulate */
224 xV = vld1q_f32(px);
225 bV = vld1q_f32(pb);
226 sumV = vmlaq_f32(sumV, xV, bV);
227
228 px += 4;
229 pb += 4;
230
231 /* Decrement the loop counter */
232 tapCnt--;
233 }
234 tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
235 sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
236
237 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
238 tapCnt = numTaps % 0x4U;
239
240 while (tapCnt > 0U)
241 {
242 /* Perform the multiply-accumulate */
243 sum += (*px++) * (*pb++);
244
245 /* Decrement the loop counter */
246 tapCnt--;
247 }
248
249 /* The result in the accumulator, store in the destination buffer. */
250 *pOut++ = sum;
251
252 /* Compute and store error */
253 d = (float32_t) (*pRef++);
254 e = d - sum;
255 *pErr++ = e;
256
257 /* Calculation of Weighting factor for updating filter coefficients */
258 /* epsilon value 0.000000119209289f */
259 w = (e * mu) / (energy + 0.000000119209289f);
260
261 /* Initialize pState pointer */
262 px = pState;
263
264 /* Initialize coeff pointer */
265 pb = (pCoeffs);
266
267 /* Process 4 taps at a time. */
268 tapCnt = numTaps >> 2;
269
270 /* Update filter coefficients */
271 while (tapCnt > 0U)
272 {
273 /* Perform the multiply-accumulate */
274 xV = vld1q_f32(px);
275 bV = vld1q_f32(pb);
276 px += 4;
277 bV = vmlaq_n_f32(bV,xV,w);
278
279 vst1q_f32(pb,bV);
280 pb += 4;
281
282
283 /* Decrement the loop counter */
284 tapCnt--;
285 }
286
287 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
288 tapCnt = numTaps % 0x4U;
289
290 while (tapCnt > 0U)
291 {
292 /* Perform the multiply-accumulate */
293 *pb += w * (*px++);
294 pb++;
295
296 /* Decrement the loop counter */
297 tapCnt--;
298 }
299
300 x0 = *pState;
301
302 /* Advance state pointer by 1 for the next sample */
303 pState = pState + 1;
304
305 /* Decrement the loop counter */
306 blkCnt--;
307 }
308
309 S->energy = energy;
310 S->x0 = x0;
311
312 /* Processing is complete. Now copy the last numTaps - 1 samples to the
313 satrt of the state buffer. This prepares the state buffer for the
314 next function call. */
315
316 /* Points to the start of the pState buffer */
317 pStateCurnt = S->pState;
318
319 /* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */
320 tapCnt = (numTaps - 1U) >> 2U;
321
322 /* copy data */
323 while (tapCnt > 0U)
324 {
325 tempV = vld1q_f32(pState);
326 vst1q_f32(pStateCurnt,tempV);
327 pState += 4;
328 pStateCurnt += 4;
329
330 /* Decrement the loop counter */
331 tapCnt--;
332 }
333
334 /* Calculate remaining number of copies */
335 tapCnt = (numTaps - 1U) % 0x4U;
336
337 /* Copy the remaining q31_t data */
338 while (tapCnt > 0U)
339 {
340 *pStateCurnt++ = *pState++;
341
342 /* Decrement the loop counter */
343 tapCnt--;
344 }
345
346 }
347 #else
arm_lms_norm_f32(arm_lms_norm_instance_f32 * S,const float32_t * pSrc,float32_t * pRef,float32_t * pOut,float32_t * pErr,uint32_t blockSize)348 ARM_DSP_ATTRIBUTE void arm_lms_norm_f32(
349 arm_lms_norm_instance_f32 * S,
350 const float32_t * pSrc,
351 float32_t * pRef,
352 float32_t * pOut,
353 float32_t * pErr,
354 uint32_t blockSize)
355 {
356 float32_t *pState = S->pState; /* State pointer */
357 float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
358 float32_t *pStateCurnt; /* Points to the current sample of the state */
359 float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
360 float32_t mu = S->mu; /* Adaptive factor */
361 float32_t acc, e; /* Accumulator, error */
362 float32_t w; /* Weight factor */
363 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
364 uint32_t tapCnt, blkCnt; /* Loop counters */
365 float32_t energy; /* Energy of the input */
366 float32_t x0, in; /* Temporary variable to hold input sample and state */
367
368 /* Initializations of error, difference, Coefficient update */
369 e = 0.0f;
370 w = 0.0f;
371
372 energy = S->energy;
373 x0 = S->x0;
374
375 /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
376 /* pStateCurnt points to the location where the new input data should be written */
377 pStateCurnt = &(S->pState[(numTaps - 1U)]);
378
379 /* initialise loop count */
380 blkCnt = blockSize;
381
382 while (blkCnt > 0U)
383 {
384 /* Copy the new input sample into the state buffer */
385 *pStateCurnt++ = *pSrc;
386
387 /* Initialize pState pointer */
388 px = pState;
389
390 /* Initialize coefficient pointer */
391 pb = pCoeffs;
392
393 /* Read the sample from input buffer */
394 in = *pSrc++;
395
396 /* Update the energy calculation */
397 energy -= x0 * x0;
398 energy += in * in;
399
400 /* Set the accumulator to zero */
401 acc = 0.0f;
402
403 #if defined (ARM_MATH_LOOPUNROLL)
404
405 /* Loop unrolling: Compute 4 taps at a time. */
406 tapCnt = numTaps >> 2U;
407
408 while (tapCnt > 0U)
409 {
410 /* Perform the multiply-accumulate */
411 acc += (*px++) * (*pb++);
412
413 acc += (*px++) * (*pb++);
414
415 acc += (*px++) * (*pb++);
416
417 acc += (*px++) * (*pb++);
418
419 /* Decrement loop counter */
420 tapCnt--;
421 }
422
423 /* Loop unrolling: Compute remaining taps */
424 tapCnt = numTaps % 0x4U;
425
426 #else
427
428 /* Initialize tapCnt with number of samples */
429 tapCnt = numTaps;
430
431 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
432
433 while (tapCnt > 0U)
434 {
435 /* Perform the multiply-accumulate */
436 acc += (*px++) * (*pb++);
437
438 /* Decrement the loop counter */
439 tapCnt--;
440 }
441
442 /* Store the result from accumulator into the destination buffer. */
443 *pOut++ = acc;
444
445 /* Compute and store error */
446 e = (float32_t) *pRef++ - acc;
447 *pErr++ = e;
448
449 /* Calculation of Weighting factor for updating filter coefficients */
450 /* epsilon value 0.000000119209289f */
451 w = (e * mu) / (energy + 0.000000119209289f);
452
453 /* Initialize pState pointer */
454 px = pState;
455
456 /* Initialize coefficient pointer */
457 pb = pCoeffs;
458
459 #if defined (ARM_MATH_LOOPUNROLL)
460
461 /* Loop unrolling: Compute 4 taps at a time. */
462 tapCnt = numTaps >> 2U;
463
464 /* Update filter coefficients */
465 while (tapCnt > 0U)
466 {
467 /* Perform the multiply-accumulate */
468 *pb += w * (*px++);
469 pb++;
470
471 *pb += w * (*px++);
472 pb++;
473
474 *pb += w * (*px++);
475 pb++;
476
477 *pb += w * (*px++);
478 pb++;
479
480 /* Decrement loop counter */
481 tapCnt--;
482 }
483
484 /* Loop unrolling: Compute remaining taps */
485 tapCnt = numTaps % 0x4U;
486
487 #else
488
489 /* Initialize tapCnt with number of samples */
490 tapCnt = numTaps;
491
492 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
493
494 while (tapCnt > 0U)
495 {
496 /* Perform the multiply-accumulate */
497 *pb += w * (*px++);
498 pb++;
499
500 /* Decrement loop counter */
501 tapCnt--;
502 }
503
504 x0 = *pState;
505
506 /* Advance state pointer by 1 for the next sample */
507 pState = pState + 1;
508
509 /* Decrement loop counter */
510 blkCnt--;
511 }
512
513 /* Save energy and x0 values for the next frame */
514 S->energy = energy;
515 S->x0 = x0;
516
517 /* Processing is complete.
518 Now copy the last numTaps - 1 samples to the start of the state buffer.
519 This prepares the state buffer for the next function call. */
520
521 /* Points to the start of the pState buffer */
522 pStateCurnt = S->pState;
523
524 /* copy data */
525 #if defined (ARM_MATH_LOOPUNROLL)
526
527 /* Loop unrolling: Compute 4 taps at a time. */
528 tapCnt = (numTaps - 1U) >> 2U;
529
530 while (tapCnt > 0U)
531 {
532 *pStateCurnt++ = *pState++;
533 *pStateCurnt++ = *pState++;
534 *pStateCurnt++ = *pState++;
535 *pStateCurnt++ = *pState++;
536
537 /* Decrement loop counter */
538 tapCnt--;
539 }
540
541 /* Loop unrolling: Compute remaining taps */
542 tapCnt = (numTaps - 1U) % 0x4U;
543
544 #else
545
546 /* Initialize tapCnt with number of samples */
547 tapCnt = (numTaps - 1U);
548
549 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
550
551 while (tapCnt > 0U)
552 {
553 *pStateCurnt++ = *pState++;
554
555 /* Decrement loop counter */
556 tapCnt--;
557 }
558
559 }
560 #endif /* #if defined(ARM_MATH_NEON) */
561 /**
562 @} end of LMS_NORM group
563 */
564