1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_lms_norm_f32.c
4  * Description:  Processing function for the floating-point NLMS filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup LMS_NORM Normalized LMS Filters
37 
38   This set of functions implements a commonly used adaptive filter.
39   It is related to the Least Mean Square (LMS) adaptive filter and includes an additional normalization
40   factor which increases the adaptation rate of the filter.
41   The CMSIS DSP Library contains normalized LMS filter functions that operate on Q15, Q31, and floating-point data types.
42 
43   A normalized least mean square (NLMS) filter consists of two components as shown below.
44   The first component is a standard transversal or FIR filter.
45   The second component is a coefficient update mechanism.
46   The NLMS filter has two input signals.
47   The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
48   That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
49   The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
50   This "error signal" tends towards zero as the filter adapts.
51   The NLMS processing functions accept the input and reference input signals and generate the filter output and error signal.
52   \image html LMS.gif "Internal structure of the NLMS adaptive filter"
53 
54   The functions operate on blocks of data and each call to the function processes
55   <code>blockSize</code> samples through the filter.
56   <code>pSrc</code> points to input signal, <code>pRef</code> points to reference signal,
57   <code>pOut</code> points to output signal and <code>pErr</code> points to error signal.
58   All arrays contain <code>blockSize</code> values.
59 
60   The functions operate on a block-by-block basis.
61   Internally, the filter coefficients <code>b[n]</code> are updated on a sample-by-sample basis.
62   The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
63 
64  @par            Algorithm
65                    The output signal <code>y[n]</code> is computed by a standard FIR filter:
66   <pre>
67       y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
68   </pre>
69 
70  @par
71                    The error signal equals the difference between the reference signal <code>d[n]</code> and the filter output:
72   <pre>
73       e[n] = d[n] - y[n].
74   </pre>
75 
76  @par
77                    After each sample of the error signal is computed the instanteous energy of the filter state variables is calculated:
78   <pre>
79      E = x[n]^2 + x[n-1]^2 + ... + x[n-numTaps+1]^2.
80   </pre>
81                    The filter coefficients <code>b[k]</code> are then updated on a sample-by-sample basis:
82   <pre>
83       b[k] = b[k] + e[n] * (mu/E) * x[n-k],  for k=0, 1, ..., numTaps-1
84   </pre>
85                    where <code>mu</code> is the step size and controls the rate of coefficient convergence.
86  @par
87                    In the APIs, <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
88                    Coefficients are stored in time reversed order.
89  @par
90   <pre>
91      {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
92   </pre>
93  @par
94                    <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
95                    Samples in the state buffer are stored in the order:
96  @par
97   <pre>
98      {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
99   </pre>
100  @par
101                    Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code> samples.
102                    The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
103                    to be avoided and yields a significant speed improvement.
104                    The state variables are updated after each block of data is processed.
105 
106  @par            Instance Structure
107                    The coefficients and state variables for a filter are stored together in an instance data structure.
108                    A separate instance structure must be defined for each filter and
109                    coefficient and state arrays cannot be shared among instances.
110                    There are separate instance structure declarations for each of the 3 supported data types.
111 
112  @par            Initialization Functions
113                    There is also an associated initialization function for each data type.
114                    The initialization function performs the following operations:
115                    - Sets the values of the internal structure fields.
116                    - Zeros out the values in the state buffer.
117                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
118                    numTaps, pCoeffs, mu, energy, x0, pState. Also set all of the values in pState to zero.
119                    For Q7, Q15, and Q31 the following fields must also be initialized;
120                    recipTable, postShift
121  @par
122                    Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
123  @par            Fixed-Point Behavior
124                    Care must be taken when using the Q15 and Q31 versions of the normalised LMS filter.
125                    The following issues must be considered:
126                    - Scaling of coefficients
127                    - Overflow and saturation
128 
129  @par            Scaling of Coefficients (fixed point versions)
130                    Filter coefficients are represented as fractional values and
131                    coefficients are restricted to lie in the range <code>[-1 +1)</code>.
132                    The fixed-point functions have an additional scaling parameter <code>postShift</code>.
133                    At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
134                    This essentially scales the filter coefficients by <code>2^postShift</code> and
135                    allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
136                    The value of <code>postShift</code> is set by the user based on the expected gain through the system being modeled.
137 
138  @par            Overflow and Saturation (fixed point versions)
139                    Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
140                    described separately as part of the function specific documentation below.
141  */
142 
143 /**
144   @addtogroup LMS_NORM
145   @{
146  */
147 
148 /**
149   @brief         Processing function for floating-point normalized LMS filter.
150   @param[in]     S         points to an instance of the floating-point normalized LMS filter structure
151   @param[in]     pSrc      points to the block of input data
152   @param[in]     pRef      points to the block of reference data
153   @param[out]    pOut      points to the block of output data
154   @param[out]    pErr      points to the block of error data
155   @param[in]     blockSize number of samples to process
156  */
157 
158 #if defined(ARM_MATH_NEON)
arm_lms_norm_f32(arm_lms_norm_instance_f32 * S,const float32_t * pSrc,float32_t * pRef,float32_t * pOut,float32_t * pErr,uint32_t blockSize)159 ARM_DSP_ATTRIBUTE void arm_lms_norm_f32(
160   arm_lms_norm_instance_f32 * S,
161   const float32_t * pSrc,
162   float32_t * pRef,
163   float32_t * pOut,
164   float32_t * pErr,
165   uint32_t blockSize)
166 {
167   float32_t *pState = S->pState;                 /* State pointer */
168   float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
169   float32_t *pStateCurnt;                        /* Points to the current sample of the state */
170   float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
171   float32_t mu = S->mu;                          /* Adaptive factor */
172   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
173   uint32_t tapCnt, blkCnt;                       /* Loop counters */
174   float32_t energy;                              /* Energy of the input */
175   float32_t sum, e, d;                           /* accumulator, error, reference data sample */
176   float32_t w, x0, in;                           /* weight factor, temporary variable to hold input sample and state */
177 
178   float32x4_t tempV, sumV, xV, bV;
179   float32x2_t tempV2;
180 
181   /* Initializations of error,  difference, Coefficient update */
182   e = 0.0f;
183   d = 0.0f;
184   w = 0.0f;
185 
186   energy = S->energy;
187   x0 = S->x0;
188 
189   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
190   /* pStateCurnt points to the location where the new input data should be written */
191   pStateCurnt = &(S->pState[(numTaps - 1U)]);
192 
193   /* Loop over blockSize number of values */
194   blkCnt = blockSize;
195 
196   while (blkCnt > 0U)
197   {
198     /* Copy the new input sample into the state buffer */
199     *pStateCurnt++ = *pSrc;
200 
201     /* Initialize pState pointer */
202     px = pState;
203 
204     /* Initialize coeff pointer */
205     pb = (pCoeffs);
206 
207     /* Read the sample from input buffer */
208     in = *pSrc++;
209 
210     /* Update the energy calculation */
211     energy -= x0 * x0;
212     energy += in * in;
213 
214     /* Set the accumulator to zero */
215     sum = 0.0f;
216     sumV = vdupq_n_f32(0.0);
217 
218     /* Process 4 taps at a time. */
219     tapCnt = numTaps >> 2;
220 
221     while (tapCnt > 0U)
222     {
223       /* Perform the multiply-accumulate */
224       xV = vld1q_f32(px);
225       bV = vld1q_f32(pb);
226       sumV = vmlaq_f32(sumV, xV, bV);
227 
228       px += 4;
229       pb += 4;
230 
231       /* Decrement the loop counter */
232       tapCnt--;
233     }
234     tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
235     sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
236 
237     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
238     tapCnt = numTaps % 0x4U;
239 
240     while (tapCnt > 0U)
241     {
242       /* Perform the multiply-accumulate */
243       sum += (*px++) * (*pb++);
244 
245       /* Decrement the loop counter */
246       tapCnt--;
247     }
248 
249     /* The result in the accumulator, store in the destination buffer. */
250     *pOut++ = sum;
251 
252     /* Compute and store error */
253     d = (float32_t) (*pRef++);
254     e = d - sum;
255     *pErr++ = e;
256 
257     /* Calculation of Weighting factor for updating filter coefficients */
258     /* epsilon value 0.000000119209289f */
259     w = (e * mu) / (energy + 0.000000119209289f);
260 
261     /* Initialize pState pointer */
262     px = pState;
263 
264     /* Initialize coeff pointer */
265     pb = (pCoeffs);
266 
267     /* Process 4 taps at a time. */
268     tapCnt = numTaps >> 2;
269 
270     /* Update filter coefficients */
271     while (tapCnt > 0U)
272     {
273       /* Perform the multiply-accumulate */
274       xV = vld1q_f32(px);
275       bV = vld1q_f32(pb);
276       px += 4;
277       bV = vmlaq_n_f32(bV,xV,w);
278 
279       vst1q_f32(pb,bV);
280       pb += 4;
281 
282 
283       /* Decrement the loop counter */
284       tapCnt--;
285     }
286 
287     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
288     tapCnt = numTaps % 0x4U;
289 
290     while (tapCnt > 0U)
291     {
292       /* Perform the multiply-accumulate */
293       *pb += w * (*px++);
294       pb++;
295 
296       /* Decrement the loop counter */
297       tapCnt--;
298     }
299 
300     x0 = *pState;
301 
302     /* Advance state pointer by 1 for the next sample */
303     pState = pState + 1;
304 
305     /* Decrement the loop counter */
306     blkCnt--;
307   }
308 
309   S->energy = energy;
310   S->x0 = x0;
311 
312   /* Processing is complete. Now copy the last numTaps - 1 samples to the
313      satrt of the state buffer. This prepares the state buffer for the
314      next function call. */
315 
316   /* Points to the start of the pState buffer */
317   pStateCurnt = S->pState;
318 
319   /* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */
320   tapCnt = (numTaps - 1U) >> 2U;
321 
322   /* copy data */
323   while (tapCnt > 0U)
324   {
325     tempV = vld1q_f32(pState);
326     vst1q_f32(pStateCurnt,tempV);
327     pState += 4;
328     pStateCurnt += 4;
329 
330     /* Decrement the loop counter */
331     tapCnt--;
332   }
333 
334   /* Calculate remaining number of copies */
335   tapCnt = (numTaps - 1U) % 0x4U;
336 
337   /* Copy the remaining q31_t data */
338   while (tapCnt > 0U)
339   {
340     *pStateCurnt++ = *pState++;
341 
342     /* Decrement the loop counter */
343     tapCnt--;
344   }
345 
346 }
347 #else
arm_lms_norm_f32(arm_lms_norm_instance_f32 * S,const float32_t * pSrc,float32_t * pRef,float32_t * pOut,float32_t * pErr,uint32_t blockSize)348 ARM_DSP_ATTRIBUTE void arm_lms_norm_f32(
349         arm_lms_norm_instance_f32 * S,
350   const float32_t * pSrc,
351         float32_t * pRef,
352         float32_t * pOut,
353         float32_t * pErr,
354         uint32_t blockSize)
355 {
356         float32_t *pState = S->pState;                 /* State pointer */
357         float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
358         float32_t *pStateCurnt;                        /* Points to the current sample of the state */
359         float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
360         float32_t mu = S->mu;                          /* Adaptive factor */
361         float32_t acc, e;                              /* Accumulator, error */
362         float32_t w;                                   /* Weight factor */
363         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
364         uint32_t tapCnt, blkCnt;                       /* Loop counters */
365         float32_t energy;                              /* Energy of the input */
366         float32_t x0, in;                              /* Temporary variable to hold input sample and state */
367 
368   /* Initializations of error,  difference, Coefficient update */
369   e = 0.0f;
370   w = 0.0f;
371 
372   energy = S->energy;
373   x0 = S->x0;
374 
375   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
376   /* pStateCurnt points to the location where the new input data should be written */
377   pStateCurnt = &(S->pState[(numTaps - 1U)]);
378 
379   /* initialise loop count */
380   blkCnt = blockSize;
381 
382   while (blkCnt > 0U)
383   {
384     /* Copy the new input sample into the state buffer */
385     *pStateCurnt++ = *pSrc;
386 
387     /* Initialize pState pointer */
388     px = pState;
389 
390     /* Initialize coefficient pointer */
391     pb = pCoeffs;
392 
393     /* Read the sample from input buffer */
394     in = *pSrc++;
395 
396     /* Update the energy calculation */
397     energy -= x0 * x0;
398     energy += in * in;
399 
400     /* Set the accumulator to zero */
401     acc = 0.0f;
402 
403 #if defined (ARM_MATH_LOOPUNROLL)
404 
405     /* Loop unrolling: Compute 4 taps at a time. */
406     tapCnt = numTaps >> 2U;
407 
408     while (tapCnt > 0U)
409     {
410       /* Perform the multiply-accumulate */
411       acc += (*px++) * (*pb++);
412 
413       acc += (*px++) * (*pb++);
414 
415       acc += (*px++) * (*pb++);
416 
417       acc += (*px++) * (*pb++);
418 
419       /* Decrement loop counter */
420       tapCnt--;
421     }
422 
423     /* Loop unrolling: Compute remaining taps */
424     tapCnt = numTaps % 0x4U;
425 
426 #else
427 
428     /* Initialize tapCnt with number of samples */
429     tapCnt = numTaps;
430 
431 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
432 
433     while (tapCnt > 0U)
434     {
435       /* Perform the multiply-accumulate */
436       acc += (*px++) * (*pb++);
437 
438       /* Decrement the loop counter */
439       tapCnt--;
440     }
441 
442     /* Store the result from accumulator into the destination buffer. */
443     *pOut++ = acc;
444 
445     /* Compute and store error */
446     e = (float32_t) *pRef++ - acc;
447     *pErr++ = e;
448 
449     /* Calculation of Weighting factor for updating filter coefficients */
450     /* epsilon value 0.000000119209289f */
451     w = (e * mu) / (energy + 0.000000119209289f);
452 
453     /* Initialize pState pointer */
454     px = pState;
455 
456     /* Initialize coefficient pointer */
457     pb = pCoeffs;
458 
459 #if defined (ARM_MATH_LOOPUNROLL)
460 
461     /* Loop unrolling: Compute 4 taps at a time. */
462     tapCnt = numTaps >> 2U;
463 
464     /* Update filter coefficients */
465     while (tapCnt > 0U)
466     {
467       /* Perform the multiply-accumulate */
468       *pb += w * (*px++);
469       pb++;
470 
471       *pb += w * (*px++);
472       pb++;
473 
474       *pb += w * (*px++);
475       pb++;
476 
477       *pb += w * (*px++);
478       pb++;
479 
480       /* Decrement loop counter */
481       tapCnt--;
482     }
483 
484     /* Loop unrolling: Compute remaining taps */
485     tapCnt = numTaps % 0x4U;
486 
487 #else
488 
489     /* Initialize tapCnt with number of samples */
490     tapCnt = numTaps;
491 
492 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
493 
494     while (tapCnt > 0U)
495     {
496       /* Perform the multiply-accumulate */
497       *pb += w * (*px++);
498       pb++;
499 
500       /* Decrement loop counter */
501       tapCnt--;
502     }
503 
504     x0 = *pState;
505 
506     /* Advance state pointer by 1 for the next sample */
507     pState = pState + 1;
508 
509     /* Decrement loop counter */
510     blkCnt--;
511   }
512 
513   /* Save energy and x0 values for the next frame */
514   S->energy = energy;
515   S->x0 = x0;
516 
517   /* Processing is complete.
518      Now copy the last numTaps - 1 samples to the start of the state buffer.
519      This prepares the state buffer for the next function call. */
520 
521   /* Points to the start of the pState buffer */
522   pStateCurnt = S->pState;
523 
524   /* copy data */
525 #if defined (ARM_MATH_LOOPUNROLL)
526 
527   /* Loop unrolling: Compute 4 taps at a time. */
528   tapCnt = (numTaps - 1U) >> 2U;
529 
530   while (tapCnt > 0U)
531   {
532     *pStateCurnt++ = *pState++;
533     *pStateCurnt++ = *pState++;
534     *pStateCurnt++ = *pState++;
535     *pStateCurnt++ = *pState++;
536 
537     /* Decrement loop counter */
538     tapCnt--;
539   }
540 
541   /* Loop unrolling: Compute remaining taps */
542   tapCnt = (numTaps - 1U) % 0x4U;
543 
544 #else
545 
546   /* Initialize tapCnt with number of samples */
547   tapCnt = (numTaps - 1U);
548 
549 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
550 
551   while (tapCnt > 0U)
552   {
553     *pStateCurnt++ = *pState++;
554 
555     /* Decrement loop counter */
556     tapCnt--;
557   }
558 
559 }
560 #endif /* #if defined(ARM_MATH_NEON) */
561 /**
562   @} end of LMS_NORM group
563  */
564