1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_lms_f32.c
4  * Description:  Processing function for the floating-point LMS filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @defgroup LMS Least Mean Square (LMS) Filters
37 
38   LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions.
39   LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal.
40   Adaptive filters are often used in communication systems, equalizers, and noise removal.
41   The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types.
42   The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal.
43 
44   An LMS filter consists of two components as shown below.
45   The first component is a standard transversal or FIR filter.
46   The second component is a coefficient update mechanism.
47   The LMS filter has two input signals.
48   The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
49   That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
50   The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
51   This "error signal" tends towards zero as the filter adapts.
52   The LMS processing functions accept the input and reference input signals and generate the filter output and error signal.
53   \image html LMS.gif "Internal structure of the Least Mean Square filter"
54 
55   The functions operate on blocks of data and each call to the function processes
56   <code>blockSize</code> samples through the filter.
57   <code>pSrc</code> points to input signal, <code>pRef</code> points to reference signal,
58   <code>pOut</code> points to output signal and <code>pErr</code> points to error signal.
59   All arrays contain <code>blockSize</code> values.
60 
61   The functions operate on a block-by-block basis.
62   Internally, the filter coefficients <code>b[n]</code> are updated on a sample-by-sample basis.
63   The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
64 
65   @par           Algorithm
66                    The output signal <code>y[n]</code> is computed by a standard FIR filter:
67   <pre>
68       y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
69   </pre>
70 
71   @par
72                    The error signal equals the difference between the reference signal <code>d[n]</code> and the filter output:
73   <pre>
74       e[n] = d[n] - y[n].
75   </pre>
76 
77   @par
78                    After each sample of the error signal is computed, the filter coefficients <code>b[k]</code> are updated on a sample-by-sample basis:
79   <pre>
80       b[k] = b[k] + e[n] * mu * x[n-k],  for k=0, 1, ..., numTaps-1
81   </pre>
82                    where <code>mu</code> is the step size and controls the rate of coefficient convergence.
83   @par
84                    In the APIs, <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
85                    Coefficients are stored in time reversed order.
86   @par
87   <pre>
88      {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
89   </pre>
90   @par
91                    <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
92                    Samples in the state buffer are stored in the order:
93   @par
94   <pre>
95      {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
96   </pre>
97   @par
98                    Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code> samples.
99                    The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
100                    to be avoided and yields a significant speed improvement.
101                    The state variables are updated after each block of data is processed.
102   @par           Instance Structure
103                    The coefficients and state variables for a filter are stored together in an instance data structure.
104                    A separate instance structure must be defined for each filter and
105                    coefficient and state arrays cannot be shared among instances.
106                    There are separate instance structure declarations for each of the 3 supported data types.
107 
108   @par           Initialization Functions
109                    There is also an associated initialization function for each data type.
110                    The initialization function performs the following operations:
111                    - Sets the values of the internal structure fields.
112                    - Zeros out the values in the state buffer.
113                    To do this manually without calling the init function, assign the follow subfields of the instance structure:
114                    numTaps, pCoeffs, mu, postShift (not for f32), pState. Also set all of the values in pState to zero.
115 
116   @par
117                  Use of the initialization function is optional.
118                  However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
119                  To place an instance structure into a const data section, the instance structure must be manually initialized.
120                  Set the values in the state buffer to zeros before static initialization.
121                  The code below statically initializes each of the 3 different data type filter instance structures
122   <pre>
123      arm_lms_instance_f32 S = {numTaps, pState, pCoeffs, mu};
124      arm_lms_instance_q31 S = {numTaps, pState, pCoeffs, mu, postShift};
125      arm_lms_instance_q15 S = {numTaps, pState, pCoeffs, mu, postShift};
126   </pre>
127                  where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
128                  <code>pCoeffs</code> is the address of the coefficient buffer; <code>mu</code> is the step size parameter; and <code>postShift</code> is the shift applied to coefficients.
129 
130   @par           Fixed-Point Behavior
131                    Care must be taken when using the Q15 and Q31 versions of the LMS filter.
132                    The following issues must be considered:
133                    - Scaling of coefficients
134                    - Overflow and saturation
135 
136   @par           Scaling of Coefficients
137                    Filter coefficients are represented as fractional values and
138                    coefficients are restricted to lie in the range <code>[-1 +1)</code>.
139                    The fixed-point functions have an additional scaling parameter <code>postShift</code>.
140                    At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.
141                    This essentially scales the filter coefficients by <code>2^postShift</code> and
142                    allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
143                    The value of <code>postShift</code> is set by the user based on the expected gain through the system being modeled.
144 
145   @par           Overflow and Saturation
146                    Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
147                    described separately as part of the function specific documentation below.
148  */
149 
150 /**
151   @addtogroup LMS
152   @{
153  */
154 
155 /**
156   @brief         Processing function for floating-point LMS filter.
157   @param[in]     S          points to an instance of the floating-point LMS filter structure
158   @param[in]     pSrc       points to the block of input data
159   @param[in]     pRef       points to the block of reference data
160   @param[out]    pOut       points to the block of output data
161   @param[out]    pErr       points to the block of error data
162   @param[in]     blockSize  number of samples to process
163  */
164 #if defined(ARM_MATH_NEON)
arm_lms_f32(const arm_lms_instance_f32 * S,const float32_t * pSrc,float32_t * pRef,float32_t * pOut,float32_t * pErr,uint32_t blockSize)165 ARM_DSP_ATTRIBUTE void arm_lms_f32(
166   const arm_lms_instance_f32 * S,
167   const float32_t * pSrc,
168   float32_t * pRef,
169   float32_t * pOut,
170   float32_t * pErr,
171   uint32_t blockSize)
172 {
173   float32_t *pState = S->pState;                 /* State pointer */
174   float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
175   float32_t *pStateCurnt;                        /* Points to the current sample of the state */
176   float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
177   float32_t mu = S->mu;                          /* Adaptive factor */
178   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
179   uint32_t tapCnt, blkCnt;                       /* Loop counters */
180   float32_t sum, e, d;                           /* accumulator, error, reference data sample */
181   float32_t w = 0.0f;                            /* weight factor */
182 
183   float32x4_t tempV, sumV, xV, bV;
184   float32x2_t tempV2;
185 
186   e = 0.0f;
187   d = 0.0f;
188 
189   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
190   /* pStateCurnt points to the location where the new input data should be written */
191   pStateCurnt = &(S->pState[(numTaps - 1U)]);
192 
193   blkCnt = blockSize;
194 
195   while (blkCnt > 0U)
196   {
197     /* Copy the new input sample into the state buffer */
198     *pStateCurnt++ = *pSrc++;
199 
200     /* Initialize pState pointer */
201     px = pState;
202 
203     /* Initialize coeff pointer */
204     pb = (pCoeffs);
205 
206     /* Set the accumulator to zero */
207     sum = 0.0f;
208     sumV = vdupq_n_f32(0.0);
209 
210     /* Process 4 taps at a time. */
211     tapCnt = numTaps >> 2;
212 
213     while (tapCnt > 0U)
214     {
215       /* Perform the multiply-accumulate */
216       xV = vld1q_f32(px);
217       bV = vld1q_f32(pb);
218       sumV = vmlaq_f32(sumV, xV, bV);
219 
220       px += 4;
221       pb += 4;
222 
223       /* Decrement the loop counter */
224       tapCnt--;
225     }
226     tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
227     sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
228 
229 
230     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
231     tapCnt = numTaps % 0x4U;
232 
233     while (tapCnt > 0U)
234     {
235       /* Perform the multiply-accumulate */
236       sum += (*px++) * (*pb++);
237 
238       /* Decrement the loop counter */
239       tapCnt--;
240     }
241 
242     /* The result in the accumulator, store in the destination buffer. */
243     *pOut++ = sum;
244 
245     /* Compute and store error */
246     d = (float32_t) (*pRef++);
247     e = d - sum;
248     *pErr++ = e;
249 
250     /* Calculation of Weighting factor for the updating filter coefficients */
251     w = e * mu;
252 
253     /* Initialize pState pointer */
254     px = pState;
255 
256     /* Initialize coeff pointer */
257     pb = (pCoeffs);
258 
259     /* Process 4 taps at a time. */
260     tapCnt = numTaps >> 2;
261 
262     /* Update filter coefficients */
263     while (tapCnt > 0U)
264     {
265       /* Perform the multiply-accumulate */
266       xV = vld1q_f32(px);
267       bV = vld1q_f32(pb);
268       px += 4;
269       bV = vmlaq_n_f32(bV,xV,w);
270 
271       vst1q_f32(pb,bV);
272       pb += 4;
273 
274 
275       /* Decrement the loop counter */
276       tapCnt--;
277     }
278 
279     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
280     tapCnt = numTaps % 0x4U;
281 
282     while (tapCnt > 0U)
283     {
284       /* Perform the multiply-accumulate */
285       *pb = *pb + (w * (*px++));
286       pb++;
287 
288       /* Decrement the loop counter */
289       tapCnt--;
290     }
291 
292     /* Advance state pointer by 1 for the next sample */
293     pState = pState + 1;
294 
295     /* Decrement the loop counter */
296     blkCnt--;
297   }
298 
299 
300   /* Processing is complete. Now copy the last numTaps - 1 samples to the
301      satrt of the state buffer. This prepares the state buffer for the
302      next function call. */
303 
304   /* Points to the start of the pState buffer */
305   pStateCurnt = S->pState;
306 
307   /* Process 4 taps at a time for (numTaps - 1U) samples copy */
308   tapCnt = (numTaps - 1U) >> 2U;
309 
310   /* copy data */
311   while (tapCnt > 0U)
312   {
313     tempV = vld1q_f32(pState);
314     vst1q_f32(pStateCurnt,tempV);
315     pState += 4;
316     pStateCurnt += 4;
317 
318     /* Decrement the loop counter */
319     tapCnt--;
320   }
321 
322   /* Calculate remaining number of copies */
323   tapCnt = (numTaps - 1U) % 0x4U;
324 
325   /* Copy the remaining q31_t data */
326   while (tapCnt > 0U)
327   {
328     *pStateCurnt++ = *pState++;
329 
330     /* Decrement the loop counter */
331     tapCnt--;
332   }
333 
334 
335 }
336 #else
arm_lms_f32(const arm_lms_instance_f32 * S,const float32_t * pSrc,float32_t * pRef,float32_t * pOut,float32_t * pErr,uint32_t blockSize)337 ARM_DSP_ATTRIBUTE void arm_lms_f32(
338   const arm_lms_instance_f32 * S,
339   const float32_t * pSrc,
340         float32_t * pRef,
341         float32_t * pOut,
342         float32_t * pErr,
343         uint32_t blockSize)
344 {
345         float32_t *pState = S->pState;                 /* State pointer */
346         float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
347         float32_t *pStateCurnt;                        /* Points to the current sample of the state */
348         float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
349         float32_t mu = S->mu;                          /* Adaptive factor */
350         float32_t acc, e;                              /* Accumulator, error */
351         float32_t w;                                   /* Weight factor */
352         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
353         uint32_t tapCnt, blkCnt;                       /* Loop counters */
354 
355   /* Initializations of error,  difference, Coefficient update */
356   e = 0.0f;
357   w = 0.0f;
358 
359   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
360   /* pStateCurnt points to the location where the new input data should be written */
361   pStateCurnt = &(S->pState[(numTaps - 1U)]);
362 
363   /* initialise loop count */
364   blkCnt = blockSize;
365 
366   while (blkCnt > 0U)
367   {
368     /* Copy the new input sample into the state buffer */
369     *pStateCurnt++ = *pSrc++;
370 
371     /* Initialize pState pointer */
372     px = pState;
373 
374     /* Initialize coefficient pointer */
375     pb = pCoeffs;
376 
377     /* Set the accumulator to zero */
378     acc = 0.0f;
379 
380 #if defined (ARM_MATH_LOOPUNROLL)
381 
382     /* Loop unrolling: Compute 4 taps at a time. */
383     tapCnt = numTaps >> 2U;
384 
385     while (tapCnt > 0U)
386     {
387       /* Perform the multiply-accumulate */
388       acc += (*px++) * (*pb++);
389 
390       acc += (*px++) * (*pb++);
391 
392       acc += (*px++) * (*pb++);
393 
394       acc += (*px++) * (*pb++);
395 
396       /* Decrement loop counter */
397       tapCnt--;
398     }
399 
400     /* Loop unrolling: Compute remaining taps */
401     tapCnt = numTaps % 0x4U;
402 
403 #else
404 
405     /* Initialize tapCnt with number of samples */
406     tapCnt = numTaps;
407 
408 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
409 
410     while (tapCnt > 0U)
411     {
412       /* Perform the multiply-accumulate */
413       acc += (*px++) * (*pb++);
414 
415       /* Decrement the loop counter */
416       tapCnt--;
417     }
418 
419     /* Store the result from accumulator into the destination buffer. */
420     *pOut++ = acc;
421 
422     /* Compute and store error */
423     e = (float32_t) *pRef++ - acc;
424     *pErr++ = e;
425 
426     /* Calculation of Weighting factor for updating filter coefficients */
427     w = e * mu;
428 
429     /* Initialize pState pointer */
430     /* Advance state pointer by 1 for the next sample */
431     px = pState++;
432 
433     /* Initialize coefficient pointer */
434     pb = pCoeffs;
435 
436 #if defined (ARM_MATH_LOOPUNROLL)
437 
438     /* Loop unrolling: Compute 4 taps at a time. */
439     tapCnt = numTaps >> 2U;
440 
441     /* Update filter coefficients */
442     while (tapCnt > 0U)
443     {
444       /* Perform the multiply-accumulate */
445       *pb += w * (*px++);
446       pb++;
447 
448       *pb += w * (*px++);
449       pb++;
450 
451       *pb += w * (*px++);
452       pb++;
453 
454       *pb += w * (*px++);
455       pb++;
456 
457       /* Decrement loop counter */
458       tapCnt--;
459     }
460 
461     /* Loop unrolling: Compute remaining taps */
462     tapCnt = numTaps % 0x4U;
463 
464 #else
465 
466     /* Initialize tapCnt with number of samples */
467     tapCnt = numTaps;
468 
469 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
470 
471     while (tapCnt > 0U)
472     {
473       /* Perform the multiply-accumulate */
474       *pb += w * (*px++);
475       pb++;
476 
477       /* Decrement loop counter */
478       tapCnt--;
479     }
480 
481     /* Decrement loop counter */
482     blkCnt--;
483   }
484 
485   /* Processing is complete.
486      Now copy the last numTaps - 1 samples to the start of the state buffer.
487      This prepares the state buffer for the next function call. */
488 
489   /* Points to the start of the pState buffer */
490   pStateCurnt = S->pState;
491 
492   /* copy data */
493 #if defined (ARM_MATH_LOOPUNROLL)
494 
495   /* Loop unrolling: Compute 4 taps at a time. */
496   tapCnt = (numTaps - 1U) >> 2U;
497 
498   while (tapCnt > 0U)
499   {
500     *pStateCurnt++ = *pState++;
501     *pStateCurnt++ = *pState++;
502     *pStateCurnt++ = *pState++;
503     *pStateCurnt++ = *pState++;
504 
505     /* Decrement loop counter */
506     tapCnt--;
507   }
508 
509   /* Loop unrolling: Compute remaining taps */
510   tapCnt = (numTaps - 1U) % 0x4U;
511 
512 #else
513 
514   /* Initialize tapCnt with number of samples */
515   tapCnt = (numTaps - 1U);
516 
517 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
518 
519   while (tapCnt > 0U)
520   {
521     *pStateCurnt++ = *pState++;
522 
523     /* Decrement loop counter */
524     tapCnt--;
525   }
526 
527 }
528 #endif /* #if defined(ARM_MATH_NEON) */
529 
530 /**
531   @} end of LMS group
532  */
533