1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_lms_norm_q15.c
4  * Description:  Processing function for Q15 normalized LMS filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup LMS_NORM
37   @{
38  */
39 
40 /**
41   @brief         Processing function for Q15 normalized LMS filter.
42   @param[in]     S         points to an instance of the Q15 normalized LMS filter structure
43   @param[in]     pSrc      points to the block of input data
44   @param[in]     pRef      points to the block of reference data
45   @param[out]    pOut      points to the block of output data
46   @param[out]    pErr      points to the block of error data
47   @param[in]     blockSize number of samples to process
48 
49   @par           Scaling and Overflow Behavior
50                    The function is implemented using a 64-bit internal accumulator.
51                    Both coefficients and state variables are represented in 1.15 format and
52                    multiplications yield a 2.30 result. The 2.30 intermediate results are
53                    accumulated in a 64-bit accumulator in 34.30 format.
54                    There is no risk of internal overflow with this approach and the full
55                    precision of intermediate multiplications is preserved. After all additions
56                    have been performed, the accumulator is truncated to 34.15 format by
57                    discarding low 15 bits. Lastly, the accumulator is saturated to yield a
58                    result in 1.15 format.
59  @par
60   	               In this filter, filter coefficients are updated for each sample and the
61                    updation of filter cofficients are saturted.
62  */
63 
arm_lms_norm_q15(arm_lms_norm_instance_q15 * S,const q15_t * pSrc,q15_t * pRef,q15_t * pOut,q15_t * pErr,uint32_t blockSize)64 void arm_lms_norm_q15(
65         arm_lms_norm_instance_q15 * S,
66   const q15_t * pSrc,
67         q15_t * pRef,
68         q15_t * pOut,
69         q15_t * pErr,
70         uint32_t blockSize)
71 {
72         q15_t *pState = S->pState;                     /* State pointer */
73         q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
74         q15_t *pStateCurnt;                            /* Points to the current sample of the state */
75         q15_t *px, *pb;                                /* Temporary pointers for state and coefficient buffers */
76         q15_t mu = S->mu;                              /* Adaptive factor */
77         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
78         uint32_t tapCnt, blkCnt;                       /* Loop counters */
79         q63_t acc;                                     /* Accumulator */
80         q31_t energy;                                  /* Energy of the input */
81         q15_t e = 0, d = 0;                            /* Error, reference data sample */
82         q15_t w = 0, in;                               /* Weight factor and state */
83         q15_t x0;                                      /* Temporary variable to hold input sample */
84         q15_t errorXmu, oneByEnergy;                   /* Temporary variables to store error and mu product and reciprocal of energy */
85         q15_t postShift;                               /* Post shift to be applied to weight after reciprocal calculation */
86         q31_t coef;                                    /* Temporary variable for coefficient */
87         q31_t acc_l, acc_h;                            /* Temporary input */
88         int32_t lShift = (15 - (int32_t) S->postShift);       /*  Post shift  */
89         int32_t uShift = (32 - lShift);
90 
91   energy = S->energy;
92   x0 = S->x0;
93 
94   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
95   /* pStateCurnt points to the location where the new input data should be written */
96   pStateCurnt = &(S->pState[(numTaps - 1U)]);
97 
98   /* initialise loop count */
99   blkCnt = blockSize;
100 
101   while (blkCnt > 0U)
102   {
103     /* Copy the new input sample into the state buffer */
104     *pStateCurnt++ = *pSrc;
105 
106     /* Initialize pState pointer */
107     px = pState;
108 
109     /* Initialize coefficient pointer */
110     pb = pCoeffs;
111 
112     /* Read the sample from input buffer */
113     in = *pSrc++;
114 
115     /* Update the energy calculation */
116     energy -= (((q31_t) x0 * (x0)) >> 15);
117     energy += (((q31_t) in * (in)) >> 15);
118     energy = (q15_t) __SSAT(energy, 16);
119 
120     /* Set the accumulator to zero */
121     acc = 0;
122 
123 #if defined (ARM_MATH_LOOPUNROLL)
124 
125     /* Loop unrolling: Compute 4 taps at a time. */
126     tapCnt = numTaps >> 2U;
127 
128     while (tapCnt > 0U)
129     {
130       /* Perform the multiply-accumulate */
131       /* acc +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
132       acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
133       acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
134 
135       /* Decrement loop counter */
136       tapCnt--;
137     }
138 
139     /* Loop unrolling: Compute remaining taps */
140     tapCnt = numTaps % 0x4U;
141 
142 #else
143 
144     /* Initialize tapCnt with number of samples */
145     tapCnt = numTaps;
146 
147 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
148 
149     while (tapCnt > 0U)
150     {
151       /* Perform the multiply-accumulate */
152       acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
153 
154       /* Decrement the loop counter */
155       tapCnt--;
156     }
157 
158     /* Calc lower part of acc */
159     acc_l = acc & 0xffffffff;
160 
161     /* Calc upper part of acc */
162     acc_h = (acc >> 32) & 0xffffffff;
163 
164     /* Apply shift for lower part of acc and upper part of acc */
165     acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
166 
167     /* Converting the result to 1.15 format and saturate the output */
168     acc = __SSAT(acc, 16U);
169 
170     /* Store the result from accumulator into the destination buffer. */
171     *pOut++ = (q15_t) acc;
172 
173     /* Compute and store error */
174     d = *pRef++;
175     e = d - (q15_t) acc;
176     *pErr++ = e;
177 
178     /* Calculation of 1/energy */
179     postShift = arm_recip_q15((q15_t) energy + DELTA_Q15, &oneByEnergy, S->recipTable);
180 
181     /* Calculation of e * mu value */
182     errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
183 
184     /* Calculation of (e * mu) * (1/energy) value */
185     acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
186 
187     /* Weighting factor for the normalized version */
188     w = (q15_t) __SSAT((q31_t) acc, 16);
189 
190     /* Initialize pState pointer */
191     px = pState;
192 
193     /* Initialize coefficient pointer */
194     pb = pCoeffs;
195 
196 #if defined (ARM_MATH_LOOPUNROLL)
197 
198     /* Loop unrolling: Compute 4 taps at a time. */
199     tapCnt = numTaps >> 2U;
200 
201     /* Update filter coefficients */
202     while (tapCnt > 0U)
203     {
204       coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
205       *pb++ = (q15_t) __SSAT(coef, 16);
206 
207       coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
208       *pb++ = (q15_t) __SSAT(coef, 16);
209 
210       coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
211       *pb++ = (q15_t) __SSAT(coef, 16);
212 
213       coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
214       *pb++ = (q15_t) __SSAT(coef, 16);
215 
216       /* Decrement loop counter */
217       tapCnt--;
218     }
219 
220     /* Loop unrolling: Compute remaining taps */
221     tapCnt = numTaps % 0x4U;
222 
223 #else
224 
225     /* Initialize tapCnt with number of samples */
226     tapCnt = numTaps;
227 
228 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
229 
230     while (tapCnt > 0U)
231     {
232       /* Perform the multiply-accumulate */
233       coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
234       *pb++ = (q15_t) __SSAT(coef, 16);
235 
236       /* Decrement loop counter */
237       tapCnt--;
238     }
239 
240     x0 = *pState;
241 
242     /* Advance state pointer by 1 for the next sample */
243     pState = pState + 1;
244 
245     /* Decrement loop counter */
246     blkCnt--;
247   }
248 
249   /* Save energy and x0 values for the next frame */
250   S->energy = (q15_t) energy;
251   S->x0 = x0;
252 
253   /* Processing is complete.
254      Now copy the last numTaps - 1 samples to the start of the state buffer.
255      This prepares the state buffer for the next function call. */
256 
257   /* Points to the start of the pState buffer */
258   pStateCurnt = S->pState;
259 
260   /* copy data */
261 #if defined (ARM_MATH_LOOPUNROLL)
262 
263   /* Loop unrolling: Compute 4 taps at a time. */
264   tapCnt = (numTaps - 1U) >> 2U;
265 
266   while (tapCnt > 0U)
267   {
268     write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
269     write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
270 
271     /* Decrement loop counter */
272     tapCnt--;
273   }
274 
275   /* Loop unrolling: Compute remaining taps */
276   tapCnt = (numTaps - 1U) % 0x4U;
277 
278 #else
279 
280   /* Initialize tapCnt with number of samples */
281   tapCnt = (numTaps - 1U);
282 
283 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
284 
285   while (tapCnt > 0U)
286   {
287     *pStateCurnt++ = *pState++;
288 
289     /* Decrement loop counter */
290     tapCnt--;
291   }
292 
293 }
294 
295 /**
296   @} end of LMS_NORM group
297  */
298