1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_lms_q15.c
4  * Description:  Processing function for Q15 LMS filter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup LMS
37   @{
38  */
39 
40 /**
41   @brief         Processing function for Q15 LMS filter.
42   @param[in]     S         points to an instance of the Q15 LMS filter structure
43   @param[in]     pSrc      points to the block of input data
44   @param[in]     pRef      points to the block of reference data
45   @param[out]    pOut      points to the block of output data
46   @param[out]    pErr      points to the block of error data
47   @param[in]     blockSize number of samples to process
48 
49   @par           Scaling and Overflow Behavior
50                    The function is implemented using an internal 64-bit accumulator.
51                    Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
52                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
53                    There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
54                    After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
55                    Lastly, the accumulator is saturated to yield a result in 1.15 format.
56   @par
57                    In this filter, filter coefficients are updated for each sample and
58                    the updation of filter cofficients are saturted.
59  */
60 
arm_lms_q15(const arm_lms_instance_q15 * S,const q15_t * pSrc,q15_t * pRef,q15_t * pOut,q15_t * pErr,uint32_t blockSize)61 void arm_lms_q15(
62   const arm_lms_instance_q15 * S,
63   const q15_t * pSrc,
64         q15_t * pRef,
65         q15_t * pOut,
66         q15_t * pErr,
67         uint32_t blockSize)
68 {
69         q15_t *pState = S->pState;                     /* State pointer */
70         q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
71         q15_t *pStateCurnt;                            /* Points to the current sample of the state */
72         q15_t *px, *pb;                                /* Temporary pointers for state and coefficient buffers */
73         q15_t mu = S->mu;                              /* Adaptive factor */
74         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
75         uint32_t tapCnt, blkCnt;                       /* Loop counters */
76         q63_t acc;                                     /* Accumulator */
77         q15_t e = 0;                                   /* Error of data sample */
78         q15_t alpha;                                   /* Intermediate constant for taps update */
79         q31_t coef;                                    /* Temporary variable for coefficient */
80         q31_t acc_l, acc_h;                            /* Temporary input */
81         int32_t lShift = (15 - (int32_t) S->postShift);       /*  Post shift  */
82         int32_t uShift = (32 - lShift);
83 
84   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
85   /* pStateCurnt points to the location where the new input data should be written */
86   pStateCurnt = &(S->pState[(numTaps - 1U)]);
87 
88   /* initialise loop count */
89   blkCnt = blockSize;
90 
91   while (blkCnt > 0U)
92   {
93     /* Copy the new input sample into the state buffer */
94     *pStateCurnt++ = *pSrc++;
95 
96     /* Initialize pState pointer */
97     px = pState;
98 
99     /* Initialize coefficient pointer */
100     pb = pCoeffs;
101 
102     /* Set the accumulator to zero */
103     acc = 0;
104 
105 #if defined (ARM_MATH_LOOPUNROLL)
106 
107     /* Loop unrolling: Compute 4 taps at a time. */
108     tapCnt = numTaps >> 2U;
109 
110     while (tapCnt > 0U)
111     {
112       /* Perform the multiply-accumulate */
113       /* acc +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
114       acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
115       acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
116 
117       /* Decrement loop counter */
118       tapCnt--;
119     }
120 
121     /* Loop unrolling: Compute remaining taps */
122     tapCnt = numTaps % 0x4U;
123 
124 #else
125 
126     /* Initialize tapCnt with number of samples */
127     tapCnt = numTaps;
128 
129 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
130 
131     while (tapCnt > 0U)
132     {
133       /* Perform the multiply-accumulate */
134       acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
135 
136       /* Decrement the loop counter */
137       tapCnt--;
138     }
139 
140     /* Calc lower part of acc */
141     acc_l = acc & 0xffffffff;
142 
143     /* Calc upper part of acc */
144     acc_h = (acc >> 32) & 0xffffffff;
145 
146     /* Apply shift for lower part of acc and upper part of acc */
147     acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
148 
149     /* Converting the result to 1.15 format and saturate the output */
150     acc = __SSAT(acc, 16U);
151 
152     /* Store the result from accumulator into the destination buffer. */
153     *pOut++ = (q15_t) acc;
154 
155     /* Compute and store error */
156     e = *pRef++ - (q15_t) acc;
157     *pErr++ = (q15_t) e;
158 
159     /* Compute alpha i.e. intermediate constant for taps update */
160     alpha = (q15_t) (((q31_t) e * (mu)) >> 15);
161 
162     /* Initialize pState pointer */
163     /* Advance state pointer by 1 for the next sample */
164     px = pState++;
165 
166     /* Initialize coefficient pointer */
167     pb = pCoeffs;
168 
169 #if defined (ARM_MATH_LOOPUNROLL)
170 
171     /* Loop unrolling: Compute 4 taps at a time. */
172     tapCnt = numTaps >> 2U;
173 
174     /* Update filter coefficients */
175     while (tapCnt > 0U)
176     {
177       coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
178       *pb++ = (q15_t) __SSAT((coef), 16);
179 
180       coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
181       *pb++ = (q15_t) __SSAT((coef), 16);
182 
183       coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
184       *pb++ = (q15_t) __SSAT((coef), 16);
185 
186       coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
187       *pb++ = (q15_t) __SSAT((coef), 16);
188 
189       /* Decrement loop counter */
190       tapCnt--;
191     }
192 
193     /* Loop unrolling: Compute remaining taps */
194     tapCnt = numTaps % 0x4U;
195 
196 #else
197 
198     /* Initialize tapCnt with number of samples */
199     tapCnt = numTaps;
200 
201 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
202 
203     while (tapCnt > 0U)
204     {
205       /* Perform the multiply-accumulate */
206       coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
207       *pb++ = (q15_t) __SSAT((coef), 16);
208 
209       /* Decrement loop counter */
210       tapCnt--;
211     }
212 
213     /* Decrement loop counter */
214     blkCnt--;
215   }
216 
217   /* Processing is complete.
218      Now copy the last numTaps - 1 samples to the start of the state buffer.
219      This prepares the state buffer for the next function call. */
220 
221   /* Points to the start of the pState buffer */
222   pStateCurnt = S->pState;
223 
224   /* copy data */
225 #if defined (ARM_MATH_LOOPUNROLL)
226 
227   /* Loop unrolling: Compute 4 taps at a time. */
228   tapCnt = (numTaps - 1U) >> 2U;
229 
230   while (tapCnt > 0U)
231   {
232     write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
233     write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
234 
235     /* Decrement loop counter */
236     tapCnt--;
237   }
238 
239   /* Loop unrolling: Compute remaining taps */
240   tapCnt = (numTaps - 1U) % 0x4U;
241 
242 #else
243 
244   /* Initialize tapCnt with number of samples */
245   tapCnt = (numTaps - 1U);
246 
247 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
248 
249   while (tapCnt > 0U)
250   {
251     *pStateCurnt++ = *pState++;
252 
253     /* Decrement loop counter */
254     tapCnt--;
255   }
256 
257 }
258 
259 /**
260   @} end of LMS group
261  */
262