1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_fir_fast_q15.c
4  * Description:  Q15 Fast FIR filter processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup FIR
37   @{
38  */
39 
40 /**
41   @brief         Processing function for the Q15 FIR filter (fast version).
42   @param[in]     S          points to an instance of the Q15 FIR filter structure
43   @param[in]     pSrc       points to the block of input data
44   @param[out]    pDst       points to the block of output data
45   @param[in]     blockSize  number of samples to process
46 
47   @par           Scaling and Overflow Behavior
48                    This fast version uses a 32-bit accumulator with 2.30 format.
49                    The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
50                    Thus, if the accumulator result overflows it wraps around and distorts the result.
51                    In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
52                    The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
53 
54   @remark
55                    Refer to \ref arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.  Both the slow and the fast versions use the same instance structure.
56                    Use function \ref arm_fir_init_q15() to initialize the filter structure.
57  */
58 
arm_fir_fast_q15(const arm_fir_instance_q15 * S,const q15_t * pSrc,q15_t * pDst,uint32_t blockSize)59 ARM_DSP_ATTRIBUTE void arm_fir_fast_q15(
60   const arm_fir_instance_q15 * S,
61   const q15_t * pSrc,
62         q15_t * pDst,
63         uint32_t blockSize)
64 {
65         q15_t *pState = S->pState;                     /* State pointer */
66   const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
67         q15_t *pStateCurnt;                            /* Points to the current sample of the state */
68         q15_t *px;                                     /* Temporary pointer for state buffer */
69   const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
70         q31_t acc0;                                    /* Accumulators */
71         uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
72         uint32_t tapCnt, blkCnt;                       /* Loop counters */
73 
74 #if defined (ARM_MATH_LOOPUNROLL)
75         q31_t acc1, acc2, acc3;                        /* Accumulators */
76         q31_t x0, x1, x2, c0;                          /* Temporary variables to hold state and coefficient values */
77 #endif
78 
79   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
80   /* pStateCurnt points to the location where the new input data should be written */
81   pStateCurnt = &(S->pState[(numTaps - 1U)]);
82 
83 #if defined (ARM_MATH_LOOPUNROLL)
84 
85   /* Loop unrolling: Compute 4 output values simultaneously.
86    * The variables acc0 ... acc3 hold output values that are being computed:
87    *
88    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
89    *    acc1 =  b[numTaps-1] * x[n-numTaps]   + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
90    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps]   + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
91    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
92    */
93   blkCnt = blockSize >> 2U;
94 
95   while (blkCnt > 0U)
96   {
97     /* Copy 4 new input samples into the state buffer. */
98     *pStateCurnt++ = *pSrc++;
99     *pStateCurnt++ = *pSrc++;
100     *pStateCurnt++ = *pSrc++;
101     *pStateCurnt++ = *pSrc++;
102 
103     /* Set all accumulators to zero */
104     acc0 = 0;
105     acc1 = 0;
106     acc2 = 0;
107     acc3 = 0;
108 
109     /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
110     px = pState;
111 
112     /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
113     pb = pCoeffs;
114 
115     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
116     x0 = read_q15x2_ia (&px);
117 
118     /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
119     x2 = read_q15x2_ia (&px);
120 
121     /* Loop over the number of taps.  Unroll by a factor of 4.
122        Repeat until we've computed numTaps-(numTaps%4) coefficients. */
123     tapCnt = numTaps >> 2U;
124 
125     while (tapCnt > 0U)
126     {
127       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
128       c0 = read_q15x2_ia ((q15_t **) &pb);
129 
130       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
131       acc0 = __SMLAD(x0, c0, acc0);
132 
133       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
134       acc2 = __SMLAD(x2, c0, acc2);
135 
136       /* pack  x[n-N-1] and x[n-N-2] */
137 #ifndef ARM_MATH_BIG_ENDIAN
138       x1 = __PKHBT(x2, x0, 0);
139 #else
140       x1 = __PKHBT(x0, x2, 0);
141 #endif
142 
143       /* Read state x[n-N-4], x[n-N-5] */
144       x0 = read_q15x2_ia (&px);
145 
146       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
147       acc1 = __SMLADX(x1, c0, acc1);
148 
149       /* pack  x[n-N-3] and x[n-N-4] */
150 #ifndef ARM_MATH_BIG_ENDIAN
151       x1 = __PKHBT(x0, x2, 0);
152 #else
153       x1 = __PKHBT(x2, x0, 0);
154 #endif
155 
156       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
157       acc3 = __SMLADX(x1, c0, acc3);
158 
159       /* Read coefficients b[N-2], b[N-3] */
160       c0 = read_q15x2_ia ((q15_t **) &pb);
161 
162       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
163       acc0 = __SMLAD(x2, c0, acc0);
164 
165       /* Read state x[n-N-6], x[n-N-7] with offset */
166       x2 = read_q15x2_ia (&px);
167 
168       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
169       acc2 = __SMLAD(x0, c0, acc2);
170 
171       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
172       acc1 = __SMLADX(x1, c0, acc1);
173 
174       /* pack  x[n-N-5] and x[n-N-6] */
175 #ifndef ARM_MATH_BIG_ENDIAN
176       x1 = __PKHBT(x2, x0, 0);
177 #else
178       x1 = __PKHBT(x0, x2, 0);
179 #endif
180 
181       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
182       acc3 = __SMLADX(x1, c0, acc3);
183 
184       /* Decrement tap count */
185       tapCnt--;
186     }
187 
188     /* If the filter length is not a multiple of 4, compute the remaining filter taps.
189        This is always be 2 taps since the filter length is even. */
190     if ((numTaps & 0x3U) != 0U)
191     {
192       /* Read last two coefficients */
193       c0 = read_q15x2_ia ((q15_t **) &pb);
194 
195       /* Perform the multiply-accumulates */
196       acc0 = __SMLAD(x0, c0, acc0);
197       acc2 = __SMLAD(x2, c0, acc2);
198 
199       /* pack state variables */
200 #ifndef ARM_MATH_BIG_ENDIAN
201       x1 = __PKHBT(x2, x0, 0);
202 #else
203       x1 = __PKHBT(x0, x2, 0);
204 #endif
205 
206       /* Read last state variables */
207       x0 = read_q15x2 (px);
208 
209       /* Perform the multiply-accumulates */
210       acc1 = __SMLADX(x1, c0, acc1);
211 
212       /* pack state variables */
213 #ifndef ARM_MATH_BIG_ENDIAN
214       x1 = __PKHBT(x0, x2, 0);
215 #else
216       x1 = __PKHBT(x2, x0, 0);
217 #endif
218 
219       /* Perform the multiply-accumulates */
220       acc3 = __SMLADX(x1, c0, acc3);
221     }
222 
223     /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
224        Then store the 4 outputs in the destination buffer. */
225 #ifndef ARM_MATH_BIG_ENDIAN
226     write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
227     write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
228 #else
229     write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
230     write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
231 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
232 
233     /* Advance the state pointer by 4 to process the next group of 4 samples */
234     pState = pState + 4U;
235 
236     /* Decrement loop counter */
237     blkCnt--;
238   }
239 
240   /* Loop unrolling: Compute remaining output samples */
241   blkCnt = blockSize % 0x4U;
242 
243 #else
244 
245   /* Initialize blkCnt with number of taps */
246   blkCnt = blockSize;
247 
248 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
249 
250   while (blkCnt > 0U)
251   {
252     /* Copy two samples into state buffer */
253     *pStateCurnt++ = *pSrc++;
254 
255     /* Set the accumulator to zero */
256     acc0 = 0;
257 
258     /* Use SIMD to hold states and coefficients */
259     px = pState;
260     pb = pCoeffs;
261 
262     tapCnt = numTaps >> 1U;
263 
264     do
265     {
266       acc0 += (q31_t) *px++ * *pb++;
267 	  acc0 += (q31_t) *px++ * *pb++;
268 
269       tapCnt--;
270     }
271     while (tapCnt > 0U);
272 
273     /* The result is in 2.30 format. Convert to 1.15 with saturation.
274        Then store the output in the destination buffer. */
275     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
276 
277     /* Advance state pointer by 1 for the next sample */
278     pState = pState + 1U;
279 
280     /* Decrement loop counter */
281     blkCnt--;
282   }
283 
284   /* Processing is complete.
285      Now copy the last numTaps - 1 samples to the start of the state buffer.
286      This prepares the state buffer for the next function call. */
287 
288   /* Points to the start of the state buffer */
289   pStateCurnt = S->pState;
290 
291 #if defined (ARM_MATH_LOOPUNROLL)
292 
293   /* Loop unrolling: Compute 4 taps at a time */
294   tapCnt = (numTaps - 1U) >> 2U;
295 
296   /* Copy data */
297   while (tapCnt > 0U)
298   {
299     *pStateCurnt++ = *pState++;
300     *pStateCurnt++ = *pState++;
301     *pStateCurnt++ = *pState++;
302     *pStateCurnt++ = *pState++;
303 
304     /* Decrement loop counter */
305     tapCnt--;
306   }
307 
308   /* Calculate remaining number of copies */
309   tapCnt = (numTaps - 1U) % 0x4U;
310 
311 #else
312 
313   /* Initialize tapCnt with number of taps */
314   tapCnt = (numTaps - 1U);
315 
316 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
317 
318   /* Copy remaining data */
319   while (tapCnt > 0U)
320   {
321     *pStateCurnt++ = *pState++;
322 
323     /* Decrement loop counter */
324     tapCnt--;
325   }
326 
327 }
328 
329 /**
330   @} end of FIR group
331  */
332