1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_rfft_q15.c
4  * Description:  RFFT & RIFFT Q15 process function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 
31 /* ----------------------------------------------------------------------
32  * Internal functions prototypes
33  * -------------------------------------------------------------------- */
34 
35 void arm_split_rfft_q15(
36         q15_t * pSrc,
37         uint32_t fftLen,
38   const q15_t * pATable,
39   const q15_t * pBTable,
40         q15_t * pDst,
41         uint32_t modifier);
42 
43 void arm_split_rifft_q15(
44         q15_t * pSrc,
45         uint32_t fftLen,
46   const q15_t * pATable,
47   const q15_t * pBTable,
48         q15_t * pDst,
49         uint32_t modifier);
50 
51 /**
52   @addtogroup RealFFTQ15
53   @{
54  */
55 
56 /**
57   @brief         Processing function for the Q15 RFFT/RIFFT.
58   @param[in]     S     points to an instance of the Q15 RFFT/RIFFT structure
59   @param[in]     pSrc  points to input buffer (Source buffer is modified by this function.)
60   @param[out]    pDst  points to output buffer
61 
62   @par           Input an output formats
63                    Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
64                    Hence the output format is different for different RFFT sizes.
65                    The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
66   @par             Input and Output formats for RFFT Q15
67 
68 | RFFT Size  | Input Format  | Output Format  | Number of bits to upscale |
69 | ---------: | ------------: | -------------: | ------------------------: |
70 | 32         | 1.15          | 6.10           | 5                         |
71 | 64         | 1.15          | 7.9            | 6                         |
72 | 128        | 1.15          | 8.8            | 7                         |
73 | 256        | 1.15          | 9.7            | 8                         |
74 | 512        | 1.15          | 10.6           | 9                         |
75 | 1024       | 1.15          | 11.5           | 10                        |
76 | 2048       | 1.15          | 12.4           | 11                        |
77 | 4096       | 1.15          | 13.3           | 12                        |
78 | 8192       | 1.15          | 14.2           | 13                        |
79 
80   @par             Input and Output formats for RIFFT Q15
81 
82 | RIFFT Size  | Input Format  | Output Format  | Number of bits to upscale |
83 | ----------: | ------------: | -------------: | ------------------------: |
84 | 32          | 1.15          | 6.10           | 0                         |
85 | 64          | 1.15          | 7.9            | 0                         |
86 | 128         | 1.15          | 8.8            | 0                         |
87 | 256         | 1.15          | 9.7            | 0                         |
88 | 512         | 1.15          | 10.6           | 0                         |
89 | 1024        | 1.15          | 11.5           | 0                         |
90 | 2048        | 1.15          | 12.4           | 0                         |
91 | 4096        | 1.15          | 13.3           | 0                         |
92 | 8192        | 1.15          | 14.2           | 0                         |
93 
94   @par
95                    If the input buffer is of length N (fftLenReal), the output buffer must have length 2N
96                    since it is containing the conjugate part (except for MVE version where N+2 is enough).
97                    The input buffer is modified by this function.
98   @par
99                    For the RIFFT, the source buffer must have length N+2 since the Nyquist frequency value
100                    is needed but conjugate part is ignored.
101                    It is not using the packing trick of the float version.
102  */
103 
arm_rfft_q15(const arm_rfft_instance_q15 * S,q15_t * pSrc,q15_t * pDst)104 void arm_rfft_q15(
105   const arm_rfft_instance_q15 * S,
106         q15_t * pSrc,
107         q15_t * pDst)
108 {
109 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
110   const arm_cfft_instance_q15 *S_CFFT = &(S->cfftInst);
111 #else
112   const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
113 #endif
114         uint32_t L2 = S->fftLenReal >> 1U;
115 
116   /* Calculation of RIFFT of input */
117   if (S->ifftFlagR == 1U)
118   {
119      /*  Real IFFT core process */
120      arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
121 
122      /* Complex IFFT process */
123      arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
124 
125      arm_shift_q15(pDst, 1, pDst, S->fftLenReal);
126   }
127   else
128   {
129      /* Calculation of RFFT of input */
130 
131      /* Complex FFT process */
132      arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
133 
134      /*  Real FFT core process */
135      arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
136   }
137 
138 }
139 
140 /**
141   @} end of RealFFTQ15 group
142  */
143 
144 /**
145   @brief         Core Real FFT process
146   @param[in]     pSrc      points to input buffer
147   @param[in]     fftLen    length of FFT
148   @param[in]     pATable   points to twiddle Coef A buffer
149   @param[in]     pBTable   points to twiddle Coef B buffer
150   @param[out]    pDst      points to output buffer
151   @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
152 
153   @par
154                    The function implements a Real FFT
155  */
156 
157 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
158 
159 #include "arm_helium_utils.h"
160 #include "arm_vec_fft.h"
161 
162 #if defined(__CMSIS_GCC_H)
163 #define MVE_CMPLX_MULT_FX_AxB_S16(A,B)          vqdmladhxq_s16(vqdmlsdhq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
164 #define MVE_CMPLX_MULT_FX_AxConjB_S16(A,B)      vqdmladhq_s16(vqdmlsdhxq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
165 
166 #endif
167 
arm_split_rfft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)168 void arm_split_rfft_q15(
169         q15_t * pSrc,
170         uint32_t fftLen,
171   const q15_t * pATable,
172   const q15_t * pBTable,
173         q15_t * pDst,
174         uint32_t modifier)
175 {
176    uint32_t        i;          /* Loop Counter */
177     const q15_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
178     q15_t          *pOut1 = &pDst[2];
179     q15_t          *pIn1 = &pSrc[2];
180     uint16x8_t      offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 };
181     uint16x8_t      offsetCoef;
182     const uint16_t  offsetCoefArr[16] = {
183         0, 0, 2, 2, 4, 4, 6, 6,
184         0, 1, 0, 1, 0, 1, 0, 1
185     };
186 
187     offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
188     offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8));
189 
190     /* Init coefficient pointers */
191     pCoefA = &pATable[modifier * 2];
192     pCoefB = &pBTable[modifier * 2];
193 
194     const q15_t    *pCoefAb, *pCoefBb;
195     pCoefAb = pCoefA;
196     pCoefBb = pCoefB;
197 
198     pIn1 = &pSrc[2];
199 
200     i = fftLen - 1U;
201     i = i / 4 + 1;
202     while (i > 0U) {
203         q15x8_t         in1 = vld1q_s16(pIn1);
204         q15x8_t         in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn);
205         q15x8_t         coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
206         q15x8_t         coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
207 
208 #if defined(__CMSIS_GCC_H)
209         q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB_S16(in1, coefA),
210                                      MVE_CMPLX_MULT_FX_AxConjB_S16(coefB, in2));
211 #else
212         q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB(in1, coefA, q15x8_t),
213                                          MVE_CMPLX_MULT_FX_AxConjB(coefB, in2, q15x8_t));
214 #endif
215         vst1q_s16(pOut1, out);
216         pOut1 += 8;
217 
218         offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
219         offsetIn -= 8;
220         pIn1 += 8;
221         i -= 1;
222     }
223 
224     pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
225     pDst[2 * fftLen + 1] = 0;
226 
227     pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
228     pDst[1] = 0;
229 }
230 #else
arm_split_rfft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)231 void arm_split_rfft_q15(
232         q15_t * pSrc,
233         uint32_t fftLen,
234   const q15_t * pATable,
235   const q15_t * pBTable,
236         q15_t * pDst,
237         uint32_t modifier)
238 {
239         uint32_t i;                                    /* Loop Counter */
240         q31_t outR, outI;                              /* Temporary variables for output */
241   const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
242         q15_t *pSrc1, *pSrc2;
243 #if defined (ARM_MATH_DSP)
244         q15_t *pD1, *pD2;
245 #endif
246 
247   /* Init coefficient pointers */
248   pCoefA = &pATable[modifier * 2];
249   pCoefB = &pBTable[modifier * 2];
250 
251   pSrc1 = &pSrc[2];
252   pSrc2 = &pSrc[(2U * fftLen) - 2U];
253 
254 #if defined (ARM_MATH_DSP)
255 
256     i = 1U;
257     pD1 = pDst + 2;
258     pD2 = pDst + (4U * fftLen) - 2;
259 
260     for (i = fftLen - 1; i > 0; i--)
261     {
262         /*
263           outR = (  pSrc[2 * i]             * pATable[2 * i]
264                   - pSrc[2 * i + 1]         * pATable[2 * i + 1]
265                   + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
266                   + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
267 
268           outI = (  pIn[2 * i + 1]         * pATable[2 * i]
269                   + pIn[2 * i]             * pATable[2 * i + 1]
270                   + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
271                   - pIn[2 * n - 2 * i + 1] * pBTable[2 * i])
272          */
273 
274 
275 #ifndef ARM_MATH_BIG_ENDIAN
276         /* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */
277         outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA));
278 #else
279         /* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */
280         outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)));
281 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
282 
283         /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
284         outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U;
285 
286         /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
287 #ifndef ARM_MATH_BIG_ENDIAN
288         outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
289 #else
290         outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2));
291 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
292 
293         /* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */
294         outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI);
295 
296         /* write output */
297         *pD1++ = (q15_t) outR;
298         *pD1++ = outI >> 16U;
299 
300         /* write complex conjugate output */
301         pD2[0] = (q15_t) outR;
302         pD2[1] = -(outI >> 16U);
303         pD2 -= 2;
304 
305         /* update coefficient pointer */
306         pCoefB = pCoefB + (2U * modifier);
307         pCoefA = pCoefA + (2U * modifier);
308     }
309 
310     pDst[2U * fftLen]      = (pSrc[0] - pSrc[1]) >> 1U;
311     pDst[2U * fftLen + 1U] = 0;
312 
313     pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
314     pDst[1] = 0;
315 
316 #else
317 
318     i = 1U;
319 
320     while (i < fftLen)
321     {
322         /*
323           outR = (  pSrc[2 * i]             * pATable[2 * i]
324                   - pSrc[2 * i + 1]         * pATable[2 * i + 1]
325                   + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
326                   + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
327         */
328 
329         outR = *pSrc1 * *pCoefA;
330         outR = outR - (*(pSrc1 + 1) * *(pCoefA + 1));
331         outR = outR + (*pSrc2 * *pCoefB);
332         outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16;
333 
334         /*
335           outI = (  pIn[2 * i + 1]         * pATable[2 * i]
336                   + pIn[2 * i]             * pATable[2 * i + 1]
337                   + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
338                   - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
339         */
340 
341         outI = *pSrc2 * *(pCoefB + 1);
342         outI = outI - (*(pSrc2 + 1) * *pCoefB);
343         outI = outI + (*(pSrc1 + 1) * *pCoefA);
344         outI = outI + (*pSrc1 * *(pCoefA + 1));
345 
346         /* update input pointers */
347         pSrc1 += 2U;
348         pSrc2 -= 2U;
349 
350         /* write output */
351         pDst[2U * i] = (q15_t) outR;
352         pDst[2U * i + 1U] = outI >> 16U;
353 
354         /* write complex conjugate output */
355         pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR;
356         pDst[((4U * fftLen) - (2U * i)) + 1U] = -(outI >> 16U);
357 
358         /* update coefficient pointer */
359         pCoefB = pCoefB + (2U * modifier);
360         pCoefA = pCoefA + (2U * modifier);
361 
362         i++;
363     }
364 
365     pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
366     pDst[2U * fftLen + 1U] = 0;
367 
368     pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
369     pDst[1] = 0;
370 
371 #endif /* #if defined (ARM_MATH_DSP) */
372 }
373 #endif /* defined(ARM_MATH_MVEI) */
374 
375 /**
376   @brief         Core Real IFFT process
377   @param[in]     pSrc      points to input buffer
378   @param[in]     fftLen    length of FFT
379   @param[in]     pATable   points to twiddle Coef A buffer
380   @param[in]     pBTable   points to twiddle Coef B buffer
381   @param[out]    pDst      points to output buffer
382   @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
383 
384   @par
385                    The function implements a Real IFFT
386  */
387 
388 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
389 
390 #include "arm_helium_utils.h"
391 #include "arm_vec_fft.h"
392 
arm_split_rifft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)393 void arm_split_rifft_q15(
394         q15_t * pSrc,
395         uint32_t fftLen,
396   const q15_t * pATable,
397   const q15_t * pBTable,
398         q15_t * pDst,
399         uint32_t modifier)
400 {
401    uint32_t        i;                  /* Loop Counter */
402     const q15_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
403     q15_t          *pIn1;
404     uint16x8_t      offset = { 6, 7, 4, 5, 2, 3, 0, 1 };
405     uint16x8_t      offsetCoef;
406     int16x8_t       conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */
407     const uint16_t  offsetCoefArr[16] = {
408         0, 0, 2, 2, 4, 4, 6, 6,
409         0, 1, 0, 1, 0, 1, 0, 1
410     };
411 
412     offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
413 
414     offset = vaddq_n_u16(offset, (2 * fftLen - 6));
415 
416     /* Init coefficient pointers */
417     pCoefA = &pATable[0];
418     pCoefB = &pBTable[0];
419 
420     const q15_t    *pCoefAb, *pCoefBb;
421     pCoefAb = pCoefA;
422     pCoefBb = pCoefB;
423 
424     pIn1 = &pSrc[0];
425 
426     i = fftLen;
427     i = i / 4;
428 
429     while (i > 0U) {
430         q15x8_t         in1 = vld1q_s16(pIn1);
431         q15x8_t         in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset);
432         q15x8_t         coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
433         q15x8_t         coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
434 
435         /* can we avoid the conjugate here ? */
436         q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA, q15x8_t),
437                                          vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB, q15x8_t)));
438 
439         vst1q_s16(pDst, out);
440         pDst += 8;
441 
442         offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
443         offset -= 8;
444 
445         pIn1 += 8;
446         i -= 1;
447     }
448 }
449 #else
arm_split_rifft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)450 void arm_split_rifft_q15(
451         q15_t * pSrc,
452         uint32_t fftLen,
453   const q15_t * pATable,
454   const q15_t * pBTable,
455         q15_t * pDst,
456         uint32_t modifier)
457 {
458         uint32_t i;                                    /* Loop Counter */
459         q31_t outR, outI;                              /* Temporary variables for output */
460   const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
461         q15_t *pSrc1, *pSrc2;
462         q15_t *pDst1 = &pDst[0];
463 
464   pCoefA = &pATable[0];
465   pCoefB = &pBTable[0];
466 
467   pSrc1 = &pSrc[0];
468   pSrc2 = &pSrc[2 * fftLen];
469 
470   i = fftLen;
471   while (i > 0U)
472   {
473       /*
474         outR = (  pIn[2 * i]             * pATable[2 * i]
475                 + pIn[2 * i + 1]         * pATable[2 * i + 1]
476                 + pIn[2 * n - 2 * i]     * pBTable[2 * i]
477                 - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
478 
479         outI = (  pIn[2 * i + 1]         * pATable[2 * i]
480                 - pIn[2 * i]             * pATable[2 * i + 1]
481                 - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
482                 - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
483        */
484 
485 #if defined (ARM_MATH_DSP)
486 
487 #ifndef ARM_MATH_BIG_ENDIAN
488       /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
489       outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB));
490 #else
491       /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
492       outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)));
493 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
494 
495       /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */
496       outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U;
497 
498       /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
499       outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
500 
501       /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
502 #ifndef ARM_MATH_BIG_ENDIAN
503       outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI);
504 #else
505       outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI);
506 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
507 
508       /* write output */
509 #ifndef ARM_MATH_BIG_ENDIAN
510       write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16));
511 #else
512       write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16));
513 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
514 
515 
516 #else  /* #if defined (ARM_MATH_DSP) */
517 
518       outR = *pSrc2 * *pCoefB;
519       outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
520       outR = outR + (*pSrc1 * *pCoefA);
521       outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
522 
523       outI = *(pSrc1 + 1) * *pCoefA;
524       outI = outI - (*pSrc1 * *(pCoefA + 1));
525       outI = outI - (*pSrc2 * *(pCoefB + 1));
526       outI = outI - (*(pSrc2 + 1) * *(pCoefB));
527 
528       /* update input pointers */
529       pSrc1 += 2U;
530       pSrc2 -= 2U;
531 
532       /* write output */
533       *pDst1++ = (q15_t) outR;
534       *pDst1++ = (q15_t) (outI >> 16);
535 
536 #endif /* #if defined (ARM_MATH_DSP) */
537 
538       /* update coefficient pointer */
539       pCoefB = pCoefB + (2 * modifier);
540       pCoefA = pCoefA + (2 * modifier);
541 
542       i--;
543   }
544 
545 }
546 #endif /* defined(ARM_MATH_MVEI) */
547