1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_dct4_q31.c
4  * Description:  Processing function of DCT4 & IDCT4 Q31
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 
31 /**
32   @addtogroup DCT4Q31
33   @{
34  */
35 
36 /**
37   @brief         Processing function for the Q31 DCT4/IDCT4.
38   @deprecated    Do not use this function. It will be removed in future versions.
39   @param[in]     S             points to an instance of the Q31 DCT4 structure.
40   @param[in]     pState        points to state buffer.
41   @param[in,out] pInlineBuffer points to the in-place input and output buffer.
42 
43   @par           Input an output formats
44                    Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
45                    as the conversion from DCT2 to DCT4 involves one subtraction.
46                    Internally inputs are downscaled in the RFFT process function to avoid overflows.
47                    Number of bits downscaled, depends on the size of the transform.
48                    The input and output formats for different DCT sizes and number of bits to upscale are
49                    mentioned in the table below:
50 
51 | DCT Size  | Input format  | Output format | Number of bits to upscale |
52 | --------: | ------------: | ------------: | ------------------------: |
53 | 2048      | 2.30          | 12.20         | 11                        |
54 | 512       | 2.30          | 10.22         | 9                         |
55 | 128       | 2.30          | 8.24          | 7                         |
56 
57  */
58 
arm_dct4_q31(const arm_dct4_instance_q31 * S,q31_t * pState,q31_t * pInlineBuffer)59 void arm_dct4_q31(
60   const arm_dct4_instance_q31 * S,
61         q31_t * pState,
62         q31_t * pInlineBuffer)
63 {
64   const q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
65   const q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
66         q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
67         q31_t in;                                      /* Temporary variable */
68         uint32_t i;                                    /* Loop counter */
69 
70 
71   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
72    * along with some pre-processing and post-processing.
73    * Computational procedure is explained as follows:
74    * (a) Pre-processing involves multiplying input with cos factor,
75    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
76    *              where,
77    *                 r(n) -- output of preprocessing
78    *                 u(n) -- input to preprocessing(actual Source buffer)
79    * (b) Calculation of DCT2 using FFT is divided into three steps:
80    *                  Step1: Re-ordering of even and odd elements of input.
81    *                  Step2: Calculating FFT of the re-ordered input.
82    *                  Step3: Taking the real part of the product of FFT output and weights.
83    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
84    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
85    *                        where,
86    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
87    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
88    */
89 
90   /*-------- Pre-processing ------------*/
91   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
92   arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
93   arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N);
94 
95   /* ----------------------------------------------------------------
96    * Step1: Re-ordering of even and odd elements as
97    *             pState[i] =  pInlineBuffer[2*i] and
98    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
99    ---------------------------------------------------------------------*/
100 
101   /* pS1 initialized to pState */
102   pS1 = pState;
103 
104   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
105   pS2 = pState + (S->N - 1U);
106 
107   /* pbuff initialized to input buffer */
108   pbuff = pInlineBuffer;
109 
110 
111 #if defined (ARM_MATH_LOOPUNROLL)
112 
113   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
114   i = S->Nby2 >> 2U;
115 
116   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
117    ** a second loop below computes the remaining 1 to 3 samples. */
118   do
119   {
120     /* Re-ordering of even and odd elements */
121     /* pState[i] =  pInlineBuffer[2*i] */
122     *pS1++ = *pbuff++;
123     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
124     *pS2-- = *pbuff++;
125 
126     *pS1++ = *pbuff++;
127     *pS2-- = *pbuff++;
128 
129     *pS1++ = *pbuff++;
130     *pS2-- = *pbuff++;
131 
132     *pS1++ = *pbuff++;
133     *pS2-- = *pbuff++;
134 
135     /* Decrement loop counter */
136     i--;
137   } while (i > 0U);
138 
139   /* pbuff initialized to input buffer */
140   pbuff = pInlineBuffer;
141 
142   /* pS1 initialized to pState */
143   pS1 = pState;
144 
145   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
146   i = S->N >> 2U;
147 
148   /* Processing with loop unrolling 4 times as N is always multiple of 4.
149    * Compute 4 outputs at a time */
150   do
151   {
152     /* Writing the re-ordered output back to inplace input buffer */
153     *pbuff++ = *pS1++;
154     *pbuff++ = *pS1++;
155     *pbuff++ = *pS1++;
156     *pbuff++ = *pS1++;
157 
158     /* Decrement the loop counter */
159     i--;
160   } while (i > 0U);
161 
162 
163   /* ---------------------------------------------------------
164    *     Step2: Calculate RFFT for N-point input
165    * ---------------------------------------------------------- */
166   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
167   arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
168 
169   /*----------------------------------------------------------------------
170    *  Step3: Multiply the FFT output with the weights.
171    *----------------------------------------------------------------------*/
172   arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
173 
174   /* The output of complex multiplication is in 3.29 format.
175    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
176   arm_shift_q31 (pState, 2, pState, S->N * 2);
177 
178   /* ----------- Post-processing ---------- */
179   /* DCT-IV can be obtained from DCT-II by the equation,
180    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
181    *       Hence, Y4(0) = Y2(0)/2  */
182   /* Getting only real part from the output and Converting to DCT-IV */
183 
184   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
185   i = (S->N - 1U) >> 2U;
186 
187   /* pbuff initialized to input buffer. */
188   pbuff = pInlineBuffer;
189 
190   /* pS1 initialized to pState */
191   pS1 = pState;
192 
193   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
194   in = *pS1++ >> 1U;
195   /* input buffer acts as inplace, so output values are stored in the input itself. */
196   *pbuff++ = in;
197 
198   /* pState pointer is incremented twice as the real values are located alternatively in the array */
199   pS1++;
200 
201   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
202    ** a second loop below computes the remaining 1 to 3 samples. */
203   do
204   {
205     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
206     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
207     in = *pS1++ - in;
208     *pbuff++ = in;
209     /* points to the next real value */
210     pS1++;
211 
212     in = *pS1++ - in;
213     *pbuff++ = in;
214     pS1++;
215 
216     in = *pS1++ - in;
217     *pbuff++ = in;
218     pS1++;
219 
220     in = *pS1++ - in;
221     *pbuff++ = in;
222     pS1++;
223 
224     /* Decrement the loop counter */
225     i--;
226   } while (i > 0U);
227 
228   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
229    ** No loop unrolling is used. */
230   i = (S->N - 1U) % 0x4U;
231 
232   while (i > 0U)
233   {
234     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
235     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
236     in = *pS1++ - in;
237     *pbuff++ = in;
238 
239     /* points to the next real value */
240     pS1++;
241 
242     /* Decrement loop counter */
243     i--;
244   }
245 
246 
247   /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
248 
249   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
250   i = S->N >> 2U;
251 
252   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
253   pbuff = pInlineBuffer;
254 
255   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
256   do
257   {
258     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
259     in = *pbuff;
260     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
261 
262     in = *pbuff;
263     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
264 
265     in = *pbuff;
266     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
267 
268     in = *pbuff;
269     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
270 
271     /* Decrement loop counter */
272     i--;
273   } while (i > 0U);
274 
275 
276 #else
277 
278   /* Initializing the loop counter to N/2 */
279   i = S->Nby2;
280 
281   do
282   {
283     /* Re-ordering of even and odd elements */
284     /* pState[i] =  pInlineBuffer[2*i] */
285     *pS1++ = *pbuff++;
286     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
287     *pS2-- = *pbuff++;
288 
289     /* Decrement the loop counter */
290     i--;
291   } while (i > 0U);
292 
293   /* pbuff initialized to input buffer */
294   pbuff = pInlineBuffer;
295 
296   /* pS1 initialized to pState */
297   pS1 = pState;
298 
299   /* Initializing the loop counter */
300   i = S->N;
301 
302   do
303   {
304     /* Writing the re-ordered output back to inplace input buffer */
305     *pbuff++ = *pS1++;
306 
307     /* Decrement the loop counter */
308     i--;
309   } while (i > 0U);
310 
311 
312   /* ---------------------------------------------------------
313    *     Step2: Calculate RFFT for N-point input
314    * ---------------------------------------------------------- */
315   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
316   arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
317 
318   /*----------------------------------------------------------------------
319    *  Step3: Multiply the FFT output with the weights.
320    *----------------------------------------------------------------------*/
321   arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
322 
323   /* The output of complex multiplication is in 3.29 format.
324    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
325   arm_shift_q31(pState, 2, pState, S->N * 2);
326 
327   /* ----------- Post-processing ---------- */
328   /* DCT-IV can be obtained from DCT-II by the equation,
329    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
330    *       Hence, Y4(0) = Y2(0)/2  */
331   /* Getting only real part from the output and Converting to DCT-IV */
332 
333   /* pbuff initialized to input buffer. */
334   pbuff = pInlineBuffer;
335 
336   /* pS1 initialized to pState */
337   pS1 = pState;
338 
339   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
340   in = *pS1++ >> 1U;
341   /* input buffer acts as inplace, so output values are stored in the input itself. */
342   *pbuff++ = in;
343 
344   /* pState pointer is incremented twice as the real values are located alternatively in the array */
345   pS1++;
346 
347   /* Initializing the loop counter */
348   i = (S->N - 1U);
349 
350   while (i > 0U)
351   {
352     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
353     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
354     in = *pS1++ - in;
355     *pbuff++ = in;
356 
357     /* points to the next real value */
358     pS1++;
359 
360     /* Decrement loop counter */
361     i--;
362   }
363 
364   /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
365 
366   /* Initializing loop counter */
367   i = S->N;
368 
369   /* pbuff initialized to the pInlineBuffer (now contains the output values) */
370   pbuff = pInlineBuffer;
371 
372   do
373   {
374     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
375     in = *pbuff;
376     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
377 
378     /* Decrement loop counter */
379     i--;
380   } while (i > 0U);
381 
382 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
383 
384 }
385 
386 /**
387   @} end of DCT4Q31 group
388  */
389