1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_dct4_q31.c
4 * Description: Processing function of DCT4 & IDCT4 Q31
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30
31 /**
32 @addtogroup DCT4Q31
33 @{
34 */
35
36 /**
37 @brief Processing function for the Q31 DCT4/IDCT4.
38 @deprecated Do not use this function. It will be removed in future versions.
39 @param[in] S points to an instance of the Q31 DCT4 structure.
40 @param[in] pState points to state buffer.
41 @param[in,out] pInlineBuffer points to the in-place input and output buffer.
42
43 @par Input an output formats
44 Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
45 as the conversion from DCT2 to DCT4 involves one subtraction.
46 Internally inputs are downscaled in the RFFT process function to avoid overflows.
47 Number of bits downscaled, depends on the size of the transform.
48 The input and output formats for different DCT sizes and number of bits to upscale are
49 mentioned in the table below:
50
51 | DCT Size | Input format | Output format | Number of bits to upscale |
52 | --------: | ------------: | ------------: | ------------------------: |
53 | 2048 | 2.30 | 12.20 | 11 |
54 | 512 | 2.30 | 10.22 | 9 |
55 | 128 | 2.30 | 8.24 | 7 |
56
57 */
58
arm_dct4_q31(const arm_dct4_instance_q31 * S,q31_t * pState,q31_t * pInlineBuffer)59 void arm_dct4_q31(
60 const arm_dct4_instance_q31 * S,
61 q31_t * pState,
62 q31_t * pInlineBuffer)
63 {
64 const q31_t *weights = S->pTwiddle; /* Pointer to the Weights table */
65 const q31_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */
66 q31_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */
67 q31_t in; /* Temporary variable */
68 uint32_t i; /* Loop counter */
69
70
71 /* DCT4 computation involves DCT2 (which is calculated using RFFT)
72 * along with some pre-processing and post-processing.
73 * Computational procedure is explained as follows:
74 * (a) Pre-processing involves multiplying input with cos factor,
75 * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
76 * where,
77 * r(n) -- output of preprocessing
78 * u(n) -- input to preprocessing(actual Source buffer)
79 * (b) Calculation of DCT2 using FFT is divided into three steps:
80 * Step1: Re-ordering of even and odd elements of input.
81 * Step2: Calculating FFT of the re-ordered input.
82 * Step3: Taking the real part of the product of FFT output and weights.
83 * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
84 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
85 * where,
86 * Y4 -- DCT4 output, Y2 -- DCT2 output
87 * (d) Multiplying the output with the normalizing factor sqrt(2/N).
88 */
89
90 /*-------- Pre-processing ------------*/
91 /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
92 arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
93 arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N);
94
95 /* ----------------------------------------------------------------
96 * Step1: Re-ordering of even and odd elements as
97 * pState[i] = pInlineBuffer[2*i] and
98 * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
99 ---------------------------------------------------------------------*/
100
101 /* pS1 initialized to pState */
102 pS1 = pState;
103
104 /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
105 pS2 = pState + (S->N - 1U);
106
107 /* pbuff initialized to input buffer */
108 pbuff = pInlineBuffer;
109
110
111 #if defined (ARM_MATH_LOOPUNROLL)
112
113 /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
114 i = S->Nby2 >> 2U;
115
116 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
117 ** a second loop below computes the remaining 1 to 3 samples. */
118 do
119 {
120 /* Re-ordering of even and odd elements */
121 /* pState[i] = pInlineBuffer[2*i] */
122 *pS1++ = *pbuff++;
123 /* pState[N-i-1] = pInlineBuffer[2*i+1] */
124 *pS2-- = *pbuff++;
125
126 *pS1++ = *pbuff++;
127 *pS2-- = *pbuff++;
128
129 *pS1++ = *pbuff++;
130 *pS2-- = *pbuff++;
131
132 *pS1++ = *pbuff++;
133 *pS2-- = *pbuff++;
134
135 /* Decrement loop counter */
136 i--;
137 } while (i > 0U);
138
139 /* pbuff initialized to input buffer */
140 pbuff = pInlineBuffer;
141
142 /* pS1 initialized to pState */
143 pS1 = pState;
144
145 /* Initializing the loop counter to N/4 instead of N for loop unrolling */
146 i = S->N >> 2U;
147
148 /* Processing with loop unrolling 4 times as N is always multiple of 4.
149 * Compute 4 outputs at a time */
150 do
151 {
152 /* Writing the re-ordered output back to inplace input buffer */
153 *pbuff++ = *pS1++;
154 *pbuff++ = *pS1++;
155 *pbuff++ = *pS1++;
156 *pbuff++ = *pS1++;
157
158 /* Decrement the loop counter */
159 i--;
160 } while (i > 0U);
161
162
163 /* ---------------------------------------------------------
164 * Step2: Calculate RFFT for N-point input
165 * ---------------------------------------------------------- */
166 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
167 arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
168
169 /*----------------------------------------------------------------------
170 * Step3: Multiply the FFT output with the weights.
171 *----------------------------------------------------------------------*/
172 arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
173
174 /* The output of complex multiplication is in 3.29 format.
175 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
176 arm_shift_q31 (pState, 2, pState, S->N * 2);
177
178 /* ----------- Post-processing ---------- */
179 /* DCT-IV can be obtained from DCT-II by the equation,
180 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
181 * Hence, Y4(0) = Y2(0)/2 */
182 /* Getting only real part from the output and Converting to DCT-IV */
183
184 /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
185 i = (S->N - 1U) >> 2U;
186
187 /* pbuff initialized to input buffer. */
188 pbuff = pInlineBuffer;
189
190 /* pS1 initialized to pState */
191 pS1 = pState;
192
193 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
194 in = *pS1++ >> 1U;
195 /* input buffer acts as inplace, so output values are stored in the input itself. */
196 *pbuff++ = in;
197
198 /* pState pointer is incremented twice as the real values are located alternatively in the array */
199 pS1++;
200
201 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
202 ** a second loop below computes the remaining 1 to 3 samples. */
203 do
204 {
205 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
206 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
207 in = *pS1++ - in;
208 *pbuff++ = in;
209 /* points to the next real value */
210 pS1++;
211
212 in = *pS1++ - in;
213 *pbuff++ = in;
214 pS1++;
215
216 in = *pS1++ - in;
217 *pbuff++ = in;
218 pS1++;
219
220 in = *pS1++ - in;
221 *pbuff++ = in;
222 pS1++;
223
224 /* Decrement the loop counter */
225 i--;
226 } while (i > 0U);
227
228 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
229 ** No loop unrolling is used. */
230 i = (S->N - 1U) % 0x4U;
231
232 while (i > 0U)
233 {
234 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
235 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
236 in = *pS1++ - in;
237 *pbuff++ = in;
238
239 /* points to the next real value */
240 pS1++;
241
242 /* Decrement loop counter */
243 i--;
244 }
245
246
247 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
248
249 /* Initializing the loop counter to N/4 instead of N for loop unrolling */
250 i = S->N >> 2U;
251
252 /* pbuff initialized to the pInlineBuffer(now contains the output values) */
253 pbuff = pInlineBuffer;
254
255 /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
256 do
257 {
258 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
259 in = *pbuff;
260 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
261
262 in = *pbuff;
263 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
264
265 in = *pbuff;
266 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
267
268 in = *pbuff;
269 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
270
271 /* Decrement loop counter */
272 i--;
273 } while (i > 0U);
274
275
276 #else
277
278 /* Initializing the loop counter to N/2 */
279 i = S->Nby2;
280
281 do
282 {
283 /* Re-ordering of even and odd elements */
284 /* pState[i] = pInlineBuffer[2*i] */
285 *pS1++ = *pbuff++;
286 /* pState[N-i-1] = pInlineBuffer[2*i+1] */
287 *pS2-- = *pbuff++;
288
289 /* Decrement the loop counter */
290 i--;
291 } while (i > 0U);
292
293 /* pbuff initialized to input buffer */
294 pbuff = pInlineBuffer;
295
296 /* pS1 initialized to pState */
297 pS1 = pState;
298
299 /* Initializing the loop counter */
300 i = S->N;
301
302 do
303 {
304 /* Writing the re-ordered output back to inplace input buffer */
305 *pbuff++ = *pS1++;
306
307 /* Decrement the loop counter */
308 i--;
309 } while (i > 0U);
310
311
312 /* ---------------------------------------------------------
313 * Step2: Calculate RFFT for N-point input
314 * ---------------------------------------------------------- */
315 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
316 arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
317
318 /*----------------------------------------------------------------------
319 * Step3: Multiply the FFT output with the weights.
320 *----------------------------------------------------------------------*/
321 arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
322
323 /* The output of complex multiplication is in 3.29 format.
324 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
325 arm_shift_q31(pState, 2, pState, S->N * 2);
326
327 /* ----------- Post-processing ---------- */
328 /* DCT-IV can be obtained from DCT-II by the equation,
329 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
330 * Hence, Y4(0) = Y2(0)/2 */
331 /* Getting only real part from the output and Converting to DCT-IV */
332
333 /* pbuff initialized to input buffer. */
334 pbuff = pInlineBuffer;
335
336 /* pS1 initialized to pState */
337 pS1 = pState;
338
339 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
340 in = *pS1++ >> 1U;
341 /* input buffer acts as inplace, so output values are stored in the input itself. */
342 *pbuff++ = in;
343
344 /* pState pointer is incremented twice as the real values are located alternatively in the array */
345 pS1++;
346
347 /* Initializing the loop counter */
348 i = (S->N - 1U);
349
350 while (i > 0U)
351 {
352 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
353 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
354 in = *pS1++ - in;
355 *pbuff++ = in;
356
357 /* points to the next real value */
358 pS1++;
359
360 /* Decrement loop counter */
361 i--;
362 }
363
364 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
365
366 /* Initializing loop counter */
367 i = S->N;
368
369 /* pbuff initialized to the pInlineBuffer (now contains the output values) */
370 pbuff = pInlineBuffer;
371
372 do
373 {
374 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
375 in = *pbuff;
376 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
377
378 /* Decrement loop counter */
379 i--;
380 } while (i > 0U);
381
382 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
383
384 }
385
386 /**
387 @} end of DCT4Q31 group
388 */
389