1 // SPDX-License-Identifier: BSD-3-Clause
2 //
3 // Copyright(c) 2017 Intel Corporation. All rights reserved.
4 //
5 // Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
6
7 #include <sof/math/fir_config.h>
8
9 #if FIR_HIFIEP
10
11 #include <sof/audio/format.h>
12 #include <sof/math/fir_hifi2ep.h>
13 #include <user/fir.h>
14 #include <xtensa/config/defs.h>
15 #include <xtensa/tie/xt_hifi2.h>
16 #include <errno.h>
17 #include <stddef.h>
18 #include <stdint.h>
19
20 /*
21 * EQ FIR algorithm code
22 */
23
fir_reset(struct fir_state_32x16 * fir)24 void fir_reset(struct fir_state_32x16 *fir)
25 {
26 fir->taps = 0;
27 fir->length = 0;
28 fir->out_shift = 0;
29 fir->coef = NULL;
30 /* There may need to know the beginning of dynamic allocation after
31 * reset so omitting setting also fir->delay to NULL.
32 */
33 }
34
fir_delay_size(struct sof_fir_coef_data * config)35 int fir_delay_size(struct sof_fir_coef_data *config)
36 {
37 /* Check FIR tap count for implementation specific constraints */
38 if (config->length > SOF_FIR_MAX_LENGTH || config->length < 4)
39 return -EINVAL;
40
41 if (config->length & 0x3)
42 return -EINVAL;
43
44 /* The dual sample version needs one more delay entry. To preserve
45 * align for 64 bits need to add two.
46 */
47 return (config->length + 2) * sizeof(int32_t);
48 }
49
fir_init_coef(struct fir_state_32x16 * fir,struct sof_fir_coef_data * config)50 int fir_init_coef(struct fir_state_32x16 *fir,
51 struct sof_fir_coef_data *config)
52 {
53 /* The length is taps plus two since the filter computes two
54 * samples per call. Length plus one would be minimum but the add
55 * must be even. The even length is needed for 64 bit loads from delay
56 * lines with 32 bit samples.
57 */
58 fir->taps = (int)config->length;
59 fir->length = fir->taps + 2;
60 fir->out_shift = (int)config->out_shift;
61 fir->coef = (ae_p16x2s *)&config->coef[0];
62 return 0;
63 }
64
fir_init_delay(struct fir_state_32x16 * fir,int32_t ** data)65 void fir_init_delay(struct fir_state_32x16 *fir, int32_t **data)
66 {
67 fir->delay = (ae_p24f *)*data;
68 fir->delay_end = fir->delay + fir->length;
69 fir->rwp = (ae_p24x2f *)(fir->delay + fir->length - 1);
70 *data += fir->length; /* Point to next delay line start */
71 }
72
fir_get_lrshifts(struct fir_state_32x16 * fir,int * lshift,int * rshift)73 void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
74 int *rshift)
75 {
76 *lshift = (fir->out_shift < 0) ? -fir->out_shift : 0;
77 *rshift = (fir->out_shift > 0) ? fir->out_shift : 0;
78 }
79
80 /* HiFi EP has the follow number of reqisters that should not be exceeded
81 * 4x 56 bit registers in register file Q
82 * 8x 48 bit registers in register file P
83 */
84
fir_32x16_hifiep(struct fir_state_32x16 * fir,int32_t x,int32_t * y,int lshift,int rshift)85 void fir_32x16_hifiep(struct fir_state_32x16 *fir, int32_t x, int32_t *y, int lshift, int rshift)
86 {
87 /* This function uses
88 * 1x 56 bit registers Q,
89 * 4x 48 bit registers P
90 * 3x integers
91 * 2x address pointers,
92 */
93 ae_q56s a;
94 ae_p24x2f data2;
95 ae_p24x2f coef2;
96 ae_p24x2f d0;
97 ae_p24x2f d1;
98 int i;
99 ae_p24x2f *dp = fir->rwp;
100 ae_p16x2s *coefp = fir->coef;
101 const int taps_div_4 = fir->taps >> 2;
102 const int inc = sizeof(int32_t);
103
104 /* Bypass samples if taps count is zero. */
105 if (!taps_div_4) {
106 *y = x;
107 return;
108 }
109
110 /* Write sample to delay */
111 a = AE_CVTQ48A32S(x);
112 AE_SQ32F_C(a, (ae_q32s *)fir->rwp, -sizeof(int32_t));
113
114 /* Note: If the next function is converted to handle two samples
115 * per call the data load can be done with single instruction
116 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
117 */
118 a = AE_ZEROQ56();
119 for (i = 0; i < taps_div_4; i++) {
120 /* Load two coefficients. Coef2_h contains tap coefp[n]
121 * and coef2_l contains coef[n+1].
122 */
123 coef2 = AE_LP16X2F_I(coefp, 0);
124
125 /* Load two data samples and pack to d0 to data2_h and
126 * d1 to data2_l.
127 */
128 AE_LP24F_C(d0, dp, inc);
129 AE_LP24F_C(d1, dp, inc);
130 data2 = AE_SELP24_LL(d0, d1);
131
132 /* Accumulate
133 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
134 * data and Q1.15 coefficients are used as 24 bits as
135 * Q1.23 values.
136 */
137 AE_MULAAFP24S_HH_LL(a, data2, coef2);
138
139 /* Repeat the same for next two taps and increase coefp. */
140 coef2 = AE_LP16X2F_I(coefp, sizeof(ae_p16x2s));
141 AE_LP24F_C(d0, dp, inc);
142 AE_LP24F_C(d1, dp, inc);
143 data2 = AE_SELP24_LL(d0, d1);
144 AE_MULAAFP24S_HH_LL(a, data2, coef2);
145 coefp += 2;
146 }
147
148 /* Do scaling shifts and store sample. */
149 a = AE_SRAAQ56(AE_SLLASQ56S(a, lshift), rshift);
150 AE_SQ32F_I(AE_ROUNDSQ32SYM(a), (ae_q32s *)y, 0);
151 }
152
153 /* HiFi EP has the follow number of reqisters that should not be exceeded
154 * 4x 56 bit registers in register file Q
155 * 8x 48 bit registers in register file P
156 */
157
fir_32x16_2x_hifiep(struct fir_state_32x16 * fir,int32_t x0,int32_t x1,int32_t * y0,int32_t * y1,int lshift,int rshift)158 void fir_32x16_2x_hifiep(struct fir_state_32x16 *fir, int32_t x0, int32_t x1,
159 int32_t *y0, int32_t *y1, int lshift, int rshift)
160 {
161 /* This function uses
162 * 2x 56 bit registers Q,
163 * 4x 48 bit registers P
164 * 3x integers
165 * 2x address pointers,
166 */
167 ae_q56s a;
168 ae_q56s b;
169 ae_p24x2f d0;
170 ae_p24x2f d1;
171 ae_p24x2f d3;
172 ae_p24x2f coefs;
173 int i;
174 ae_p24x2f *dp;
175 ae_p16x2s *coefp = fir->coef;
176 const int taps_div_4 = fir->taps >> 2;
177 const int inc = 2 * sizeof(int32_t);
178
179 /* Bypass samples if taps count is zero. */
180 if (!taps_div_4) {
181 *y0 = x0;
182 *y1 = x1;
183 return;
184 }
185
186 /* Write samples to delay */
187 a = AE_CVTQ48A32S(x0);
188 AE_SQ32F_C(a, (ae_q32s *)fir->rwp, -sizeof(int32_t));
189 a = AE_CVTQ48A32S(x1);
190 dp = fir->rwp;
191 AE_SQ32F_C(a, (ae_q32s *)fir->rwp, -sizeof(int32_t));
192
193 /* Note: If the next function is converted to handle two samples
194 * per call the data load can be done with single instruction
195 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
196 */
197 a = AE_ZEROQ56();
198 b = AE_ZEROQ56();
199 /* Load two data samples and pack to d0 to data2_h and
200 * d1 to data2_l.
201 */
202 AE_LP24X2F_C(d0, dp, inc);
203 for (i = 0; i < taps_div_4; i++) {
204 /* Load two coefficients. Coef2_h contains tap coefp[n]
205 * and coef2_l contains coef[n+1].
206 */
207 coefs = AE_LP16X2F_I(coefp, 0);
208
209 /* Load two data samples. Upper part d1_h is x[n+1] and
210 * lower part d1_l is x[n].
211 */
212 AE_LP24X2F_C(d1, dp, inc);
213
214 /* Accumulate
215 * b += d0_h * coefs_h + d0_l * coefs_l. The Q1.31 data
216 * and Q1.15 coefficients are converted to 24 bits as
217 * Q1.23 values.
218 */
219 AE_MULAAFP24S_HH_LL(b, d0, coefs);
220
221 /* Pack d0_l and d1_h to d3. Then accumulate
222 * a += d3_h * coefs_h + d3_l * coefs_l. Pass d1 to d1 for
223 * next unrolled iteration.
224 */
225 d3 = AE_SELP24_LH(d0, d1);
226 AE_MULAAFP24S_HH_LL(a, d3, coefs);
227 d0 = d1;
228
229 /* Repeat the same for next two taps and increase coefp. */
230 coefs = AE_LP16X2F_I(coefp, sizeof(ae_p16x2s));
231 AE_LP24X2F_C(d1, dp, inc);
232 AE_MULAAFP24S_HH_LL(b, d0, coefs);
233 d3 = AE_SELP24_LH(d0, d1);
234 AE_MULAAFP24S_HH_LL(a, d3, coefs);
235 d0 = d1;
236 coefp += 2;
237 }
238
239 /* Do scaling shifts and store sample. */
240 b = AE_SRAAQ56(AE_SLLASQ56S(b, lshift), rshift);
241 a = AE_SRAAQ56(AE_SLLASQ56S(a, lshift), rshift);
242 AE_SQ32F_I(AE_ROUNDSQ32SYM(b), (ae_q32s *)y1, 0);
243 AE_SQ32F_I(AE_ROUNDSQ32SYM(a), (ae_q32s *)y0, 0);
244 }
245
246 #endif
247