1 // SPDX-License-Identifier: BSD-3-Clause
2 //
3 // Copyright(c) 2017 Intel Corporation. All rights reserved.
4 //
5 // Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
6 
7 #include <sof/math/fir_config.h>
8 
9 #if FIR_HIFIEP
10 
11 #include <sof/audio/format.h>
12 #include <sof/math/fir_hifi2ep.h>
13 #include <user/fir.h>
14 #include <xtensa/config/defs.h>
15 #include <xtensa/tie/xt_hifi2.h>
16 #include <errno.h>
17 #include <stddef.h>
18 #include <stdint.h>
19 
20 /*
21  * EQ FIR algorithm code
22  */
23 
fir_reset(struct fir_state_32x16 * fir)24 void fir_reset(struct fir_state_32x16 *fir)
25 {
26 	fir->taps = 0;
27 	fir->length = 0;
28 	fir->out_shift = 0;
29 	fir->coef = NULL;
30 	/* There may need to know the beginning of dynamic allocation after
31 	 * reset so omitting setting also fir->delay to NULL.
32 	 */
33 }
34 
fir_delay_size(struct sof_fir_coef_data * config)35 int fir_delay_size(struct sof_fir_coef_data *config)
36 {
37 	/* Check FIR tap count for implementation specific constraints */
38 	if (config->length > SOF_FIR_MAX_LENGTH || config->length < 4)
39 		return -EINVAL;
40 
41 	if (config->length & 0x3)
42 		return -EINVAL;
43 
44 	/* The dual sample version needs one more delay entry. To preserve
45 	 * align for 64 bits need to add two.
46 	 */
47 	return (config->length + 2) * sizeof(int32_t);
48 }
49 
fir_init_coef(struct fir_state_32x16 * fir,struct sof_fir_coef_data * config)50 int fir_init_coef(struct fir_state_32x16 *fir,
51 		  struct sof_fir_coef_data *config)
52 {
53 	/* The length is taps plus two since the filter computes two
54 	 * samples per call. Length plus one would be minimum but the add
55 	 * must be even. The even length is needed for 64 bit loads from delay
56 	 * lines with 32 bit samples.
57 	 */
58 	fir->taps = (int)config->length;
59 	fir->length = fir->taps + 2;
60 	fir->out_shift = (int)config->out_shift;
61 	fir->coef = (ae_p16x2s *)&config->coef[0];
62 	return 0;
63 }
64 
fir_init_delay(struct fir_state_32x16 * fir,int32_t ** data)65 void fir_init_delay(struct fir_state_32x16 *fir, int32_t **data)
66 {
67 	fir->delay = (ae_p24f *)*data;
68 	fir->delay_end = fir->delay + fir->length;
69 	fir->rwp = (ae_p24x2f *)(fir->delay + fir->length - 1);
70 	*data += fir->length; /* Point to next delay line start */
71 }
72 
fir_get_lrshifts(struct fir_state_32x16 * fir,int * lshift,int * rshift)73 void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
74 		      int *rshift)
75 {
76 	*lshift = (fir->out_shift < 0) ? -fir->out_shift : 0;
77 	*rshift = (fir->out_shift > 0) ? fir->out_shift : 0;
78 }
79 
80 /* HiFi EP has the follow number of reqisters that should not be exceeded
81  * 4x 56 bit registers in register file Q
82  * 8x 48 bit registers in register file P
83  */
84 
fir_32x16_hifiep(struct fir_state_32x16 * fir,int32_t x,int32_t * y,int lshift,int rshift)85 void fir_32x16_hifiep(struct fir_state_32x16 *fir, int32_t x, int32_t *y, int lshift, int rshift)
86 {
87 	/* This function uses
88 	 * 1x 56 bit registers Q,
89 	 * 4x 48 bit registers P
90 	 * 3x integers
91 	 * 2x address pointers,
92 	 */
93 	ae_q56s a;
94 	ae_p24x2f data2;
95 	ae_p24x2f coef2;
96 	ae_p24x2f d0;
97 	ae_p24x2f d1;
98 	int i;
99 	ae_p24x2f *dp = fir->rwp;
100 	ae_p16x2s *coefp = fir->coef;
101 	const int taps_div_4 = fir->taps >> 2;
102 	const int inc = sizeof(int32_t);
103 
104 	/* Bypass samples if taps count is zero. */
105 	if (!taps_div_4) {
106 		*y = x;
107 		return;
108 	}
109 
110 	/* Write sample to delay */
111 	a = AE_CVTQ48A32S(x);
112 	AE_SQ32F_C(a, (ae_q32s *)fir->rwp, -sizeof(int32_t));
113 
114 	/* Note: If the next function is converted to handle two samples
115 	 * per call the data load can be done with single instruction
116 	 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
117 	 */
118 	a = AE_ZEROQ56();
119 	for (i = 0; i < taps_div_4; i++) {
120 		/* Load two coefficients. Coef2_h contains tap coefp[n]
121 		 * and coef2_l contains coef[n+1].
122 		 */
123 		coef2 = AE_LP16X2F_I(coefp, 0);
124 
125 		/* Load two data samples and pack to d0 to data2_h and
126 		 * d1 to data2_l.
127 		 */
128 		AE_LP24F_C(d0, dp, inc);
129 		AE_LP24F_C(d1, dp, inc);
130 		data2 = AE_SELP24_LL(d0, d1);
131 
132 		/* Accumulate
133 		 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
134 		 * data and Q1.15 coefficients are used as 24 bits as
135 		 * Q1.23 values.
136 		 */
137 		AE_MULAAFP24S_HH_LL(a, data2, coef2);
138 
139 		/* Repeat the same for next two taps and increase coefp. */
140 		coef2 = AE_LP16X2F_I(coefp, sizeof(ae_p16x2s));
141 		AE_LP24F_C(d0, dp, inc);
142 		AE_LP24F_C(d1, dp, inc);
143 		data2 = AE_SELP24_LL(d0, d1);
144 		AE_MULAAFP24S_HH_LL(a, data2, coef2);
145 		coefp += 2;
146 	}
147 
148 	/* Do scaling shifts and store sample. */
149 	a = AE_SRAAQ56(AE_SLLASQ56S(a, lshift), rshift);
150 	AE_SQ32F_I(AE_ROUNDSQ32SYM(a), (ae_q32s *)y, 0);
151 }
152 
153 /* HiFi EP has the follow number of reqisters that should not be exceeded
154  * 4x 56 bit registers in register file Q
155  * 8x 48 bit registers in register file P
156  */
157 
fir_32x16_2x_hifiep(struct fir_state_32x16 * fir,int32_t x0,int32_t x1,int32_t * y0,int32_t * y1,int lshift,int rshift)158 void fir_32x16_2x_hifiep(struct fir_state_32x16 *fir, int32_t x0, int32_t x1,
159 			 int32_t *y0, int32_t *y1, int lshift, int rshift)
160 {
161 	/* This function uses
162 	 * 2x 56 bit registers Q,
163 	 * 4x 48 bit registers P
164 	 * 3x integers
165 	 * 2x address pointers,
166 	 */
167 	ae_q56s a;
168 	ae_q56s b;
169 	ae_p24x2f d0;
170 	ae_p24x2f d1;
171 	ae_p24x2f d3;
172 	ae_p24x2f coefs;
173 	int i;
174 	ae_p24x2f *dp;
175 	ae_p16x2s *coefp = fir->coef;
176 	const int taps_div_4 = fir->taps >> 2;
177 	const int inc = 2 * sizeof(int32_t);
178 
179 	/* Bypass samples if taps count is zero. */
180 	if (!taps_div_4) {
181 		*y0 = x0;
182 		*y1 = x1;
183 		return;
184 	}
185 
186 	/* Write samples to delay */
187 	a = AE_CVTQ48A32S(x0);
188 	AE_SQ32F_C(a, (ae_q32s *)fir->rwp, -sizeof(int32_t));
189 	a = AE_CVTQ48A32S(x1);
190 	dp = fir->rwp;
191 	AE_SQ32F_C(a, (ae_q32s *)fir->rwp, -sizeof(int32_t));
192 
193 	/* Note: If the next function is converted to handle two samples
194 	 * per call the data load can be done with single instruction
195 	 * AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
196 	 */
197 	a = AE_ZEROQ56();
198 	b = AE_ZEROQ56();
199 	/* Load two data samples and pack to d0 to data2_h and
200 	 * d1 to data2_l.
201 	 */
202 	AE_LP24X2F_C(d0, dp, inc);
203 	for (i = 0; i < taps_div_4; i++) {
204 		/* Load two coefficients. Coef2_h contains tap coefp[n]
205 		 * and coef2_l contains coef[n+1].
206 		 */
207 		coefs = AE_LP16X2F_I(coefp, 0);
208 
209 		/* Load two data samples. Upper part d1_h is x[n+1] and
210 		 * lower part d1_l is x[n].
211 		 */
212 		AE_LP24X2F_C(d1, dp, inc);
213 
214 		/* Accumulate
215 		 * b += d0_h * coefs_h + d0_l * coefs_l. The Q1.31 data
216 		 * and Q1.15 coefficients are converted to 24 bits as
217 		 * Q1.23 values.
218 		 */
219 		AE_MULAAFP24S_HH_LL(b, d0, coefs);
220 
221 		/* Pack d0_l and d1_h to d3. Then accumulate
222 		 * a += d3_h * coefs_h + d3_l * coefs_l. Pass d1 to d1 for
223 		 * next unrolled iteration.
224 		 */
225 		d3 = AE_SELP24_LH(d0, d1);
226 		AE_MULAAFP24S_HH_LL(a, d3, coefs);
227 		d0 = d1;
228 
229 		/* Repeat the same for next two taps and increase coefp. */
230 		coefs = AE_LP16X2F_I(coefp, sizeof(ae_p16x2s));
231 		AE_LP24X2F_C(d1, dp, inc);
232 		AE_MULAAFP24S_HH_LL(b, d0, coefs);
233 		d3 = AE_SELP24_LH(d0, d1);
234 		AE_MULAAFP24S_HH_LL(a, d3, coefs);
235 		d0 = d1;
236 		coefp += 2;
237 	}
238 
239 	/* Do scaling shifts and store sample. */
240 	b = AE_SRAAQ56(AE_SLLASQ56S(b, lshift), rshift);
241 	a = AE_SRAAQ56(AE_SLLASQ56S(a, lshift), rshift);
242 	AE_SQ32F_I(AE_ROUNDSQ32SYM(b), (ae_q32s *)y1, 0);
243 	AE_SQ32F_I(AE_ROUNDSQ32SYM(a), (ae_q32s *)y0, 0);
244 }
245 
246 #endif
247