1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright 2016 Tom aan de Wiel
4  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *
6  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7  *
8  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9  * R.D. Brown, 1977
10  */
11 
12 #include <linux/string.h>
13 #include "vicodec-codec.h"
14 
15 #define ALL_ZEROS 15
16 #define DEADZONE_WIDTH 20
17 
18 static const uint8_t zigzag[64] = {
19 	0,
20 	1,  8,
21 	2,  9, 16,
22 	3, 10, 17, 24,
23 	4, 11, 18, 25, 32,
24 	5, 12, 19, 26, 33, 40,
25 	6, 13, 20, 27, 34, 41, 48,
26 	7, 14, 21, 28, 35, 42, 49, 56,
27 	15, 22, 29, 36, 43, 50, 57,
28 	23, 30, 37, 44, 51, 58,
29 	31, 38, 45, 52, 59,
30 	39, 46, 53, 60,
31 	47, 54, 61,
32 	55, 62,
33 	63,
34 };
35 
36 
rlc(const s16 * in,__be16 * output,int blocktype)37 static int rlc(const s16 *in, __be16 *output, int blocktype)
38 {
39 	s16 block[8 * 8];
40 	s16 *wp = block;
41 	int i = 0;
42 	int x, y;
43 	int ret = 0;
44 
45 	/* read in block from framebuffer */
46 	int lastzero_run = 0;
47 	int to_encode;
48 
49 	for (y = 0; y < 8; y++) {
50 		for (x = 0; x < 8; x++) {
51 			*wp = in[x + y * 8];
52 			wp++;
53 		}
54 	}
55 
56 	/* keep track of amount of trailing zeros */
57 	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
58 		lastzero_run++;
59 
60 	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
61 	ret++;
62 
63 	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
64 
65 	i = 0;
66 	while (i < to_encode) {
67 		int cnt = 0;
68 		int tmp;
69 
70 		/* count leading zeros */
71 		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
72 			cnt++;
73 			i++;
74 			if (i == to_encode) {
75 				cnt--;
76 				break;
77 			}
78 		}
79 		/* 4 bits for run, 12 for coefficient (quantization by 4) */
80 		*output++ = htons((cnt | tmp << 4));
81 		i++;
82 		ret++;
83 	}
84 	if (lastzero_run > 14) {
85 		*output = htons(ALL_ZEROS | 0);
86 		ret++;
87 	}
88 
89 	return ret;
90 }
91 
92 /*
93  * This function will worst-case increase rlc_in by 65*2 bytes:
94  * one s16 value for the header and 8 * 8 coefficients of type s16.
95  */
derlc(const __be16 ** rlc_in,s16 * dwht_out)96 static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
97 {
98 	/* header */
99 	const __be16 *input = *rlc_in;
100 	s16 ret = ntohs(*input++);
101 	int dec_count = 0;
102 	s16 block[8 * 8 + 16];
103 	s16 *wp = block;
104 	int i;
105 
106 	/*
107 	 * Now de-compress, it expands one byte to up to 15 bytes
108 	 * (or fills the remainder of the 64 bytes with zeroes if it
109 	 * is the last byte to expand).
110 	 *
111 	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
112 	 * allow for overflow if the incoming data was malformed.
113 	 */
114 	while (dec_count < 8 * 8) {
115 		s16 in = ntohs(*input++);
116 		int length = in & 0xf;
117 		int coeff = in >> 4;
118 
119 		/* fill remainder with zeros */
120 		if (length == 15) {
121 			for (i = 0; i < 64 - dec_count; i++)
122 				*wp++ = 0;
123 			break;
124 		}
125 
126 		for (i = 0; i < length; i++)
127 			*wp++ = 0;
128 		*wp++ = coeff;
129 		dec_count += length + 1;
130 	}
131 
132 	wp = block;
133 
134 	for (i = 0; i < 64; i++) {
135 		int pos = zigzag[i];
136 		int y = pos / 8;
137 		int x = pos % 8;
138 
139 		dwht_out[x + y * 8] = *wp++;
140 	}
141 	*rlc_in = input;
142 	return ret;
143 }
144 
145 static const int quant_table[] = {
146 	2, 2, 2, 2, 2, 2,  2,  2,
147 	2, 2, 2, 2, 2, 2,  2,  2,
148 	2, 2, 2, 2, 2, 2,  2,  3,
149 	2, 2, 2, 2, 2, 2,  3,  6,
150 	2, 2, 2, 2, 2, 3,  6,  6,
151 	2, 2, 2, 2, 3, 6,  6,  6,
152 	2, 2, 2, 3, 6, 6,  6,  6,
153 	2, 2, 3, 6, 6, 6,  6,  8,
154 };
155 
156 static const int quant_table_p[] = {
157 	3, 3, 3, 3, 3, 3,  3,  3,
158 	3, 3, 3, 3, 3, 3,  3,  3,
159 	3, 3, 3, 3, 3, 3,  3,  3,
160 	3, 3, 3, 3, 3, 3,  3,  6,
161 	3, 3, 3, 3, 3, 3,  6,  6,
162 	3, 3, 3, 3, 3, 6,  6,  9,
163 	3, 3, 3, 3, 6, 6,  9,  9,
164 	3, 3, 3, 6, 6, 9,  9,  10,
165 };
166 
quantize_intra(s16 * coeff,s16 * de_coeff)167 static void quantize_intra(s16 *coeff, s16 *de_coeff)
168 {
169 	const int *quant = quant_table;
170 	int i, j;
171 
172 	for (j = 0; j < 8; j++) {
173 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
174 			*coeff >>= *quant;
175 			if (*coeff >= -DEADZONE_WIDTH &&
176 			    *coeff <= DEADZONE_WIDTH)
177 				*coeff = *de_coeff = 0;
178 			else
179 				*de_coeff = *coeff << *quant;
180 		}
181 	}
182 }
183 
dequantize_intra(s16 * coeff)184 static void dequantize_intra(s16 *coeff)
185 {
186 	const int *quant = quant_table;
187 	int i, j;
188 
189 	for (j = 0; j < 8; j++)
190 		for (i = 0; i < 8; i++, quant++, coeff++)
191 			*coeff <<= *quant;
192 }
193 
quantize_inter(s16 * coeff,s16 * de_coeff)194 static void quantize_inter(s16 *coeff, s16 *de_coeff)
195 {
196 	const int *quant = quant_table_p;
197 	int i, j;
198 
199 	for (j = 0; j < 8; j++) {
200 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
201 			*coeff >>= *quant;
202 			if (*coeff >= -DEADZONE_WIDTH &&
203 			    *coeff <= DEADZONE_WIDTH)
204 				*coeff = *de_coeff = 0;
205 			else
206 				*de_coeff = *coeff << *quant;
207 		}
208 	}
209 }
210 
dequantize_inter(s16 * coeff)211 static void dequantize_inter(s16 *coeff)
212 {
213 	const int *quant = quant_table_p;
214 	int i, j;
215 
216 	for (j = 0; j < 8; j++)
217 		for (i = 0; i < 8; i++, quant++, coeff++)
218 			*coeff <<= *quant;
219 }
220 
fwht(const u8 * block,s16 * output_block,unsigned int stride,unsigned int input_step,bool intra)221 static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
222 		 unsigned int input_step, bool intra)
223 {
224 	/* we'll need more than 8 bits for the transformed coefficients */
225 	s32 workspace1[8], workspace2[8];
226 	const u8 *tmp = block;
227 	s16 *out = output_block;
228 	int add = intra ? 256 : 0;
229 	unsigned int i;
230 
231 	/* stage 1 */
232 	stride *= input_step;
233 
234 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
235 		if (input_step == 1) {
236 			workspace1[0]  = tmp[0] + tmp[1] - add;
237 			workspace1[1]  = tmp[0] - tmp[1];
238 
239 			workspace1[2]  = tmp[2] + tmp[3] - add;
240 			workspace1[3]  = tmp[2] - tmp[3];
241 
242 			workspace1[4]  = tmp[4] + tmp[5] - add;
243 			workspace1[5]  = tmp[4] - tmp[5];
244 
245 			workspace1[6]  = tmp[6] + tmp[7] - add;
246 			workspace1[7]  = tmp[6] - tmp[7];
247 		} else {
248 			workspace1[0]  = tmp[0] + tmp[2] - add;
249 			workspace1[1]  = tmp[0] - tmp[2];
250 
251 			workspace1[2]  = tmp[4] + tmp[6] - add;
252 			workspace1[3]  = tmp[4] - tmp[6];
253 
254 			workspace1[4]  = tmp[8] + tmp[10] - add;
255 			workspace1[5]  = tmp[8] - tmp[10];
256 
257 			workspace1[6]  = tmp[12] + tmp[14] - add;
258 			workspace1[7]  = tmp[12] - tmp[14];
259 		}
260 
261 		/* stage 2 */
262 		workspace2[0] = workspace1[0] + workspace1[2];
263 		workspace2[1] = workspace1[0] - workspace1[2];
264 		workspace2[2] = workspace1[1] - workspace1[3];
265 		workspace2[3] = workspace1[1] + workspace1[3];
266 
267 		workspace2[4] = workspace1[4] + workspace1[6];
268 		workspace2[5] = workspace1[4] - workspace1[6];
269 		workspace2[6] = workspace1[5] - workspace1[7];
270 		workspace2[7] = workspace1[5] + workspace1[7];
271 
272 		/* stage 3 */
273 		out[0] = workspace2[0] + workspace2[4];
274 		out[1] = workspace2[0] - workspace2[4];
275 		out[2] = workspace2[1] - workspace2[5];
276 		out[3] = workspace2[1] + workspace2[5];
277 		out[4] = workspace2[2] + workspace2[6];
278 		out[5] = workspace2[2] - workspace2[6];
279 		out[6] = workspace2[3] - workspace2[7];
280 		out[7] = workspace2[3] + workspace2[7];
281 	}
282 
283 	out = output_block;
284 
285 	for (i = 0; i < 8; i++, out++) {
286 		/* stage 1 */
287 		workspace1[0]  = out[0] + out[1 * 8];
288 		workspace1[1]  = out[0] - out[1 * 8];
289 
290 		workspace1[2]  = out[2 * 8] + out[3 * 8];
291 		workspace1[3]  = out[2 * 8] - out[3 * 8];
292 
293 		workspace1[4]  = out[4 * 8] + out[5 * 8];
294 		workspace1[5]  = out[4 * 8] - out[5 * 8];
295 
296 		workspace1[6]  = out[6 * 8] + out[7 * 8];
297 		workspace1[7]  = out[6 * 8] - out[7 * 8];
298 
299 		/* stage 2 */
300 		workspace2[0] = workspace1[0] + workspace1[2];
301 		workspace2[1] = workspace1[0] - workspace1[2];
302 		workspace2[2] = workspace1[1] - workspace1[3];
303 		workspace2[3] = workspace1[1] + workspace1[3];
304 
305 		workspace2[4] = workspace1[4] + workspace1[6];
306 		workspace2[5] = workspace1[4] - workspace1[6];
307 		workspace2[6] = workspace1[5] - workspace1[7];
308 		workspace2[7] = workspace1[5] + workspace1[7];
309 		/* stage 3 */
310 		out[0 * 8] = workspace2[0] + workspace2[4];
311 		out[1 * 8] = workspace2[0] - workspace2[4];
312 		out[2 * 8] = workspace2[1] - workspace2[5];
313 		out[3 * 8] = workspace2[1] + workspace2[5];
314 		out[4 * 8] = workspace2[2] + workspace2[6];
315 		out[5 * 8] = workspace2[2] - workspace2[6];
316 		out[6 * 8] = workspace2[3] - workspace2[7];
317 		out[7 * 8] = workspace2[3] + workspace2[7];
318 	}
319 }
320 
321 /*
322  * Not the nicest way of doing it, but P-blocks get twice the range of
323  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
324  * Furthermore values can be negative... This is just a version that
325  * works with 16 signed data
326  */
fwht16(const s16 * block,s16 * output_block,int stride,int intra)327 static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
328 {
329 	/* we'll need more than 8 bits for the transformed coefficients */
330 	s32 workspace1[8], workspace2[8];
331 	const s16 *tmp = block;
332 	s16 *out = output_block;
333 	int i;
334 
335 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
336 		/* stage 1 */
337 		workspace1[0]  = tmp[0] + tmp[1];
338 		workspace1[1]  = tmp[0] - tmp[1];
339 
340 		workspace1[2]  = tmp[2] + tmp[3];
341 		workspace1[3]  = tmp[2] - tmp[3];
342 
343 		workspace1[4]  = tmp[4] + tmp[5];
344 		workspace1[5]  = tmp[4] - tmp[5];
345 
346 		workspace1[6]  = tmp[6] + tmp[7];
347 		workspace1[7]  = tmp[6] - tmp[7];
348 
349 		/* stage 2 */
350 		workspace2[0] = workspace1[0] + workspace1[2];
351 		workspace2[1] = workspace1[0] - workspace1[2];
352 		workspace2[2] = workspace1[1] - workspace1[3];
353 		workspace2[3] = workspace1[1] + workspace1[3];
354 
355 		workspace2[4] = workspace1[4] + workspace1[6];
356 		workspace2[5] = workspace1[4] - workspace1[6];
357 		workspace2[6] = workspace1[5] - workspace1[7];
358 		workspace2[7] = workspace1[5] + workspace1[7];
359 
360 		/* stage 3 */
361 		out[0] = workspace2[0] + workspace2[4];
362 		out[1] = workspace2[0] - workspace2[4];
363 		out[2] = workspace2[1] - workspace2[5];
364 		out[3] = workspace2[1] + workspace2[5];
365 		out[4] = workspace2[2] + workspace2[6];
366 		out[5] = workspace2[2] - workspace2[6];
367 		out[6] = workspace2[3] - workspace2[7];
368 		out[7] = workspace2[3] + workspace2[7];
369 	}
370 
371 	out = output_block;
372 
373 	for (i = 0; i < 8; i++, out++) {
374 		/* stage 1 */
375 		workspace1[0]  = out[0] + out[1*8];
376 		workspace1[1]  = out[0] - out[1*8];
377 
378 		workspace1[2]  = out[2*8] + out[3*8];
379 		workspace1[3]  = out[2*8] - out[3*8];
380 
381 		workspace1[4]  = out[4*8] + out[5*8];
382 		workspace1[5]  = out[4*8] - out[5*8];
383 
384 		workspace1[6]  = out[6*8] + out[7*8];
385 		workspace1[7]  = out[6*8] - out[7*8];
386 
387 		/* stage 2 */
388 		workspace2[0] = workspace1[0] + workspace1[2];
389 		workspace2[1] = workspace1[0] - workspace1[2];
390 		workspace2[2] = workspace1[1] - workspace1[3];
391 		workspace2[3] = workspace1[1] + workspace1[3];
392 
393 		workspace2[4] = workspace1[4] + workspace1[6];
394 		workspace2[5] = workspace1[4] - workspace1[6];
395 		workspace2[6] = workspace1[5] - workspace1[7];
396 		workspace2[7] = workspace1[5] + workspace1[7];
397 
398 		/* stage 3 */
399 		out[0*8] = workspace2[0] + workspace2[4];
400 		out[1*8] = workspace2[0] - workspace2[4];
401 		out[2*8] = workspace2[1] - workspace2[5];
402 		out[3*8] = workspace2[1] + workspace2[5];
403 		out[4*8] = workspace2[2] + workspace2[6];
404 		out[5*8] = workspace2[2] - workspace2[6];
405 		out[6*8] = workspace2[3] - workspace2[7];
406 		out[7*8] = workspace2[3] + workspace2[7];
407 	}
408 }
409 
ifwht(const s16 * block,s16 * output_block,int intra)410 static void ifwht(const s16 *block, s16 *output_block, int intra)
411 {
412 	/*
413 	 * we'll need more than 8 bits for the transformed coefficients
414 	 * use native unit of cpu
415 	 */
416 	int workspace1[8], workspace2[8];
417 	int inter = intra ? 0 : 1;
418 	const s16 *tmp = block;
419 	s16 *out = output_block;
420 	int i;
421 
422 	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
423 		/* stage 1 */
424 		workspace1[0]  = tmp[0] + tmp[1];
425 		workspace1[1]  = tmp[0] - tmp[1];
426 
427 		workspace1[2]  = tmp[2] + tmp[3];
428 		workspace1[3]  = tmp[2] - tmp[3];
429 
430 		workspace1[4]  = tmp[4] + tmp[5];
431 		workspace1[5]  = tmp[4] - tmp[5];
432 
433 		workspace1[6]  = tmp[6] + tmp[7];
434 		workspace1[7]  = tmp[6] - tmp[7];
435 
436 		/* stage 2 */
437 		workspace2[0] = workspace1[0] + workspace1[2];
438 		workspace2[1] = workspace1[0] - workspace1[2];
439 		workspace2[2] = workspace1[1] - workspace1[3];
440 		workspace2[3] = workspace1[1] + workspace1[3];
441 
442 		workspace2[4] = workspace1[4] + workspace1[6];
443 		workspace2[5] = workspace1[4] - workspace1[6];
444 		workspace2[6] = workspace1[5] - workspace1[7];
445 		workspace2[7] = workspace1[5] + workspace1[7];
446 
447 		/* stage 3 */
448 		out[0] = workspace2[0] + workspace2[4];
449 		out[1] = workspace2[0] - workspace2[4];
450 		out[2] = workspace2[1] - workspace2[5];
451 		out[3] = workspace2[1] + workspace2[5];
452 		out[4] = workspace2[2] + workspace2[6];
453 		out[5] = workspace2[2] - workspace2[6];
454 		out[6] = workspace2[3] - workspace2[7];
455 		out[7] = workspace2[3] + workspace2[7];
456 	}
457 
458 	out = output_block;
459 
460 	for (i = 0; i < 8; i++, out++) {
461 		/* stage 1 */
462 		workspace1[0]  = out[0] + out[1 * 8];
463 		workspace1[1]  = out[0] - out[1 * 8];
464 
465 		workspace1[2]  = out[2 * 8] + out[3 * 8];
466 		workspace1[3]  = out[2 * 8] - out[3 * 8];
467 
468 		workspace1[4]  = out[4 * 8] + out[5 * 8];
469 		workspace1[5]  = out[4 * 8] - out[5 * 8];
470 
471 		workspace1[6]  = out[6 * 8] + out[7 * 8];
472 		workspace1[7]  = out[6 * 8] - out[7 * 8];
473 
474 		/* stage 2 */
475 		workspace2[0] = workspace1[0] + workspace1[2];
476 		workspace2[1] = workspace1[0] - workspace1[2];
477 		workspace2[2] = workspace1[1] - workspace1[3];
478 		workspace2[3] = workspace1[1] + workspace1[3];
479 
480 		workspace2[4] = workspace1[4] + workspace1[6];
481 		workspace2[5] = workspace1[4] - workspace1[6];
482 		workspace2[6] = workspace1[5] - workspace1[7];
483 		workspace2[7] = workspace1[5] + workspace1[7];
484 
485 		/* stage 3 */
486 		if (inter) {
487 			int d;
488 
489 			out[0 * 8] = workspace2[0] + workspace2[4];
490 			out[1 * 8] = workspace2[0] - workspace2[4];
491 			out[2 * 8] = workspace2[1] - workspace2[5];
492 			out[3 * 8] = workspace2[1] + workspace2[5];
493 			out[4 * 8] = workspace2[2] + workspace2[6];
494 			out[5 * 8] = workspace2[2] - workspace2[6];
495 			out[6 * 8] = workspace2[3] - workspace2[7];
496 			out[7 * 8] = workspace2[3] + workspace2[7];
497 
498 			for (d = 0; d < 8; d++)
499 				out[8 * d] >>= 6;
500 		} else {
501 			int d;
502 
503 			out[0 * 8] = workspace2[0] + workspace2[4];
504 			out[1 * 8] = workspace2[0] - workspace2[4];
505 			out[2 * 8] = workspace2[1] - workspace2[5];
506 			out[3 * 8] = workspace2[1] + workspace2[5];
507 			out[4 * 8] = workspace2[2] + workspace2[6];
508 			out[5 * 8] = workspace2[2] - workspace2[6];
509 			out[6 * 8] = workspace2[3] - workspace2[7];
510 			out[7 * 8] = workspace2[3] + workspace2[7];
511 
512 			for (d = 0; d < 8; d++) {
513 				out[8 * d] >>= 6;
514 				out[8 * d] += 128;
515 			}
516 		}
517 	}
518 }
519 
fill_encoder_block(const u8 * input,s16 * dst,unsigned int stride,unsigned int input_step)520 static void fill_encoder_block(const u8 *input, s16 *dst,
521 			       unsigned int stride, unsigned int input_step)
522 {
523 	int i, j;
524 
525 	for (i = 0; i < 8; i++) {
526 		for (j = 0; j < 8; j++, input += input_step)
527 			*dst++ = *input;
528 		input += (stride - 8) * input_step;
529 	}
530 }
531 
var_intra(const s16 * input)532 static int var_intra(const s16 *input)
533 {
534 	int32_t mean = 0;
535 	int32_t ret = 0;
536 	const s16 *tmp = input;
537 	int i;
538 
539 	for (i = 0; i < 8 * 8; i++, tmp++)
540 		mean += *tmp;
541 	mean /= 64;
542 	tmp = input;
543 	for (i = 0; i < 8 * 8; i++, tmp++)
544 		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
545 	return ret;
546 }
547 
var_inter(const s16 * old,const s16 * new)548 static int var_inter(const s16 *old, const s16 *new)
549 {
550 	int32_t ret = 0;
551 	int i;
552 
553 	for (i = 0; i < 8 * 8; i++, old++, new++)
554 		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
555 	return ret;
556 }
557 
decide_blocktype(const u8 * cur,const u8 * reference,s16 * deltablock,unsigned int stride,unsigned int input_step)558 static int decide_blocktype(const u8 *cur, const u8 *reference,
559 			    s16 *deltablock, unsigned int stride,
560 			    unsigned int input_step)
561 {
562 	s16 tmp[64];
563 	s16 old[64];
564 	s16 *work = tmp;
565 	unsigned int k, l;
566 	int vari;
567 	int vard;
568 
569 	fill_encoder_block(cur, tmp, stride, input_step);
570 	fill_encoder_block(reference, old, 8, 1);
571 	vari = var_intra(tmp);
572 
573 	for (k = 0; k < 8; k++) {
574 		for (l = 0; l < 8; l++) {
575 			*deltablock = *work - *reference;
576 			deltablock++;
577 			work++;
578 			reference++;
579 		}
580 	}
581 	deltablock -= 64;
582 	vard = var_inter(old, tmp);
583 	return vari <= vard ? IBLOCK : PBLOCK;
584 }
585 
fill_decoder_block(u8 * dst,const s16 * input,int stride)586 static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
587 {
588 	int i, j;
589 
590 	for (i = 0; i < 8; i++) {
591 		for (j = 0; j < 8; j++)
592 			*dst++ = *input++;
593 		dst += stride - 8;
594 	}
595 }
596 
add_deltas(s16 * deltas,const u8 * ref,int stride)597 static void add_deltas(s16 *deltas, const u8 *ref, int stride)
598 {
599 	int k, l;
600 
601 	for (k = 0; k < 8; k++) {
602 		for (l = 0; l < 8; l++) {
603 			*deltas += *ref++;
604 			/*
605 			 * Due to quantizing, it might possible that the
606 			 * decoded coefficients are slightly out of range
607 			 */
608 			if (*deltas < 0)
609 				*deltas = 0;
610 			else if (*deltas > 255)
611 				*deltas = 255;
612 			deltas++;
613 		}
614 		ref += stride - 8;
615 	}
616 }
617 
encode_plane(u8 * input,u8 * refp,__be16 ** rlco,__be16 * rlco_max,struct cframe * cf,u32 height,u32 width,unsigned int input_step,bool is_intra,bool next_is_intra)618 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
619 			struct cframe *cf, u32 height, u32 width,
620 			unsigned int input_step,
621 			bool is_intra, bool next_is_intra)
622 {
623 	u8 *input_start = input;
624 	__be16 *rlco_start = *rlco;
625 	s16 deltablock[64];
626 	__be16 pframe_bit = htons(PFRAME_BIT);
627 	u32 encoding = 0;
628 	unsigned int last_size = 0;
629 	unsigned int i, j;
630 
631 	for (j = 0; j < height / 8; j++) {
632 		for (i = 0; i < width / 8; i++) {
633 			/* intra code, first frame is always intra coded. */
634 			int blocktype = IBLOCK;
635 			unsigned int size;
636 
637 			if (!is_intra)
638 				blocktype = decide_blocktype(input, refp,
639 					deltablock, width, input_step);
640 			if (is_intra || blocktype == IBLOCK) {
641 				fwht(input, cf->coeffs, width, input_step, 1);
642 				quantize_intra(cf->coeffs, cf->de_coeffs);
643 				blocktype = IBLOCK;
644 			} else {
645 				/* inter code */
646 				encoding |= FRAME_PCODED;
647 				fwht16(deltablock, cf->coeffs, 8, 0);
648 				quantize_inter(cf->coeffs, cf->de_coeffs);
649 			}
650 			if (!next_is_intra) {
651 				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
652 
653 				if (blocktype == PBLOCK)
654 					add_deltas(cf->de_fwht, refp, 8);
655 				fill_decoder_block(refp, cf->de_fwht, 8);
656 			}
657 
658 			input += 8 * input_step;
659 			refp += 8 * 8;
660 
661 			if (encoding & FRAME_UNENCODED)
662 				continue;
663 
664 			size = rlc(cf->coeffs, *rlco, blocktype);
665 			if (last_size == size &&
666 			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
667 				__be16 *last_rlco = *rlco - size;
668 				s16 hdr = ntohs(*last_rlco);
669 
670 				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
671 				    (hdr & DUPS_MASK) < DUPS_MASK)
672 					*last_rlco = htons(hdr + 2);
673 				else
674 					*rlco += size;
675 			} else {
676 				*rlco += size;
677 			}
678 			if (*rlco >= rlco_max)
679 				encoding |= FRAME_UNENCODED;
680 			last_size = size;
681 		}
682 		input += width * 7 * input_step;
683 	}
684 	if (encoding & FRAME_UNENCODED) {
685 		u8 *out = (u8 *)rlco_start;
686 
687 		input = input_start;
688 		/*
689 		 * The compressed stream should never contain the magic
690 		 * header, so when we copy the YUV data we replace 0xff
691 		 * by 0xfe. Since YUV is limited range such values
692 		 * shouldn't appear anyway.
693 		 */
694 		for (i = 0; i < height * width; i++, input += input_step)
695 			*out++ = (*input == 0xff) ? 0xfe : *input;
696 		*rlco = (__be16 *)out;
697 	}
698 	return encoding;
699 }
700 
encode_frame(struct raw_frame * frm,struct raw_frame * ref_frm,struct cframe * cf,bool is_intra,bool next_is_intra)701 u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
702 		 struct cframe *cf, bool is_intra, bool next_is_intra)
703 {
704 	unsigned int size = frm->height * frm->width;
705 	__be16 *rlco = cf->rlc_data;
706 	__be16 *rlco_max;
707 	u32 encoding;
708 
709 	rlco_max = rlco + size / 2 - 256;
710 	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
711 				  frm->height, frm->width,
712 				  1, is_intra, next_is_intra);
713 	if (encoding & FRAME_UNENCODED)
714 		encoding |= LUMA_UNENCODED;
715 	encoding &= ~FRAME_UNENCODED;
716 	rlco_max = rlco + size / 8 - 256;
717 	encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
718 				   frm->height / 2, frm->width / 2,
719 				   frm->chroma_step, is_intra, next_is_intra);
720 	if (encoding & FRAME_UNENCODED)
721 		encoding |= CB_UNENCODED;
722 	encoding &= ~FRAME_UNENCODED;
723 	rlco_max = rlco + size / 8 - 256;
724 	encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
725 				   frm->height / 2, frm->width / 2,
726 				   frm->chroma_step, is_intra, next_is_intra);
727 	if (encoding & FRAME_UNENCODED)
728 		encoding |= CR_UNENCODED;
729 	encoding &= ~FRAME_UNENCODED;
730 	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
731 	return encoding;
732 }
733 
decode_plane(struct cframe * cf,const __be16 ** rlco,u8 * ref,u32 height,u32 width,bool uncompressed)734 static void decode_plane(struct cframe *cf, const __be16 **rlco, u8 *ref,
735 			 u32 height, u32 width, bool uncompressed)
736 {
737 	unsigned int copies = 0;
738 	s16 copy[8 * 8];
739 	s16 stat;
740 	unsigned int i, j;
741 
742 	if (uncompressed) {
743 		memcpy(ref, *rlco, width * height);
744 		*rlco += width * height / 2;
745 		return;
746 	}
747 
748 	/*
749 	 * When decoding each macroblock the rlco pointer will be increased
750 	 * by 65 * 2 bytes worst-case.
751 	 * To avoid overflow the buffer has to be 65/64th of the actual raw
752 	 * image size, just in case someone feeds it malicious data.
753 	 */
754 	for (j = 0; j < height / 8; j++) {
755 		for (i = 0; i < width / 8; i++) {
756 			u8 *refp = ref + j * 8 * width + i * 8;
757 
758 			if (copies) {
759 				memcpy(cf->de_fwht, copy, sizeof(copy));
760 				if (stat & PFRAME_BIT)
761 					add_deltas(cf->de_fwht, refp, width);
762 				fill_decoder_block(refp, cf->de_fwht, width);
763 				copies--;
764 				continue;
765 			}
766 
767 			stat = derlc(rlco, cf->coeffs);
768 
769 			if (stat & PFRAME_BIT)
770 				dequantize_inter(cf->coeffs);
771 			else
772 				dequantize_intra(cf->coeffs);
773 
774 			ifwht(cf->coeffs, cf->de_fwht,
775 			      (stat & PFRAME_BIT) ? 0 : 1);
776 
777 			copies = (stat & DUPS_MASK) >> 1;
778 			if (copies)
779 				memcpy(copy, cf->de_fwht, sizeof(copy));
780 			if (stat & PFRAME_BIT)
781 				add_deltas(cf->de_fwht, refp, width);
782 			fill_decoder_block(refp, cf->de_fwht, width);
783 		}
784 	}
785 }
786 
decode_frame(struct cframe * cf,struct raw_frame * ref,u32 hdr_flags)787 void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags)
788 {
789 	const __be16 *rlco = cf->rlc_data;
790 
791 	decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
792 		     hdr_flags & VICODEC_FL_LUMA_IS_UNCOMPRESSED);
793 	decode_plane(cf, &rlco, ref->cb, cf->height / 2, cf->width / 2,
794 		     hdr_flags & VICODEC_FL_CB_IS_UNCOMPRESSED);
795 	decode_plane(cf, &rlco, ref->cr, cf->height / 2, cf->width / 2,
796 		     hdr_flags & VICODEC_FL_CR_IS_UNCOMPRESSED);
797 }
798