1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * Copyright 2016 Tom aan de Wiel
4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5 *
6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7 *
8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9 * R.D. Brown, 1977
10 */
11
12 #include <linux/string.h>
13 #include "vicodec-codec.h"
14
15 #define ALL_ZEROS 15
16 #define DEADZONE_WIDTH 20
17
18 static const uint8_t zigzag[64] = {
19 0,
20 1, 8,
21 2, 9, 16,
22 3, 10, 17, 24,
23 4, 11, 18, 25, 32,
24 5, 12, 19, 26, 33, 40,
25 6, 13, 20, 27, 34, 41, 48,
26 7, 14, 21, 28, 35, 42, 49, 56,
27 15, 22, 29, 36, 43, 50, 57,
28 23, 30, 37, 44, 51, 58,
29 31, 38, 45, 52, 59,
30 39, 46, 53, 60,
31 47, 54, 61,
32 55, 62,
33 63,
34 };
35
36
rlc(const s16 * in,__be16 * output,int blocktype)37 static int rlc(const s16 *in, __be16 *output, int blocktype)
38 {
39 s16 block[8 * 8];
40 s16 *wp = block;
41 int i = 0;
42 int x, y;
43 int ret = 0;
44
45 /* read in block from framebuffer */
46 int lastzero_run = 0;
47 int to_encode;
48
49 for (y = 0; y < 8; y++) {
50 for (x = 0; x < 8; x++) {
51 *wp = in[x + y * 8];
52 wp++;
53 }
54 }
55
56 /* keep track of amount of trailing zeros */
57 for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
58 lastzero_run++;
59
60 *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
61 ret++;
62
63 to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
64
65 i = 0;
66 while (i < to_encode) {
67 int cnt = 0;
68 int tmp;
69
70 /* count leading zeros */
71 while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
72 cnt++;
73 i++;
74 if (i == to_encode) {
75 cnt--;
76 break;
77 }
78 }
79 /* 4 bits for run, 12 for coefficient (quantization by 4) */
80 *output++ = htons((cnt | tmp << 4));
81 i++;
82 ret++;
83 }
84 if (lastzero_run > 14) {
85 *output = htons(ALL_ZEROS | 0);
86 ret++;
87 }
88
89 return ret;
90 }
91
92 /*
93 * This function will worst-case increase rlc_in by 65*2 bytes:
94 * one s16 value for the header and 8 * 8 coefficients of type s16.
95 */
derlc(const __be16 ** rlc_in,s16 * dwht_out)96 static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
97 {
98 /* header */
99 const __be16 *input = *rlc_in;
100 s16 ret = ntohs(*input++);
101 int dec_count = 0;
102 s16 block[8 * 8 + 16];
103 s16 *wp = block;
104 int i;
105
106 /*
107 * Now de-compress, it expands one byte to up to 15 bytes
108 * (or fills the remainder of the 64 bytes with zeroes if it
109 * is the last byte to expand).
110 *
111 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
112 * allow for overflow if the incoming data was malformed.
113 */
114 while (dec_count < 8 * 8) {
115 s16 in = ntohs(*input++);
116 int length = in & 0xf;
117 int coeff = in >> 4;
118
119 /* fill remainder with zeros */
120 if (length == 15) {
121 for (i = 0; i < 64 - dec_count; i++)
122 *wp++ = 0;
123 break;
124 }
125
126 for (i = 0; i < length; i++)
127 *wp++ = 0;
128 *wp++ = coeff;
129 dec_count += length + 1;
130 }
131
132 wp = block;
133
134 for (i = 0; i < 64; i++) {
135 int pos = zigzag[i];
136 int y = pos / 8;
137 int x = pos % 8;
138
139 dwht_out[x + y * 8] = *wp++;
140 }
141 *rlc_in = input;
142 return ret;
143 }
144
145 static const int quant_table[] = {
146 2, 2, 2, 2, 2, 2, 2, 2,
147 2, 2, 2, 2, 2, 2, 2, 2,
148 2, 2, 2, 2, 2, 2, 2, 3,
149 2, 2, 2, 2, 2, 2, 3, 6,
150 2, 2, 2, 2, 2, 3, 6, 6,
151 2, 2, 2, 2, 3, 6, 6, 6,
152 2, 2, 2, 3, 6, 6, 6, 6,
153 2, 2, 3, 6, 6, 6, 6, 8,
154 };
155
156 static const int quant_table_p[] = {
157 3, 3, 3, 3, 3, 3, 3, 3,
158 3, 3, 3, 3, 3, 3, 3, 3,
159 3, 3, 3, 3, 3, 3, 3, 3,
160 3, 3, 3, 3, 3, 3, 3, 6,
161 3, 3, 3, 3, 3, 3, 6, 6,
162 3, 3, 3, 3, 3, 6, 6, 9,
163 3, 3, 3, 3, 6, 6, 9, 9,
164 3, 3, 3, 6, 6, 9, 9, 10,
165 };
166
quantize_intra(s16 * coeff,s16 * de_coeff)167 static void quantize_intra(s16 *coeff, s16 *de_coeff)
168 {
169 const int *quant = quant_table;
170 int i, j;
171
172 for (j = 0; j < 8; j++) {
173 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
174 *coeff >>= *quant;
175 if (*coeff >= -DEADZONE_WIDTH &&
176 *coeff <= DEADZONE_WIDTH)
177 *coeff = *de_coeff = 0;
178 else
179 *de_coeff = *coeff << *quant;
180 }
181 }
182 }
183
dequantize_intra(s16 * coeff)184 static void dequantize_intra(s16 *coeff)
185 {
186 const int *quant = quant_table;
187 int i, j;
188
189 for (j = 0; j < 8; j++)
190 for (i = 0; i < 8; i++, quant++, coeff++)
191 *coeff <<= *quant;
192 }
193
quantize_inter(s16 * coeff,s16 * de_coeff)194 static void quantize_inter(s16 *coeff, s16 *de_coeff)
195 {
196 const int *quant = quant_table_p;
197 int i, j;
198
199 for (j = 0; j < 8; j++) {
200 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
201 *coeff >>= *quant;
202 if (*coeff >= -DEADZONE_WIDTH &&
203 *coeff <= DEADZONE_WIDTH)
204 *coeff = *de_coeff = 0;
205 else
206 *de_coeff = *coeff << *quant;
207 }
208 }
209 }
210
dequantize_inter(s16 * coeff)211 static void dequantize_inter(s16 *coeff)
212 {
213 const int *quant = quant_table_p;
214 int i, j;
215
216 for (j = 0; j < 8; j++)
217 for (i = 0; i < 8; i++, quant++, coeff++)
218 *coeff <<= *quant;
219 }
220
fwht(const u8 * block,s16 * output_block,unsigned int stride,unsigned int input_step,bool intra)221 static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
222 unsigned int input_step, bool intra)
223 {
224 /* we'll need more than 8 bits for the transformed coefficients */
225 s32 workspace1[8], workspace2[8];
226 const u8 *tmp = block;
227 s16 *out = output_block;
228 int add = intra ? 256 : 0;
229 unsigned int i;
230
231 /* stage 1 */
232 stride *= input_step;
233
234 for (i = 0; i < 8; i++, tmp += stride, out += 8) {
235 if (input_step == 1) {
236 workspace1[0] = tmp[0] + tmp[1] - add;
237 workspace1[1] = tmp[0] - tmp[1];
238
239 workspace1[2] = tmp[2] + tmp[3] - add;
240 workspace1[3] = tmp[2] - tmp[3];
241
242 workspace1[4] = tmp[4] + tmp[5] - add;
243 workspace1[5] = tmp[4] - tmp[5];
244
245 workspace1[6] = tmp[6] + tmp[7] - add;
246 workspace1[7] = tmp[6] - tmp[7];
247 } else {
248 workspace1[0] = tmp[0] + tmp[2] - add;
249 workspace1[1] = tmp[0] - tmp[2];
250
251 workspace1[2] = tmp[4] + tmp[6] - add;
252 workspace1[3] = tmp[4] - tmp[6];
253
254 workspace1[4] = tmp[8] + tmp[10] - add;
255 workspace1[5] = tmp[8] - tmp[10];
256
257 workspace1[6] = tmp[12] + tmp[14] - add;
258 workspace1[7] = tmp[12] - tmp[14];
259 }
260
261 /* stage 2 */
262 workspace2[0] = workspace1[0] + workspace1[2];
263 workspace2[1] = workspace1[0] - workspace1[2];
264 workspace2[2] = workspace1[1] - workspace1[3];
265 workspace2[3] = workspace1[1] + workspace1[3];
266
267 workspace2[4] = workspace1[4] + workspace1[6];
268 workspace2[5] = workspace1[4] - workspace1[6];
269 workspace2[6] = workspace1[5] - workspace1[7];
270 workspace2[7] = workspace1[5] + workspace1[7];
271
272 /* stage 3 */
273 out[0] = workspace2[0] + workspace2[4];
274 out[1] = workspace2[0] - workspace2[4];
275 out[2] = workspace2[1] - workspace2[5];
276 out[3] = workspace2[1] + workspace2[5];
277 out[4] = workspace2[2] + workspace2[6];
278 out[5] = workspace2[2] - workspace2[6];
279 out[6] = workspace2[3] - workspace2[7];
280 out[7] = workspace2[3] + workspace2[7];
281 }
282
283 out = output_block;
284
285 for (i = 0; i < 8; i++, out++) {
286 /* stage 1 */
287 workspace1[0] = out[0] + out[1 * 8];
288 workspace1[1] = out[0] - out[1 * 8];
289
290 workspace1[2] = out[2 * 8] + out[3 * 8];
291 workspace1[3] = out[2 * 8] - out[3 * 8];
292
293 workspace1[4] = out[4 * 8] + out[5 * 8];
294 workspace1[5] = out[4 * 8] - out[5 * 8];
295
296 workspace1[6] = out[6 * 8] + out[7 * 8];
297 workspace1[7] = out[6 * 8] - out[7 * 8];
298
299 /* stage 2 */
300 workspace2[0] = workspace1[0] + workspace1[2];
301 workspace2[1] = workspace1[0] - workspace1[2];
302 workspace2[2] = workspace1[1] - workspace1[3];
303 workspace2[3] = workspace1[1] + workspace1[3];
304
305 workspace2[4] = workspace1[4] + workspace1[6];
306 workspace2[5] = workspace1[4] - workspace1[6];
307 workspace2[6] = workspace1[5] - workspace1[7];
308 workspace2[7] = workspace1[5] + workspace1[7];
309 /* stage 3 */
310 out[0 * 8] = workspace2[0] + workspace2[4];
311 out[1 * 8] = workspace2[0] - workspace2[4];
312 out[2 * 8] = workspace2[1] - workspace2[5];
313 out[3 * 8] = workspace2[1] + workspace2[5];
314 out[4 * 8] = workspace2[2] + workspace2[6];
315 out[5 * 8] = workspace2[2] - workspace2[6];
316 out[6 * 8] = workspace2[3] - workspace2[7];
317 out[7 * 8] = workspace2[3] + workspace2[7];
318 }
319 }
320
321 /*
322 * Not the nicest way of doing it, but P-blocks get twice the range of
323 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
324 * Furthermore values can be negative... This is just a version that
325 * works with 16 signed data
326 */
fwht16(const s16 * block,s16 * output_block,int stride,int intra)327 static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
328 {
329 /* we'll need more than 8 bits for the transformed coefficients */
330 s32 workspace1[8], workspace2[8];
331 const s16 *tmp = block;
332 s16 *out = output_block;
333 int i;
334
335 for (i = 0; i < 8; i++, tmp += stride, out += 8) {
336 /* stage 1 */
337 workspace1[0] = tmp[0] + tmp[1];
338 workspace1[1] = tmp[0] - tmp[1];
339
340 workspace1[2] = tmp[2] + tmp[3];
341 workspace1[3] = tmp[2] - tmp[3];
342
343 workspace1[4] = tmp[4] + tmp[5];
344 workspace1[5] = tmp[4] - tmp[5];
345
346 workspace1[6] = tmp[6] + tmp[7];
347 workspace1[7] = tmp[6] - tmp[7];
348
349 /* stage 2 */
350 workspace2[0] = workspace1[0] + workspace1[2];
351 workspace2[1] = workspace1[0] - workspace1[2];
352 workspace2[2] = workspace1[1] - workspace1[3];
353 workspace2[3] = workspace1[1] + workspace1[3];
354
355 workspace2[4] = workspace1[4] + workspace1[6];
356 workspace2[5] = workspace1[4] - workspace1[6];
357 workspace2[6] = workspace1[5] - workspace1[7];
358 workspace2[7] = workspace1[5] + workspace1[7];
359
360 /* stage 3 */
361 out[0] = workspace2[0] + workspace2[4];
362 out[1] = workspace2[0] - workspace2[4];
363 out[2] = workspace2[1] - workspace2[5];
364 out[3] = workspace2[1] + workspace2[5];
365 out[4] = workspace2[2] + workspace2[6];
366 out[5] = workspace2[2] - workspace2[6];
367 out[6] = workspace2[3] - workspace2[7];
368 out[7] = workspace2[3] + workspace2[7];
369 }
370
371 out = output_block;
372
373 for (i = 0; i < 8; i++, out++) {
374 /* stage 1 */
375 workspace1[0] = out[0] + out[1*8];
376 workspace1[1] = out[0] - out[1*8];
377
378 workspace1[2] = out[2*8] + out[3*8];
379 workspace1[3] = out[2*8] - out[3*8];
380
381 workspace1[4] = out[4*8] + out[5*8];
382 workspace1[5] = out[4*8] - out[5*8];
383
384 workspace1[6] = out[6*8] + out[7*8];
385 workspace1[7] = out[6*8] - out[7*8];
386
387 /* stage 2 */
388 workspace2[0] = workspace1[0] + workspace1[2];
389 workspace2[1] = workspace1[0] - workspace1[2];
390 workspace2[2] = workspace1[1] - workspace1[3];
391 workspace2[3] = workspace1[1] + workspace1[3];
392
393 workspace2[4] = workspace1[4] + workspace1[6];
394 workspace2[5] = workspace1[4] - workspace1[6];
395 workspace2[6] = workspace1[5] - workspace1[7];
396 workspace2[7] = workspace1[5] + workspace1[7];
397
398 /* stage 3 */
399 out[0*8] = workspace2[0] + workspace2[4];
400 out[1*8] = workspace2[0] - workspace2[4];
401 out[2*8] = workspace2[1] - workspace2[5];
402 out[3*8] = workspace2[1] + workspace2[5];
403 out[4*8] = workspace2[2] + workspace2[6];
404 out[5*8] = workspace2[2] - workspace2[6];
405 out[6*8] = workspace2[3] - workspace2[7];
406 out[7*8] = workspace2[3] + workspace2[7];
407 }
408 }
409
ifwht(const s16 * block,s16 * output_block,int intra)410 static void ifwht(const s16 *block, s16 *output_block, int intra)
411 {
412 /*
413 * we'll need more than 8 bits for the transformed coefficients
414 * use native unit of cpu
415 */
416 int workspace1[8], workspace2[8];
417 int inter = intra ? 0 : 1;
418 const s16 *tmp = block;
419 s16 *out = output_block;
420 int i;
421
422 for (i = 0; i < 8; i++, tmp += 8, out += 8) {
423 /* stage 1 */
424 workspace1[0] = tmp[0] + tmp[1];
425 workspace1[1] = tmp[0] - tmp[1];
426
427 workspace1[2] = tmp[2] + tmp[3];
428 workspace1[3] = tmp[2] - tmp[3];
429
430 workspace1[4] = tmp[4] + tmp[5];
431 workspace1[5] = tmp[4] - tmp[5];
432
433 workspace1[6] = tmp[6] + tmp[7];
434 workspace1[7] = tmp[6] - tmp[7];
435
436 /* stage 2 */
437 workspace2[0] = workspace1[0] + workspace1[2];
438 workspace2[1] = workspace1[0] - workspace1[2];
439 workspace2[2] = workspace1[1] - workspace1[3];
440 workspace2[3] = workspace1[1] + workspace1[3];
441
442 workspace2[4] = workspace1[4] + workspace1[6];
443 workspace2[5] = workspace1[4] - workspace1[6];
444 workspace2[6] = workspace1[5] - workspace1[7];
445 workspace2[7] = workspace1[5] + workspace1[7];
446
447 /* stage 3 */
448 out[0] = workspace2[0] + workspace2[4];
449 out[1] = workspace2[0] - workspace2[4];
450 out[2] = workspace2[1] - workspace2[5];
451 out[3] = workspace2[1] + workspace2[5];
452 out[4] = workspace2[2] + workspace2[6];
453 out[5] = workspace2[2] - workspace2[6];
454 out[6] = workspace2[3] - workspace2[7];
455 out[7] = workspace2[3] + workspace2[7];
456 }
457
458 out = output_block;
459
460 for (i = 0; i < 8; i++, out++) {
461 /* stage 1 */
462 workspace1[0] = out[0] + out[1 * 8];
463 workspace1[1] = out[0] - out[1 * 8];
464
465 workspace1[2] = out[2 * 8] + out[3 * 8];
466 workspace1[3] = out[2 * 8] - out[3 * 8];
467
468 workspace1[4] = out[4 * 8] + out[5 * 8];
469 workspace1[5] = out[4 * 8] - out[5 * 8];
470
471 workspace1[6] = out[6 * 8] + out[7 * 8];
472 workspace1[7] = out[6 * 8] - out[7 * 8];
473
474 /* stage 2 */
475 workspace2[0] = workspace1[0] + workspace1[2];
476 workspace2[1] = workspace1[0] - workspace1[2];
477 workspace2[2] = workspace1[1] - workspace1[3];
478 workspace2[3] = workspace1[1] + workspace1[3];
479
480 workspace2[4] = workspace1[4] + workspace1[6];
481 workspace2[5] = workspace1[4] - workspace1[6];
482 workspace2[6] = workspace1[5] - workspace1[7];
483 workspace2[7] = workspace1[5] + workspace1[7];
484
485 /* stage 3 */
486 if (inter) {
487 int d;
488
489 out[0 * 8] = workspace2[0] + workspace2[4];
490 out[1 * 8] = workspace2[0] - workspace2[4];
491 out[2 * 8] = workspace2[1] - workspace2[5];
492 out[3 * 8] = workspace2[1] + workspace2[5];
493 out[4 * 8] = workspace2[2] + workspace2[6];
494 out[5 * 8] = workspace2[2] - workspace2[6];
495 out[6 * 8] = workspace2[3] - workspace2[7];
496 out[7 * 8] = workspace2[3] + workspace2[7];
497
498 for (d = 0; d < 8; d++)
499 out[8 * d] >>= 6;
500 } else {
501 int d;
502
503 out[0 * 8] = workspace2[0] + workspace2[4];
504 out[1 * 8] = workspace2[0] - workspace2[4];
505 out[2 * 8] = workspace2[1] - workspace2[5];
506 out[3 * 8] = workspace2[1] + workspace2[5];
507 out[4 * 8] = workspace2[2] + workspace2[6];
508 out[5 * 8] = workspace2[2] - workspace2[6];
509 out[6 * 8] = workspace2[3] - workspace2[7];
510 out[7 * 8] = workspace2[3] + workspace2[7];
511
512 for (d = 0; d < 8; d++) {
513 out[8 * d] >>= 6;
514 out[8 * d] += 128;
515 }
516 }
517 }
518 }
519
fill_encoder_block(const u8 * input,s16 * dst,unsigned int stride,unsigned int input_step)520 static void fill_encoder_block(const u8 *input, s16 *dst,
521 unsigned int stride, unsigned int input_step)
522 {
523 int i, j;
524
525 for (i = 0; i < 8; i++) {
526 for (j = 0; j < 8; j++, input += input_step)
527 *dst++ = *input;
528 input += (stride - 8) * input_step;
529 }
530 }
531
var_intra(const s16 * input)532 static int var_intra(const s16 *input)
533 {
534 int32_t mean = 0;
535 int32_t ret = 0;
536 const s16 *tmp = input;
537 int i;
538
539 for (i = 0; i < 8 * 8; i++, tmp++)
540 mean += *tmp;
541 mean /= 64;
542 tmp = input;
543 for (i = 0; i < 8 * 8; i++, tmp++)
544 ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
545 return ret;
546 }
547
var_inter(const s16 * old,const s16 * new)548 static int var_inter(const s16 *old, const s16 *new)
549 {
550 int32_t ret = 0;
551 int i;
552
553 for (i = 0; i < 8 * 8; i++, old++, new++)
554 ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
555 return ret;
556 }
557
decide_blocktype(const u8 * cur,const u8 * reference,s16 * deltablock,unsigned int stride,unsigned int input_step)558 static int decide_blocktype(const u8 *cur, const u8 *reference,
559 s16 *deltablock, unsigned int stride,
560 unsigned int input_step)
561 {
562 s16 tmp[64];
563 s16 old[64];
564 s16 *work = tmp;
565 unsigned int k, l;
566 int vari;
567 int vard;
568
569 fill_encoder_block(cur, tmp, stride, input_step);
570 fill_encoder_block(reference, old, 8, 1);
571 vari = var_intra(tmp);
572
573 for (k = 0; k < 8; k++) {
574 for (l = 0; l < 8; l++) {
575 *deltablock = *work - *reference;
576 deltablock++;
577 work++;
578 reference++;
579 }
580 }
581 deltablock -= 64;
582 vard = var_inter(old, tmp);
583 return vari <= vard ? IBLOCK : PBLOCK;
584 }
585
fill_decoder_block(u8 * dst,const s16 * input,int stride)586 static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
587 {
588 int i, j;
589
590 for (i = 0; i < 8; i++) {
591 for (j = 0; j < 8; j++)
592 *dst++ = *input++;
593 dst += stride - 8;
594 }
595 }
596
add_deltas(s16 * deltas,const u8 * ref,int stride)597 static void add_deltas(s16 *deltas, const u8 *ref, int stride)
598 {
599 int k, l;
600
601 for (k = 0; k < 8; k++) {
602 for (l = 0; l < 8; l++) {
603 *deltas += *ref++;
604 /*
605 * Due to quantizing, it might possible that the
606 * decoded coefficients are slightly out of range
607 */
608 if (*deltas < 0)
609 *deltas = 0;
610 else if (*deltas > 255)
611 *deltas = 255;
612 deltas++;
613 }
614 ref += stride - 8;
615 }
616 }
617
encode_plane(u8 * input,u8 * refp,__be16 ** rlco,__be16 * rlco_max,struct cframe * cf,u32 height,u32 width,unsigned int input_step,bool is_intra,bool next_is_intra)618 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
619 struct cframe *cf, u32 height, u32 width,
620 unsigned int input_step,
621 bool is_intra, bool next_is_intra)
622 {
623 u8 *input_start = input;
624 __be16 *rlco_start = *rlco;
625 s16 deltablock[64];
626 __be16 pframe_bit = htons(PFRAME_BIT);
627 u32 encoding = 0;
628 unsigned int last_size = 0;
629 unsigned int i, j;
630
631 for (j = 0; j < height / 8; j++) {
632 for (i = 0; i < width / 8; i++) {
633 /* intra code, first frame is always intra coded. */
634 int blocktype = IBLOCK;
635 unsigned int size;
636
637 if (!is_intra)
638 blocktype = decide_blocktype(input, refp,
639 deltablock, width, input_step);
640 if (is_intra || blocktype == IBLOCK) {
641 fwht(input, cf->coeffs, width, input_step, 1);
642 quantize_intra(cf->coeffs, cf->de_coeffs);
643 blocktype = IBLOCK;
644 } else {
645 /* inter code */
646 encoding |= FRAME_PCODED;
647 fwht16(deltablock, cf->coeffs, 8, 0);
648 quantize_inter(cf->coeffs, cf->de_coeffs);
649 }
650 if (!next_is_intra) {
651 ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
652
653 if (blocktype == PBLOCK)
654 add_deltas(cf->de_fwht, refp, 8);
655 fill_decoder_block(refp, cf->de_fwht, 8);
656 }
657
658 input += 8 * input_step;
659 refp += 8 * 8;
660
661 if (encoding & FRAME_UNENCODED)
662 continue;
663
664 size = rlc(cf->coeffs, *rlco, blocktype);
665 if (last_size == size &&
666 !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
667 __be16 *last_rlco = *rlco - size;
668 s16 hdr = ntohs(*last_rlco);
669
670 if (!((*last_rlco ^ **rlco) & pframe_bit) &&
671 (hdr & DUPS_MASK) < DUPS_MASK)
672 *last_rlco = htons(hdr + 2);
673 else
674 *rlco += size;
675 } else {
676 *rlco += size;
677 }
678 if (*rlco >= rlco_max)
679 encoding |= FRAME_UNENCODED;
680 last_size = size;
681 }
682 input += width * 7 * input_step;
683 }
684 if (encoding & FRAME_UNENCODED) {
685 u8 *out = (u8 *)rlco_start;
686
687 input = input_start;
688 /*
689 * The compressed stream should never contain the magic
690 * header, so when we copy the YUV data we replace 0xff
691 * by 0xfe. Since YUV is limited range such values
692 * shouldn't appear anyway.
693 */
694 for (i = 0; i < height * width; i++, input += input_step)
695 *out++ = (*input == 0xff) ? 0xfe : *input;
696 *rlco = (__be16 *)out;
697 }
698 return encoding;
699 }
700
encode_frame(struct raw_frame * frm,struct raw_frame * ref_frm,struct cframe * cf,bool is_intra,bool next_is_intra)701 u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
702 struct cframe *cf, bool is_intra, bool next_is_intra)
703 {
704 unsigned int size = frm->height * frm->width;
705 __be16 *rlco = cf->rlc_data;
706 __be16 *rlco_max;
707 u32 encoding;
708
709 rlco_max = rlco + size / 2 - 256;
710 encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
711 frm->height, frm->width,
712 1, is_intra, next_is_intra);
713 if (encoding & FRAME_UNENCODED)
714 encoding |= LUMA_UNENCODED;
715 encoding &= ~FRAME_UNENCODED;
716 rlco_max = rlco + size / 8 - 256;
717 encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
718 frm->height / 2, frm->width / 2,
719 frm->chroma_step, is_intra, next_is_intra);
720 if (encoding & FRAME_UNENCODED)
721 encoding |= CB_UNENCODED;
722 encoding &= ~FRAME_UNENCODED;
723 rlco_max = rlco + size / 8 - 256;
724 encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
725 frm->height / 2, frm->width / 2,
726 frm->chroma_step, is_intra, next_is_intra);
727 if (encoding & FRAME_UNENCODED)
728 encoding |= CR_UNENCODED;
729 encoding &= ~FRAME_UNENCODED;
730 cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
731 return encoding;
732 }
733
decode_plane(struct cframe * cf,const __be16 ** rlco,u8 * ref,u32 height,u32 width,bool uncompressed)734 static void decode_plane(struct cframe *cf, const __be16 **rlco, u8 *ref,
735 u32 height, u32 width, bool uncompressed)
736 {
737 unsigned int copies = 0;
738 s16 copy[8 * 8];
739 s16 stat;
740 unsigned int i, j;
741
742 if (uncompressed) {
743 memcpy(ref, *rlco, width * height);
744 *rlco += width * height / 2;
745 return;
746 }
747
748 /*
749 * When decoding each macroblock the rlco pointer will be increased
750 * by 65 * 2 bytes worst-case.
751 * To avoid overflow the buffer has to be 65/64th of the actual raw
752 * image size, just in case someone feeds it malicious data.
753 */
754 for (j = 0; j < height / 8; j++) {
755 for (i = 0; i < width / 8; i++) {
756 u8 *refp = ref + j * 8 * width + i * 8;
757
758 if (copies) {
759 memcpy(cf->de_fwht, copy, sizeof(copy));
760 if (stat & PFRAME_BIT)
761 add_deltas(cf->de_fwht, refp, width);
762 fill_decoder_block(refp, cf->de_fwht, width);
763 copies--;
764 continue;
765 }
766
767 stat = derlc(rlco, cf->coeffs);
768
769 if (stat & PFRAME_BIT)
770 dequantize_inter(cf->coeffs);
771 else
772 dequantize_intra(cf->coeffs);
773
774 ifwht(cf->coeffs, cf->de_fwht,
775 (stat & PFRAME_BIT) ? 0 : 1);
776
777 copies = (stat & DUPS_MASK) >> 1;
778 if (copies)
779 memcpy(copy, cf->de_fwht, sizeof(copy));
780 if (stat & PFRAME_BIT)
781 add_deltas(cf->de_fwht, refp, width);
782 fill_decoder_block(refp, cf->de_fwht, width);
783 }
784 }
785 }
786
decode_frame(struct cframe * cf,struct raw_frame * ref,u32 hdr_flags)787 void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags)
788 {
789 const __be16 *rlco = cf->rlc_data;
790
791 decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
792 hdr_flags & VICODEC_FL_LUMA_IS_UNCOMPRESSED);
793 decode_plane(cf, &rlco, ref->cb, cf->height / 2, cf->width / 2,
794 hdr_flags & VICODEC_FL_CB_IS_UNCOMPRESSED);
795 decode_plane(cf, &rlco, ref->cr, cf->height / 2, cf->width / 2,
796 hdr_flags & VICODEC_FL_CR_IS_UNCOMPRESSED);
797 }
798