codec-fwht.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959
  1. // SPDX-License-Identifier: LGPL-2.1+
  2. /*
  3. * Copyright 2016 Tom aan de Wiel
  4. * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
  5. *
  6. * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
  7. *
  8. * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
  9. * R.D. Brown, 1977
  10. */
  11. #include <linux/string.h>
  12. #include <linux/kernel.h>
  13. #include <linux/videodev2.h>
  14. #include "codec-fwht.h"
  15. #define OVERFLOW_BIT BIT(14)
  16. /*
  17. * Note: bit 0 of the header must always be 0. Otherwise it cannot
  18. * be guaranteed that the magic 8 byte sequence (see below) can
  19. * never occur in the rlc output.
  20. */
  21. #define PFRAME_BIT BIT(15)
  22. #define DUPS_MASK 0x1ffe
  23. #define PBLOCK 0
  24. #define IBLOCK 1
  25. #define ALL_ZEROS 15
  26. static const uint8_t zigzag[64] = {
  27. 0,
  28. 1, 8,
  29. 2, 9, 16,
  30. 3, 10, 17, 24,
  31. 4, 11, 18, 25, 32,
  32. 5, 12, 19, 26, 33, 40,
  33. 6, 13, 20, 27, 34, 41, 48,
  34. 7, 14, 21, 28, 35, 42, 49, 56,
  35. 15, 22, 29, 36, 43, 50, 57,
  36. 23, 30, 37, 44, 51, 58,
  37. 31, 38, 45, 52, 59,
  38. 39, 46, 53, 60,
  39. 47, 54, 61,
  40. 55, 62,
  41. 63,
  42. };
  43. /*
  44. * noinline_for_stack to work around
  45. * https://bugs.llvm.org/show_bug.cgi?id=38809
  46. */
  47. static int noinline_for_stack
  48. rlc(const s16 *in, __be16 *output, int blocktype)
  49. {
  50. s16 block[8 * 8];
  51. s16 *wp = block;
  52. int i = 0;
  53. int x, y;
  54. int ret = 0;
  55. /* read in block from framebuffer */
  56. int lastzero_run = 0;
  57. int to_encode;
  58. for (y = 0; y < 8; y++) {
  59. for (x = 0; x < 8; x++) {
  60. *wp = in[x + y * 8];
  61. wp++;
  62. }
  63. }
  64. /* keep track of amount of trailing zeros */
  65. for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
  66. lastzero_run++;
  67. *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
  68. ret++;
  69. to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
  70. i = 0;
  71. while (i < to_encode) {
  72. int cnt = 0;
  73. int tmp;
  74. /* count leading zeros */
  75. while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
  76. cnt++;
  77. i++;
  78. if (i == to_encode) {
  79. cnt--;
  80. break;
  81. }
  82. }
  83. /* 4 bits for run, 12 for coefficient (quantization by 4) */
  84. *output++ = htons((cnt | tmp << 4));
  85. i++;
  86. ret++;
  87. }
  88. if (lastzero_run > 14) {
  89. *output = htons(ALL_ZEROS | 0);
  90. ret++;
  91. }
  92. return ret;
  93. }
  94. /*
  95. * This function will worst-case increase rlc_in by 65*2 bytes:
  96. * one s16 value for the header and 8 * 8 coefficients of type s16.
  97. */
  98. static noinline_for_stack u16
  99. derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
  100. {
  101. /* header */
  102. const __be16 *input = *rlc_in;
  103. u16 stat;
  104. int dec_count = 0;
  105. s16 block[8 * 8 + 16];
  106. s16 *wp = block;
  107. int i;
  108. if (input > end_of_input)
  109. return OVERFLOW_BIT;
  110. stat = ntohs(*input++);
  111. /*
  112. * Now de-compress, it expands one byte to up to 15 bytes
  113. * (or fills the remainder of the 64 bytes with zeroes if it
  114. * is the last byte to expand).
  115. *
  116. * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
  117. * allow for overflow if the incoming data was malformed.
  118. */
  119. while (dec_count < 8 * 8) {
  120. s16 in;
  121. int length;
  122. int coeff;
  123. if (input > end_of_input)
  124. return OVERFLOW_BIT;
  125. in = ntohs(*input++);
  126. length = in & 0xf;
  127. coeff = in >> 4;
  128. /* fill remainder with zeros */
  129. if (length == 15) {
  130. for (i = 0; i < 64 - dec_count; i++)
  131. *wp++ = 0;
  132. break;
  133. }
  134. for (i = 0; i < length; i++)
  135. *wp++ = 0;
  136. *wp++ = coeff;
  137. dec_count += length + 1;
  138. }
  139. wp = block;
  140. for (i = 0; i < 64; i++) {
  141. int pos = zigzag[i];
  142. int y = pos / 8;
  143. int x = pos % 8;
  144. dwht_out[x + y * 8] = *wp++;
  145. }
  146. *rlc_in = input;
  147. return stat;
  148. }
  149. static const int quant_table[] = {
  150. 2, 2, 2, 2, 2, 2, 2, 2,
  151. 2, 2, 2, 2, 2, 2, 2, 2,
  152. 2, 2, 2, 2, 2, 2, 2, 3,
  153. 2, 2, 2, 2, 2, 2, 3, 6,
  154. 2, 2, 2, 2, 2, 3, 6, 6,
  155. 2, 2, 2, 2, 3, 6, 6, 6,
  156. 2, 2, 2, 3, 6, 6, 6, 6,
  157. 2, 2, 3, 6, 6, 6, 6, 8,
  158. };
  159. static const int quant_table_p[] = {
  160. 3, 3, 3, 3, 3, 3, 3, 3,
  161. 3, 3, 3, 3, 3, 3, 3, 3,
  162. 3, 3, 3, 3, 3, 3, 3, 3,
  163. 3, 3, 3, 3, 3, 3, 3, 6,
  164. 3, 3, 3, 3, 3, 3, 6, 6,
  165. 3, 3, 3, 3, 3, 6, 6, 9,
  166. 3, 3, 3, 3, 6, 6, 9, 9,
  167. 3, 3, 3, 6, 6, 9, 9, 10,
  168. };
  169. static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
  170. {
  171. const int *quant = quant_table;
  172. int i, j;
  173. for (j = 0; j < 8; j++) {
  174. for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
  175. *coeff >>= *quant;
  176. if (*coeff >= -qp && *coeff <= qp)
  177. *coeff = *de_coeff = 0;
  178. else
  179. *de_coeff = *coeff << *quant;
  180. }
  181. }
  182. }
  183. static void dequantize_intra(s16 *coeff)
  184. {
  185. const int *quant = quant_table;
  186. int i, j;
  187. for (j = 0; j < 8; j++)
  188. for (i = 0; i < 8; i++, quant++, coeff++)
  189. *coeff <<= *quant;
  190. }
  191. static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
  192. {
  193. const int *quant = quant_table_p;
  194. int i, j;
  195. for (j = 0; j < 8; j++) {
  196. for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
  197. *coeff >>= *quant;
  198. if (*coeff >= -qp && *coeff <= qp)
  199. *coeff = *de_coeff = 0;
  200. else
  201. *de_coeff = *coeff << *quant;
  202. }
  203. }
  204. }
  205. static void dequantize_inter(s16 *coeff)
  206. {
  207. const int *quant = quant_table_p;
  208. int i, j;
  209. for (j = 0; j < 8; j++)
  210. for (i = 0; i < 8; i++, quant++, coeff++)
  211. *coeff <<= *quant;
  212. }
  213. static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
  214. unsigned int stride,
  215. unsigned int input_step, bool intra)
  216. {
  217. /* we'll need more than 8 bits for the transformed coefficients */
  218. s32 workspace1[8], workspace2[8];
  219. const u8 *tmp = block;
  220. s16 *out = output_block;
  221. int add = intra ? 256 : 0;
  222. unsigned int i;
  223. /* stage 1 */
  224. for (i = 0; i < 8; i++, tmp += stride, out += 8) {
  225. switch (input_step) {
  226. case 1:
  227. workspace1[0] = tmp[0] + tmp[1] - add;
  228. workspace1[1] = tmp[0] - tmp[1];
  229. workspace1[2] = tmp[2] + tmp[3] - add;
  230. workspace1[3] = tmp[2] - tmp[3];
  231. workspace1[4] = tmp[4] + tmp[5] - add;
  232. workspace1[5] = tmp[4] - tmp[5];
  233. workspace1[6] = tmp[6] + tmp[7] - add;
  234. workspace1[7] = tmp[6] - tmp[7];
  235. break;
  236. case 2:
  237. workspace1[0] = tmp[0] + tmp[2] - add;
  238. workspace1[1] = tmp[0] - tmp[2];
  239. workspace1[2] = tmp[4] + tmp[6] - add;
  240. workspace1[3] = tmp[4] - tmp[6];
  241. workspace1[4] = tmp[8] + tmp[10] - add;
  242. workspace1[5] = tmp[8] - tmp[10];
  243. workspace1[6] = tmp[12] + tmp[14] - add;
  244. workspace1[7] = tmp[12] - tmp[14];
  245. break;
  246. case 3:
  247. workspace1[0] = tmp[0] + tmp[3] - add;
  248. workspace1[1] = tmp[0] - tmp[3];
  249. workspace1[2] = tmp[6] + tmp[9] - add;
  250. workspace1[3] = tmp[6] - tmp[9];
  251. workspace1[4] = tmp[12] + tmp[15] - add;
  252. workspace1[5] = tmp[12] - tmp[15];
  253. workspace1[6] = tmp[18] + tmp[21] - add;
  254. workspace1[7] = tmp[18] - tmp[21];
  255. break;
  256. default:
  257. workspace1[0] = tmp[0] + tmp[4] - add;
  258. workspace1[1] = tmp[0] - tmp[4];
  259. workspace1[2] = tmp[8] + tmp[12] - add;
  260. workspace1[3] = tmp[8] - tmp[12];
  261. workspace1[4] = tmp[16] + tmp[20] - add;
  262. workspace1[5] = tmp[16] - tmp[20];
  263. workspace1[6] = tmp[24] + tmp[28] - add;
  264. workspace1[7] = tmp[24] - tmp[28];
  265. break;
  266. }
  267. /* stage 2 */
  268. workspace2[0] = workspace1[0] + workspace1[2];
  269. workspace2[1] = workspace1[0] - workspace1[2];
  270. workspace2[2] = workspace1[1] - workspace1[3];
  271. workspace2[3] = workspace1[1] + workspace1[3];
  272. workspace2[4] = workspace1[4] + workspace1[6];
  273. workspace2[5] = workspace1[4] - workspace1[6];
  274. workspace2[6] = workspace1[5] - workspace1[7];
  275. workspace2[7] = workspace1[5] + workspace1[7];
  276. /* stage 3 */
  277. out[0] = workspace2[0] + workspace2[4];
  278. out[1] = workspace2[0] - workspace2[4];
  279. out[2] = workspace2[1] - workspace2[5];
  280. out[3] = workspace2[1] + workspace2[5];
  281. out[4] = workspace2[2] + workspace2[6];
  282. out[5] = workspace2[2] - workspace2[6];
  283. out[6] = workspace2[3] - workspace2[7];
  284. out[7] = workspace2[3] + workspace2[7];
  285. }
  286. out = output_block;
  287. for (i = 0; i < 8; i++, out++) {
  288. /* stage 1 */
  289. workspace1[0] = out[0] + out[1 * 8];
  290. workspace1[1] = out[0] - out[1 * 8];
  291. workspace1[2] = out[2 * 8] + out[3 * 8];
  292. workspace1[3] = out[2 * 8] - out[3 * 8];
  293. workspace1[4] = out[4 * 8] + out[5 * 8];
  294. workspace1[5] = out[4 * 8] - out[5 * 8];
  295. workspace1[6] = out[6 * 8] + out[7 * 8];
  296. workspace1[7] = out[6 * 8] - out[7 * 8];
  297. /* stage 2 */
  298. workspace2[0] = workspace1[0] + workspace1[2];
  299. workspace2[1] = workspace1[0] - workspace1[2];
  300. workspace2[2] = workspace1[1] - workspace1[3];
  301. workspace2[3] = workspace1[1] + workspace1[3];
  302. workspace2[4] = workspace1[4] + workspace1[6];
  303. workspace2[5] = workspace1[4] - workspace1[6];
  304. workspace2[6] = workspace1[5] - workspace1[7];
  305. workspace2[7] = workspace1[5] + workspace1[7];
  306. /* stage 3 */
  307. out[0 * 8] = workspace2[0] + workspace2[4];
  308. out[1 * 8] = workspace2[0] - workspace2[4];
  309. out[2 * 8] = workspace2[1] - workspace2[5];
  310. out[3 * 8] = workspace2[1] + workspace2[5];
  311. out[4 * 8] = workspace2[2] + workspace2[6];
  312. out[5 * 8] = workspace2[2] - workspace2[6];
  313. out[6 * 8] = workspace2[3] - workspace2[7];
  314. out[7 * 8] = workspace2[3] + workspace2[7];
  315. }
  316. }
  317. /*
  318. * Not the nicest way of doing it, but P-blocks get twice the range of
  319. * that of the I-blocks. Therefore we need a type bigger than 8 bits.
  320. * Furthermore values can be negative... This is just a version that
  321. * works with 16 signed data
  322. */
  323. static void noinline_for_stack
  324. fwht16(const s16 *block, s16 *output_block, int stride, int intra)
  325. {
  326. /* we'll need more than 8 bits for the transformed coefficients */
  327. s32 workspace1[8], workspace2[8];
  328. const s16 *tmp = block;
  329. s16 *out = output_block;
  330. int i;
  331. for (i = 0; i < 8; i++, tmp += stride, out += 8) {
  332. /* stage 1 */
  333. workspace1[0] = tmp[0] + tmp[1];
  334. workspace1[1] = tmp[0] - tmp[1];
  335. workspace1[2] = tmp[2] + tmp[3];
  336. workspace1[3] = tmp[2] - tmp[3];
  337. workspace1[4] = tmp[4] + tmp[5];
  338. workspace1[5] = tmp[4] - tmp[5];
  339. workspace1[6] = tmp[6] + tmp[7];
  340. workspace1[7] = tmp[6] - tmp[7];
  341. /* stage 2 */
  342. workspace2[0] = workspace1[0] + workspace1[2];
  343. workspace2[1] = workspace1[0] - workspace1[2];
  344. workspace2[2] = workspace1[1] - workspace1[3];
  345. workspace2[3] = workspace1[1] + workspace1[3];
  346. workspace2[4] = workspace1[4] + workspace1[6];
  347. workspace2[5] = workspace1[4] - workspace1[6];
  348. workspace2[6] = workspace1[5] - workspace1[7];
  349. workspace2[7] = workspace1[5] + workspace1[7];
  350. /* stage 3 */
  351. out[0] = workspace2[0] + workspace2[4];
  352. out[1] = workspace2[0] - workspace2[4];
  353. out[2] = workspace2[1] - workspace2[5];
  354. out[3] = workspace2[1] + workspace2[5];
  355. out[4] = workspace2[2] + workspace2[6];
  356. out[5] = workspace2[2] - workspace2[6];
  357. out[6] = workspace2[3] - workspace2[7];
  358. out[7] = workspace2[3] + workspace2[7];
  359. }
  360. out = output_block;
  361. for (i = 0; i < 8; i++, out++) {
  362. /* stage 1 */
  363. workspace1[0] = out[0] + out[1*8];
  364. workspace1[1] = out[0] - out[1*8];
  365. workspace1[2] = out[2*8] + out[3*8];
  366. workspace1[3] = out[2*8] - out[3*8];
  367. workspace1[4] = out[4*8] + out[5*8];
  368. workspace1[5] = out[4*8] - out[5*8];
  369. workspace1[6] = out[6*8] + out[7*8];
  370. workspace1[7] = out[6*8] - out[7*8];
  371. /* stage 2 */
  372. workspace2[0] = workspace1[0] + workspace1[2];
  373. workspace2[1] = workspace1[0] - workspace1[2];
  374. workspace2[2] = workspace1[1] - workspace1[3];
  375. workspace2[3] = workspace1[1] + workspace1[3];
  376. workspace2[4] = workspace1[4] + workspace1[6];
  377. workspace2[5] = workspace1[4] - workspace1[6];
  378. workspace2[6] = workspace1[5] - workspace1[7];
  379. workspace2[7] = workspace1[5] + workspace1[7];
  380. /* stage 3 */
  381. out[0*8] = workspace2[0] + workspace2[4];
  382. out[1*8] = workspace2[0] - workspace2[4];
  383. out[2*8] = workspace2[1] - workspace2[5];
  384. out[3*8] = workspace2[1] + workspace2[5];
  385. out[4*8] = workspace2[2] + workspace2[6];
  386. out[5*8] = workspace2[2] - workspace2[6];
  387. out[6*8] = workspace2[3] - workspace2[7];
  388. out[7*8] = workspace2[3] + workspace2[7];
  389. }
  390. }
  391. static noinline_for_stack void
  392. ifwht(const s16 *block, s16 *output_block, int intra)
  393. {
  394. /*
  395. * we'll need more than 8 bits for the transformed coefficients
  396. * use native unit of cpu
  397. */
  398. int workspace1[8], workspace2[8];
  399. int inter = intra ? 0 : 1;
  400. const s16 *tmp = block;
  401. s16 *out = output_block;
  402. int i;
  403. for (i = 0; i < 8; i++, tmp += 8, out += 8) {
  404. /* stage 1 */
  405. workspace1[0] = tmp[0] + tmp[1];
  406. workspace1[1] = tmp[0] - tmp[1];
  407. workspace1[2] = tmp[2] + tmp[3];
  408. workspace1[3] = tmp[2] - tmp[3];
  409. workspace1[4] = tmp[4] + tmp[5];
  410. workspace1[5] = tmp[4] - tmp[5];
  411. workspace1[6] = tmp[6] + tmp[7];
  412. workspace1[7] = tmp[6] - tmp[7];
  413. /* stage 2 */
  414. workspace2[0] = workspace1[0] + workspace1[2];
  415. workspace2[1] = workspace1[0] - workspace1[2];
  416. workspace2[2] = workspace1[1] - workspace1[3];
  417. workspace2[3] = workspace1[1] + workspace1[3];
  418. workspace2[4] = workspace1[4] + workspace1[6];
  419. workspace2[5] = workspace1[4] - workspace1[6];
  420. workspace2[6] = workspace1[5] - workspace1[7];
  421. workspace2[7] = workspace1[5] + workspace1[7];
  422. /* stage 3 */
  423. out[0] = workspace2[0] + workspace2[4];
  424. out[1] = workspace2[0] - workspace2[4];
  425. out[2] = workspace2[1] - workspace2[5];
  426. out[3] = workspace2[1] + workspace2[5];
  427. out[4] = workspace2[2] + workspace2[6];
  428. out[5] = workspace2[2] - workspace2[6];
  429. out[6] = workspace2[3] - workspace2[7];
  430. out[7] = workspace2[3] + workspace2[7];
  431. }
  432. out = output_block;
  433. for (i = 0; i < 8; i++, out++) {
  434. /* stage 1 */
  435. workspace1[0] = out[0] + out[1 * 8];
  436. workspace1[1] = out[0] - out[1 * 8];
  437. workspace1[2] = out[2 * 8] + out[3 * 8];
  438. workspace1[3] = out[2 * 8] - out[3 * 8];
  439. workspace1[4] = out[4 * 8] + out[5 * 8];
  440. workspace1[5] = out[4 * 8] - out[5 * 8];
  441. workspace1[6] = out[6 * 8] + out[7 * 8];
  442. workspace1[7] = out[6 * 8] - out[7 * 8];
  443. /* stage 2 */
  444. workspace2[0] = workspace1[0] + workspace1[2];
  445. workspace2[1] = workspace1[0] - workspace1[2];
  446. workspace2[2] = workspace1[1] - workspace1[3];
  447. workspace2[3] = workspace1[1] + workspace1[3];
  448. workspace2[4] = workspace1[4] + workspace1[6];
  449. workspace2[5] = workspace1[4] - workspace1[6];
  450. workspace2[6] = workspace1[5] - workspace1[7];
  451. workspace2[7] = workspace1[5] + workspace1[7];
  452. /* stage 3 */
  453. if (inter) {
  454. int d;
  455. out[0 * 8] = workspace2[0] + workspace2[4];
  456. out[1 * 8] = workspace2[0] - workspace2[4];
  457. out[2 * 8] = workspace2[1] - workspace2[5];
  458. out[3 * 8] = workspace2[1] + workspace2[5];
  459. out[4 * 8] = workspace2[2] + workspace2[6];
  460. out[5 * 8] = workspace2[2] - workspace2[6];
  461. out[6 * 8] = workspace2[3] - workspace2[7];
  462. out[7 * 8] = workspace2[3] + workspace2[7];
  463. for (d = 0; d < 8; d++)
  464. out[8 * d] >>= 6;
  465. } else {
  466. int d;
  467. out[0 * 8] = workspace2[0] + workspace2[4];
  468. out[1 * 8] = workspace2[0] - workspace2[4];
  469. out[2 * 8] = workspace2[1] - workspace2[5];
  470. out[3 * 8] = workspace2[1] + workspace2[5];
  471. out[4 * 8] = workspace2[2] + workspace2[6];
  472. out[5 * 8] = workspace2[2] - workspace2[6];
  473. out[6 * 8] = workspace2[3] - workspace2[7];
  474. out[7 * 8] = workspace2[3] + workspace2[7];
  475. for (d = 0; d < 8; d++) {
  476. out[8 * d] >>= 6;
  477. out[8 * d] += 128;
  478. }
  479. }
  480. }
  481. }
  482. static void fill_encoder_block(const u8 *input, s16 *dst,
  483. unsigned int stride, unsigned int input_step)
  484. {
  485. int i, j;
  486. for (i = 0; i < 8; i++) {
  487. for (j = 0; j < 8; j++, input += input_step)
  488. *dst++ = *input;
  489. input += stride - 8 * input_step;
  490. }
  491. }
  492. static int var_intra(const s16 *input)
  493. {
  494. int32_t mean = 0;
  495. int32_t ret = 0;
  496. const s16 *tmp = input;
  497. int i;
  498. for (i = 0; i < 8 * 8; i++, tmp++)
  499. mean += *tmp;
  500. mean /= 64;
  501. tmp = input;
  502. for (i = 0; i < 8 * 8; i++, tmp++)
  503. ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
  504. return ret;
  505. }
  506. static int var_inter(const s16 *old, const s16 *new)
  507. {
  508. int32_t ret = 0;
  509. int i;
  510. for (i = 0; i < 8 * 8; i++, old++, new++)
  511. ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
  512. return ret;
  513. }
  514. static noinline_for_stack int
  515. decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
  516. unsigned int stride, unsigned int input_step)
  517. {
  518. s16 tmp[64];
  519. s16 old[64];
  520. s16 *work = tmp;
  521. unsigned int k, l;
  522. int vari;
  523. int vard;
  524. fill_encoder_block(cur, tmp, stride, input_step);
  525. fill_encoder_block(reference, old, 8, 1);
  526. vari = var_intra(tmp);
  527. for (k = 0; k < 8; k++) {
  528. for (l = 0; l < 8; l++) {
  529. *deltablock = *work - *reference;
  530. deltablock++;
  531. work++;
  532. reference++;
  533. }
  534. }
  535. deltablock -= 64;
  536. vard = var_inter(old, tmp);
  537. return vari <= vard ? IBLOCK : PBLOCK;
  538. }
  539. static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
  540. unsigned int dst_step)
  541. {
  542. int i, j;
  543. for (i = 0; i < 8; i++) {
  544. for (j = 0; j < 8; j++, input++, dst += dst_step) {
  545. if (*input < 0)
  546. *dst = 0;
  547. else if (*input > 255)
  548. *dst = 255;
  549. else
  550. *dst = *input;
  551. }
  552. dst += stride - (8 * dst_step);
  553. }
  554. }
  555. static void add_deltas(s16 *deltas, const u8 *ref, int stride,
  556. unsigned int ref_step)
  557. {
  558. int k, l;
  559. for (k = 0; k < 8; k++) {
  560. for (l = 0; l < 8; l++) {
  561. *deltas += *ref;
  562. ref += ref_step;
  563. /*
  564. * Due to quantizing, it might possible that the
  565. * decoded coefficients are slightly out of range
  566. */
  567. if (*deltas < 0)
  568. *deltas = 0;
  569. else if (*deltas > 255)
  570. *deltas = 255;
  571. deltas++;
  572. }
  573. ref += stride - (8 * ref_step);
  574. }
  575. }
  576. static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
  577. struct fwht_cframe *cf, u32 height, u32 width,
  578. u32 stride, unsigned int input_step,
  579. bool is_intra, bool next_is_intra)
  580. {
  581. u8 *input_start = input;
  582. __be16 *rlco_start = *rlco;
  583. s16 deltablock[64];
  584. __be16 pframe_bit = htons(PFRAME_BIT);
  585. u32 encoding = 0;
  586. unsigned int last_size = 0;
  587. unsigned int i, j;
  588. width = round_up(width, 8);
  589. height = round_up(height, 8);
  590. for (j = 0; j < height / 8; j++) {
  591. input = input_start + j * 8 * stride;
  592. for (i = 0; i < width / 8; i++) {
  593. /* intra code, first frame is always intra coded. */
  594. int blocktype = IBLOCK;
  595. unsigned int size;
  596. if (!is_intra)
  597. blocktype = decide_blocktype(input, refp,
  598. deltablock, stride, input_step);
  599. if (blocktype == IBLOCK) {
  600. fwht(input, cf->coeffs, stride, input_step, 1);
  601. quantize_intra(cf->coeffs, cf->de_coeffs,
  602. cf->i_frame_qp);
  603. } else {
  604. /* inter code */
  605. encoding |= FWHT_FRAME_PCODED;
  606. fwht16(deltablock, cf->coeffs, 8, 0);
  607. quantize_inter(cf->coeffs, cf->de_coeffs,
  608. cf->p_frame_qp);
  609. }
  610. if (!next_is_intra) {
  611. ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
  612. if (blocktype == PBLOCK)
  613. add_deltas(cf->de_fwht, refp, 8, 1);
  614. fill_decoder_block(refp, cf->de_fwht, 8, 1);
  615. }
  616. input += 8 * input_step;
  617. refp += 8 * 8;
  618. size = rlc(cf->coeffs, *rlco, blocktype);
  619. if (last_size == size &&
  620. !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
  621. __be16 *last_rlco = *rlco - size;
  622. s16 hdr = ntohs(*last_rlco);
  623. if (!((*last_rlco ^ **rlco) & pframe_bit) &&
  624. (hdr & DUPS_MASK) < DUPS_MASK)
  625. *last_rlco = htons(hdr + 2);
  626. else
  627. *rlco += size;
  628. } else {
  629. *rlco += size;
  630. }
  631. if (*rlco >= rlco_max) {
  632. encoding |= FWHT_FRAME_UNENCODED;
  633. goto exit_loop;
  634. }
  635. last_size = size;
  636. }
  637. }
  638. exit_loop:
  639. if (encoding & FWHT_FRAME_UNENCODED) {
  640. u8 *out = (u8 *)rlco_start;
  641. u8 *p;
  642. input = input_start;
  643. /*
  644. * The compressed stream should never contain the magic
  645. * header, so when we copy the YUV data we replace 0xff
  646. * by 0xfe. Since YUV is limited range such values
  647. * shouldn't appear anyway.
  648. */
  649. for (j = 0; j < height; j++) {
  650. for (i = 0, p = input; i < width; i++, p += input_step)
  651. *out++ = (*p == 0xff) ? 0xfe : *p;
  652. input += stride;
  653. }
  654. *rlco = (__be16 *)out;
  655. encoding &= ~FWHT_FRAME_PCODED;
  656. }
  657. return encoding;
  658. }
  659. u32 fwht_encode_frame(struct fwht_raw_frame *frm,
  660. struct fwht_raw_frame *ref_frm,
  661. struct fwht_cframe *cf,
  662. bool is_intra, bool next_is_intra,
  663. unsigned int width, unsigned int height,
  664. unsigned int stride, unsigned int chroma_stride)
  665. {
  666. unsigned int size = height * width;
  667. __be16 *rlco = cf->rlc_data;
  668. __be16 *rlco_max;
  669. u32 encoding;
  670. rlco_max = rlco + size / 2 - 256;
  671. encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
  672. height, width, stride,
  673. frm->luma_alpha_step, is_intra, next_is_intra);
  674. if (encoding & FWHT_FRAME_UNENCODED)
  675. encoding |= FWHT_LUMA_UNENCODED;
  676. encoding &= ~FWHT_FRAME_UNENCODED;
  677. if (frm->components_num >= 3) {
  678. u32 chroma_h = height / frm->height_div;
  679. u32 chroma_w = width / frm->width_div;
  680. unsigned int chroma_size = chroma_h * chroma_w;
  681. rlco_max = rlco + chroma_size / 2 - 256;
  682. encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
  683. cf, chroma_h, chroma_w,
  684. chroma_stride, frm->chroma_step,
  685. is_intra, next_is_intra);
  686. if (encoding & FWHT_FRAME_UNENCODED)
  687. encoding |= FWHT_CB_UNENCODED;
  688. encoding &= ~FWHT_FRAME_UNENCODED;
  689. rlco_max = rlco + chroma_size / 2 - 256;
  690. encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
  691. cf, chroma_h, chroma_w,
  692. chroma_stride, frm->chroma_step,
  693. is_intra, next_is_intra);
  694. if (encoding & FWHT_FRAME_UNENCODED)
  695. encoding |= FWHT_CR_UNENCODED;
  696. encoding &= ~FWHT_FRAME_UNENCODED;
  697. }
  698. if (frm->components_num == 4) {
  699. rlco_max = rlco + size / 2 - 256;
  700. encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
  701. rlco_max, cf, height, width,
  702. stride, frm->luma_alpha_step,
  703. is_intra, next_is_intra);
  704. if (encoding & FWHT_FRAME_UNENCODED)
  705. encoding |= FWHT_ALPHA_UNENCODED;
  706. encoding &= ~FWHT_FRAME_UNENCODED;
  707. }
  708. cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
  709. return encoding;
  710. }
  711. static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
  712. u32 height, u32 width, const u8 *ref, u32 ref_stride,
  713. unsigned int ref_step, u8 *dst,
  714. unsigned int dst_stride, unsigned int dst_step,
  715. bool uncompressed, const __be16 *end_of_rlco_buf)
  716. {
  717. unsigned int copies = 0;
  718. s16 copy[8 * 8];
  719. u16 stat;
  720. unsigned int i, j;
  721. bool is_intra = !ref;
  722. width = round_up(width, 8);
  723. height = round_up(height, 8);
  724. if (uncompressed) {
  725. int i;
  726. if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
  727. return false;
  728. for (i = 0; i < height; i++) {
  729. memcpy(dst, *rlco, width);
  730. dst += dst_stride;
  731. *rlco += width / 2;
  732. }
  733. return true;
  734. }
  735. /*
  736. * When decoding each macroblock the rlco pointer will be increased
  737. * by 65 * 2 bytes worst-case.
  738. * To avoid overflow the buffer has to be 65/64th of the actual raw
  739. * image size, just in case someone feeds it malicious data.
  740. */
  741. for (j = 0; j < height / 8; j++) {
  742. for (i = 0; i < width / 8; i++) {
  743. const u8 *refp = ref + j * 8 * ref_stride +
  744. i * 8 * ref_step;
  745. u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
  746. if (copies) {
  747. memcpy(cf->de_fwht, copy, sizeof(copy));
  748. if ((stat & PFRAME_BIT) && !is_intra)
  749. add_deltas(cf->de_fwht, refp,
  750. ref_stride, ref_step);
  751. fill_decoder_block(dstp, cf->de_fwht,
  752. dst_stride, dst_step);
  753. copies--;
  754. continue;
  755. }
  756. stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
  757. if (stat & OVERFLOW_BIT)
  758. return false;
  759. if ((stat & PFRAME_BIT) && !is_intra)
  760. dequantize_inter(cf->coeffs);
  761. else
  762. dequantize_intra(cf->coeffs);
  763. ifwht(cf->coeffs, cf->de_fwht,
  764. ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
  765. copies = (stat & DUPS_MASK) >> 1;
  766. if (copies)
  767. memcpy(copy, cf->de_fwht, sizeof(copy));
  768. if ((stat & PFRAME_BIT) && !is_intra)
  769. add_deltas(cf->de_fwht, refp,
  770. ref_stride, ref_step);
  771. fill_decoder_block(dstp, cf->de_fwht, dst_stride,
  772. dst_step);
  773. }
  774. }
  775. return true;
  776. }
  777. bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
  778. unsigned int components_num, unsigned int width,
  779. unsigned int height, const struct fwht_raw_frame *ref,
  780. unsigned int ref_stride, unsigned int ref_chroma_stride,
  781. struct fwht_raw_frame *dst, unsigned int dst_stride,
  782. unsigned int dst_chroma_stride)
  783. {
  784. const __be16 *rlco = cf->rlc_data;
  785. const __be16 *end_of_rlco_buf = cf->rlc_data +
  786. (cf->size / sizeof(*rlco)) - 1;
  787. if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
  788. ref->luma_alpha_step, dst->luma, dst_stride,
  789. dst->luma_alpha_step,
  790. hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED,
  791. end_of_rlco_buf))
  792. return false;
  793. if (components_num >= 3) {
  794. u32 h = height;
  795. u32 w = width;
  796. if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT))
  797. h /= 2;
  798. if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH))
  799. w /= 2;
  800. if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
  801. ref->chroma_step, dst->cb, dst_chroma_stride,
  802. dst->chroma_step,
  803. hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED,
  804. end_of_rlco_buf))
  805. return false;
  806. if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
  807. ref->chroma_step, dst->cr, dst_chroma_stride,
  808. dst->chroma_step,
  809. hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED,
  810. end_of_rlco_buf))
  811. return false;
  812. }
  813. if (components_num == 4)
  814. if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
  815. ref->luma_alpha_step, dst->alpha, dst_stride,
  816. dst->luma_alpha_step,
  817. hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED,
  818. end_of_rlco_buf))
  819. return false;
  820. return true;
  821. }