drivers/media/platform/rockchip-vpu/rk3399_vpu_hw_vp8e.c - chromiumos/third_party/kernel - Git at Google

 /*
  * Rockchip VPU codec driver
  *
  * Copyright (C) 2016 Rockchip Electronics Co., Ltd.
  *	Alpha Lin <Alpha.Lin@rock-chips.com>
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
  * may be copied, distributed, and modified under those terms.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */

 #include "rockchip_vpu_common.h"

 #include <linux/types.h>
 #include <linux/sort.h>

 #include "rk3399_vpu_regs.h"
 #include "rockchip_vpu_hw.h"

 /* Various parameters specific to VP8 encoder. */
 #define VP8_CABAC_CTX_OFFSET	192
 #define VP8_CABAC_CTX_SIZE	((55 + 96) << 3)
 /* ((1920 * 1088 * 4 / 256 + 63) / 64 * 8) */
 #define VP8_SEGMENT_MAP_SIZE	4087

 /* threshold of MBs count to disable quarter pixel mv for encode speed */
 #define MAX_MB_COUNT_TO_DISABLE_QUARTER_PIXEL_MV	3600

 /* threshold of MBs count to disable multi mv in one macro block */
 #define MAX_MB_COUNT_TO_DISABLE_SPLIT_MV		1584

 #define QINDEX_RANGE 128

 /* experimentally fitted, 24.893*exp(0.02545*qp) */
 static const int32_t split_penalty[QINDEX_RANGE] = {
 	24, 25, 26, 26, 27, 28, 29, 29,
 	30, 31, 32, 32, 33, 34, 35, 36,
 	37, 38, 39, 40, 41, 42, 43, 44,
 	45, 47, 48, 49, 50, 52, 53, 54,
 	56, 57, 59, 60, 62, 63, 65, 67,
 	68, 70, 72, 74, 76, 78, 80, 82,
 	84, 86, 88, 91, 93, 95, 98, 100,
 	103, 106, 108, 111, 114, 117, 120, 123,
 	126, 130, 133, 136, 140, 144, 147, 151,
 	155, 159, 163, 167, 172, 176, 181, 185,
 	190, 195, 200, 205, 211, 216, 222, 227,
 	233, 239, 245, 252, 258, 265, 272, 279,
 	286, 293, 301, 309, 317, 325, 333, 342,
 	351, 360, 369, 379, 388, 398, 409, 419,
 	430, 441, 453, 464, 476, 488, 501, 514,
 	527, 541, 555, 569, 584, 599, 614, 630
 };

 static const int dc_q_lookup[QINDEX_RANGE] = {
 	4,   5,   6,   7,   8,   9,   10,  10,  11,  12,
 	13,  14,  15,  16,  17,  17,  18,  19,  20,  20,
 	21,  21,  22,  22,  23,  23,  24,  25,  25,  26,
 	27,  28,  29,  30,  31,  32,  33,  34,  35,  36,
 	37,  37,  38,  39,  40,  41,  42,  43,  44,  45,
 	46,  46,  47,  48,  49,  50,  51,  52,  53,  54,
 	55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
 	65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
 	75,  76,  76,  77,  78,  79,  80,  81,  82,  83,
 	84,  85,  86,  87,  88,  89,  91,  93,  95,  96,
 	98,  100, 101, 102, 104, 106, 108, 110, 112, 114,
 	116, 118, 122, 124, 126, 128, 130, 132, 134, 136,
 	138, 140, 143, 145, 148, 151, 154, 157
 };

 static const int ac_q_lookup[QINDEX_RANGE] = {
 	4,   5,   6,   7,   8,   9,   10,  11,  12,  13,
 	14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
 	24,  25,  26,  27,  28,  29,  30,  31,  32,  33,
 	34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
 	44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
 	54,  55,  56,  57,  58,  60,  62,  64,  66,  68,
 	70,  72,  74,  76,  78,  80,  82,  84,  86,  88,
 	90,  92,  94,  96,  98,  100, 102, 104, 106, 108,
 	110, 112, 114, 116, 119, 122, 125, 128, 131, 134,
 	137, 140, 143, 146, 149, 152, 155, 158, 161, 164,
 	167, 170, 173, 177, 181, 185, 189, 193, 197, 201,
 	205, 209, 213, 217, 221, 225, 229, 234, 239, 245,
 	249, 254, 259, 264, 269, 274, 279, 284
 };

 static const int32_t const qrounding_factors[QINDEX_RANGE] = {
 	56, 56, 56, 56, 56, 56, 56, 56, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
 	48, 48, 48, 48, 48, 48, 48, 48
 };

 static const int32_t const qzbin_factors[QINDEX_RANGE] = {
 	64, 64, 64, 64, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
 	80, 80, 80, 80, 80, 80, 80, 80
 };

 static const int32_t zbin_boost[16] = {
 	0,   0,  8, 10, 12, 14, 16, 20, 24, 28,
 	32, 36, 40, 44, 44, 44
 };

 /* Intra 16x16 mode tree penalty values */
 static s32 intra_16_tree_penalty[] = {
 	305, 841, 914, 1082
 };


 /* Intra 4x4 mode tree penalty values */
 static s32 intra_4_tree_penalty[] = {
 	280, 622, 832, 1177, 1240, 1341, 1085, 1259, 1357, 1495
 };

 /* ~round((2*(2+exp((x+22)/39)) + (2+exp((x+15)/32)))/3) */
 static s32 weight[QINDEX_RANGE] = {
 	4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
 	4,  4,  4,  4,  4,  5,  5,  5,  5,  5,
 	5,  5,  5,  5,  5,  5,  5,  6,  6,  6,
 	6,  6,  6,  6,  6,  6,  6,  7,  7,  7,
 	7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
 	8,  8,  9,  9,  9,  9,  9, 10, 10, 10,
 	10, 11, 11, 11, 12, 12, 13, 13, 13, 13,
 	14, 14, 14, 14, 15, 15, 15, 16, 16, 17,
 	17, 18, 18, 19, 19, 20, 20, 20, 21, 22,
 	23, 23, 24, 24, 25, 25, 26, 27, 28, 28,
 	29, 30, 31, 32, 32, 33, 34, 35, 36, 37,
 	38, 39, 40, 41, 42, 44, 44, 46, 47, 48,
 	50, 51, 52, 54, 55, 57, 58, 61
 };

 struct tree {
 	int32_t value;       /* Bits describe the bool tree  */
 	int32_t number;      /* Number, valid bit count in above tree */
 	int32_t index[9];    /* Probability table index */
 };

 /* Motion vector tree */
 static struct tree mv_tree[] = {
 	{ 0, 3, { 0, 1, 2 } },        /* mv_0 000 */
 	{ 1, 3, { 0, 1, 2 } },        /* mv_1 001 */
 	{ 2, 3, { 0, 1, 3 } },        /* mv_2 010 */
 	{ 3, 3, { 0, 1, 3 } },        /* mv_3 011 */
 	{ 4, 3, { 0, 4, 5 } },        /* mv_4 100 */
 	{ 5, 3, { 0, 4, 5 } },        /* mv_5 101 */
 	{ 6, 3, { 0, 4, 6 } },        /* mv_6 110 */
 	{ 7, 3, { 0, 4, 6 } },        /* mv_7 111 */
 };

 /* If probability being zero is p, then average number of bits used to encode 0
  * is log2(1/p), to encode 1 is log2(1/(1-p)).
  *
  * For example, if the probability of being zero is 0.5
  * bin = 0 -> average bits used is log2(1/0.5)      = 1 bits/bin
  * bin = 1 -> average bits used is log2(1/(1 - 0.5) = 1 bits/bin
  *
  * For example, if the probability of being zero is 0.95
  * bin = 0 -> average bits used is log2(1/0.95)      = 0.074 bits/bin
  * bin = 1 -> average bits used is log2(1/(1 - 0.95) = 4.321 bits/bin
  *
  * The cost[p] is average number of bits used to encode 0 if the probability is
  * p / 256, scaled by a magic number 256,
  * i.e., cost[p] = round(log2(256 / p) * 256).
  */
 static const s32 const vp8_prob_cost[] = {
 	2048, 2048, 1792, 1642, 1536, 1454, 1386, 1329, 1280, 1236,
 	1198, 1162, 1130, 1101, 1073, 1048, 1024, 1002,  980,  961,
 	942,  924,  906,  890,  874,  859,  845,  831,  817,  804,
 	792,  780,  768,  757,  746,  735,  724,  714,  705,  695,
 	686,  676,  668,  659,  650,  642,  634,  626,  618,  611,
 	603,  596,  589,  582,  575,  568,  561,  555,  548,  542,
 	536,  530,  524,  518,  512,  506,  501,  495,  490,  484,
 	479,  474,  468,  463,  458,  453,  449,  444,  439,  434,
 	430,  425,  420,  416,  412,  407,  403,  399,  394,  390,
 	386,  382,  378,  374,  370,  366,  362,  358,  355,  351,
 	347,  343,  340,  336,  333,  329,  326,  322,  319,  315,
 	312,  309,  305,  302,  299,  296,  292,  289,  286,  283,
 	280,  277,  274,  271,  268,  265,  262,  259,  256,  253,
 	250,  247,  245,  242,  239,  236,  234,  231,  228,  226,
 	223,  220,  218,  215,  212,  210,  207,  205,  202,  200,
 	197,  195,  193,  190,  188,  185,  183,  181,  178,  176,
 	174,  171,  169,  167,  164,  162,  160,  158,  156,  153,
 	151,  149,  147,  145,  143,  140,  138,  136,  134,  132,
 	130,  128,  126,  124,  122,  120,  118,  116,  114,  112,
 	110,  108,  106,  104,  102,  101,   99,   97,   95,   93,
 	91,   89,   87,   86,   84,   82,   80,   78,   77,   75,
 	73,   71,   70,   68,   66,   64,   63,   61,   59,   58,
 	56,   54,   53,   51,   49,   48,   46,   44,   43,   41,
 	40,   38,   36,   35,   33,   32,   30,   28,   27,   25,
 	24,   22,   21,   19,   18,   16,   15,   13,   12,   10,
 	9,    7,    6,    4,    3,    1
 };

 /* Approximate bit cost of bin at given probability prob */
 #define COST_BOOL(prob, bin)	vp8_prob_cost[(bin) ? 255 - (prob) : prob]

 /**
  * struct rk3399_vpu_vp8e_ctrl_buf - hardware control buffer layout
  * @ext_hdr_size:	Ext header size in bytes (written by hardware).
  * @dct_size:		DCT partition size (written by hardware).
  * @rsvd:		Reserved for hardware.
  */
 struct rk3399_vpu_vp8e_ctrl_buf {
 	u32 ext_hdr_size;
 	u32 dct_size;
 	u8 rsvd[1016];
 };

 int rk3399_vpu_vp8e_init(struct rockchip_vpu_ctx *ctx)
 {
 	struct rockchip_vpu_dev *vpu = ctx->dev;
 	size_t height = ctx->src_fmt.height;
 	size_t width = ctx->src_fmt.width;
 	size_t ref_buf_size;
 	size_t mv_size;
 	int ret;

 	ret = rockchip_vpu_aux_buf_alloc(vpu, &ctx->hw.vp8e.ctrl_buf,
 				sizeof(struct rk3399_vpu_vp8e_ctrl_buf));
 	if (ret) {
 		vpu_err("failed to allocate ctrl buffer\n");
 		return ret;
 	}

 	mv_size = DIV_ROUND_UP(width, 16) * DIV_ROUND_UP(height, 16) * 4;
 	ret = rockchip_vpu_aux_buf_alloc(vpu, &ctx->hw.vp8e.mv_buf, mv_size);
 	if (ret) {
 		vpu_err("failed to allocate MV buffer\n");
 		goto err_ctrl_buf;
 	}

 	ref_buf_size = rockchip_vpu_rounded_luma_size(width, height) * 3 / 2;
 	ret = rockchip_vpu_aux_buf_alloc(vpu, &ctx->hw.vp8e.ext_buf,
 					2 * ref_buf_size);
 	if (ret) {
 		vpu_err("failed to allocate ext buffer\n");
 		goto err_mv_buf;
 	}

 	return 0;

 err_mv_buf:
 	rockchip_vpu_aux_buf_free(vpu, &ctx->hw.vp8e.mv_buf);
 err_ctrl_buf:
 	rockchip_vpu_aux_buf_free(vpu, &ctx->hw.vp8e.ctrl_buf);

 	return ret;
 }

 void rk3399_vpu_vp8e_exit(struct rockchip_vpu_ctx *ctx)
 {
 	struct rockchip_vpu_dev *vpu = ctx->dev;

 	rockchip_vpu_aux_buf_free(vpu, &ctx->hw.vp8e.ext_buf);
 	rockchip_vpu_aux_buf_free(vpu, &ctx->hw.vp8e.mv_buf);
 	rockchip_vpu_aux_buf_free(vpu, &ctx->hw.vp8e.ctrl_buf);
 }

 static inline u32 enc_in_img_ctrl(struct rockchip_vpu_ctx *ctx)
 {
 	struct v4l2_pix_format_mplane *pix_fmt = &ctx->src_fmt;
 	const struct rk3399_vp8e_reg_params *params =
 		(struct rk3399_vp8e_reg_params *)ctx->run.vp8e.reg_params;
 	struct v4l2_rect *crop = &ctx->src_crop;
 	unsigned overfill_r, overfill_b;
 	u32 first_free_bits = (params->frm_hdr_size & 7) * 8;

 	/*
 	 * The hardware needs only the value for luma plane, because
 	 * values of other planes are calculated internally based on
 	 * format setting.
 	 */
 	overfill_r = (pix_fmt->width - crop->width) / 4;
 	overfill_b = pix_fmt->height - crop->height;

 	/** TODO, finish first free bit when assemble frame header
 	 *  done
 	 */
 	return VEPU_REG_STREAM_START_OFFSET(first_free_bits) |
 		VEPU_REG_IN_IMG_CTRL_OVRFLR_D4(overfill_r) |
 		VEPU_REG_IN_IMG_CTRL_OVRFLB(overfill_b) |
 		VEPU_REG_SKIP_MACROBLOCK_PENALTY(params->qp >= 100 ?
 			(3 * params->qp / 4) : 0);
 }

 static void rk3399_vpu_vp8e_set_buffers(struct rockchip_vpu_dev *vpu,
 					struct rockchip_vpu_ctx *ctx)
 {
 	const u32 src_addr_regs[] = { VEPU_REG_ADDR_IN_LUMA,
 				      VEPU_REG_ADDR_IN_CB,
 				      VEPU_REG_ADDR_IN_CR };
 	const struct rk3399_vp8e_reg_params *params =
 		(struct rk3399_vp8e_reg_params *)ctx->run.vp8e.reg_params;
 	struct v4l2_pix_format_mplane *src_fmt = &ctx->src_fmt;
 	dma_addr_t ref_buf_dma, rec_buf_dma;
 	dma_addr_t stream_dma;
 	size_t rounded_size;
 	dma_addr_t dst_dma;
 	size_t dst_size;
 	int i;

 	vpu_debug_enter();

 	rounded_size = rockchip_vpu_rounded_luma_size(ctx->src_fmt.width,
 						      ctx->src_fmt.height);

 	ref_buf_dma = rec_buf_dma = ctx->hw.vp8e.ext_buf.dma;
 	if (ctx->hw.vp8e.ref_rec_ptr)
 		ref_buf_dma += rounded_size * 3 / 2;
 	else
 		rec_buf_dma += rounded_size * 3 / 2;
 	ctx->hw.vp8e.ref_rec_ptr ^= 1;

 	dst_dma = vb2_dma_contig_plane_dma_addr(&ctx->run.dst->b.vb2_buf, 0);
 	dst_size = vb2_plane_size(&ctx->run.dst->b.vb2_buf, 0);

 	/*
 	 * stream addr-->|
 	 * align 64bits->|<-start offset->|
 	 * |<---------header size-------->|<---dst buf---
 	 */
 	stream_dma = round_down(dst_dma + params->frm_hdr_size, 8);

 	/**
 	 * Userspace will pass 8 bytes aligned size(round_down) to us,
 	 * so we need to plus start offset to get real header size.
 	 *
 	 * |<-aligned size->|<-start offset->|
 	 * |<----------header size---------->|
 	 */
 	vpu_debug(0, "frame header size %u\n", params->frm_hdr_size);
 	ctx->run.dst->vp8e.hdr_size = params->frm_hdr_size;

 	if (params->is_intra)
 		ctx->run.dst->b.flags |= V4L2_BUF_FLAG_KEYFRAME;
 	else
 		ctx->run.dst->b.flags &= ~V4L2_BUF_FLAG_KEYFRAME;

 	/*
 	 * We assume here that 1/10 of the buffer is enough for headers.
 	 * DCT partition will be placed in remaining 9/10 of the buffer.
 	 */
 	ctx->run.dst->vp8e.dct_offset = round_up(dst_size / 10, 8);

 	/* Destination buffer. */
 	vepu_write_relaxed(vpu, stream_dma, VEPU_REG_ADDR_OUTPUT_STREAM);
 	vepu_write_relaxed(vpu, dst_dma + ctx->run.dst->vp8e.dct_offset,
 				VEPU_REG_ADDR_VP8_DCT_PART(0));
 	vepu_write_relaxed(vpu, dst_size - ctx->run.dst->vp8e.dct_offset,
 				VEPU_REG_STR_BUF_LIMIT);

 	/* Auxiliary buffers. */
 	vepu_write_relaxed(vpu, ctx->hw.vp8e.ctrl_buf.dma,
 				VEPU_REG_ADDR_OUTPUT_CTRL);
 	vepu_write_relaxed(vpu, ctx->hw.vp8e.mv_buf.dma,
 				VEPU_REG_ADDR_MV_OUT);
 	vepu_write_relaxed(vpu, ctx->run.priv_dst.dma,
 				VEPU_REG_ADDR_VP8_PROB_CNT);
 	vepu_write_relaxed(vpu, ctx->run.priv_src.dma + VP8_CABAC_CTX_OFFSET,
 				VEPU_REG_ADDR_CABAC_TBL);

 	memset(ctx->run.priv_src.cpu + VP8_CABAC_CTX_OFFSET +
 		VP8_CABAC_CTX_SIZE, 0, VP8_SEGMENT_MAP_SIZE);
 	vepu_write_relaxed(vpu, ctx->run.priv_src.dma
 				+ VP8_CABAC_CTX_OFFSET + VP8_CABAC_CTX_SIZE,
 				VEPU_REG_ADDR_VP8_SEG_MAP);

 	/* Reference buffers. */
 	vepu_write_relaxed(vpu, ref_buf_dma,
 				VEPU_REG_ADDR_REF_LUMA);
 	vepu_write_relaxed(vpu, ref_buf_dma + rounded_size,
 				VEPU_REG_ADDR_REF_CHROMA);

 	/* Reconstruction buffers. */
 	vepu_write_relaxed(vpu, rec_buf_dma,
 				VEPU_REG_ADDR_REC_LUMA);
 	vepu_write_relaxed(vpu, rec_buf_dma + rounded_size,
 				VEPU_REG_ADDR_REC_CHROMA);

 	/* Source buffer. */
 	/*
 	 * TODO(crbug.com/901264): The way to pass an offset within a DMA-buf
 	 * is not defined in V4L2 specification, so we abuse data_offset
 	 * for now. Fix it when we have the right interface, including
 	 * any necessary validation and potential alignment issues.
 	 */
 	for (i = 0; i < src_fmt->num_planes; ++i)
 		vepu_write_relaxed(vpu, vb2_dma_contig_plane_dma_addr(
 				   &ctx->run.src->b.vb2_buf, i) +
 				ctx->run.src->b.vb2_buf.planes[i].data_offset,
 				   src_addr_regs[i]);

 	/* Source parameters. */
 	vepu_write_relaxed(vpu, enc_in_img_ctrl(ctx),
 			   VEPU_REG_ENC_OVER_FILL_STRM_OFFSET);

 	vpu_debug_leave();
 }

 static s32 cost_tree(struct tree *tree, const s32 *prob)
 {
 	s32 value = tree->value;
 	s32 number = tree->number;
 	s32 *index = tree->index;
 	s32 bit_cost = 0;

 	while (number--)
 		bit_cost += COST_BOOL(prob[*index++], (value >> number) & 1);

 	return bit_cost;
 }

 static s32 cost_mv(s32 mvd, const s32 *mv_prob)
 {
 	s32 i, tmp, bit_cost = 0;

 	/* Luma motion vectors are doubled, see 18.1 in "VP8 Data Format and
 	 * Decoding Guide".
 	 */
 	BUG_ON(mvd & 1);
 	tmp = abs(mvd >> 1);

 	/* Short Tree */
 	if (tmp < 8) {
 		bit_cost += COST_BOOL(mv_prob[0], 0);
 		bit_cost += cost_tree(&mv_tree[tmp], mv_prob + 2);
 		if (!tmp)
 			return bit_cost;

 		/* Sign */
 		bit_cost += COST_BOOL(mv_prob[1], mvd < 0);
 		return bit_cost;
 	}

 	/* Long Tree */
 	bit_cost += COST_BOOL(mv_prob[0], 1);

 	/* Bits 0, 1, 2 */
 	for (i = 0; i < 3; i++)
 		bit_cost += COST_BOOL(mv_prob[9 + i], (tmp >> i) & 1);

 	/* Bits 9, 8, 7, 6, 5, 4 */
 	for (i = 9; i > 3; i--)
 		bit_cost += COST_BOOL(mv_prob[9 + i], (tmp >> i) & 1);

 	/*
 	 * Bit 3: if ABS(mvd) < 8, it is coded with short tree, so if here
 	 * ABS(mvd) <= 15, bit 3 must be one (because here we code values
 	 * 8,...,15) and is not explicitly coded.
 	 */
 	if (tmp > 15)
 		bit_cost += COST_BOOL(mv_prob[9 + 3], (tmp >> 3) & 1);

 	/* Sign */
 	bit_cost += COST_BOOL(mv_prob[1], mvd < 0);

 	return bit_cost;
 }

 static void rk3399_vpu_vp8e_set_params(struct rockchip_vpu_dev *vpu,
 				       struct rockchip_vpu_ctx *ctx)
 {
 	const struct rk3399_vp8e_reg_params *params =
 		(struct rk3399_vp8e_reg_params *)ctx->run.vp8e.reg_params;
 	int i;
 	u32 reg;
 	u32 mbs_in_row = MB_WIDTH(ctx->src_fmt.width);
 	u32 mbs_in_col = MB_HEIGHT(ctx->src_fmt.height);
 	u32 deq;
 	u32 tmp;
 	u32 qp = params->qp;
 	s32 inter_favor = 0;

 	vpu_debug_enter();

 	reg = VEPU_REG_OUTPUT_SWAP32
 		| VEPU_REG_OUTPUT_SWAP16
 		| VEPU_REG_OUTPUT_SWAP8
 		| VEPU_REG_INPUT_SWAP8
 		| VEPU_REG_INPUT_SWAP16
 		| VEPU_REG_INPUT_SWAP32;
 	vepu_write_relaxed(vpu, reg, VEPU_REG_DATA_ENDIAN);

 	reg = VEPU_REG_SIZE_TABLE_PRESENT
 		| VEPU_REG_IN_IMG_CTRL_FMT(ctx->vpu_src_fmt->enc_fmt)
 		| VEPU_REG_IN_IMG_ROTATE_MODE(0);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_ENC_CTRL1);

 	reg = VEPU_REG_INTERRUPT_TIMEOUT_EN
 		| VEPU_REG_MV_WRITE_EN;
 	vepu_write_relaxed(vpu, reg, VEPU_REG_INTERRUPT);

 	reg = VEPU_REG_IN_IMG_CHROMA_OFFSET(0)
 		| VEPU_REG_IN_IMG_LUMA_OFFSET(0)
 		| VEPU_REG_IN_IMG_CTRL_ROW_LEN(mbs_in_row * 16);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_INPUT_LUMA_INFO);

 	vepu_write_relaxed(vpu, 0, VEPU_REG_STR_HDR_REM_MSB);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_STR_HDR_REM_LSB);

 	reg = 0;
 	if (mbs_in_row * mbs_in_col > MAX_MB_COUNT_TO_DISABLE_QUARTER_PIXEL_MV)
 		reg = VEPU_REG_DISABLE_QUARTER_PIXEL_MV;
 	reg |= VEPU_REG_ENTROPY_CODING_MODE;
 	vepu_write_relaxed(vpu, reg, VEPU_REG_ENC_CTRL0);

 	inter_favor = 128 - params->intra_prob;
 	if (inter_favor >= 0)
 		inter_favor = max(0u, qp * 2 - 40);

 	reg = VEPU_REG_INTRA16X16_MODE(qp * 1024 / 128)
 		| VEPU_REG_INTER_MODE(inter_favor);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_INTRA_INTER_MODE);

 	reg = VEPU_REG_1MV_PENALTY(60 / 2 * 32)
 		| VEPU_REG_QMV_PENALTY(8)
 		| VEPU_REG_4MV_PENALTY(64 / 2);
 	if (mbs_in_row * mbs_in_col < MAX_MB_COUNT_TO_DISABLE_SPLIT_MV)
 		reg |= VEPU_REG_SPLIT_MV_MODE_EN;
 	vepu_write_relaxed(vpu, reg, VEPU_REG_MV_PENALTY);

 	reg = VEPU_REG_MV_PENALTY_16X8_8X16(
 			min(1023, split_penalty[qp] / 2))
 		| VEPU_REG_MV_PENALTY_8X8(
 			min(1023, (2 * split_penalty[qp] + 40) / 4))
 		| VEPU_REG_MV_PENALTY_8X4_4X8(0x3ff);
 	/* no 8x4 or 4x8 block define in vp8 */
 	vepu_write_relaxed(vpu, reg, VEPU_REG_ENC_CTRL4);

 	reg = VEPU_REG_PENALTY_4X4MV(min(511,
 					 (8 * split_penalty[qp] + 500) / 16))
 		| VEPU_REG_ZERO_MV_FAVOR_D2(0);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_MVC_RELATE);

 	/* initialize quant table for segment0 */
 	deq = dc_q_lookup[qp];
 	reg = VEPU_REG_VP8_SEG0_QUT_DC_Y1(min((1 << 16) / deq, 0x3FFFu));
 	reg |= VEPU_REG_VP8_SEG0_ZBIN_DC_Y1(((qzbin_factors[qp] * deq) + 64) >>
 					    7);
 	reg |= VEPU_REG_VP8_SEG0_RND_DC_Y1((qrounding_factors[qp] * deq) >> 7);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_SEG0_QUANT_DC_Y1);

 	deq = ac_q_lookup[qp];
 	reg = VEPU_REG_VP8_SEG0_QUT_AC_Y1(min((1 << 16) / deq, 0x3FFFu));
 	reg |= VEPU_REG_VP8_SEG0_ZBIN_AC_Y1(((qzbin_factors[qp] * deq) + 64) >>
 					    7);
 	reg |= VEPU_REG_VP8_SEG0_RND_AC_Y1((qrounding_factors[qp] * deq) >> 7);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_SEG0_QUANT_AC_Y1);

 	deq = dc_q_lookup[qp] * 2;
 	reg = VEPU_REG_VP8_SEG0_QUT_DC_Y2(min((1 << 16) / deq, 0x3FFFu));
 	reg |= VEPU_REG_VP8_SEG0_ZBIN_DC_Y2((qzbin_factors[qp] * deq + 64) >>
 					    7);
 	reg |= VEPU_REG_VP8_SEG0_RND_DC_Y2((qrounding_factors[qp] * deq) >> 7);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_SEG0_QUANT_DC_Y2);

 	deq = ac_q_lookup[qp] * 155 / 100;
 	reg = VEPU_REG_VP8_SEG0_QUT_AC_Y2(min((1 << 16) / deq, 0x3FFFu));
 	reg |= VEPU_REG_VP8_SEG0_ZBIN_AC_Y2((qzbin_factors[qp] * deq + 64) >>
 					    7);
 	reg |= VEPU_REG_VP8_SEG0_RND_AC_Y2((qrounding_factors[qp] * deq) >> 7);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_SEG0_QUANT_AC_Y2);

 	deq = min(dc_q_lookup[qp], 132);
 	reg = VEPU_REG_VP8_SEG0_QUT_DC_CHR(min((1 << 16) / deq, 0x3FFFu));
 	reg |= VEPU_REG_VP8_SEG0_ZBIN_DC_CHR((qzbin_factors[qp] * deq + 64) >>
 					     7);
 	reg |= VEPU_REG_VP8_SEG0_RND_DC_CHR((qrounding_factors[qp] * deq) >> 7);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_SEG0_QUANT_DC_CHR);

 	deq = ac_q_lookup[qp];
 	reg = VEPU_REG_VP8_SEG0_QUT_AC_CHR(min((1 << 16) / deq, 0x3FFFu));
 	reg |= VEPU_REG_VP8_SEG0_ZBIN_AC_CHR((qzbin_factors[qp] * deq + 64) >>
 					     7);
 	reg |= VEPU_REG_VP8_SEG0_RND_AC_CHR((qrounding_factors[qp] * deq) >> 7);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_SEG0_QUANT_AC_CHR);

 	reg = VEPU_REG_VP8_MV_REF_IDX1(0);
 	reg |= VEPU_REG_VP8_SEG0_DQUT_DC_Y1(dc_q_lookup[qp]);
 	reg |= VEPU_REG_VP8_SEG0_DQUT_AC_Y1(ac_q_lookup[qp]);
 	reg |= VEPU_REG_VP8_SEG0_DQUT_DC_Y2(dc_q_lookup[qp] * 2);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_SEG0_QUANT_DQUT);

 	reg = VEPU_REG_VP8_MV_REF_IDX2(0);
 	reg |= VEPU_REG_VP8_SEG0_DQUT_DC_CHR(min(dc_q_lookup[qp], 132));
 	reg |= VEPU_REG_VP8_SEG0_DQUT_AC_CHR(ac_q_lookup[qp]);
 	reg |= VEPU_REG_VP8_SEG0_DQUT_AC_Y2(ac_q_lookup[qp] * 155 / 100);
 	if (params->is_intra)
 		reg |= VEPU_REG_VP8_SEGMENT_MAP_UPDATE;
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_SEG0_QUANT_DQUT_1);
 	vepu_write_relaxed(vpu, params->bool_enc_value,
 		VEPU_REG_VP8_BOOL_ENC_VALUE);

 	reg = VEPU_REG_VP8_DCT_PARTITION_CNT(0);
 	reg |= VEPU_REG_VP8_FILTER_LEVEL(params->filter_level);
 	reg |= VEPU_REG_VP8_FILTER_SHARPNESS(params->filter_sharpness);
 	reg |= VEPU_REG_VP8_ZERO_MV_PENALTY_FOR_REF2(0);
 	reg |= VEPU_REG_VP8_BOOL_ENC_VALUE_BITS(params->bool_enc_value_bits);
 	reg |= VEPU_REG_VP8_BOOL_ENC_RANGE(params->bool_enc_range);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_ENC_CTRL2);

 	vepu_write_relaxed(vpu, 0, VEPU_REG_ROI1);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_ROI2);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_STABILIZATION_OUTPUT);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_RGB2YUV_CONVERSION_COEF1);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_RGB2YUV_CONVERSION_COEF2);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_RGB2YUV_CONVERSION_COEF3);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_RGB_MASK_MSB);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_CIR_INTRA_CTRL);
 	vepu_write_relaxed(vpu, 0, VEPU_REG_INTRA_AREA_CTRL);

 	/* Intra 4x4 mode */
 	tmp = qp * 2 + 8;
 	for (i = 0; i < 5; i++) {
 		reg = VEPU_REG_VP8_INTRA_4X4_PENALTY_0
 			((intra_4_tree_penalty[i * 2] * tmp) >> 8);
 		reg |= VEPU_REG_VP8_INTRA_4x4_PENALTY_1
 			((intra_4_tree_penalty[i * 2 + 1] * tmp) >> 8);
 		vepu_write_relaxed(vpu, reg,
 				   VEPU_REG_VP8_INTRA_4X4_PENALTY(i));
 	}

 	/* Intra 16x16 mode */
 	tmp = qp * 2 + 64;
 	for (i = 0; i < 2; i++) {
 		reg = VEPU_REG_VP8_INTRA_16X16_PENALTY_0
 			((intra_16_tree_penalty[2 * i] * tmp) >> 8);
 		reg |= VEPU_REG_VP8_INTRA_16X16_PENALTY_1
 			((intra_16_tree_penalty[2 * i + 1] * tmp) >> 8);
 		vepu_write_relaxed(vpu, reg,
 				   VEPU_REG_VP8_INTRA_16X16_PENALTY(i));
 	}

 	reg = VEPU_REG_VP8_LF_REF_DELTA_INTRA_MB(params->intra_frm_delta);
 	reg |= VEPU_REG_VP8_LF_MODE_DELTA_BPRED(params->bpred_mode_delta);
 	reg |= VEPU_REG_VP8_INTER_TYPE_BIT_COST(0);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_CONTROL);

 	reg = VEPU_REG_VP8_LF_REF_DELTA_ALT_REF(params->altref_frm_delta)
 		| VEPU_REG_VP8_LF_REF_DELTA_LAST_REF(params->last_frm_delta)
 		| VEPU_REG_VP8_LF_REF_DELTA_GOLDEN(params->golden_frm_delta);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_LOOP_FILTER_REF_DELTA);

 	reg = VEPU_REG_VP8_LF_MODE_DELTA_SPLITMV(params->splitmv_mode_delta)
 		| VEPU_REG_VP8_LF_MODE_DELTA_ZEROMV(params->zero_mode_delta)
 		| VEPU_REG_VP8_LF_MODE_DELTA_NEWMV(params->newmv_mode_delta);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_VP8_LOOP_FILTER_MODE_DELTA);

 	for (i = 0; i < 128; i += 4) {
 		u32 x;
 		u32 y;

 		reg = VEPU_REG_DMV_PENALTY_TABLE_BIT(i * 2, 3);
 		reg |= VEPU_REG_DMV_PENALTY_TABLE_BIT((i + 1) * 2, 2);
 		reg |= VEPU_REG_DMV_PENALTY_TABLE_BIT((i + 2) * 2, 1);
 		reg |= VEPU_REG_DMV_PENALTY_TABLE_BIT((i + 3) * 2, 0);
 		vepu_write_relaxed(vpu, reg, VEPU_REG_DMV_PENALTY_TBL(i / 4));

 		y = cost_mv(i * 2, params->mv_prob[0]);	/* mv y */
 		x = cost_mv(i * 2, params->mv_prob[1]);	/* mv x */

 		reg = VEPU_REG_DMV_Q_PIXEL_PENALTY_TABLE_BIT(
 			min(255u, (y + x + 1) / 2 * weight[qp] >> 8), 3);

 		y = cost_mv((i + 1) * 2, params->mv_prob[0]); /* mv y */
 		x = cost_mv((i + 1) * 2, params->mv_prob[1]); /* mv x */
 		reg |= VEPU_REG_DMV_Q_PIXEL_PENALTY_TABLE_BIT(
 			min(255u, (y + x + 1) / 2 * weight[qp] >> 8), 2);

 		y = cost_mv((i + 2) * 2, params->mv_prob[0]); /* mv y */
 		x = cost_mv((i + 2) * 2, params->mv_prob[1]); /* mv x */
 		reg |= VEPU_REG_DMV_Q_PIXEL_PENALTY_TABLE_BIT(
 			min(255u, (y + x + 1) / 2 * weight[qp] >> 8), 1);

 		y = cost_mv((i + 3) * 2, params->mv_prob[0]); /* mv y */
 		x = cost_mv((i + 3) * 2, params->mv_prob[1]); /* mv x */
 		reg |= VEPU_REG_DMV_Q_PIXEL_PENALTY_TABLE_BIT(
 			min(255u, (y + x + 1) / 2 * weight[qp] >> 8), 0);

 		vepu_write_relaxed(vpu, reg,
 				   VEPU_REG_DMV_Q_PIXEL_PENALTY_TBL(i / 4));
 	}

 	vpu_debug_leave();
 }

 void rk3399_vpu_vp8e_run(struct rockchip_vpu_ctx *ctx)
 {
 	const struct rk3399_vp8e_reg_params *params =
 		(struct rk3399_vp8e_reg_params *)ctx->run.vp8e.reg_params;
 	struct rockchip_vpu_dev *vpu = ctx->dev;
 	u32 reg;

 	vpu_debug_enter();

 	/* The hardware expects the control buffer to be zeroed. */
 	memset(ctx->hw.vp8e.ctrl_buf.cpu, 0,
 		sizeof(struct rk3399_vpu_vp8e_ctrl_buf));

 	/*
 	 * Program the hardware.
 	 */
 	rockchip_vpu_power_on(vpu);

 	reg = VEPU_REG_ENCODE_FORMAT(1);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_ENCODE_START);

 	rk3399_vpu_vp8e_set_params(vpu, ctx);
 	rk3399_vpu_vp8e_set_buffers(vpu, ctx);

 	reg = VEPU_REG_AXI_CTRL_READ_ID(0)
 		| VEPU_REG_AXI_CTRL_WRITE_ID(0)
 		| VEPU_REG_AXI_CTRL_BURST_LEN(16)
 		| VEPU_REG_AXI_CTRL_INCREMENT_MODE(0)
 		| VEPU_REG_AXI_CTRL_BIRST_DISCARD(0);
 	vepu_write_relaxed(vpu, reg, VEPU_REG_AXI_CTRL);

 	/* Make sure that all registers are written at this point. */
 	wmb();

 	/* Set the watchdog. */
 	schedule_delayed_work(&vpu->watchdog_work, msecs_to_jiffies(2000));

 	/* Start the hardware. */
 	reg = VEPU_REG_MB_HEIGHT(MB_HEIGHT(ctx->src_fmt.height))
 		| VEPU_REG_MB_WIDTH(MB_WIDTH(ctx->src_fmt.width))
 		| VEPU_REG_PIC_TYPE(params->is_intra)
 		| VEPU_REG_ENCODE_FORMAT(1)
 		| VEPU_REG_ENCODE_ENABLE;
 	vepu_write_relaxed(vpu, reg, VEPU_REG_ENCODE_START);

 	vpu_debug_leave();
 }

 void rk3399_vpu_vp8e_done(struct rockchip_vpu_ctx *ctx,
 			  enum vb2_buffer_state result)
 {
 	struct rk3399_vpu_vp8e_ctrl_buf *ctrl_buf = ctx->hw.vp8e.ctrl_buf.cpu;
 	const struct rk3399_vp8e_reg_params *params =
 		(struct rk3399_vp8e_reg_params *)ctx->run.vp8e.reg_params;

 	vpu_debug_enter();

 	/* Read length information of this run from utility buffer. */
 	ctx->run.dst->vp8e.ext_hdr_size = ctrl_buf->ext_hdr_size -
 		(params->frm_hdr_size & 7);
 	ctx->run.dst->vp8e.dct_size = ctrl_buf->dct_size;

 	rockchip_vpu_run_done(ctx, result);

 	vpu_debug_leave();
 }