// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
 * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved.
 */

#include "adreno.h"
#include "adreno_a5xx.h"
#include "adreno_pm4types.h"
#include "adreno_ringbuffer.h"
#include "adreno_trace.h"
#include "kgsl_trace.h"

static int a5xx_rb_pagetable_switch(struct kgsl_device *device,
		struct adreno_context *drawctxt,
		struct adreno_ringbuffer *rb,
		struct kgsl_pagetable *pagetable, u32 *cmds)
{
	u64 ttbr0 = kgsl_mmu_pagetable_get_ttbr0(pagetable);
	u32 id = drawctxt ? drawctxt->base.id : 0;

	if (pagetable == device->mmu.defaultpagetable)
		return 0;

	cmds[0] = cp_type7_packet(CP_SMMU_TABLE_UPDATE, 3);
	cmds[1] = lower_32_bits(ttbr0);
	cmds[2] = upper_32_bits(ttbr0);
	cmds[3] = id;

	cmds[4] = cp_type7_packet(CP_WAIT_FOR_IDLE, 0);
	cmds[5] = cp_type7_packet(CP_WAIT_FOR_ME, 0);
	cmds[6] = cp_type4_packet(A5XX_CP_CNTL, 1);
	cmds[7] = 1;

	cmds[8] = cp_type7_packet(CP_MEM_WRITE, 5);
	cmds[9] = lower_32_bits(SCRATCH_RB_GPU_ADDR(device,
			rb->id, ttbr0));
	cmds[10] = upper_32_bits(SCRATCH_RB_GPU_ADDR(device,
			rb->id, ttbr0));
	cmds[11] = lower_32_bits(ttbr0);
	cmds[12] = upper_32_bits(ttbr0);
	cmds[13] = id;

	cmds[14] = cp_type7_packet(CP_WAIT_FOR_IDLE, 0);
	cmds[15] = cp_type7_packet(CP_WAIT_FOR_ME, 0);
	cmds[16] = cp_type4_packet(A5XX_CP_CNTL, 1);
	cmds[17] = 0;

	return 18;
}

#define RB_SOPTIMESTAMP(device, rb) \
	       MEMSTORE_RB_GPU_ADDR(device, rb, soptimestamp)
#define CTXT_SOPTIMESTAMP(device, drawctxt) \
	       MEMSTORE_ID_GPU_ADDR(device, (drawctxt)->base.id, soptimestamp)

#define RB_EOPTIMESTAMP(device, rb) \
	       MEMSTORE_RB_GPU_ADDR(device, rb, eoptimestamp)
#define CTXT_EOPTIMESTAMP(device, drawctxt) \
	       MEMSTORE_ID_GPU_ADDR(device, (drawctxt)->base.id, eoptimestamp)

int a5xx_ringbuffer_submit(struct adreno_ringbuffer *rb,
		struct adreno_submit_time *time, bool sync)
{
	struct adreno_device *adreno_dev = ADRENO_RB_DEVICE(rb);
	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
	unsigned long flags;

	adreno_get_submit_time(adreno_dev, rb, time);
	adreno_profile_submit_time(time);

	if (sync) {
		u32 *cmds = adreno_ringbuffer_allocspace(rb, 3);

		if (IS_ERR(cmds))
			return PTR_ERR(cmds);

		cmds[0] = cp_type7_packet(CP_WHERE_AM_I, 2);
		cmds[1] = lower_32_bits(SCRATCH_RB_GPU_ADDR(device, rb->id,
				rptr));
		cmds[2] = upper_32_bits(SCRATCH_RB_GPU_ADDR(device, rb->id,
				rptr));
	}

	spin_lock_irqsave(&rb->preempt_lock, flags);
	if (adreno_in_preempt_state(adreno_dev, ADRENO_PREEMPT_NONE)) {
		if (adreno_dev->cur_rb == rb) {
			kgsl_pwrscale_busy(device);
			kgsl_regwrite(device, A5XX_CP_RB_WPTR, rb->_wptr);
		}
	}

	rb->wptr = rb->_wptr;
	spin_unlock_irqrestore(&rb->preempt_lock, flags);

	return 0;
}

int a5xx_ringbuffer_init(struct adreno_device *adreno_dev)
{
	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
	int i;

	if (IS_ERR_OR_NULL(device->scratch))
		device->scratch = kgsl_allocate_global(device, PAGE_SIZE,
			0, 0, KGSL_MEMDESC_RANDOM | KGSL_MEMDESC_PRIVILEGED,
			"scratch");

	if (IS_ERR(device->scratch))
		return PTR_ERR(device->scratch);

	adreno_dev->cur_rb = &(adreno_dev->ringbuffers[0]);

	if (!adreno_preemption_feature_set(adreno_dev)) {
		adreno_dev->num_ringbuffers = 1;
		return adreno_ringbuffer_setup(adreno_dev,
			&adreno_dev->ringbuffers[0], 0);
	}

	adreno_dev->num_ringbuffers = ARRAY_SIZE(adreno_dev->ringbuffers);

	for (i = 0; i < adreno_dev->num_ringbuffers; i++) {
		int ret;

		ret = adreno_ringbuffer_setup(adreno_dev,
			&adreno_dev->ringbuffers[i], i);
		if (ret)
			return ret;
	}

	timer_setup(&adreno_dev->preempt.timer, adreno_preemption_timer, 0);
	a5xx_preemption_init(adreno_dev);
	return 0;
}

#define A5XX_SUBMIT_MAX 64

int a5xx_ringbuffer_addcmds(struct adreno_device *adreno_dev,
		struct adreno_ringbuffer *rb, struct adreno_context *drawctxt,
		u32 flags, u32 *in, u32 dwords, u32 timestamp,
		struct adreno_submit_time *time)
{
	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
	static u32 sequence;
	u32 size = A5XX_SUBMIT_MAX + dwords;
	u32 *cmds, index = 0;
	u64 profile_gpuaddr;
	u32 profile_dwords;

	if (adreno_drawctxt_detached(drawctxt))
		return -ENOENT;

	if (adreno_gpu_fault(adreno_dev) != 0)
		return -EPROTO;

	rb->timestamp++;

	if (drawctxt)
		drawctxt->internal_timestamp = rb->timestamp;

	cmds = adreno_ringbuffer_allocspace(rb, size);
	if (IS_ERR(cmds))
		return PTR_ERR(cmds);

	/* Identify the start of a command */
	cmds[index++] = cp_type7_packet(CP_NOP, 1);
	cmds[index++] = drawctxt ? CMD_IDENTIFIER : CMD_INTERNAL_IDENTIFIER;

	/* 14 dwords */
	index += a5xx_preemption_pre_ibsubmit(adreno_dev, rb, drawctxt,
		&cmds[index]);

	profile_gpuaddr = adreno_profile_preib_processing(adreno_dev,
		drawctxt, &profile_dwords);

	if (profile_gpuaddr) {
		cmds[index++] = cp_type7_packet(CP_INDIRECT_BUFFER_PFE, 3);
		cmds[index++] = lower_32_bits(profile_gpuaddr);
		cmds[index++] = upper_32_bits(profile_gpuaddr);
		cmds[index++] = profile_dwords;
	}

	if (drawctxt) {
		cmds[index++] = cp_type7_packet(CP_MEM_WRITE, 3);
		cmds[index++] = lower_32_bits(CTXT_SOPTIMESTAMP(device,
					drawctxt));
		cmds[index++] = upper_32_bits(CTXT_SOPTIMESTAMP(device,
					drawctxt));
		cmds[index++] = timestamp;
	}

	cmds[index++] = cp_type7_packet(CP_MEM_WRITE, 3);
	cmds[index++] = lower_32_bits(RB_SOPTIMESTAMP(device, rb));
	cmds[index++] = upper_32_bits(RB_SOPTIMESTAMP(device, rb));
	cmds[index++] = rb->timestamp;

	if (IS_SECURE(flags)) {
		cmds[index++] = cp_type7_packet(CP_SET_SECURE_MODE, 1);
		cmds[index++] = 1;
	}

	if (IS_NOTPROTECTED(flags)) {
		cmds[index++] = cp_type7_packet(CP_SET_PROTECTED_MODE, 1);
		cmds[index++] = 0;
	}

	memcpy(&cmds[index], in, dwords << 2);
	index += dwords;

	if (IS_NOTPROTECTED(flags)) {
		cmds[index++] = cp_type7_packet(CP_SET_PROTECTED_MODE, 1);
		cmds[index++] = 1;
	}

	/* 4 dwords */
	profile_gpuaddr = adreno_profile_postib_processing(adreno_dev,
		drawctxt, &profile_dwords);

	if (profile_gpuaddr) {
		cmds[index++] = cp_type7_packet(CP_INDIRECT_BUFFER_PFE, 3);
		cmds[index++] = lower_32_bits(profile_gpuaddr);
		cmds[index++] = upper_32_bits(profile_gpuaddr);
		cmds[index++] = profile_dwords;
	}

	if (!adreno_is_a510(adreno_dev) &&
		test_bit(KGSL_FT_PAGEFAULT_GPUHALT_ENABLE,
			&device->mmu.pfpolicy))
		cmds[index++] = cp_type7_packet(CP_WAIT_MEM_WRITES, 0);

	/*
	 * Do a unique memory write from the GPU to assist in early detection of
	 * interrupt storms
	 */

	cmds[index++] = cp_type7_packet(CP_MEM_WRITE, 3);
	cmds[index++] = lower_32_bits(MEMSTORE_ID_GPU_ADDR(device,
				KGSL_MEMSTORE_GLOBAL, ref_wait_ts));
	cmds[index++] = upper_32_bits(MEMSTORE_ID_GPU_ADDR(device,
				KGSL_MEMSTORE_GLOBAL, ref_wait_ts));
	cmds[index++] = ++sequence;

	/*
	 * If this is an internal command, just write the ringbuffer timestamp,
	 * otherwise, write both
	 */
	if (!drawctxt) {
		cmds[index++] = cp_type7_packet(CP_EVENT_WRITE, 4);
		cmds[index++] = CACHE_FLUSH_TS | (1 << 31);
		cmds[index++] = lower_32_bits(RB_EOPTIMESTAMP(device, rb));
		cmds[index++] = upper_32_bits(RB_EOPTIMESTAMP(device, rb));
		cmds[index++] = rb->timestamp;
	} else {
		cmds[index++] = cp_type7_packet(CP_EVENT_WRITE, 4);
		cmds[index++] = CACHE_FLUSH_TS | (1 << 31);
		cmds[index++] = lower_32_bits(CTXT_EOPTIMESTAMP(device,
					drawctxt));
		cmds[index++] = upper_32_bits(CTXT_EOPTIMESTAMP(device,
					drawctxt));
		cmds[index++] = timestamp;

		cmds[index++] = cp_type7_packet(CP_EVENT_WRITE, 4);
		cmds[index++] = CACHE_FLUSH_TS;
		cmds[index++] = lower_32_bits(RB_EOPTIMESTAMP(device, rb));
		cmds[index++] = upper_32_bits(RB_EOPTIMESTAMP(device, rb));
		cmds[index++] = rb->timestamp;
	}

	if (IS_WFI(flags))
		cmds[index++] = cp_type7_packet(CP_WAIT_FOR_IDLE, 0);

	if (IS_SECURE(flags)) {
		cmds[index++] = cp_type7_packet(CP_SET_SECURE_MODE, 1);
		cmds[index++] = 0;
	}

	/* 5 dwords */
	index += a5xx_preemption_post_ibsubmit(adreno_dev, &cmds[index]);

	/* Adjust the thing for the number of bytes we actually wrote */
	rb->_wptr -= (size - index);

	a5xx_ringbuffer_submit(rb, time,
		!adreno_is_preemption_enabled(adreno_dev));

	return 0;
}

static u32 a5xx_get_alwayson_counter(struct adreno_device *adreno_dev,
		u32 *cmds, u64 gpuaddr)
{
	cmds[0] = cp_type7_packet(CP_REG_TO_MEM, 3);
	cmds[1] = A5XX_RBBM_ALWAYSON_COUNTER_LO;

	/* On some targets the upper 32 bits are not reliable */
	if (ADRENO_GPUREV(adreno_dev) > ADRENO_REV_A530)
		cmds[1] |= (1 << 30) | (2 << 18);

	cmds[2] = lower_32_bits(gpuaddr);
	cmds[3] = upper_32_bits(gpuaddr);

	return 4;
}

/* This is the maximum possible size for 64 bit targets */
#define PROFILE_IB_DWORDS 4
#define PROFILE_IB_SLOTS (PAGE_SIZE / (PROFILE_IB_DWORDS << 2))

static u64 a5xx_get_user_profiling_ib(struct adreno_device *adreno_dev,
		struct adreno_ringbuffer *rb, struct kgsl_drawobj_cmd *cmdobj,
		u32 target_offset, u32 *cmds)
{
	u32 offset, *ib, dwords;
	u64 gpuaddr;

	if (IS_ERR(rb->profile_desc))
		return 0;

	offset = rb->profile_index * (PROFILE_IB_DWORDS << 2);
	ib = rb->profile_desc->hostptr + offset;
	gpuaddr = rb->profile_desc->gpuaddr + offset;
	dwords = a5xx_get_alwayson_counter(adreno_dev, ib,
		cmdobj->profiling_buffer_gpuaddr + target_offset);

	cmds[0] = cp_type7_packet(CP_INDIRECT_BUFFER_PFE, 3);
	cmds[1] = lower_32_bits(gpuaddr);
	cmds[2] = upper_32_bits(gpuaddr);
	cmds[3] = dwords;

	rb->profile_index = (rb->profile_index + 1) % PROFILE_IB_SLOTS;

	return 4;
}

static int a5xx_rb_context_switch(struct adreno_device *adreno_dev,
		struct adreno_ringbuffer *rb,
		struct adreno_context *drawctxt)
{
	struct kgsl_pagetable *pagetable =
		adreno_drawctxt_get_pagetable(drawctxt);
	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
	int count = 0;
	u32 cmds[32];

	if (adreno_drawctxt_get_pagetable(rb->drawctxt_active) != pagetable)
		count += a5xx_rb_pagetable_switch(device, drawctxt,
				rb, pagetable, cmds);

	cmds[count++] = cp_type7_packet(CP_NOP, 1);
	cmds[count++] = CONTEXT_TO_MEM_IDENTIFIER;

	cmds[count++] = cp_type7_packet(CP_MEM_WRITE, 3);
	cmds[count++] = lower_32_bits(MEMSTORE_RB_GPU_ADDR(device, rb,
				current_context));
	cmds[count++] = upper_32_bits(MEMSTORE_RB_GPU_ADDR(device, rb,
				current_context));
	cmds[count++] = drawctxt->base.id;

	cmds[count++] = cp_type7_packet(CP_MEM_WRITE, 3);
	cmds[count++] = lower_32_bits(MEMSTORE_ID_GPU_ADDR(device,
		KGSL_MEMSTORE_GLOBAL, current_context));
	cmds[count++] = upper_32_bits(MEMSTORE_ID_GPU_ADDR(device,
		KGSL_MEMSTORE_GLOBAL, current_context));
	cmds[count++] = drawctxt->base.id;

	cmds[count++] = cp_type4_packet(A5XX_UCHE_INVALIDATE0, 1);
	cmds[count++] = 0x12;

	return a5xx_ringbuffer_addcmds(adreno_dev, rb, NULL, F_NOTPROTECTED,
			cmds, count, 0, NULL);
}

static int a5xx_drawctxt_switch(struct adreno_device *adreno_dev,
		struct adreno_ringbuffer *rb,
		struct adreno_context *drawctxt)
{
	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);

	if (rb->drawctxt_active == drawctxt)
		return 0;

	if (kgsl_context_detached(&drawctxt->base))
		return -ENOENT;

	if (!_kgsl_context_get(&drawctxt->base))
		return -ENOENT;

	trace_adreno_drawctxt_switch(rb, drawctxt);

	a5xx_rb_context_switch(adreno_dev, rb, drawctxt);

	/* Release the current drawctxt as soon as the new one is switched */
	adreno_put_drawctxt_on_timestamp(device, rb->drawctxt_active,
		rb, rb->timestamp);

	rb->drawctxt_active = drawctxt;
	return 0;
}


#define A5XX_USER_PROFILE_IB(dev, rb, cmdobj, cmds, field) \
	a5xx_get_user_profiling_ib((dev), (rb), (cmdobj), \
		offsetof(struct kgsl_drawobj_profiling_buffer, field), \
		(cmds))

#define A5XX_KERNEL_PROFILE(dev, cmdobj, cmds, field) \
	a5xx_get_alwayson_counter((dev), (cmds), \
		(dev)->profile_buffer->gpuaddr + \
			ADRENO_DRAWOBJ_PROFILE_OFFSET((cmdobj)->profile_index, \
				field))

#define A5XX_COMMAND_DWORDS 32

int a5xx_ringbuffer_submitcmd(struct adreno_device *adreno_dev,
		struct kgsl_drawobj_cmd *cmdobj, u32 flags,
		struct adreno_submit_time *time)
{
	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
	struct kgsl_drawobj *drawobj = DRAWOBJ(cmdobj);
	struct adreno_context *drawctxt = ADRENO_CONTEXT(drawobj->context);
	struct adreno_ringbuffer *rb = drawctxt->rb;
	int ret = 0, numibs = 0, index = 0;
	u32 *cmds;

	/* Count the number of IBs (if we are not skipping) */
	if (!IS_SKIP(flags)) {
		struct list_head *tmp;

		list_for_each(tmp, &cmdobj->cmdlist)
			numibs++;
	}

	cmds = kmalloc((A5XX_COMMAND_DWORDS + (numibs * 5)) << 2, GFP_KERNEL);
	if (!cmds) {
		ret = -ENOMEM;
		goto done;
	}

	cmds[index++] = cp_type7_packet(CP_NOP, 1);
	cmds[index++] = START_IB_IDENTIFIER;

	/* Kernel profiling: 4 dwords */
	if (IS_KERNEL_PROFILE(flags))
		index += A5XX_KERNEL_PROFILE(adreno_dev, cmdobj, &cmds[index],
			started);

	/* User profiling: 4 dwords */
	if (IS_USER_PROFILE(flags))
		index += A5XX_USER_PROFILE_IB(adreno_dev, rb, cmdobj,
			&cmds[index], gpu_ticks_submitted);

	if (numibs) {
		struct kgsl_memobj_node *ib;

		list_for_each_entry(ib, &cmdobj->cmdlist, node) {
			if (ib->priv & MEMOBJ_SKIP ||
			    (ib->flags & KGSL_CMDLIST_CTXTSWITCH_PREAMBLE
			     && !IS_PREAMBLE(flags)))
				cmds[index++] = cp_type7_packet(CP_NOP, 4);

			cmds[index++] =
				cp_type7_packet(CP_INDIRECT_BUFFER_PFE, 3);
			cmds[index++] = lower_32_bits(ib->gpuaddr);
			cmds[index++] = upper_32_bits(ib->gpuaddr);

			/* Double check that IB_PRIV is never set */
			cmds[index++] = (ib->size >> 2) & 0xfffff;
		}
	}

	/*
	 * SRM -- set render mode (ex binning, direct render etc)
	 * SRM is set by UMD usually at start of IB to tell CP the type of
	 * preemption.
	 * KMD needs to set SRM to NULL to indicate CP that rendering is
	 * done by IB.
	 */
	cmds[index++] = cp_type7_packet(CP_SET_RENDER_MODE, 5);
	cmds[index++] = 0;
	cmds[index++] = 0;
	cmds[index++] = 0;
	cmds[index++] = 0;
	cmds[index++] = 0;

	cmds[index++] = cp_type7_packet(CP_YIELD_ENABLE, 1);
	cmds[index++] = 1;

	/* 4 dwords */
	if (IS_KERNEL_PROFILE(flags))
		index += A5XX_KERNEL_PROFILE(adreno_dev, cmdobj, &cmds[index],
			retired);

	/* 4 dwords */
	if (IS_USER_PROFILE(flags))
		index += A5XX_USER_PROFILE_IB(adreno_dev, rb, cmdobj,
			&cmds[index], gpu_ticks_retired);

	cmds[index++] = cp_type7_packet(CP_NOP, 1);
	cmds[index++] = END_IB_IDENTIFIER;

	ret = a5xx_drawctxt_switch(adreno_dev, rb, drawctxt);

	/*
	 * In the unlikely event of an error in the drawctxt switch,
	 * treat it like a hang
	 */
	if (ret) {
		/*
		 * It is "normal" to get a -ENOSPC or a -ENOENT. Don't log it,
		 * the upper layers know how to handle it
		 */
		if (ret != -ENOSPC && ret != -ENOENT)
			dev_err(device->dev,
				     "Unable to switch draw context: %d\n",
				     ret);
		goto done;
	}

	adreno_drawobj_set_constraint(device, drawobj);

	ret = a5xx_ringbuffer_addcmds(adreno_dev, drawctxt->rb, drawctxt,
		flags, cmds, index, drawobj->timestamp, time);

done:
	trace_kgsl_issueibcmds(device, drawctxt->base.id, numibs,
		drawobj->timestamp, drawobj->flags, ret, drawctxt->type);

	kfree(cmds);
	return ret;
}