229 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			229 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
From 8f5722ac3e42a33345bfd82b7ad6a153134a4239 Mon Sep 17 00:00:00 2001
 | 
						|
From: Jonas Pfeil <pfeiljonas@gmx.de>
 | 
						|
Date: Tue, 8 Nov 2016 00:18:39 +0100
 | 
						|
Subject: [PATCH] drm/vc4: Add fragment shader threading support
 | 
						|
 | 
						|
FS threading brings performance improvements of 0-20% in glmark2.
 | 
						|
 | 
						|
The validation code checks for thread switch signals and ensures that
 | 
						|
the registers of the other thread are not touched, and that our clamps
 | 
						|
are not live across thread switches.  It also checks that the
 | 
						|
threading and branching instructions do not interfere.
 | 
						|
 | 
						|
(Original patch by Jonas, changes by anholt for style cleanup,
 | 
						|
removing validation the kernel doesn't need to do, and adding the flag
 | 
						|
for userspace).
 | 
						|
 | 
						|
v2: Minor style fixes from checkpatch.
 | 
						|
 | 
						|
Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de>
 | 
						|
Signed-off-by: Eric Anholt <eric@anholt.net>
 | 
						|
(cherry picked from commit c778cc5df944291dcdb1ca7a6bb781fbc22550c5)
 | 
						|
---
 | 
						|
 drivers/gpu/drm/vc4/vc4_drv.c              |  1 +
 | 
						|
 drivers/gpu/drm/vc4/vc4_drv.h              |  2 +
 | 
						|
 drivers/gpu/drm/vc4/vc4_validate.c         | 17 +++++---
 | 
						|
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 63 ++++++++++++++++++++++++++++++
 | 
						|
 include/uapi/drm/vc4_drm.h                 |  1 +
 | 
						|
 5 files changed, 79 insertions(+), 5 deletions(-)
 | 
						|
 | 
						|
--- a/drivers/gpu/drm/vc4/vc4_drv.c
 | 
						|
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
 | 
						|
@@ -82,6 +82,7 @@ static int vc4_get_param_ioctl(struct dr
 | 
						|
 		break;
 | 
						|
 	case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
 | 
						|
 	case DRM_VC4_PARAM_SUPPORTS_ETC1:
 | 
						|
+	case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
 | 
						|
 		args->value = true;
 | 
						|
 		break;
 | 
						|
 	default:
 | 
						|
--- a/drivers/gpu/drm/vc4/vc4_drv.h
 | 
						|
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
 | 
						|
@@ -385,6 +385,8 @@ struct vc4_validated_shader_info {
 | 
						|
 
 | 
						|
 	uint32_t num_uniform_addr_offsets;
 | 
						|
 	uint32_t *uniform_addr_offsets;
 | 
						|
+
 | 
						|
+	bool is_threaded;
 | 
						|
 };
 | 
						|
 
 | 
						|
 /**
 | 
						|
--- a/drivers/gpu/drm/vc4/vc4_validate.c
 | 
						|
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
 | 
						|
@@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device
 | 
						|
 	exec->shader_rec_v += roundup(packet_size, 16);
 | 
						|
 	exec->shader_rec_size -= packet_size;
 | 
						|
 
 | 
						|
-	if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
 | 
						|
-		DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
 | 
						|
-		return -EINVAL;
 | 
						|
-	}
 | 
						|
-
 | 
						|
 	for (i = 0; i < shader_reloc_count; i++) {
 | 
						|
 		if (src_handles[i] > exec->bo_count) {
 | 
						|
 			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
 | 
						|
@@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device
 | 
						|
 			return -EINVAL;
 | 
						|
 	}
 | 
						|
 
 | 
						|
+	if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
 | 
						|
+	    to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
 | 
						|
+		DRM_ERROR("Thread mode of CL and FS do not match\n");
 | 
						|
+		return -EINVAL;
 | 
						|
+	}
 | 
						|
+
 | 
						|
+	if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
 | 
						|
+	    to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
 | 
						|
+		DRM_ERROR("cs and vs cannot be threaded\n");
 | 
						|
+		return -EINVAL;
 | 
						|
+	}
 | 
						|
+
 | 
						|
 	for (i = 0; i < shader_reloc_count; i++) {
 | 
						|
 		struct vc4_validated_shader_info *validated_shader;
 | 
						|
 		uint32_t o = shader_reloc_offsets[i];
 | 
						|
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
 | 
						|
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
 | 
						|
@@ -83,6 +83,13 @@ struct vc4_shader_validation_state {
 | 
						|
 	 * basic blocks.
 | 
						|
 	 */
 | 
						|
 	bool needs_uniform_address_for_loop;
 | 
						|
+
 | 
						|
+	/* Set when we find an instruction writing the top half of the
 | 
						|
+	 * register files.  If we allowed writing the unusable regs in
 | 
						|
+	 * a threaded shader, then the other shader running on our
 | 
						|
+	 * QPU's clamp validation would be invalid.
 | 
						|
+	 */
 | 
						|
+	bool all_registers_used;
 | 
						|
 };
 | 
						|
 
 | 
						|
 static uint32_t
 | 
						|
@@ -119,6 +126,13 @@ raddr_add_a_to_live_reg_index(uint64_t i
 | 
						|
 }
 | 
						|
 
 | 
						|
 static bool
 | 
						|
+live_reg_is_upper_half(uint32_t lri)
 | 
						|
+{
 | 
						|
+	return	(lri >= 16 && lri < 32) ||
 | 
						|
+		(lri >= 32 + 16 && lri < 32 + 32);
 | 
						|
+}
 | 
						|
+
 | 
						|
+static bool
 | 
						|
 is_tmu_submit(uint32_t waddr)
 | 
						|
 {
 | 
						|
 	return (waddr == QPU_W_TMU0_S ||
 | 
						|
@@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_sha
 | 
						|
 		} else {
 | 
						|
 			validation_state->live_immediates[lri] = ~0;
 | 
						|
 		}
 | 
						|
+
 | 
						|
+		if (live_reg_is_upper_half(lri))
 | 
						|
+			validation_state->all_registers_used = true;
 | 
						|
 	}
 | 
						|
 
 | 
						|
 	switch (waddr) {
 | 
						|
@@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_valid
 | 
						|
 		}
 | 
						|
 	}
 | 
						|
 
 | 
						|
+	if ((raddr_a >= 16 && raddr_a < 32) ||
 | 
						|
+	    (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
 | 
						|
+		validation_state->all_registers_used = true;
 | 
						|
+	}
 | 
						|
+
 | 
						|
 	return true;
 | 
						|
 }
 | 
						|
 
 | 
						|
@@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_o
 | 
						|
 {
 | 
						|
 	bool found_shader_end = false;
 | 
						|
 	int shader_end_ip = 0;
 | 
						|
+	uint32_t last_thread_switch_ip = -3;
 | 
						|
 	uint32_t ip;
 | 
						|
 	struct vc4_validated_shader_info *validated_shader = NULL;
 | 
						|
 	struct vc4_shader_validation_state validation_state;
 | 
						|
@@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_o
 | 
						|
 		if (!vc4_handle_branch_target(&validation_state))
 | 
						|
 			goto fail;
 | 
						|
 
 | 
						|
+		if (ip == last_thread_switch_ip + 3) {
 | 
						|
+			/* Reset r0-r3 live clamp data */
 | 
						|
+			int i;
 | 
						|
+
 | 
						|
+			for (i = 64; i < LIVE_REG_COUNT; i++) {
 | 
						|
+				validation_state.live_min_clamp_offsets[i] = ~0;
 | 
						|
+				validation_state.live_max_clamp_regs[i] = false;
 | 
						|
+				validation_state.live_immediates[i] = ~0;
 | 
						|
+			}
 | 
						|
+		}
 | 
						|
+
 | 
						|
 		switch (sig) {
 | 
						|
 		case QPU_SIG_NONE:
 | 
						|
 		case QPU_SIG_WAIT_FOR_SCOREBOARD:
 | 
						|
@@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_o
 | 
						|
 		case QPU_SIG_LOAD_TMU1:
 | 
						|
 		case QPU_SIG_PROG_END:
 | 
						|
 		case QPU_SIG_SMALL_IMM:
 | 
						|
+		case QPU_SIG_THREAD_SWITCH:
 | 
						|
+		case QPU_SIG_LAST_THREAD_SWITCH:
 | 
						|
 			if (!check_instruction_writes(validated_shader,
 | 
						|
 						      &validation_state)) {
 | 
						|
 				DRM_ERROR("Bad write at ip %d\n", ip);
 | 
						|
@@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_o
 | 
						|
 				shader_end_ip = ip;
 | 
						|
 			}
 | 
						|
 
 | 
						|
+			if (sig == QPU_SIG_THREAD_SWITCH ||
 | 
						|
+			    sig == QPU_SIG_LAST_THREAD_SWITCH) {
 | 
						|
+				validated_shader->is_threaded = true;
 | 
						|
+
 | 
						|
+				if (ip < last_thread_switch_ip + 3) {
 | 
						|
+					DRM_ERROR("Thread switch too soon after "
 | 
						|
+						  "last switch at ip %d\n", ip);
 | 
						|
+					goto fail;
 | 
						|
+				}
 | 
						|
+				last_thread_switch_ip = ip;
 | 
						|
+			}
 | 
						|
+
 | 
						|
 			break;
 | 
						|
 
 | 
						|
 		case QPU_SIG_LOAD_IMM:
 | 
						|
@@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_o
 | 
						|
 			if (!check_branch(inst, validated_shader,
 | 
						|
 					  &validation_state, ip))
 | 
						|
 				goto fail;
 | 
						|
+
 | 
						|
+			if (ip < last_thread_switch_ip + 3) {
 | 
						|
+				DRM_ERROR("Branch in thread switch at ip %d",
 | 
						|
+					  ip);
 | 
						|
+				goto fail;
 | 
						|
+			}
 | 
						|
+
 | 
						|
 			break;
 | 
						|
 		default:
 | 
						|
 			DRM_ERROR("Unsupported QPU signal %d at "
 | 
						|
@@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_o
 | 
						|
 		goto fail;
 | 
						|
 	}
 | 
						|
 
 | 
						|
+	/* Might corrupt other thread */
 | 
						|
+	if (validated_shader->is_threaded &&
 | 
						|
+	    validation_state.all_registers_used) {
 | 
						|
+		DRM_ERROR("Shader uses threading, but uses the upper "
 | 
						|
+			  "half of the registers, too\n");
 | 
						|
+		goto fail;
 | 
						|
+	}
 | 
						|
+
 | 
						|
 	/* If we did a backwards branch and we haven't emitted a uniforms
 | 
						|
 	 * reset since then, we still need the uniforms stream to have the
 | 
						|
 	 * uniforms address available so that the backwards branch can do its
 | 
						|
--- a/include/uapi/drm/vc4_drm.h
 | 
						|
+++ b/include/uapi/drm/vc4_drm.h
 | 
						|
@@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state {
 | 
						|
 #define DRM_VC4_PARAM_V3D_IDENT2		2
 | 
						|
 #define DRM_VC4_PARAM_SUPPORTS_BRANCHES		3
 | 
						|
 #define DRM_VC4_PARAM_SUPPORTS_ETC1		4
 | 
						|
+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS	5
 | 
						|
 
 | 
						|
 struct drm_vc4_get_param {
 | 
						|
 	__u32 param;
 |