From 11ecbdddf2f8b6cc2480aff6d877b7a4076e3b7f Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 17 Mar 2020 15:22:22 +0200 Subject: drm/i915/perf: introduce global sseu pinning On Gen11 powergating half the execution units is a functional requirement when using the VME samplers. Not fullfilling this requirement can lead to hangs. This unfortunately plays fairly poorly with the NOA requirements. NOA requires a stable power configuration to maintain its configuration. As a result using OA (and NOA feeding into it) so far has required us to use a power configuration that can work for all contexts. The only power configuration fullfilling this is powergating half the execution units. This makes performance analysis for 3D workloads somewhat pointless. Failing to find a solution that would work for everybody, this change introduces a new i915-perf stream open parameter that punts the decision off to userspace. If this parameter is omitted, the existing Gen11 behavior remains (half EU array powergating). This change takes the initiative to move all perf related sseu configuration into i915_perf.c v2: Make parameter priviliged if different from default v3: Fix context modifying its sseu config while i915-perf is enabled v4: Always consider global sseu a privileged operation (Tvrtko) Override req_sseu point in intel_sseu_make_rpcs() (Tvrtko) Remove unrelated changes (Tvrtko) v5: Some typos (Tvrtko) Process sseu param in read_properties_unlocked() (Tvrtko) v6: Actually commit the bits from v5... Fixup some checkpath warnings v7: Only compare engine uabi field (Chris) Signed-off-by: Lionel Landwerlin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200317132222.2638719-3-lionel.g.landwerlin@intel.com --- include/uapi/drm/i915_drm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 2813e579b480..db649d03ab52 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1969,6 +1969,17 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_PROP_HOLD_PREEMPTION, + /** + * Specifying this pins all contexts to the specified SSEU power + * configuration for the duration of the recording. + * + * This parameter's value is a pointer to a struct + * drm_i915_gem_context_param_sseu. + * + * This property is available in perf revision 4. + */ + DRM_I915_PERF_PROP_GLOBAL_SSEU, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; -- cgit v1.2.3 From 4ef10fe05ba0b08ce7029c07878afe3c8d5754d8 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 24 Mar 2020 11:54:57 -0700 Subject: drm/i915/perf: add new open param to configure polling of OA buffer This new parameter let's the application choose how often the OA buffer should be checked on the CPU side for data availability. Longer polling period tend to reduce CPU overhead if the application does not care about somewhat real time data collection. v2: Allow disabling polling completely with 0 value (Lionel) v3: Version the new parameter (Joonas) v4: Rebase (Umesh) v5: Make poll delay value of 0 invalid (Umesh) v6: - Describe poll_oa_period (Ashutosh) - Fix comment for new poll parameter (Lionel) - Drop open_flags in read_properties_unlocked (Lionel) - Rename uapi parameter (Ashutosh) v7: Reword the comment in uapi (Ashutosh) Signed-off-by: Lionel Landwerlin Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Ashutosh Dixit Signed-off-by: Lionel Landwerlin Link: https://patchwork.freedesktop.org/patch/msgid/20200324185457.14635-4-umesh.nerlige.ramappa@intel.com --- drivers/gpu/drm/i915/i915_perf.c | 32 +++++++++++++++++++++++++------- drivers/gpu/drm/i915/i915_perf_types.h | 6 ++++++ include/uapi/drm/i915_drm.h | 13 +++++++++++++ 3 files changed, 44 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 4cadf97f94bc..c74ebac50015 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -248,11 +248,11 @@ #define OA_TAIL_MARGIN_NSEC 100000ULL #define INVALID_TAIL_PTR 0xffffffff -/* frequency for checking whether the OA unit has written new reports to the - * circular OA buffer... +/* The default frequency for checking whether the OA unit has written new + * reports to the circular OA buffer... */ -#define POLL_FREQUENCY 200 -#define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) +#define DEFAULT_POLL_FREQUENCY_HZ 200 +#define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ) /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */ static u32 i915_perf_stream_paranoid = true; @@ -339,6 +339,8 @@ static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = { * @sseu: internal SSEU configuration computed either from the userspace * specified configuration in the opening parameters or a default value * (see get_default_sseu_config()) + * @poll_oa_period: The period in nanoseconds at which the CPU will check for OA + * data availability * * As read_properties_unlocked() enumerates and validates the properties given * to open a stream of metrics the configuration is built up in the structure @@ -361,6 +363,8 @@ struct perf_open_properties { bool has_sseu; struct intel_sseu sseu; + + u64 poll_oa_period; }; struct i915_oa_config_bo { @@ -2600,7 +2604,7 @@ static void i915_oa_stream_enable(struct i915_perf_stream *stream) if (stream->periodic) hrtimer_start(&stream->poll_check_timer, - ns_to_ktime(POLL_PERIOD), + ns_to_ktime(stream->poll_oa_period), HRTIMER_MODE_REL_PINNED); } @@ -3035,7 +3039,8 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer) wake_up(&stream->poll_wq); } - hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD)); + hrtimer_forward_now(hrtimer, + ns_to_ktime(stream->poll_oa_period)); return HRTIMER_RESTART; } @@ -3424,6 +3429,7 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf, stream->perf = perf; stream->ctx = specific_ctx; + stream->poll_oa_period = props->poll_oa_period; ret = i915_oa_stream_init(stream, param, props); if (ret) @@ -3502,6 +3508,7 @@ static int read_properties_unlocked(struct i915_perf *perf, int ret; memset(props, 0, sizeof(struct perf_open_properties)); + props->poll_oa_period = DEFAULT_POLL_PERIOD_NS; if (!n_props) { DRM_DEBUG("No i915 perf properties given\n"); @@ -3634,6 +3641,14 @@ static int read_properties_unlocked(struct i915_perf *perf, props->has_sseu = true; break; } + case DRM_I915_PERF_PROP_POLL_OA_PERIOD: + if (value < 100000 /* 100us */) { + DRM_DEBUG("OA availability timer too small (%lluns < 100us)\n", + value); + return -EINVAL; + } + props->poll_oa_period = value; + break; case DRM_I915_PERF_PROP_MAX: MISSING_CASE(id); return -EINVAL; @@ -4416,8 +4431,11 @@ int i915_perf_ioctl_version(void) * 4: Add DRM_I915_PERF_PROP_ALLOWED_SSEU to limit what contexts can * be run for the duration of the performance recording based on * their SSEU configuration. + * + * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the + * interval for the hrtimer used to check for OA data. */ - return 4; + return 5; } #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h index c3ab184c604a..371dcf243622 100644 --- a/drivers/gpu/drm/i915/i915_perf_types.h +++ b/drivers/gpu/drm/i915/i915_perf_types.h @@ -304,6 +304,12 @@ struct i915_perf_stream { * reprogrammed. */ struct i915_vma *noa_wait; + + /** + * @poll_oa_period: The period in nanoseconds at which the OA + * buffer should be checked for available data. + */ + u64 poll_oa_period; }; /** diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index db649d03ab52..14b67cd6b54b 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1980,6 +1980,19 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_PROP_GLOBAL_SSEU, + /** + * This optional parameter specifies the timer interval in nanoseconds + * at which the i915 driver will check the OA buffer for available data. + * Minimum allowed value is 100 microseconds. A default value is used by + * the driver if this parameter is not specified. Note that larger timer + * values will reduce cpu consumption during OA perf captures. However, + * excessively large values would potentially result in OA buffer + * overwrites as captures reach end of the OA buffer. + * + * This property is available in perf revision 5. + */ + DRM_I915_PERF_PROP_POLL_OA_PERIOD, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; -- cgit v1.2.3 From c57053725d9bd7ab3e14fd0f8001714041df1280 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Fri, 17 Apr 2020 20:50:30 -0400 Subject: drm/amdgpu: add tiling flags from Mesa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DCC_INDEPENDENT_128B is needed for displayble DCC on gfx10. SCANOUT is not needed by the kernel, but Mesa uses it. Signed-off-by: Marek Olšák Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 65f69723cbdc..d28b4ce744d5 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -346,6 +346,10 @@ struct drm_amdgpu_gem_userptr { #define AMDGPU_TILING_DCC_PITCH_MAX_MASK 0x3FFF #define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT 43 #define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK 0x1 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT 44 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK 0x1 +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 0x1 /* Set/Get helpers for tiling flags. */ #define AMDGPU_TILING_SET(field, value) \ -- cgit v1.2.3 From 35ce006004821c5e9ae8fe03d048567cec99c41e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 7 Aug 2019 21:43:24 -0500 Subject: drm/amdgpu: add UAPI for creating encrypted buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a flag to the GEM_CREATE ioctl to create encrypted buffers. Buffers with this flag set will be created with the TMZ bit set in the PTEs or engines accessing them. This is required in order to properly access the data from the engines. Signed-off-by: Alex Deucher Reviewed-by: Huang Rui Reviewed-by: Christian König --- include/uapi/drm/amdgpu_drm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index d28b4ce744d5..4a0829d35e72 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -133,6 +133,11 @@ extern "C" { * releasing the memory */ #define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE (1 << 9) +/* Flag that BO will be encrypted and that the TMZ bit should be + * set in the PTEs when mapping this buffer via GPUVM or + * accessing it with various hw blocks + */ +#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) struct drm_amdgpu_gem_create_in { /** the requested memory size */ -- cgit v1.2.3 From e90c2b210bad457aab73d3357465afe4d4475f7e Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 23 Sep 2019 19:02:41 -0400 Subject: drm/amdgpu: add UAPI to create secure commands (v3) Add a flag to the command submission IOCTL structure which when present indicates that this command submission should be treated as secure. The kernel driver uses this flag to determine whether the engine should be transitioned to secure or unsecure, or the work can be submitted to a secure queue depending on the IP. v3: the flag is now at command submission IOCTL Signed-off-by: Luben Tuikov Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 4a0829d35e72..34cd0bae06bc 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -205,6 +205,9 @@ union drm_amdgpu_bo_list { #define AMDGPU_CTX_OP_QUERY_STATE 3 #define AMDGPU_CTX_OP_QUERY_STATE2 4 +/* Flag the command submission as secure */ +#define AMDGPU_CS_FLAGS_SECURE (1 << 0) + /* GPU reset status */ #define AMDGPU_CTX_NO_RESET 0 /* this the context caused it */ @@ -564,7 +567,7 @@ struct drm_amdgpu_cs_in { /** Handle of resource list associated with CS */ __u32 bo_list_handle; __u32 num_chunks; - __u32 _pad; + __u32 flags; /** this points to __u64 * which point to cs chunks */ __u64 chunks; }; -- cgit v1.2.3 From 4baa8ff0690e464443594353d63c989f0ed9f831 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 27 Nov 2019 15:55:35 -0500 Subject: drm/amdgpu: move CS secure flag next the structs where it's used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So it's not mixed up with the CTX stuff. Reviewed-by: Zhan Liu Reviewed-by: Christian König Signed-off-by: Alex Deucher --- include/uapi/drm/amdgpu_drm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 34cd0bae06bc..bea72eb8c147 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -205,9 +205,6 @@ union drm_amdgpu_bo_list { #define AMDGPU_CTX_OP_QUERY_STATE 3 #define AMDGPU_CTX_OP_QUERY_STATE2 4 -/* Flag the command submission as secure */ -#define AMDGPU_CS_FLAGS_SECURE (1 << 0) - /* GPU reset status */ #define AMDGPU_CTX_NO_RESET 0 /* this the context caused it */ @@ -561,6 +558,9 @@ struct drm_amdgpu_cs_chunk { __u64 chunk_data; }; +/* Flag the command submission as secure */ +#define AMDGPU_CS_FLAGS_SECURE (1 << 0) + struct drm_amdgpu_cs_in { /** Rendering context id */ __u32 ctx_id; -- cgit v1.2.3 From 0bb5d5b03f78aeb5f87d47877eb15532875c64da Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Wed, 22 Apr 2020 17:56:56 -0400 Subject: drm/amdgpu: Move to a per-IB secure flag (TMZ) Move from a per-CS secure flag (TMZ) to a per-IB secure flag. Signed-off-by: Luben Tuikov Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 23 ++++++++++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 3 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 9 ++++----- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 23 +++++++---------------- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 20 ++++++-------------- include/uapi/drm/amdgpu_drm.h | 7 ++++--- 10 files changed, 44 insertions(+), 52 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 99de770a8e9f..3eee5c7d83e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -232,8 +232,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs if (ret) goto free_all_kdata; - p->job->secure = cs->in.flags & AMDGPU_CS_FLAGS_SECURE; - if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) { ret = -ECANCELED; goto free_all_kdata; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 045951d2b46c..cba22039df6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -133,6 +133,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, uint64_t fence_ctx; uint32_t status = 0, alloc_size; unsigned fence_flags = 0; + bool secure; unsigned i; int r = 0; @@ -214,9 +215,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, if (job && ring->funcs->emit_cntxcntl) { status |= job->preamble_status; status |= job->preemption_status; - amdgpu_ring_emit_cntxcntl(ring, status, job->secure); + amdgpu_ring_emit_cntxcntl(ring, status); } + secure = false; for (i = 0; i < num_ibs; ++i) { ib = &ibs[i]; @@ -228,12 +230,27 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, !amdgpu_sriov_vf(adev)) /* for SRIOV preemption, Preamble CE ib must be inserted anyway */ continue; + /* If this IB is TMZ, add frame TMZ start packet, + * else, turn off TMZ. + */ + if (ib->flags & AMDGPU_IB_FLAGS_SECURE && ring->funcs->emit_tmz) { + if (!secure) { + secure = true; + amdgpu_ring_emit_tmz(ring, true); + } + } else if (secure) { + secure = false; + amdgpu_ring_emit_tmz(ring, false); + } + amdgpu_ring_emit_ib(ring, job, ib, status); status &= ~AMDGPU_HAVE_CTX_SWITCH; } - if (ring->funcs->emit_tmz) - amdgpu_ring_emit_tmz(ring, false, job ? job->secure : false); + if (secure) { + secure = false; + amdgpu_ring_emit_tmz(ring, false); + } #ifdef CONFIG_X86_64 if (!(adev->flags & AMD_IS_APU)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 7f5ccee476a4..81caac9b958a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -62,9 +62,6 @@ struct amdgpu_job { /* user fence handling */ uint64_t uf_addr; uint64_t uf_sequence; - - /* the job is due to a secure command submission */ - bool secure; }; int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 5956eff2d784..7d39064f9361 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -168,8 +168,7 @@ struct amdgpu_ring_funcs { void (*begin_use)(struct amdgpu_ring *ring); void (*end_use)(struct amdgpu_ring *ring); void (*emit_switch_buffer) (struct amdgpu_ring *ring); - void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags, - bool trusted); + void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags); void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t reg_val_offs); void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val); @@ -178,7 +177,7 @@ struct amdgpu_ring_funcs { void (*emit_reg_write_reg_wait)(struct amdgpu_ring *ring, uint32_t reg0, uint32_t reg1, uint32_t ref, uint32_t mask); - void (*emit_tmz)(struct amdgpu_ring *ring, bool start, bool trusted); + void (*emit_tmz)(struct amdgpu_ring *ring, bool start); /* Try to soft recover the ring to make the fence signal */ void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid); int (*preempt_ib)(struct amdgpu_ring *ring); @@ -252,12 +251,12 @@ struct amdgpu_ring { #define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as)) #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r)) #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r)) -#define amdgpu_ring_emit_cntxcntl(r, d, s) (r)->funcs->emit_cntxcntl((r), (d), (s)) +#define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d)) #define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o)) #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v)) #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m)) #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m)) -#define amdgpu_ring_emit_tmz(r, b, s) (r)->funcs->emit_tmz((r), (b), (s)) +#define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b)) #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib))) #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r)) #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o)) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 473c1c145332..404c6d470515 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3037,8 +3037,7 @@ static int gfx_v10_0_rlc_backdoor_autoload_enable(struct amdgpu_device *adev); static int gfx_v10_0_wait_for_rlc_autoload_complete(struct amdgpu_device *adev); static void gfx_v10_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume); static void gfx_v10_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume); -static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start, - bool trusted); +static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start); static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t queue_mask) { @@ -7436,8 +7435,7 @@ static void gfx_v10_0_ring_emit_sb(struct amdgpu_ring *ring) } static void gfx_v10_0_ring_emit_cntxcntl(struct amdgpu_ring *ring, - uint32_t flags, - bool trusted) + uint32_t flags) { uint32_t dw2 = 0; @@ -7445,8 +7443,6 @@ static void gfx_v10_0_ring_emit_cntxcntl(struct amdgpu_ring *ring, gfx_v10_0_ring_emit_ce_meta(ring, (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ? true : false); - gfx_v10_0_ring_emit_tmz(ring, true, trusted); - dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */ if (flags & AMDGPU_HAVE_CTX_SWITCH) { /* set load_global_config & load_global_uconfig */ @@ -7603,17 +7599,12 @@ static void gfx_v10_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume) sizeof(de_payload) >> 2); } -static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start, - bool trusted) +static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start) { - amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0)); - /* - * cmd = 0: frame begin - * cmd = 1: frame end - */ - amdgpu_ring_write(ring, - ((amdgpu_is_tmz(ring->adev) && trusted) ? FRAME_TMZ : 0) - | FRAME_CMD(start ? 0 : 1)); + if (amdgpu_is_tmz(ring->adev)) { + amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0)); + amdgpu_ring_write(ring, FRAME_TMZ | FRAME_CMD(start ? 0 : 1)); + } } static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index 283b7fc10f98..aa1e1be852dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -2969,8 +2969,7 @@ static uint64_t gfx_v6_0_get_gpu_clock_counter(struct amdgpu_device *adev) return clock; } -static void gfx_v6_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags, - bool trusted) +static void gfx_v6_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { if (flags & AMDGPU_HAVE_CTX_SWITCH) gfx_v6_0_ring_emit_vgt_flush(ring); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index f26e91354ba8..e5a88cad44cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -2320,8 +2320,7 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring, amdgpu_ring_write(ring, control); } -static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags, - bool trusted) +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { uint32_t dw2 = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index d1312d829252..2fcf6865abba 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -6329,8 +6329,7 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring) amdgpu_ring_write(ring, 0); } -static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags, - bool trusted) +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { uint32_t dw2 = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index bae5dd6ea348..4e042e974983 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5442,29 +5442,21 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring) amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2); } -static void gfx_v9_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start, - bool trusted) +static void gfx_v9_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start) { - amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0)); - /* - * cmd = 0: frame begin - * cmd = 1: frame end - */ - amdgpu_ring_write(ring, - ((amdgpu_is_tmz(ring->adev) && trusted) ? FRAME_TMZ : 0) - | FRAME_CMD(start ? 0 : 1)); + if (amdgpu_is_tmz(ring->adev)) { + amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0)); + amdgpu_ring_write(ring, FRAME_TMZ | FRAME_CMD(start ? 0 : 1)); + } } -static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags, - bool trusted) +static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) { uint32_t dw2 = 0; if (amdgpu_sriov_vf(ring->adev)) gfx_v9_0_ring_emit_ce_meta(ring); - gfx_v9_0_ring_emit_tmz(ring, true, trusted); - dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */ if (flags & AMDGPU_HAVE_CTX_SWITCH) { /* set load_global_config & load_global_uconfig */ diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index bea72eb8c147..e01b673f0449 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -558,9 +558,6 @@ struct drm_amdgpu_cs_chunk { __u64 chunk_data; }; -/* Flag the command submission as secure */ -#define AMDGPU_CS_FLAGS_SECURE (1 << 0) - struct drm_amdgpu_cs_in { /** Rendering context id */ __u32 ctx_id; @@ -601,6 +598,10 @@ union drm_amdgpu_cs { */ #define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) +/* Flag the IB as secure (TMZ) + */ +#define AMDGPU_IB_FLAGS_SECURE (1 << 5) + struct drm_amdgpu_cs_chunk_ib { __u32 _pad; /** AMDGPU_IB_FLAG_* */ -- cgit v1.2.3 From 5bb4b78be9c67b02a7f138850e9e89825181f555 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Mon, 6 May 2019 22:11:14 -0500 Subject: drm/amdkfd: New IOCTL to allocate queue GWS (v2) Add a new kfd ioctl to allocate queue GWS. Queue GWS is released on queue destroy. v2: re-introduce this API with the following fixes squashed in: - drm/amdkfd: fix null pointer dereference on dev - drm/amdkfd: Return proper error code for gws alloc API - drm/amdkfd: Remove GPU ID in GWS queue creation Signed-off-by: Oak Zeng Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 39 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 9 +++++ include/uapi/linux/kfd_ioctl.h | 19 ++++++++++- 4 files changed, 68 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 0ec5f25adf56..5eb1314f500b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1584,6 +1584,43 @@ copy_from_user_failed: return err; } +static int kfd_ioctl_alloc_queue_gws(struct file *filep, + struct kfd_process *p, void *data) +{ + int retval; + struct kfd_ioctl_alloc_queue_gws_args *args = data; + struct queue *q; + struct kfd_dev *dev; + + if (!hws_gws_support) + return -ENODEV; + + mutex_lock(&p->mutex); + q = pqm_get_user_queue(&p->pqm, args->queue_id); + + if (q) { + dev = q->device; + } else { + retval = -EINVAL; + goto out_unlock; + } + + if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { + retval = -ENODEV; + goto out_unlock; + } + + retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL); + mutex_unlock(&p->mutex); + + args->first_gws = 0; + return retval; + +out_unlock: + mutex_unlock(&p->mutex); + return retval; +} + static int kfd_ioctl_get_dmabuf_info(struct file *filep, struct kfd_process *p, void *data) { @@ -1786,6 +1823,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, kfd_ioctl_import_dmabuf, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, + kfd_ioctl_alloc_queue_gws, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 4a3049841086..5e7f1fb6761b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -923,6 +923,8 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, void *gws); struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, unsigned int qid); +struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, + unsigned int qid); int pqm_get_wave_state(struct process_queue_manager *pqm, unsigned int qid, void __user *ctl_stack, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 084c35f55d59..eb1635ac8988 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -476,6 +476,15 @@ struct kernel_queue *pqm_get_kernel_queue( return NULL; } +struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, + unsigned int qid) +{ + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + return pqn ? pqn->q : NULL; +} + int pqm_get_wave_state(struct process_queue_manager *pqm, unsigned int qid, void __user *ctl_stack, diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 20917c59f39c..4f6676428c5c 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -410,6 +410,20 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { __u32 n_success; /* to/from KFD */ }; +/* Allocate GWS for specific queue + * + * @queue_id: queue's id that GWS is allocated for + * @num_gws: how many GWS to allocate + * @first_gws: index of the first GWS allocated. + * only support contiguous GWS allocation + */ +struct kfd_ioctl_alloc_queue_gws_args { + __u32 queue_id; /* to KFD */ + __u32 num_gws; /* to KFD */ + __u32 first_gws; /* from KFD */ + __u32 pad; +}; + struct kfd_ioctl_get_dmabuf_info_args { __u64 size; /* from KFD */ __u64 metadata_ptr; /* to KFD */ @@ -529,7 +543,10 @@ enum kfd_mmio_remap { #define AMDKFD_IOC_IMPORT_DMABUF \ AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args) +#define AMDKFD_IOC_ALLOC_QUEUE_GWS \ + AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x1E +#define AMDKFD_COMMAND_END 0x1F #endif -- cgit v1.2.3 From 0aeaaf64e6d06e353de15dcf9973312ae0672ca1 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 29 Apr 2020 19:36:06 -0400 Subject: drm/amdkfd: Fix comment formatting Corrected two function names. Added a missing space. Signed-off-by: Felix Kuehling Reviewed-by: Kent Russell Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- include/uapi/linux/kfd_ioctl.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 598296034b43..d27221ddcdeb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1122,7 +1122,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) return p; } -/* process_evict_queues - Evict all user queues of a process +/* kfd_process_evict_queues - Evict all user queues of a process * * Eviction is reference-counted per process-device. This means multiple * evictions from different sources can be nested safely. @@ -1162,7 +1162,7 @@ fail: return r; } -/* process_restore_queues - Restore all user queues of a process */ +/* kfd_process_restore_queues - Restore all user queues of a process */ int kfd_process_restore_queues(struct kfd_process *p) { struct kfd_process_device *pdd; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 4f6676428c5c..b6be62356d34 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -251,7 +251,7 @@ struct kfd_memory_exception_failure { __u32 imprecise; /* Can't determine the exact fault address */ }; -/* memory exception data*/ +/* memory exception data */ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; __u64 va; -- cgit v1.2.3 From 50b6f619a099af6dfe06e958bfa08d46440ea788 Mon Sep 17 00:00:00 2001 From: Mika Kahola Date: Wed, 6 May 2020 15:08:27 +0300 Subject: uapi/drm/drm_fourcc.h: Note on platform specificity for format modifiers Make an additional note on DRM format modifiers for x and y tiling. These format modifiers are defined for BDW+ platforms and therefore definition is not valid for older gens. This is due to address swizzling for tiled surfaces is no longer used. For newer platforms main memory controller has a more effective address swizzling algorithm. v2: Rephrase comment (Daniel) Signed-off-by: Mika Kahola Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200506120827.12250-1-mika.kahola@intel.com --- include/uapi/drm/drm_fourcc.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 8bc0b31597d8..9e488d10f8b4 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -354,9 +354,12 @@ extern "C" { * a platform-dependent stride. On top of that the memory can apply * platform-depending swizzling of some higher address bits into bit6. * - * This format is highly platforms specific and not useful for cross-driver - * sharing. It exists since on a given platform it does uniquely identify the - * layout in a simple way for i915-specific userspace. + * Note that this layout is only accurate on intel gen 8+ or valleyview chipsets. + * On earlier platforms the is highly platforms specific and not useful for + * cross-driver sharing. It exists since on a given platform it does uniquely + * identify the layout in a simple way for i915-specific userspace, which + * facilitated conversion of userspace to modifiers. Additionally the exact + * format on some really old platforms is not known. */ #define I915_FORMAT_MOD_X_TILED fourcc_mod_code(INTEL, 1) @@ -369,9 +372,12 @@ extern "C" { * memory can apply platform-depending swizzling of some higher address bits * into bit6. * - * This format is highly platforms specific and not useful for cross-driver - * sharing. It exists since on a given platform it does uniquely identify the - * layout in a simple way for i915-specific userspace. + * Note that this layout is only accurate on intel gen 8+ or valleyview chipsets. + * On earlier platforms the is highly platforms specific and not useful for + * cross-driver sharing. It exists since on a given platform it does uniquely + * identify the layout in a simple way for i915-specific userspace, which + * facilitated conversion of userspace to modifiers. Additionally the exact + * format on some really old platforms is not known. */ #define I915_FORMAT_MOD_Y_TILED fourcc_mod_code(INTEL, 2) -- cgit v1.2.3 From 43c8546bcd854806736d8a635a0d696504dd4c21 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 28 Apr 2020 01:28:43 -0400 Subject: drm/amdgpu: Add a UAPI flag for user to call mem_sync When this flag is set in the CS IB flags, it causes a memory cache flush of the GFX. v2: Move new flag to drm_amdgpu_cs_chunk_ib.flags Bump up UAPI version Remove condition on job != null to emit mem_sync Signed-off-by: Andrey Grodzovsky Reviewed-by: Luben Tuikov Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 3 +++ include/uapi/drm/amdgpu_drm.h | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index beb35dd12964..a0e5b54b6e47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -86,9 +86,10 @@ * - 3.35.0 - Add drm_amdgpu_info_device::tcc_disabled_mask * - 3.36.0 - Allow reading more status registers on si/cik * - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness + * - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 37 +#define KMS_DRIVER_MINOR 38 #define KMS_DRIVER_PATCHLEVEL 0 int amdgpu_vram_limit = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index c24366aacf3a..b91853fd66d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -189,6 +189,9 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, dma_fence_put(tmp); } + if ((ib->flags & AMDGPU_IB_FLAG_EMIT_MEM_SYNC) && ring->funcs->emit_mem_sync) + ring->funcs->emit_mem_sync(ring); + if (ring->funcs->insert_start) ring->funcs->insert_start(ring); diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index e01b673f0449..4e873dcbe68f 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -602,6 +602,10 @@ union drm_amdgpu_cs { */ #define AMDGPU_IB_FLAGS_SECURE (1 << 5) +/* Tell KMD to flush and invalidate caches + */ +#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC (1 << 6) + struct drm_amdgpu_cs_chunk_ib { __u32 _pad; /** AMDGPU_IB_FLAG_* */ -- cgit v1.2.3 From 82c8c4ddcae74460f4ea484ab9ad97ddb466e469 Mon Sep 17 00:00:00 2001 From: James Jones Date: Wed, 11 Dec 2019 12:55:47 -0800 Subject: drm: Generalized NV Block Linear DRM format mod Builds upon the existing NVIDIA 16Bx2 block linear format modifiers by adding more "fields" to the existing parameterized DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK format modifier macro that allow fully defining a unique-across- all-NVIDIA-hardware bit layout using a minimal set of fields and values. The new modifier macro DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D is effectively backwards compatible with the existing macro, introducing a superset of the previously definable format modifiers. Backwards compatibility has two quirks. First, the zero value for the "kind" field, which is implied by the DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK macro, must be special cased in drivers and assumed to map to the pre-Turing generic kind of 0xfe, since a kind of "zero" is reserved for linear buffer layouts on all GPUs. Second, it is assumed backwards compatibility is only needed when running on Tegra GPUs, and specifically Tegra GPUs prior to Xavier. This is based on two assertions: -Tegra GPUs prior to Xavier used a slightly different raw bit layout than desktop GPUs, making it impossible to directly share block linear buffers between the two. -Support for the existing block linear modifiers was incomplete, making them useful only for exporting buffers created by nouveau and importing them to Tegra DRM as framebuffers for scan out. There was no support for adding framebuffers using format modifiers in nouveau, nor importing dma-buf/PRIME GEM objects into nouveau userspace drivers with modifiers in Mesa. Hence it is assumed the prior modifiers were not intended for use on desktop GPUs, and as a corollary, were not intended to support sharing block linear buffers across two different NVIDIA GPUs. v2: - Added canonicalize helper function v3: - Added additional bit to compression field to support Tesla (NV5x,G8x,G9x,GT1xx,GT2xx) class chips. Signed-off-by: James Jones Signed-off-by: Ben Skeggs --- include/uapi/drm/drm_fourcc.h | 122 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 9e488d10f8b4..490143500a50 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -527,7 +527,113 @@ extern "C" { #define DRM_FORMAT_MOD_NVIDIA_TEGRA_TILED fourcc_mod_code(NVIDIA, 1) /* - * 16Bx2 Block Linear layout, used by desktop GPUs, and Tegra K1 and later + * Generalized Block Linear layout, used by desktop GPUs starting with NV50/G80, + * and Tegra GPUs starting with Tegra K1. + * + * Pixels are arranged in Groups of Bytes (GOBs). GOB size and layout varies + * based on the architecture generation. GOBs themselves are then arranged in + * 3D blocks, with the block dimensions (in terms of GOBs) always being a power + * of two, and hence expressible as their log2 equivalent (E.g., "2" represents + * a block depth or height of "4"). + * + * Chapter 20 "Pixel Memory Formats" of the Tegra X1 TRM describes this format + * in full detail. + * + * Macro + * Bits Param Description + * ---- ----- ----------------------------------------------------------------- + * + * 3:0 h log2(height) of each block, in GOBs. Placed here for + * compatibility with the existing + * DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK()-based modifiers. + * + * 4:4 - Must be 1, to indicate block-linear layout. Necessary for + * compatibility with the existing + * DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK()-based modifiers. + * + * 8:5 - Reserved (To support 3D-surfaces with variable log2(depth) block + * size). Must be zero. + * + * Note there is no log2(width) parameter. Some portions of the + * hardware support a block width of two gobs, but it is impractical + * to use due to lack of support elsewhere, and has no known + * benefits. + * + * 11:9 - Reserved (To support 2D-array textures with variable array stride + * in blocks, specified via log2(tile width in blocks)). Must be + * zero. + * + * 19:12 k Page Kind. This value directly maps to a field in the page + * tables of all GPUs >= NV50. It affects the exact layout of bits + * in memory and can be derived from the tuple + * + * (format, GPU model, compression type, samples per pixel) + * + * Where compression type is defined below. If GPU model were + * implied by the format modifier, format, or memory buffer, page + * kind would not need to be included in the modifier itself, but + * since the modifier should define the layout of the associated + * memory buffer independent from any device or other context, it + * must be included here. + * + * 21:20 g GOB Height and Page Kind Generation. The height of a GOB changed + * starting with Fermi GPUs. Additionally, the mapping between page + * kind and bit layout has changed at various points. + * + * 0 = Gob Height 8, Fermi - Volta, Tegra K1+ Page Kind mapping + * 1 = Gob Height 4, G80 - GT2XX Page Kind mapping + * 2 = Gob Height 8, Turing+ Page Kind mapping + * 3 = Reserved for future use. + * + * 22:22 s Sector layout. On Tegra GPUs prior to Xavier, there is a further + * bit remapping step that occurs at an even lower level than the + * page kind and block linear swizzles. This causes the layout of + * surfaces mapped in those SOC's GPUs to be incompatible with the + * equivalent mapping on other GPUs in the same system. + * + * 0 = Tegra K1 - Tegra Parker/TX2 Layout. + * 1 = Desktop GPU and Tegra Xavier+ Layout + * + * 25:23 c Lossless Framebuffer Compression type. + * + * 0 = none + * 1 = ROP/3D, layout 1, exact compression format implied by Page + * Kind field + * 2 = ROP/3D, layout 2, exact compression format implied by Page + * Kind field + * 3 = CDE horizontal + * 4 = CDE vertical + * 5 = Reserved for future use + * 6 = Reserved for future use + * 7 = Reserved for future use + * + * 55:25 - Reserved for future use. Must be zero. + */ +#define DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(c, s, g, k, h) \ + fourcc_mod_code(NVIDIA, (0x10 | \ + ((h) & 0xf) | \ + (((k) & 0xff) << 12) | \ + (((g) & 0x3) << 20) | \ + (((s) & 0x1) << 22) | \ + (((c) & 0x7) << 23))) + +/* To grandfather in prior block linear format modifiers to the above layout, + * the page kind "0", which corresponds to "pitch/linear" and hence is unusable + * with block-linear layouts, is remapped within drivers to the value 0xfe, + * which corresponds to the "generic" kind used for simple single-sample + * uncompressed color formats on Fermi - Volta GPUs. + */ +static inline __u64 +drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier) +{ + if (!(modifier & 0x10) || (modifier & (0xff << 12))) + return modifier; + else + return modifier | (0xfe << 12); +} + +/* + * 16Bx2 Block Linear layout, used by Tegra K1 and later * * Pixels are arranged in 64x8 Groups Of Bytes (GOBs). GOBs are then stacked * vertically by a power of 2 (1 to 32 GOBs) to form a block. @@ -548,20 +654,20 @@ extern "C" { * in full detail. */ #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(v) \ - fourcc_mod_code(NVIDIA, 0x10 | ((v) & 0xf)) + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 0, 0, 0, (v)) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_ONE_GOB \ - fourcc_mod_code(NVIDIA, 0x10) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(0) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_TWO_GOB \ - fourcc_mod_code(NVIDIA, 0x11) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(1) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_FOUR_GOB \ - fourcc_mod_code(NVIDIA, 0x12) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(2) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_EIGHT_GOB \ - fourcc_mod_code(NVIDIA, 0x13) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(3) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_SIXTEEN_GOB \ - fourcc_mod_code(NVIDIA, 0x14) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(4) #define DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK_THIRTYTWO_GOB \ - fourcc_mod_code(NVIDIA, 0x15) + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(5) /* * Some Broadcom modifiers take parameters, for example the number of -- cgit v1.2.3