From 5c08c5acdc6ce46296a0f98d06896c818b8b34e0 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 22 Jun 2021 00:36:34 +0800 Subject: iommu/arm-smmu-v3: Remove some unneeded init in arm_smmu_cmdq_issue_cmdlist() Members of struct "llq" will be zero-inited, apart from member max_n_shift. But we write llq.val straight after the init, so it was pointless to zero init those other members. As such, separately init member max_n_shift only. In addition, struct "head" is initialised to "llq" only so that member max_n_shift is set. But that member is never referenced for "head", so remove any init there. Removing these initializations is seen as a small performance optimisation, as this code is (very) hot path. Signed-off-by: John Garry Link: https://lore.kernel.org/r/1624293394-202509-1-git-send-email-john.garry@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 235f9bdaeaf2..952291840c76 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -733,11 +733,11 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, unsigned long flags; bool owner; struct arm_smmu_cmdq *cmdq = &smmu->cmdq; - struct arm_smmu_ll_queue llq = { - .max_n_shift = cmdq->q.llq.max_n_shift, - }, head = llq; + struct arm_smmu_ll_queue llq, head; int ret = 0; + llq.max_n_shift = cmdq->q.llq.max_n_shift; + /* 1. Allocate some space in the queue */ local_irq_save(flags); llq.val = READ_ONCE(cmdq->q.llq.val); -- cgit v1.2.3 From afefe67e0893325d75eb7b816dd394eef2eac628 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Tue, 10 Aug 2021 12:18:08 +0530 Subject: iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in such cases, we would need to drop the XO clock vote in unprepare call and this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened) clock driver which controls RPMh managed clock resources for new QTI SoCs. Given we cannot have a sleeping calls such as clk_bulk_prepare() and clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu operations like map and unmap can be in atomic context and are in fast path, add this prepare and unprepare call to drop the XO vote only for system pm callbacks since it is not a fast path and we expect the system to enter deep sleep states with system pm as opposed to runtime pm. This is a similar sequence of clock requests (prepare,enable and disable,unprepare) in arm-smmu probe and remove. Signed-off-by: Sai Prakash Ranjan Co-developed-by: Rajendra Nayak Signed-off-by: Rajendra Nayak Link: https://lore.kernel.org/r/20210810064808.32486-1-saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f22dbeb1e510..fc8b932b47d4 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2281,18 +2281,38 @@ static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev) static int __maybe_unused arm_smmu_pm_resume(struct device *dev) { + int ret; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + ret = clk_bulk_prepare(smmu->num_clks, smmu->clks); + if (ret) + return ret; + if (pm_runtime_suspended(dev)) return 0; - return arm_smmu_runtime_resume(dev); + ret = arm_smmu_runtime_resume(dev); + if (ret) + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + + return ret; } static int __maybe_unused arm_smmu_pm_suspend(struct device *dev) { + int ret = 0; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + if (pm_runtime_suspended(dev)) - return 0; + goto clk_unprepare; - return arm_smmu_runtime_suspend(dev); + ret = arm_smmu_runtime_suspend(dev); + if (ret) + return ret; + +clk_unprepare: + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + return ret; } static const struct dev_pm_ops arm_smmu_pm_ops = { -- cgit v1.2.3 From 211ff31b3d33b56aa12937e898c9280d07daf0d9 Mon Sep 17 00:00:00 2001 From: Ashish Mhetre Date: Tue, 10 Aug 2021 10:14:00 +0530 Subject: iommu: Fix race condition during default domain allocation When two devices with same SID are getting probed concurrently through iommu_probe_device(), the iommu_domain sometimes is getting allocated more than once as call to iommu_alloc_default_domain() is not protected for concurrency. Furthermore, it leads to each device holding a different iommu_domain pointer, separate IOVA space and only one of the devices' domain is used for translations from IOMMU. This causes accesses from other device to fault or see incorrect translations. Fix this by protecting iommu_alloc_default_domain() call with group->mutex and let all devices with same SID share same iommu_domain. Signed-off-by: Ashish Mhetre Link: https://lore.kernel.org/r/1628570641-9127-2-git-send-email-amhetre@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5419c4b9f27a..80c5a1c57216 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -273,7 +273,9 @@ int iommu_probe_device(struct device *dev) * support default domains, so the return value is not yet * checked. */ + mutex_lock(&group->mutex); iommu_alloc_default_domain(group, dev); + mutex_unlock(&group->mutex); if (group->default_domain) { ret = __iommu_attach_device(group->default_domain, dev); -- cgit v1.2.3 From b1a1347912a742a4e1fcdc9df6302dd9dd2c3405 Mon Sep 17 00:00:00 2001 From: Krishna Reddy Date: Tue, 10 Aug 2021 10:14:01 +0530 Subject: iommu/arm-smmu: Fix race condition during iommu_group creation When two devices with same SID are getting probed concurrently through iommu_probe_device(), the iommu_group sometimes is getting allocated more than once as call to arm_smmu_device_group() is not protected for concurrency. Furthermore, it leads to each device holding a different iommu_group and domain pointer, separate IOVA space and only one of the devices' domain is used for translations from IOMMU. This causes accesses from other device to fault or see incorrect translations. Fix this by protecting iommu_group allocation from concurrency in arm_smmu_device_group(). Signed-off-by: Krishna Reddy Signed-off-by: Ashish Mhetre Link: https://lore.kernel.org/r/1628570641-9127-3-git-send-email-amhetre@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index fc8b932b47d4..f7da8953afbe 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1478,6 +1478,7 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) struct iommu_group *group = NULL; int i, idx; + mutex_lock(&smmu->stream_map_mutex); for_each_cfg_sme(cfg, fwspec, i, idx) { if (group && smmu->s2crs[idx].group && group != smmu->s2crs[idx].group) @@ -1486,8 +1487,10 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) group = smmu->s2crs[idx].group; } - if (group) + if (group) { + mutex_unlock(&smmu->stream_map_mutex); return iommu_group_ref_get(group); + } if (dev_is_pci(dev)) group = pci_device_group(dev); @@ -1501,6 +1504,7 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) for_each_cfg_sme(cfg, fwspec, i, idx) smmu->s2crs[idx].group = group; + mutex_unlock(&smmu->stream_map_mutex); return group; } -- cgit v1.2.3 From ef75702d6d65a1993dc0d6dd423fefaa08901f24 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Wed, 11 Aug 2021 21:34:26 +0530 Subject: iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. On qcom implementation, there are several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. So the cost of over-invalidation is less compared to the unmap latency on several usecases like camera which deals with large buffers. So, ASID based TLB invalidations (TLBIASID) can be used to invalidate the entire context for partial walk flush thereby improving the unmap latency. For this example of 32MB scatter-gather list unmap, this change results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: size iommu_map_sg iommu_unmap 4K 2.067 us 1.854 us 64K 9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us 67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: size iommu_map_sg iommu_unmap 4K 1.723 us 1.765 us 64K 9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us 21.250 us 16M 2391.890 us 27.437 us 24M 3570.895 us 39.937 us 32M 4755.234 us 51.797 us Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan Link: https://lore.kernel.org/r/20210811160426.10312-1-saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++++++++++ drivers/iommu/arm/arm-smmu/arm-smmu.c | 13 ++++++++++--- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 9b9d13ec5a88..55690af1b25d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -193,6 +193,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -235,6 +237,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -358,6 +368,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .init_context = qcom_smmu_init_context, .cfg_probe = qcom_smmu_cfg_probe, .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f7da8953afbe..67b660b0551d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -327,9 +327,16 @@ static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size, static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t size, size_t granule, void *cookie) { - arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie, - ARM_SMMU_CB_S1_TLBIVA); - arm_smmu_tlb_sync_context(cookie); + struct arm_smmu_domain *smmu_domain = cookie; + struct arm_smmu_cfg *cfg = &smmu_domain->cfg; + + if (cfg->flush_walk_prefer_tlbiasid) { + arm_smmu_tlb_inv_context_s1(cookie); + } else { + arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie, + ARM_SMMU_CB_S1_TLBIVA); + arm_smmu_tlb_sync_context(cookie); + } } static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather *gather, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index a50271595960..432de2f742c3 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -346,6 +346,7 @@ struct arm_smmu_cfg { }; enum arm_smmu_cbar_type cbar; enum arm_smmu_context_fmt fmt; + bool flush_walk_prefer_tlbiasid; }; #define ARM_SMMU_INVALID_IRPTNDX 0xff -- cgit v1.2.3 From eff19474b1bd60286213e5052ccf246b6a6c7199 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 19:48:49 +0800 Subject: iommu/arm-smmu-v3: Use command queue batching helpers to improve performance The obvious key to the performance optimization of commit 587e6c10a7ce ("iommu/arm-smmu-v3: Reduce contention during command-queue insertion") is to allow multiple cores to insert commands in parallel after a brief mutex contention. Obviously, inserting as many commands at a time as possible can reduce the number of times the mutex contention participates, thereby improving the overall performance. At least it reduces the number of calls to function arm_smmu_cmdq_issue_cmdlist(). Therefore, use command queue batching helpers to insert multiple commands at a time. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210811114852.2429-2-thunder.leizhen@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 952291840c76..ac0c4b21cfa8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1747,15 +1747,16 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) { int i; struct arm_smmu_cmdq_ent cmd; + struct arm_smmu_cmdq_batch cmds = {}; arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; - arm_smmu_cmdq_issue_cmd(master->smmu, &cmd); + arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd); } - return arm_smmu_cmdq_issue_sync(master->smmu); + return arm_smmu_cmdq_batch_submit(master->smmu, &cmds); } int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, -- cgit v1.2.3 From 4537f6f1e2d8d22ec6f9c6bd3844fdccb931f46e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 19:48:50 +0800 Subject: iommu/arm-smmu-v3: Add and use static helper function arm_smmu_cmdq_issue_cmd_with_sync() The obvious key to the performance optimization of commit 587e6c10a7ce ("iommu/arm-smmu-v3: Reduce contention during command-queue insertion") is to allow multiple cores to insert commands in parallel after a brief mutex contention. Obviously, inserting as many commands at a time as possible can reduce the number of times the mutex contention participates, thereby improving the overall performance. At least it reduces the number of calls to function arm_smmu_cmdq_issue_cmdlist(). Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert the 'cmd+sync' commands at a time. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210811114852.2429-3-thunder.leizhen@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 35 ++++++++++++++++------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ac0c4b21cfa8..3cdb01f93f1b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -845,8 +845,9 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, return ret; } -static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, - struct arm_smmu_cmdq_ent *ent) +static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent, + bool sync) { u64 cmd[CMDQ_ENT_DWORDS]; @@ -856,12 +857,19 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, return -EINVAL; } - return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); + return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync); } -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) +static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) { - return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); + return __arm_smmu_cmdq_issue_cmd(smmu, ent, false); +} + +static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) +{ + return __arm_smmu_cmdq_issue_cmd(smmu, ent, true); } static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, @@ -929,8 +937,7 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid) .tlbi.asid = asid, }; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, @@ -1211,8 +1218,7 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) }, }; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, @@ -1824,8 +1830,7 @@ static void arm_smmu_tlb_inv_context(void *cookie) } else { cmd.opcode = CMDQ_OP_TLBI_S12_VMALL; cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0); } @@ -3339,18 +3344,16 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) /* Invalidate any cached configuration */ cmd.opcode = CMDQ_OP_CFGI_ALL; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); /* Invalidate any stale TLB entries */ if (smmu->features & ARM_SMMU_FEAT_HYP) { cmd.opcode = CMDQ_OP_TLBI_EL2_ALL; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } cmd.opcode = CMDQ_OP_TLBI_NSNH_ALL; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); /* Event queue */ writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE); -- cgit v1.2.3 From 8639cc83aac5dd1a197ed05b8f717acc35bb0248 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 19:48:51 +0800 Subject: iommu/arm-smmu-v3: Add and use static helper function arm_smmu_get_cmdq() One SMMU has only one normal CMDQ. Therefore, this CMDQ is used regardless of the core on which the command is inserted. It can be referenced directly through "smmu->cmdq". However, one SMMU has multiple ECMDQs, and the ECMDQ used by the core on which the command insertion is executed may be different. So the helper function arm_smmu_get_cmdq() is added, which returns the CMDQ/ECMDQ that the current core should use. Currently, the code that supports ECMDQ is not added. just simply returns "&smmu->cmdq". Many subfunctions of arm_smmu_cmdq_issue_cmdlist() use "&smmu->cmdq" or "&smmu->cmdq.q" directly. To support ECMDQ, they need to call the newly added function arm_smmu_get_cmdq() instead. Note that normal CMDQ is still required until ECMDQ is available. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210811114852.2429-4-thunder.leizhen@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 3cdb01f93f1b..c1cd2adab3ef 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -335,10 +335,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) return 0; } +static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu) +{ + return &smmu->cmdq; +} + static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, - u32 prod) + struct arm_smmu_queue *q, u32 prod) { - struct arm_smmu_queue *q = &smmu->cmdq.q; struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC, }; @@ -579,7 +583,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, { unsigned long flags; struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); int ret = 0; /* @@ -595,7 +599,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, queue_poll_init(smmu, &qp); do { - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + llq->val = READ_ONCE(cmdq->q.llq.val); if (!queue_full(llq)) break; @@ -614,7 +618,7 @@ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, { int ret = 0; struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); queue_poll_init(smmu, &qp); @@ -637,12 +641,12 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, struct arm_smmu_ll_queue *llq) { struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); u32 prod = llq->prod; int ret = 0; queue_poll_init(smmu, &qp); - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + llq->val = READ_ONCE(cmdq->q.llq.val); do { if (queue_consumed(llq, prod)) break; @@ -732,7 +736,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u32 prod; unsigned long flags; bool owner; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); struct arm_smmu_ll_queue llq, head; int ret = 0; @@ -772,7 +776,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); if (sync) { prod = queue_inc_prod_n(&llq, n); - arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod); + arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, &cmdq->q, prod); queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); /* -- cgit v1.2.3 From 2cbeaf3f36eb300cae1f344e125162c42fb091a3 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 19:48:52 +0800 Subject: iommu/arm-smmu-v3: Extract reusable function __arm_smmu_cmdq_skip_err() When SMMU_GERROR.CMDQP_ERR is different to SMMU_GERRORN.CMDQP_ERR, it indicates that one or more errors have been encountered on a command queue control page interface. We need to traverse all ECMDQs in that control page to find all errors. For each ECMDQ error handling, it is much the same as the CMDQ error handling. This common processing part is extracted as a new function __arm_smmu_cmdq_skip_err(). Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210811114852.2429-5-thunder.leizhen@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index c1cd2adab3ef..283bfad5f28f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -359,7 +359,8 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, arm_smmu_cmdq_build_cmd(cmd, &ent); } -static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) +static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu, + struct arm_smmu_queue *q) { static const char * const cerror_str[] = { [CMDQ_ERR_CERROR_NONE_IDX] = "No error", @@ -370,7 +371,6 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) int i; u64 cmd[CMDQ_ENT_DWORDS]; - struct arm_smmu_queue *q = &smmu->cmdq.q; u32 cons = readl_relaxed(q->cons_reg); u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons); struct arm_smmu_cmdq_ent cmd_sync = { @@ -417,6 +417,11 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); } +static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) +{ + __arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq.q); +} + /* * Command queue locking. * This is a form of bastardised rwlock with the following major changes: -- cgit v1.2.3 From fac956710ab0812f9e395e9f7a27da551412830f Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 11 Aug 2021 23:49:26 +0800 Subject: iommu/arm-smmu-v3: Stop pre-zeroing batch commands Pre-zeroing the batched commands structure is inefficient, as individual commands are zeroed later in arm_smmu_cmdq_build_cmd(). The size is quite large and commonly most commands won't even be used: struct arm_smmu_cmdq_batch cmds = {}; 345c: 52800001 mov w1, #0x0 // #0 3460: d2808102 mov x2, #0x408 // #1032 3464: 910143a0 add x0, x29, #0x50 3468: 94000000 bl 0 Stop pre-zeroing the complete structure and only zero the num member. Signed-off-by: John Garry Link: https://lore.kernel.org/r/1628696966-88386-1-git-send-email-john.garry@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 283bfad5f28f..3216e746c28a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -955,7 +955,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, size_t i; unsigned long flags; struct arm_smmu_master *master; - struct arm_smmu_cmdq_batch cmds = {}; + struct arm_smmu_cmdq_batch cmds; struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_cmdq_ent cmd = { .opcode = CMDQ_OP_CFGI_CD, @@ -965,6 +965,8 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, }, }; + cmds.num = 0; + spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { for (i = 0; i < master->num_streams; i++) { @@ -1781,7 +1783,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, unsigned long flags; struct arm_smmu_cmdq_ent cmd; struct arm_smmu_master *master; - struct arm_smmu_cmdq_batch cmds = {}; + struct arm_smmu_cmdq_batch cmds; if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) return 0; @@ -1805,6 +1807,8 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); + cmds.num = 0; + spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { if (!master->ats_enabled) @@ -1852,7 +1856,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, struct arm_smmu_device *smmu = smmu_domain->smmu; unsigned long end = iova + size, num_pages = 0, tg = 0; size_t inv_range = granule; - struct arm_smmu_cmdq_batch cmds = {}; + struct arm_smmu_cmdq_batch cmds; if (!size) return; @@ -1870,6 +1874,8 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, num_pages = size >> tg; } + cmds.num = 0; + while (iova < end) { if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { /* -- cgit v1.2.3