summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/apei/ghes.c27
-rw-r--r--drivers/acpi/numa/hmat.c11
-rw-r--r--drivers/amba/Kconfig2
-rw-r--r--drivers/ata/libata-core.c4
-rw-r--r--drivers/ata/libata-sff.c3
-rw-r--r--drivers/ata/pata_it821x.c5
-rw-r--r--drivers/ata/pata_pcmcia.c1
-rw-r--r--drivers/char/tpm/tpm-chip.c37
-rw-r--r--drivers/char/tpm/tpm-dev-common.c3
-rw-r--r--drivers/char/tpm/tpm-interface.c20
-rw-r--r--drivers/char/tpm/tpm.h1
-rw-r--r--drivers/char/tpm/tpm1-cmd.c5
-rw-r--r--drivers/char/tpm/tpm2-cmd.c8
-rw-r--r--drivers/char/tpm/tpm_crb.c4
-rw-r--r--drivers/char/tpm/tpm_tis_core.c3
-rw-r--r--drivers/crypto/hisilicon/qm.c27
-rw-r--r--drivers/crypto/intel/qat/qat_common/adf_aer.c2
-rw-r--r--drivers/cxl/acpi.c73
-rw-r--r--drivers/cxl/core/cdat.c4
-rw-r--r--drivers/cxl/core/hdm.c3
-rw-r--r--drivers/cxl/core/pci.c87
-rw-r--r--drivers/cxl/core/port.c1
-rw-r--r--drivers/cxl/core/region.c311
-rw-r--r--drivers/cxl/cxl.h29
-rw-r--r--drivers/cxl/cxlpci.h1
-rw-r--r--drivers/cxl/pci.c2
-rw-r--r--drivers/dma-buf/Makefile2
-rw-r--r--drivers/dma-buf/dma-buf-mapping.c248
-rw-r--r--drivers/dma/ioat/init.c1
-rw-r--r--drivers/firmware/efi/cper-arm.c52
-rw-r--r--drivers/firmware/efi/cper.c62
-rw-r--r--drivers/firmware/efi/libstub/efi-stub.c2
-rw-r--r--drivers/firmware/efi/libstub/efistub.h31
-rw-r--r--drivers/firmware/efi/libstub/gop.c137
-rw-r--r--drivers/firmware/efi/libstub/x86-stub.c104
-rw-r--r--drivers/firmware/efi/memattr.c7
-rw-r--r--drivers/firmware/efi/riscv-runtime.c10
-rw-r--r--drivers/firmware/efi/stmm/mm_communication.h6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c20
-rw-r--r--drivers/gpu/drm/i915/Makefile2
-rw-r--r--drivers/gpu/drm/i915/display/intel_color.c335
-rw-r--r--drivers/gpu/drm/i915/display/intel_color.h8
-rw-r--r--drivers/gpu/drm/i915/display/intel_color_pipeline.c99
-rw-r--r--drivers/gpu/drm/i915/display/intel_color_pipeline.h14
-rw-r--r--drivers/gpu/drm/i915/display/intel_color_regs.h29
-rw-r--r--drivers/gpu/drm/i915/display/intel_colorop.c35
-rw-r--r--drivers/gpu/drm/i915/display/intel_colorop.h15
-rw-r--r--drivers/gpu/drm/i915/display/intel_display.c5
-rw-r--r--drivers/gpu/drm/i915/display/intel_display_limits.h9
-rw-r--r--drivers/gpu/drm/i915/display/intel_display_types.h9
-rw-r--r--drivers/gpu/drm/i915/display/intel_plane.c55
-rw-r--r--drivers/gpu/drm/i915/display/skl_universal_plane.c21
-rw-r--r--drivers/gpu/drm/i915/display/skl_universal_plane_regs.h115
-rw-r--r--drivers/gpu/drm/i915/gt/intel_region_lmem.c24
-rw-r--r--drivers/gpu/drm/i915/gvt/kvmgt.c263
-rw-r--r--drivers/gpu/drm/xe/Makefile6
-rw-r--r--drivers/gpu/drm/xe/xe_gpu_scheduler.h5
-rw-r--r--drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c2
-rw-r--r--drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c9
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c47
-rw-r--r--drivers/gpu/drm/xe/xe_pagefault.c1
-rw-r--r--drivers/gpu/drm/xe/xe_pci.c17
-rw-r--r--drivers/gpu/drm/xe/xe_pci.h3
-rw-r--r--drivers/gpu/drm/xe/xe_pm.c21
-rw-r--r--drivers/gpu/drm/xe/xe_pm.h17
-rw-r--r--drivers/gpu/drm/xe/xe_sched_job_types.h4
-rw-r--r--drivers/gpu/drm/xe/xe_sriov_pf_migration.c35
-rw-r--r--drivers/gpu/drm/xe/xe_sriov_pf_migration.h1
-rw-r--r--drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h4
-rw-r--r--drivers/gpu/drm/xe/xe_sriov_vfio.c80
-rw-r--r--drivers/gpu/drm/xe/xe_vram.c58
-rw-r--r--drivers/infiniband/Kconfig1
-rw-r--r--drivers/infiniband/core/cm.c9
-rw-r--r--drivers/infiniband/core/cma.c2
-rw-r--r--drivers/infiniband/core/device.c4
-rw-r--r--drivers/infiniband/core/restrack.c4
-rw-r--r--drivers/infiniband/core/ucma.c2
-rw-r--r--drivers/infiniband/core/umem.c8
-rw-r--r--drivers/infiniband/core/verbs.c3
-rw-r--r--drivers/infiniband/hw/Makefile1
-rw-r--r--drivers/infiniband/hw/bng_re/Kconfig10
-rw-r--r--drivers/infiniband/hw/bng_re/Makefile8
-rw-r--r--drivers/infiniband/hw/bng_re/bng_debugfs.c39
-rw-r--r--drivers/infiniband/hw/bng_re/bng_debugfs.h12
-rw-r--r--drivers/infiniband/hw/bng_re/bng_dev.c534
-rw-r--r--drivers/infiniband/hw/bng_re/bng_fw.c767
-rw-r--r--drivers/infiniband/hw/bng_re/bng_fw.h211
-rw-r--r--drivers/infiniband/hw/bng_re/bng_re.h85
-rw-r--r--drivers/infiniband/hw/bng_re/bng_res.c279
-rw-r--r--drivers/infiniband/hw/bng_re/bng_res.h215
-rw-r--r--drivers/infiniband/hw/bng_re/bng_sp.c131
-rw-r--r--drivers/infiniband/hw/bng_re/bng_sp.h47
-rw-r--r--drivers/infiniband/hw/bng_re/bng_tlv.h128
-rw-r--r--drivers/infiniband/hw/bnxt_re/bnxt_re.h2
-rw-r--r--drivers/infiniband/hw/bnxt_re/debugfs.c128
-rw-r--r--drivers/infiniband/hw/bnxt_re/debugfs.h19
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.c8
-rw-r--r--drivers/infiniband/hw/bnxt_re/main.c1
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.c3
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.h1
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.c8
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.h2
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c2
-rw-r--r--drivers/infiniband/hw/hfi1/init.c4
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.c4
-rw-r--r--drivers/infiniband/hw/hns/Makefile4
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_ah.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_bond.c1012
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_bond.h95
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_device.h16
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.c141
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.h20
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_main.c185
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_pd.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_qp.c5
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_srq.c1
-rw-r--r--drivers/infiniband/hw/irdma/cm.c2
-rw-r--r--drivers/infiniband/hw/irdma/ctrl.c107
-rw-r--r--drivers/infiniband/hw/irdma/hw.c3
-rw-r--r--drivers/infiniband/hw/irdma/icrdma_if.c6
-rw-r--r--drivers/infiniband/hw/irdma/ig3rdma_if.c4
-rw-r--r--drivers/infiniband/hw/irdma/main.h3
-rw-r--r--drivers/infiniband/hw/irdma/pble.c6
-rw-r--r--drivers/infiniband/hw/irdma/puda.c20
-rw-r--r--drivers/infiniband/hw/irdma/type.h5
-rw-r--r--drivers/infiniband/hw/irdma/uk.c67
-rw-r--r--drivers/infiniband/hw/irdma/user.h6
-rw-r--r--drivers/infiniband/hw/irdma/utils.c58
-rw-r--r--drivers/infiniband/hw/irdma/verbs.c49
-rw-r--r--drivers/infiniband/hw/irdma/verbs.h3
-rw-r--r--drivers/infiniband/hw/mlx4/cm.c2
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c14
-rw-r--r--drivers/infiniband/hw/mlx5/fs.c65
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c74
-rw-r--r--drivers/infiniband/hw/mlx5/main.c4
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c93
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c5
-rw-r--r--drivers/infiniband/sw/rdmavt/cq.c3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c1
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.c49
-rw-r--r--drivers/infiniband/sw/rxe/rxe_odp.c1
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c49
-rw-r--r--drivers/infiniband/sw/rxe/rxe_srq.c7
-rw-r--r--drivers/infiniband/sw/siw/siw_cm.c51
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c2
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.c2
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv.c2
-rw-r--r--drivers/iommu/Kconfig15
-rw-r--r--drivers/iommu/Makefile2
-rw-r--r--drivers/iommu/amd/Kconfig5
-rw-r--r--drivers/iommu/amd/Makefile2
-rw-r--r--drivers/iommu/amd/amd_iommu.h1
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h114
-rw-r--r--drivers/iommu/amd/debugfs.c2
-rw-r--r--drivers/iommu/amd/init.c15
-rw-r--r--drivers/iommu/amd/io_pgtable.c577
-rw-r--r--drivers/iommu/amd/io_pgtable_v2.c370
-rw-r--r--drivers/iommu/amd/iommu.c572
-rw-r--r--drivers/iommu/apple-dart.c11
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c18
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c33
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c28
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c9
-rw-r--r--drivers/iommu/arm/arm-smmu/qcom_iommu.c21
-rw-r--r--drivers/iommu/dma-iommu.c4
-rw-r--r--drivers/iommu/exynos-iommu.c20
-rw-r--r--drivers/iommu/fsl_pamu_domain.c12
-rw-r--r--drivers/iommu/generic_pt/.kunitconfig14
-rw-r--r--drivers/iommu/generic_pt/Kconfig79
-rw-r--r--drivers/iommu/generic_pt/fmt/Makefile28
-rw-r--r--drivers/iommu/generic_pt/fmt/amdv1.h411
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_amdv1.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_vtdss.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_x86_64.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_amdv1.c15
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_mock.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_template.h48
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_vtdss.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_x86_64.c11
-rw-r--r--drivers/iommu/generic_pt/fmt/vtdss.h285
-rw-r--r--drivers/iommu/generic_pt/fmt/x86_64.h279
-rw-r--r--drivers/iommu/generic_pt/iommu_pt.h1289
-rw-r--r--drivers/iommu/generic_pt/kunit_generic_pt.h823
-rw-r--r--drivers/iommu/generic_pt/kunit_iommu.h184
-rw-r--r--drivers/iommu/generic_pt/kunit_iommu_pt.h487
-rw-r--r--drivers/iommu/generic_pt/pt_common.h389
-rw-r--r--drivers/iommu/generic_pt/pt_defs.h332
-rw-r--r--drivers/iommu/generic_pt/pt_fmt_defaults.h295
-rw-r--r--drivers/iommu/generic_pt/pt_iter.h636
-rw-r--r--drivers/iommu/generic_pt/pt_log2.h122
-rw-r--r--drivers/iommu/intel/Kconfig6
-rw-r--r--drivers/iommu/intel/iommu.c931
-rw-r--r--drivers/iommu/intel/iommu.h99
-rw-r--r--drivers/iommu/intel/nested.c7
-rw-r--r--drivers/iommu/intel/pasid.c44
-rw-r--r--drivers/iommu/intel/pasid.h1
-rw-r--r--drivers/iommu/intel/svm.c1
-rw-r--r--drivers/iommu/io-pgtable-arm-selftests.c214
-rw-r--r--drivers/iommu/io-pgtable-arm.c203
-rw-r--r--drivers/iommu/io-pgtable.c4
-rw-r--r--drivers/iommu/iommu-pages.c136
-rw-r--r--drivers/iommu/iommu-pages.h51
-rw-r--r--drivers/iommu/iommu.c44
-rw-r--r--drivers/iommu/iommufd/Kconfig1
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c78
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h54
-rw-r--r--drivers/iommu/iommufd/ioas.c8
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h14
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h21
-rw-r--r--drivers/iommu/iommufd/main.c10
-rw-r--r--drivers/iommu/iommufd/pages.c414
-rw-r--r--drivers/iommu/iommufd/selftest.c569
-rw-r--r--drivers/iommu/ipmmu-vmsa.c12
-rw-r--r--drivers/iommu/msm_iommu.c11
-rw-r--r--drivers/iommu/mtk_iommu.c174
-rw-r--r--drivers/iommu/mtk_iommu_v1.c35
-rw-r--r--drivers/iommu/omap-iommu.c19
-rw-r--r--drivers/iommu/omap-iommu.h2
-rw-r--r--drivers/iommu/riscv/iommu.c9
-rw-r--r--drivers/iommu/rockchip-iommu.c20
-rw-r--r--drivers/iommu/s390-iommu.c13
-rw-r--r--drivers/iommu/sprd-iommu.c3
-rw-r--r--drivers/iommu/sun50i-iommu.c10
-rw-r--r--drivers/iommu/tegra-smmu.c15
-rw-r--r--drivers/iommu/virtio-iommu.c6
-rw-r--r--drivers/net/ethernet/broadcom/bnge/Makefile3
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge.h10
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_auxr.c258
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_auxr.h84
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_core.c18
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c40
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h2
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_resc.c12
-rw-r--r--drivers/net/ethernet/broadcom/bnge/bnge_resc.h1
-rw-r--r--drivers/net/ethernet/broadcom/bnx2.c2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c1
-rw-r--r--drivers/net/ethernet/broadcom/tg3.c1
-rw-r--r--drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c1
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c2
-rw-r--r--drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c1
-rw-r--r--drivers/net/ethernet/intel/e1000e/netdev.c1
-rw-r--r--drivers/net/ethernet/intel/fm10k/fm10k_pci.c6
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c1
-rw-r--r--drivers/net/ethernet/intel/ice/ice_main.c2
-rw-r--r--drivers/net/ethernet/intel/igb/igb_main.c2
-rw-r--r--drivers/net/ethernet/intel/igc/igc_main.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/main.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c1
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_pci.c1
-rw-r--r--drivers/net/ethernet/microchip/lan743x_main.c1
-rw-r--r--drivers/net/ethernet/myricom/myri10ge/myri10ge.c4
-rw-r--r--drivers/net/ethernet/neterion/s2io.c1
-rw-r--r--drivers/pci/Makefile2
-rw-r--r--drivers/pci/bus.c3
-rw-r--r--drivers/pci/controller/Kconfig18
-rw-r--r--drivers/pci/controller/Makefile1
-rw-r--r--drivers/pci/controller/cadence/Kconfig21
-rw-r--r--drivers/pci/controller/cadence/Makefile11
-rw-r--r--drivers/pci/controller/cadence/pci-j721e.c33
-rw-r--r--drivers/pci/controller/cadence/pci-sky1.c238
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-host-common.c288
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-host-common.h46
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-host-hpa.c368
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-host.c278
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h193
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-hpa.c167
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-lga-regs.h230
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence-plat.c9
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence.c12
-rw-r--r--drivers/pci/controller/cadence/pcie-cadence.h409
-rw-r--r--drivers/pci/controller/cadence/pcie-sg2042.c3
-rw-r--r--drivers/pci/controller/dwc/Kconfig38
-rw-r--r--drivers/pci/controller/dwc/Makefile5
-rw-r--r--drivers/pci/controller/dwc/pci-keystone.c80
-rw-r--r--drivers/pci/controller/dwc/pci-meson.c18
-rw-r--r--drivers/pci/controller/dwc/pcie-designware-ep.c1
-rw-r--r--drivers/pci/controller/dwc/pcie-designware-host.c12
-rw-r--r--drivers/pci/controller/dwc/pcie-designware.c36
-rw-r--r--drivers/pci/controller/dwc/pcie-designware.h21
-rw-r--r--drivers/pci/controller/dwc/pcie-dw-rockchip.c63
-rw-r--r--drivers/pci/controller/dwc/pcie-nxp-s32g.c406
-rw-r--r--drivers/pci/controller/dwc/pcie-qcom.c32
-rw-r--r--drivers/pci/controller/dwc/pcie-spacemit-k1.c357
-rw-r--r--drivers/pci/controller/dwc/pcie-stm32-ep.c43
-rw-r--r--drivers/pci/controller/dwc/pcie-stm32.c14
-rw-r--r--drivers/pci/controller/dwc/pcie-stm32.h3
-rw-r--r--drivers/pci/controller/dwc/pcie-tegra194.c48
-rw-r--r--drivers/pci/controller/pci-host-common.c13
-rw-r--r--drivers/pci/controller/pci-host-common.h1
-rw-r--r--drivers/pci/controller/pci-hyperv.c62
-rw-r--r--drivers/pci/controller/pci-ixp4xx.c6
-rw-r--r--drivers/pci/controller/pcie-apple.c43
-rw-r--r--drivers/pci/controller/pcie-brcmstb.c209
-rw-r--r--drivers/pci/controller/pcie-mediatek.c113
-rw-r--r--drivers/pci/controller/pcie-rzg3s-host.c1761
-rw-r--r--drivers/pci/controller/vmd.c40
-rw-r--r--drivers/pci/endpoint/functions/pci-epf-test.c5
-rw-r--r--drivers/pci/endpoint/functions/pci-epf-vntb.c153
-rw-r--r--drivers/pci/endpoint/pci-epf-core.c159
-rw-r--r--drivers/pci/host-bridge.c1
-rw-r--r--drivers/pci/iov.c25
-rw-r--r--drivers/pci/p2pdma.c186
-rw-r--r--drivers/pci/pci-driver.c6
-rw-r--r--drivers/pci/pci-sysfs.c19
-rw-r--r--drivers/pci/pci.c172
-rw-r--r--drivers/pci/pci.h14
-rw-r--r--drivers/pci/pcie/portdrv.c1
-rw-r--r--drivers/pci/pcie/ptm.c23
-rw-r--r--drivers/pci/probe.c13
-rw-r--r--drivers/pci/pwrctrl/Kconfig15
-rw-r--r--drivers/pci/pwrctrl/Makefile2
-rw-r--r--drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c648
-rw-r--r--drivers/pci/rebar.c328
-rw-r--r--drivers/pci/setup-bus.c126
-rw-r--r--drivers/pci/setup-res.c78
-rw-r--r--drivers/power/reset/Kconfig9
-rw-r--r--drivers/power/reset/Makefile1
-rw-r--r--drivers/power/reset/spacemit-p1-reboot.c88
-rw-r--r--drivers/power/supply/Kconfig24
-rw-r--r--drivers/power/supply/Makefile2
-rw-r--r--drivers/power/supply/apm_power.c3
-rw-r--r--drivers/power/supply/bd71828-power.c1049
-rw-r--r--drivers/power/supply/cw2015_battery.c8
-rw-r--r--drivers/power/supply/max17040_battery.c6
-rw-r--r--drivers/power/supply/max77705_charger.c56
-rw-r--r--drivers/power/supply/qcom_battmgr.c14
-rw-r--r--drivers/power/supply/rt5033_charger.c2
-rw-r--r--drivers/power/supply/rt9467-charger.c6
-rw-r--r--drivers/power/supply/rt9756.c955
-rw-r--r--drivers/power/supply/wm831x_power.c10
-rw-r--r--drivers/ras/ras.c40
-rw-r--r--drivers/s390/cio/vfio_ccw_ops.c47
-rw-r--r--drivers/scsi/bfa/bfad.c1
-rw-r--r--drivers/scsi/csiostor/csio_init.c1
-rw-r--r--drivers/scsi/ipr.c1
-rw-r--r--drivers/scsi/lpfc/lpfc_init.c6
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c5
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c5
-rw-r--r--drivers/tty/serial/8250/8250_pci.c1
-rw-r--r--drivers/tty/serial/jsm/jsm_driver.c1
-rw-r--r--drivers/vdpa/mlx5/net/mlx5_vnet.c2
-rw-r--r--drivers/vdpa/octeon_ep/octep_vdpa_main.c1
-rw-r--r--drivers/vdpa/pds/vdpa_dev.c2
-rw-r--r--drivers/vdpa/vdpa_user/vduse_dev.c3
-rw-r--r--drivers/vfio/cdx/main.c29
-rw-r--r--drivers/vfio/device_cdev.c2
-rw-r--r--drivers/vfio/fsl-mc/vfio_fsl_mc.c43
-rw-r--r--drivers/vfio/pci/Kconfig5
-rw-r--r--drivers/vfio/pci/Makefile3
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c171
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h23
-rw-r--r--drivers/vfio/pci/mlx5/main.c1
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/main.c342
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.c1
-rw-r--r--drivers/vfio/pci/qat/main.c1
-rw-r--r--drivers/vfio/pci/vfio_pci.c6
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c23
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c300
-rw-r--r--drivers/vfio/pci/vfio_pci_dmabuf.c350
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c52
-rw-r--r--drivers/vfio/pci/vfio_pci_priv.h28
-rw-r--r--drivers/vfio/pci/virtio/common.h5
-rw-r--r--drivers/vfio/pci/virtio/legacy_io.c38
-rw-r--r--drivers/vfio/pci/virtio/main.c5
-rw-r--r--drivers/vfio/pci/xe/Kconfig12
-rw-r--r--drivers/vfio/pci/xe/Makefile3
-rw-r--r--drivers/vfio/pci/xe/main.c573
-rw-r--r--drivers/vfio/platform/vfio_amba.c1
-rw-r--r--drivers/vfio/platform/vfio_platform.c1
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c40
-rw-r--r--drivers/vfio/platform/vfio_platform_private.h3
-rw-r--r--drivers/vfio/vfio_main.c51
-rw-r--r--drivers/vhost/net.c29
-rw-r--r--drivers/vhost/scsi.c9
-rw-r--r--drivers/vhost/test.c10
-rw-r--r--drivers/vhost/vhost.c4
-rw-r--r--drivers/vhost/vhost.h42
-rw-r--r--drivers/vhost/vsock.c10
-rw-r--r--drivers/virtio/virtio.c12
-rw-r--r--drivers/virtio/virtio_balloon.c3
-rw-r--r--drivers/virtio/virtio_debug.c10
-rw-r--r--drivers/virtio/virtio_pci_modern_dev.c6
-rw-r--r--drivers/virtio/virtio_ring.c7
-rw-r--r--drivers/virtio/virtio_vdpa.c2
385 files changed, 25519 insertions, 5785 deletions
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 97ee19f2cae0..56107aa00274 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -22,6 +22,7 @@
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/acpi.h>
+#include <linux/bitfield.h>
#include <linux/io.h>
#include <linux/interrupt.h>
#include <linux/timer.h>
@@ -552,26 +553,25 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
}
static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
- int sev, bool sync)
+ int sev, bool sync)
{
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
int flags = sync ? MF_ACTION_REQUIRED : 0;
+ char error_type[120];
bool queued = false;
int sec_sev, i;
char *p;
- log_arm_hw_error(err);
-
sec_sev = ghes_severity(gdata->error_severity);
+ log_arm_hw_error(err, sec_sev);
if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
return false;
p = (char *)(err + 1);
for (i = 0; i < err->err_info_num; i++) {
struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
- bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
+ bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR;
bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
- const char *error_type = "unknown error";
/*
* The field (err_info->error_info & BIT(26)) is fixed to set to
@@ -585,12 +585,15 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
continue;
}
- if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
- error_type = cper_proc_error_type_strs[err_info->type];
+ cper_bits_to_str(error_type, sizeof(error_type),
+ FIELD_GET(CPER_ARM_ERR_TYPE_MASK, err_info->type),
+ cper_proc_error_type_strs,
+ ARRAY_SIZE(cper_proc_error_type_strs));
pr_warn_ratelimited(FW_WARN GHES_PFX
- "Unhandled processor error type: %s\n",
- error_type);
+ "Unhandled processor error type 0x%02x: %s%s\n",
+ err_info->type, error_type,
+ (err_info->type & ~CPER_ARM_ERR_TYPE_MASK) ? " with reserved bit(s)" : "");
p += err_info->length;
}
@@ -895,11 +898,9 @@ static void ghes_do_proc(struct ghes *ghes,
arch_apei_report_mem_error(sev, mem_err);
queued = ghes_handle_memory_failure(gdata, sev, sync);
- }
- else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
+ } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
ghes_handle_aer(gdata);
- }
- else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
+ } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
queued = ghes_handle_arm_hw_error(gdata, sev, sync);
} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 11e4483685c9..77a81627aaef 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -910,12 +910,13 @@ static void hmat_register_target(struct memory_target *target)
* Register generic port perf numbers. The nid may not be
* initialized and is still NUMA_NO_NODE.
*/
- mutex_lock(&target_lock);
- if (*(u16 *)target->gen_port_device_handle) {
- hmat_update_generic_target(target);
- target->registered = true;
+ scoped_guard(mutex, &target_lock) {
+ if (*(u16 *)target->gen_port_device_handle) {
+ hmat_update_generic_target(target);
+ target->registered = true;
+ return;
+ }
}
- mutex_unlock(&target_lock);
hmat_hotplug_target(target);
}
diff --git a/drivers/amba/Kconfig b/drivers/amba/Kconfig
index fb6c7e0b4cce..14bb61ff801e 100644
--- a/drivers/amba/Kconfig
+++ b/drivers/amba/Kconfig
@@ -5,7 +5,7 @@ config ARM_AMBA
if ARM_AMBA
config TEGRA_AHB
- bool
+ bool "Enable AHB driver for NVIDIA Tegra SoCs" if COMPILE_TEST
default y if ARCH_TEGRA
help
Adds AHB configuration functionality for NVIDIA Tegra SoCs,
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index f48fb63d7e85..462217935558 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4216,6 +4216,10 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = {
/* Apacer models with LPM issues */
{ "Apacer AS340*", NULL, ATA_QUIRK_NOLPM },
+ /* Silicon Motion models with LPM issues */
+ { "MD619HXCLDE3TC", "TCVAID", ATA_QUIRK_NOLPM },
+ { "MD619GXCLDE3TC", "TCV35D", ATA_QUIRK_NOLPM },
+
/* These specific Samsung models/firmware-revs do not handle LPM well */
{ "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_QUIRK_NOLPM },
{ "SAMSUNG SSD PM830 mSATA *", "CXM13D1Q", ATA_QUIRK_NOLPM },
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 1e2a2c33cdc8..785b6e371abf 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -3191,7 +3191,8 @@ void ata_sff_port_init(struct ata_port *ap)
int __init ata_sff_init(void)
{
- ata_sff_wq = alloc_workqueue("ata_sff", WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
+ ata_sff_wq = alloc_workqueue("ata_sff", WQ_MEM_RECLAIM | WQ_PERCPU,
+ WQ_MAX_ACTIVE);
if (!ata_sff_wq)
return -ENOMEM;
diff --git a/drivers/ata/pata_it821x.c b/drivers/ata/pata_it821x.c
index 042f6ad1f7c6..fc762dcc61bf 100644
--- a/drivers/ata/pata_it821x.c
+++ b/drivers/ata/pata_it821x.c
@@ -75,6 +75,7 @@
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <scsi/scsi_host.h>
#include <linux/libata.h>
@@ -632,9 +633,9 @@ static void it821x_display_disk(struct ata_port *ap, int n, u8 *buf)
cbl = "";
if (mode)
- snprintf(mbuf, 8, "%5s%d", mtype, mode - 1);
+ snprintf(mbuf, sizeof(mbuf), "%5s%d", mtype, mode - 1);
else
- strcpy(mbuf, "PIO");
+ strscpy(mbuf, "PIO");
if (buf[52] == 4)
ata_port_info(ap, "%d: %-6s %-8s %s %s\n",
n, mbuf, types[buf[52]], id, cbl);
diff --git a/drivers/ata/pata_pcmcia.c b/drivers/ata/pata_pcmcia.c
index cf3810933a27..caefcd8c4b3c 100644
--- a/drivers/ata/pata_pcmcia.c
+++ b/drivers/ata/pata_pcmcia.c
@@ -344,6 +344,7 @@ static const struct pcmcia_device_id pcmcia_devices[] = {
PCMCIA_DEVICE_PROD_ID2("NinjaATA-", 0xebe0bd79),
PCMCIA_DEVICE_PROD_ID12("PCMCIA", "CD-ROM", 0x281f1c5d, 0x66536591),
PCMCIA_DEVICE_PROD_ID12("PCMCIA", "PnPIDE", 0x281f1c5d, 0x0c694728),
+ PCMCIA_DEVICE_PROD_ID2("PCMCIA ATA/ATAPI Adapter", 0x888d7b73),
PCMCIA_DEVICE_PROD_ID12("SHUTTLE TECHNOLOGY LTD.", "PCCARD-IDE/ATAPI Adapter", 0x4a3f0ba0, 0x322560e1),
PCMCIA_DEVICE_PROD_ID12("SEAGATE", "ST1", 0x87c1b330, 0xe1f30883),
PCMCIA_DEVICE_PROD_ID12("SAMSUNG", "04/05/06", 0x43d74cb4, 0x6a22777d),
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index e25daf2396d3..082b910ddf0d 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -231,42 +231,6 @@ struct tpm_chip *tpm_default_chip(void)
EXPORT_SYMBOL_GPL(tpm_default_chip);
/**
- * tpm_find_get_ops() - find and reserve a TPM chip
- * @chip: a &struct tpm_chip instance, %NULL for the default chip
- *
- * Finds a TPM chip and reserves its class device and operations. The chip must
- * be released with tpm_put_ops() after use.
- * This function is for internal use only. It supports existing TPM callers
- * by accepting NULL, but those callers should be converted to pass in a chip
- * directly.
- *
- * Return:
- * A reserved &struct tpm_chip instance.
- * %NULL if a chip is not found.
- * %NULL if the chip is not available.
- */
-struct tpm_chip *tpm_find_get_ops(struct tpm_chip *chip)
-{
- int rc;
-
- if (chip) {
- if (!tpm_try_get_ops(chip))
- return chip;
- return NULL;
- }
-
- chip = tpm_default_chip();
- if (!chip)
- return NULL;
- rc = tpm_try_get_ops(chip);
- /* release additional reference we got from tpm_default_chip() */
- put_device(&chip->dev);
- if (rc)
- return NULL;
- return chip;
-}
-
-/**
* tpm_dev_release() - free chip memory and the device number
* @dev: the character device for the TPM chip
*
@@ -282,7 +246,6 @@ static void tpm_dev_release(struct device *dev)
kfree(chip->work_space.context_buf);
kfree(chip->work_space.session_buf);
- kfree(chip->allocated_banks);
#ifdef CONFIG_TCG_TPM2_HMAC
kfree(chip->auth);
#endif
diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c
index f2a5e09257dd..f942c0c8e402 100644
--- a/drivers/char/tpm/tpm-dev-common.c
+++ b/drivers/char/tpm/tpm-dev-common.c
@@ -275,7 +275,8 @@ void tpm_common_release(struct file *file, struct file_priv *priv)
int __init tpm_dev_common_init(void)
{
- tpm_dev_wq = alloc_workqueue("tpm_dev_wq", WQ_MEM_RECLAIM, 0);
+ tpm_dev_wq = alloc_workqueue("tpm_dev_wq", WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
return !tpm_dev_wq ? -ENOMEM : 0;
}
diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index c9f173001d0e..f745a098908b 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -313,10 +313,13 @@ int tpm_is_tpm2(struct tpm_chip *chip)
{
int rc;
- chip = tpm_find_get_ops(chip);
if (!chip)
return -ENODEV;
+ rc = tpm_try_get_ops(chip);
+ if (rc)
+ return rc;
+
rc = (chip->flags & TPM_CHIP_FLAG_TPM2) != 0;
tpm_put_ops(chip);
@@ -338,10 +341,13 @@ int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
{
int rc;
- chip = tpm_find_get_ops(chip);
if (!chip)
return -ENODEV;
+ rc = tpm_try_get_ops(chip);
+ if (rc)
+ return rc;
+
if (chip->flags & TPM_CHIP_FLAG_TPM2)
rc = tpm2_pcr_read(chip, pcr_idx, digest, NULL);
else
@@ -369,10 +375,13 @@ int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
int rc;
int i;
- chip = tpm_find_get_ops(chip);
if (!chip)
return -ENODEV;
+ rc = tpm_try_get_ops(chip);
+ if (rc)
+ return rc;
+
for (i = 0; i < chip->nr_allocated_banks; i++) {
if (digests[i].alg_id != chip->allocated_banks[i].alg_id) {
rc = -EINVAL;
@@ -492,10 +501,13 @@ int tpm_get_random(struct tpm_chip *chip, u8 *out, size_t max)
if (!out || max > TPM_MAX_RNG_DATA)
return -EINVAL;
- chip = tpm_find_get_ops(chip);
if (!chip)
return -ENODEV;
+ rc = tpm_try_get_ops(chip);
+ if (rc)
+ return rc;
+
if (chip->flags & TPM_CHIP_FLAG_TPM2)
rc = tpm2_get_random(chip, out, max);
else
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 2726bd38e5ac..02c07fef41ba 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -267,7 +267,6 @@ static inline void tpm_msleep(unsigned int delay_msec)
int tpm_chip_bootstrap(struct tpm_chip *chip);
int tpm_chip_start(struct tpm_chip *chip);
void tpm_chip_stop(struct tpm_chip *chip);
-struct tpm_chip *tpm_find_get_ops(struct tpm_chip *chip);
struct tpm_chip *tpm_chip_alloc(struct device *dev,
const struct tpm_class_ops *ops);
diff --git a/drivers/char/tpm/tpm1-cmd.c b/drivers/char/tpm/tpm1-cmd.c
index cf64c7385105..b49a790f1bd5 100644
--- a/drivers/char/tpm/tpm1-cmd.c
+++ b/drivers/char/tpm/tpm1-cmd.c
@@ -799,11 +799,6 @@ int tpm1_pm_suspend(struct tpm_chip *chip, u32 tpm_suspend_pcr)
*/
int tpm1_get_pcr_allocation(struct tpm_chip *chip)
{
- chip->allocated_banks = kcalloc(1, sizeof(*chip->allocated_banks),
- GFP_KERNEL);
- if (!chip->allocated_banks)
- return -ENOMEM;
-
chip->allocated_banks[0].alg_id = TPM_ALG_SHA1;
chip->allocated_banks[0].digest_size = hash_digest_size[HASH_ALGO_SHA1];
chip->allocated_banks[0].crypto_id = HASH_ALGO_SHA1;
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 5532e53a2dd3..dd502322f499 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -550,11 +550,9 @@ ssize_t tpm2_get_pcr_allocation(struct tpm_chip *chip)
nr_possible_banks = be32_to_cpup(
(__be32 *)&buf.data[TPM_HEADER_SIZE + 5]);
-
- chip->allocated_banks = kcalloc(nr_possible_banks,
- sizeof(*chip->allocated_banks),
- GFP_KERNEL);
- if (!chip->allocated_banks) {
+ if (nr_possible_banks > TPM2_MAX_PCR_BANKS) {
+ pr_err("tpm: out of bank capacity: %u > %u\n",
+ nr_possible_banks, TPM2_MAX_PCR_BANKS);
rc = -ENOMEM;
goto out;
}
diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c
index c75a531cfb98..6c25305c256e 100644
--- a/drivers/char/tpm/tpm_crb.c
+++ b/drivers/char/tpm/tpm_crb.c
@@ -179,6 +179,7 @@ static int crb_try_pluton_doorbell(struct crb_priv *priv, bool wait_for_complete
*
* @dev: crb device
* @priv: crb private data
+ * @loc: locality
*
* Write CRB_CTRL_REQ_GO_IDLE to TPM_CRB_CTRL_REQ
* The device should respond within TIMEOUT_C by clearing the bit.
@@ -233,6 +234,7 @@ static int crb_go_idle(struct tpm_chip *chip)
*
* @dev: crb device
* @priv: crb private data
+ * @loc: locality
*
* Write CRB_CTRL_REQ_CMD_READY to TPM_CRB_CTRL_REQ
* and poll till the device acknowledge it by clearing the bit.
@@ -412,7 +414,7 @@ static int crb_do_acpi_start(struct tpm_chip *chip)
#ifdef CONFIG_ARM64
/*
* This is a TPM Command Response Buffer start method that invokes a
- * Secure Monitor Call to requrest the firmware to execute or cancel
+ * Secure Monitor Call to request the firmware to execute or cancel
* a TPM 2.0 command.
*/
static int tpm_crb_smc_start(struct device *dev, unsigned long func_id)
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index 8954a8660ffc..e2a1769081b1 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -265,8 +265,7 @@ static u8 tpm_tis_status(struct tpm_chip *chip)
/*
* Dump stack for forensics, as invalid TPM_STS.x could be
- * potentially triggered by impaired tpm_try_get_ops() or
- * tpm_find_get_ops().
+ * potentially triggered by impaired tpm_try_get_ops().
*/
dump_stack();
}
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index be25ecbdba69..f8bfff5dd0bd 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3032,11 +3032,36 @@ static void qm_put_pci_res(struct hisi_qm *qm)
pci_release_mem_regions(pdev);
}
+static void hisi_mig_region_clear(struct hisi_qm *qm)
+{
+ u32 val;
+
+ /* Clear migration region set of PF */
+ if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) {
+ val = readl(qm->io_base + QM_MIG_REGION_SEL);
+ val &= ~QM_MIG_REGION_EN;
+ writel(val, qm->io_base + QM_MIG_REGION_SEL);
+ }
+}
+
+static void hisi_mig_region_enable(struct hisi_qm *qm)
+{
+ u32 val;
+
+ /* Select migration region of PF */
+ if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) {
+ val = readl(qm->io_base + QM_MIG_REGION_SEL);
+ val |= QM_MIG_REGION_EN;
+ writel(val, qm->io_base + QM_MIG_REGION_SEL);
+ }
+}
+
static void hisi_qm_pci_uninit(struct hisi_qm *qm)
{
struct pci_dev *pdev = qm->pdev;
pci_free_irq_vectors(pdev);
+ hisi_mig_region_clear(qm);
qm_put_pci_res(qm);
pci_disable_device(pdev);
}
@@ -5752,6 +5777,7 @@ int hisi_qm_init(struct hisi_qm *qm)
goto err_free_qm_memory;
qm_cmd_init(qm);
+ hisi_mig_region_enable(qm);
return 0;
@@ -5890,6 +5916,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm)
}
qm_cmd_init(qm);
+ hisi_mig_region_enable(qm);
hisi_qm_dev_err_init(qm);
/* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */
writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_aer.c b/drivers/crypto/intel/qat/qat_common/adf_aer.c
index 667d5e320f50..11728cf32653 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_aer.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_aer.c
@@ -105,7 +105,6 @@ void adf_dev_restore(struct adf_accel_dev *accel_dev)
accel_dev->accel_id);
hw_device->reset_device(accel_dev);
pci_restore_state(pdev);
- pci_save_state(pdev);
}
}
@@ -204,7 +203,6 @@ static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev)
if (!pdev->is_busmaster)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
res = adf_dev_up(accel_dev, false);
if (res && res != -EALREADY)
return PCI_ERS_RESULT_DISCONNECT;
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index bd2e282ca93a..77ac940e3013 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -11,25 +11,36 @@
#include "cxlpci.h"
#include "cxl.h"
-struct cxl_cxims_data {
- int nr_maps;
- u64 xormaps[] __counted_by(nr_maps);
-};
-
static const guid_t acpi_cxl_qtg_id_guid =
GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071,
0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52);
-static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr)
+#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1)
+static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = {
+ [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4
+};
+
+static const int valid_hbiw[] = { 1, 2, 3, 4, 6, 8, 12, 16 };
+
+u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw)
{
- struct cxl_cxims_data *cximsd = cxlrd->platform_data;
- int hbiw = cxlrd->cxlsd.nr_targets;
+ int nr_maps_to_apply = -1;
u64 val;
int pos;
- /* No xormaps for host bridge interleave ways of 1 or 3 */
- if (hbiw == 1 || hbiw == 3)
- return addr;
+ /*
+ * Strictly validate hbiw since this function is used for testing and
+ * that nullifies any expectation of trusted parameters from the CXL
+ * Region Driver.
+ */
+ for (int i = 0; i < ARRAY_SIZE(valid_hbiw); i++) {
+ if (valid_hbiw[i] == hbiw) {
+ nr_maps_to_apply = hbiw_to_nr_maps[hbiw];
+ break;
+ }
+ }
+ if (nr_maps_to_apply == -1 || nr_maps_to_apply > cximsd->nr_maps)
+ return ULLONG_MAX;
/*
* In regions using XOR interleave arithmetic the CXL HPA may not
@@ -60,6 +71,14 @@ static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr)
return addr;
}
+EXPORT_SYMBOL_FOR_MODULES(cxl_do_xormap_calc, "cxl_translate");
+
+static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr)
+{
+ struct cxl_cxims_data *cximsd = cxlrd->platform_data;
+
+ return cxl_do_xormap_calc(cximsd, addr, cxlrd->cxlsd.nr_targets);
+}
struct cxl_cxims_context {
struct device *dev;
@@ -353,7 +372,7 @@ static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd)
rc = hmat_get_extended_linear_cache_size(&res, nid, &cache_size);
if (rc)
- return rc;
+ return 0;
/*
* The cache range is expected to be within the CFMWS.
@@ -378,21 +397,18 @@ static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd)
int rc;
rc = cxl_acpi_set_cache_size(cxlrd);
- if (!rc)
- return;
-
- if (rc != -EOPNOTSUPP) {
+ if (rc) {
/*
- * Failing to support extended linear cache region resize does not
+ * Failing to retrieve extended linear cache region resize does not
* prevent the region from functioning. Only causes cxl list showing
* incorrect region size.
*/
dev_warn(cxlrd->cxlsd.cxld.dev.parent,
- "Extended linear cache calculation failed rc:%d\n", rc);
- }
+ "Extended linear cache retrieval failed rc:%d\n", rc);
- /* Ignoring return code */
- cxlrd->cache_size = 0;
+ /* Ignoring return code */
+ cxlrd->cache_size = 0;
+ }
}
DEFINE_FREE(put_cxlrd, struct cxl_root_decoder *,
@@ -453,8 +469,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
ig = CXL_DECODER_MIN_GRANULARITY;
cxld->interleave_granularity = ig;
- cxl_setup_extended_linear_cache(cxlrd);
-
if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) {
if (ways != 1 && ways != 3) {
cxims_ctx = (struct cxl_cxims_context) {
@@ -470,18 +484,13 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
return -EINVAL;
}
}
+ cxlrd->ops.hpa_to_spa = cxl_apply_xor_maps;
+ cxlrd->ops.spa_to_hpa = cxl_apply_xor_maps;
}
- cxlrd->qos_class = cfmws->qtg_id;
-
- if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) {
- cxlrd->ops = kzalloc(sizeof(*cxlrd->ops), GFP_KERNEL);
- if (!cxlrd->ops)
- return -ENOMEM;
+ cxl_setup_extended_linear_cache(cxlrd);
- cxlrd->ops->hpa_to_spa = cxl_apply_xor_maps;
- cxlrd->ops->spa_to_hpa = cxl_apply_xor_maps;
- }
+ cxlrd->qos_class = cfmws->qtg_id;
rc = cxl_decoder_add(cxld);
if (rc)
diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
index c4bd6e8a0cf0..7120b5f2e31f 100644
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -826,7 +826,7 @@ static struct xarray *cxl_switch_gather_bandwidth(struct cxl_region *cxlr,
cxl_coordinates_combine(coords, coords, ctx->coord);
/*
- * Take the min of the calculated bandwdith and the upstream
+ * Take the min of the calculated bandwidth and the upstream
* switch SSLBIS bandwidth if there's a parent switch
*/
if (!is_root)
@@ -949,7 +949,7 @@ static struct xarray *cxl_hb_gather_bandwidth(struct xarray *xa)
/**
* cxl_region_update_bandwidth - Update the bandwidth access coordinates of a region
* @cxlr: The region being operated on
- * @input_xa: xarray holds cxl_perf_ctx wht calculated bandwidth per ACPI0017 instance
+ * @input_xa: xarray holds cxl_perf_ctx with calculated bandwidth per ACPI0017 instance
*/
static void cxl_region_update_bandwidth(struct cxl_region *cxlr,
struct xarray *input_xa)
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index d3a094ca01ad..1c5d2022c87a 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -905,6 +905,9 @@ static void cxl_decoder_reset(struct cxl_decoder *cxld)
if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
return;
+ if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags))
+ return;
+
if (port->commit_end == id)
cxl_port_commit_reap(cxld);
else
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 18825e1505d6..5b023a0178a4 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -71,85 +71,6 @@ struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port,
}
EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL");
-struct cxl_walk_context {
- struct pci_bus *bus;
- struct cxl_port *port;
- int type;
- int error;
- int count;
-};
-
-static int match_add_dports(struct pci_dev *pdev, void *data)
-{
- struct cxl_walk_context *ctx = data;
- struct cxl_port *port = ctx->port;
- int type = pci_pcie_type(pdev);
- struct cxl_register_map map;
- struct cxl_dport *dport;
- u32 lnkcap, port_num;
- int rc;
-
- if (pdev->bus != ctx->bus)
- return 0;
- if (!pci_is_pcie(pdev))
- return 0;
- if (type != ctx->type)
- return 0;
- if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP,
- &lnkcap))
- return 0;
-
- rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
- if (rc)
- dev_dbg(&port->dev, "failed to find component registers\n");
-
- port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap);
- dport = devm_cxl_add_dport(port, &pdev->dev, port_num, map.resource);
- if (IS_ERR(dport)) {
- ctx->error = PTR_ERR(dport);
- return PTR_ERR(dport);
- }
- ctx->count++;
-
- return 0;
-}
-
-/**
- * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port
- * @port: cxl_port whose ->uport_dev is the upstream of dports to be enumerated
- *
- * Returns a positive number of dports enumerated or a negative error
- * code.
- */
-int devm_cxl_port_enumerate_dports(struct cxl_port *port)
-{
- struct pci_bus *bus = cxl_port_to_pci_bus(port);
- struct cxl_walk_context ctx;
- int type;
-
- if (!bus)
- return -ENXIO;
-
- if (pci_is_root_bus(bus))
- type = PCI_EXP_TYPE_ROOT_PORT;
- else
- type = PCI_EXP_TYPE_DOWNSTREAM;
-
- ctx = (struct cxl_walk_context) {
- .port = port,
- .bus = bus,
- .type = type,
- };
- pci_walk_bus(bus, match_add_dports, &ctx);
-
- if (ctx.count == 0)
- return -ENODEV;
- if (ctx.error)
- return ctx.error;
- return ctx.count;
-}
-EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, "CXL");
-
static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id)
{
struct pci_dev *pdev = to_pci_dev(cxlds->dev);
@@ -1217,6 +1138,14 @@ int cxl_gpf_port_setup(struct cxl_dport *dport)
return 0;
}
+struct cxl_walk_context {
+ struct pci_bus *bus;
+ struct cxl_port *port;
+ int type;
+ int error;
+ int count;
+};
+
static int count_dports(struct pci_dev *pdev, void *data)
{
struct cxl_walk_context *ctx = data;
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 8128fd2b5b31..fef3aa0c6680 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -459,7 +459,6 @@ static void cxl_root_decoder_release(struct device *dev)
if (atomic_read(&cxlrd->region_id) >= 0)
memregion_free(atomic_read(&cxlrd->region_id));
__cxl_decoder_release(&cxlrd->cxlsd.cxld);
- kfree(cxlrd->ops);
kfree(cxlrd);
}
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 41b64d871c5a..82d229c8f9bf 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -245,6 +245,9 @@ static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
struct cxl_region_params *p = &cxlr->params;
int i;
+ if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
+ return;
+
/*
* Before region teardown attempt to flush, evict any data cached for
* this region, or scream loudly about missing arch / platform support
@@ -419,6 +422,9 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
return len;
}
+ if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
+ return -EPERM;
+
rc = queue_reset(cxlr);
if (rc)
return rc;
@@ -461,21 +467,6 @@ static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(commit);
-static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
- int n)
-{
- struct device *dev = kobj_to_dev(kobj);
- struct cxl_region *cxlr = to_cxl_region(dev);
-
- /*
- * Support tooling that expects to find a 'uuid' attribute for all
- * regions regardless of mode.
- */
- if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
- return 0444;
- return a->mode;
-}
-
static ssize_t interleave_ways_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -754,6 +745,21 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(size);
+static ssize_t extended_linear_cache_size_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ ssize_t rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+ return sysfs_emit(buf, "%#llx\n", p->cache_size);
+}
+static DEVICE_ATTR_RO(extended_linear_cache_size);
+
static struct attribute *cxl_region_attrs[] = {
&dev_attr_uuid.attr,
&dev_attr_commit.attr,
@@ -762,9 +768,34 @@ static struct attribute *cxl_region_attrs[] = {
&dev_attr_resource.attr,
&dev_attr_size.attr,
&dev_attr_mode.attr,
+ &dev_attr_extended_linear_cache_size.attr,
NULL,
};
+static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_region *cxlr = to_cxl_region(dev);
+
+ /*
+ * Support tooling that expects to find a 'uuid' attribute for all
+ * regions regardless of mode.
+ */
+ if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
+ return 0444;
+
+ /*
+ * Don't display extended linear cache attribute if there is no
+ * extended linear cache.
+ */
+ if (a == &dev_attr_extended_linear_cache_size.attr &&
+ cxlr->params.cache_size == 0)
+ return 0;
+
+ return a->mode;
+}
+
static const struct attribute_group cxl_region_group = {
.attrs = cxl_region_attrs,
.is_visible = cxl_region_visible,
@@ -838,16 +869,16 @@ static int match_free_decoder(struct device *dev, const void *data)
return 1;
}
-static bool region_res_match_cxl_range(const struct cxl_region_params *p,
- const struct range *range)
+static bool spa_maps_hpa(const struct cxl_region_params *p,
+ const struct range *range)
{
if (!p->res)
return false;
/*
- * If an extended linear cache region then the CXL range is assumed
- * to be fronted by the DRAM range in current known implementation.
- * This assumption will be made until a variant implementation exists.
+ * The extended linear cache region is constructed by a 1:1 ratio
+ * where the SPA maps equal amounts of DRAM and CXL HPA capacity with
+ * CXL decoders at the high end of the SPA range.
*/
return p->res->start + p->cache_size == range->start &&
p->res->end == range->end;
@@ -865,7 +896,7 @@ static int match_auto_decoder(struct device *dev, const void *data)
cxld = to_cxl_decoder(dev);
r = &cxld->hpa_range;
- if (region_res_match_cxl_range(p, r))
+ if (spa_maps_hpa(p, r))
return 1;
return 0;
@@ -1059,6 +1090,16 @@ static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr,
return 0;
}
+static void cxl_region_set_lock(struct cxl_region *cxlr,
+ struct cxl_decoder *cxld)
+{
+ if (!test_bit(CXL_DECODER_F_LOCK, &cxld->flags))
+ return;
+
+ set_bit(CXL_REGION_F_LOCK, &cxlr->flags);
+ clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+}
+
/**
* cxl_port_attach_region() - track a region's interest in a port by endpoint
* @port: port to add a new region reference 'struct cxl_region_ref'
@@ -1170,6 +1211,8 @@ static int cxl_port_attach_region(struct cxl_port *port,
}
}
+ cxl_region_set_lock(cxlr, cxld);
+
rc = cxl_rr_ep_add(cxl_rr, cxled);
if (rc) {
dev_dbg(&cxlr->dev,
@@ -1328,7 +1371,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
struct cxl_endpoint_decoder *cxled)
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
- int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
+ int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos;
struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
@@ -1465,7 +1508,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
if (cxld->interleave_ways != iw ||
(iw > 1 && cxld->interleave_granularity != ig) ||
- !region_res_match_cxl_range(p, &cxld->hpa_range) ||
+ !spa_maps_hpa(p, &cxld->hpa_range) ||
((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
dev_err(&cxlr->dev,
"%s:%s %s expected iw: %d ig: %d %pr\n",
@@ -1520,9 +1563,8 @@ add_target:
cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id;
}
- inc = 1;
+ cxl_rr->nr_targets_set++;
out_target_set:
- cxl_rr->nr_targets_set += inc;
dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n",
dev_name(port->uport_dev), dev_name(&port->dev),
cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev),
@@ -2439,6 +2481,7 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
dev->bus = &cxl_bus_type;
dev->type = &cxl_region_type;
cxlr->id = id;
+ cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld);
return cxlr;
}
@@ -2924,38 +2967,119 @@ static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos)
return false;
}
-static bool has_hpa_to_spa(struct cxl_root_decoder *cxlrd)
+#define CXL_POS_ZERO 0
+/**
+ * cxl_validate_translation_params
+ * @eiw: encoded interleave ways
+ * @eig: encoded interleave granularity
+ * @pos: position in interleave
+ *
+ * Callers pass CXL_POS_ZERO when no position parameter needs validating.
+ *
+ * Returns: 0 on success, -EINVAL on first invalid parameter
+ */
+int cxl_validate_translation_params(u8 eiw, u16 eig, int pos)
{
- return cxlrd->ops && cxlrd->ops->hpa_to_spa;
+ int ways, gran;
+
+ if (eiw_to_ways(eiw, &ways)) {
+ pr_debug("%s: invalid eiw=%u\n", __func__, eiw);
+ return -EINVAL;
+ }
+ if (eig_to_granularity(eig, &gran)) {
+ pr_debug("%s: invalid eig=%u\n", __func__, eig);
+ return -EINVAL;
+ }
+ if (pos < 0 || pos >= ways) {
+ pr_debug("%s: invalid pos=%d for ways=%u\n", __func__, pos,
+ ways);
+ return -EINVAL;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate");
-static bool has_spa_to_hpa(struct cxl_root_decoder *cxlrd)
+u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig)
{
- return cxlrd->ops && cxlrd->ops->spa_to_hpa;
+ u64 dpa_offset, bits_lower, bits_upper, temp;
+ int ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
+ if (ret)
+ return ULLONG_MAX;
+
+ /*
+ * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13
+ * Lower bits [IG+7:0] pass through unchanged
+ * (eiw < 8)
+ * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW)
+ * Clear the position bits to isolate upper section, then
+ * reverse the left shift by eiw that occurred during DPA->HPA
+ * (eiw >= 8)
+ * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3
+ * Extract upper bits from the correct bit range and divide by 3
+ * to recover the original DPA upper bits
+ */
+ bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0);
+ if (eiw < 8) {
+ temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0);
+ dpa_offset = temp >> eiw;
+ } else {
+ bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3);
+ dpa_offset = bits_upper << (eig + 8);
+ }
+ dpa_offset |= bits_lower;
+
+ return dpa_offset;
}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate");
-u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
- u64 dpa)
+int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig)
{
- struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
- u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa;
- struct cxl_region_params *p = &cxlr->params;
- struct cxl_endpoint_decoder *cxled = NULL;
- u16 eig = 0;
- u8 eiw = 0;
- int pos;
+ unsigned int ways = 0;
+ u64 shifted, rem;
+ int pos, ret;
- for (int i = 0; i < p->nr_targets; i++) {
- cxled = p->targets[i];
- if (cxlmd == cxled_to_memdev(cxled))
- break;
+ ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
+ if (ret)
+ return ret;
+
+ if (!eiw)
+ /* position is 0 if no interleaving */
+ return 0;
+
+ /*
+ * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13
+ * eiw < 8
+ * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8].
+ * Per spec "remove IW bits starting with bit position IG+8"
+ * eiw >= 8
+ * Position is not explicitly stored in HPA_OFFSET bits. It is
+ * derived from the modulo operation of the upper bits using
+ * the total number of interleave ways.
+ */
+ if (eiw < 8) {
+ pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0);
+ } else {
+ shifted = hpa_offset >> (eig + 8);
+ eiw_to_ways(eiw, &ways);
+ div64_u64_rem(shifted, ways, &rem);
+ pos = rem;
}
- if (!cxled || cxlmd != cxled_to_memdev(cxled))
- return ULLONG_MAX;
- pos = cxled->pos;
- ways_to_eiw(p->interleave_ways, &eiw);
- granularity_to_eig(p->interleave_granularity, &eig);
+ return pos;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate");
+
+u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig)
+{
+ u64 mask_upper, hpa_offset, bits_upper;
+ int ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, pos);
+ if (ret)
+ return ULLONG_MAX;
/*
* The device position in the region interleave set was removed
@@ -2967,9 +3091,6 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
* 8.2.4.19.13 Implementation Note: Device Decode Logic
*/
- /* Remove the dpa base */
- dpa_offset = dpa - cxl_dpa_resource_start(cxled);
-
mask_upper = GENMASK_ULL(51, eig + 8);
if (eiw < 8) {
@@ -2984,12 +3105,43 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
/* The lower bits remain unchanged */
hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
+ return hpa_offset;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate");
+
+u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
+ u64 dpa)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_endpoint_decoder *cxled = NULL;
+ u64 dpa_offset, hpa_offset, hpa;
+ u16 eig = 0;
+ u8 eiw = 0;
+ int pos;
+
+ for (int i = 0; i < p->nr_targets; i++) {
+ if (cxlmd == cxled_to_memdev(p->targets[i])) {
+ cxled = p->targets[i];
+ break;
+ }
+ }
+ if (!cxled)
+ return ULLONG_MAX;
+
+ pos = cxled->pos;
+ ways_to_eiw(p->interleave_ways, &eiw);
+ granularity_to_eig(p->interleave_granularity, &eig);
+
+ dpa_offset = dpa - cxl_dpa_resource_start(cxled);
+ hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig);
+
/* Apply the hpa_offset to the region base address */
hpa = hpa_offset + p->res->start + p->cache_size;
/* Root decoder translation overrides typical modulo decode */
- if (has_hpa_to_spa(cxlrd))
- hpa = cxlrd->ops->hpa_to_spa(cxlrd, hpa);
+ if (cxlrd->ops.hpa_to_spa)
+ hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa);
if (!cxl_resource_contains_addr(p->res, hpa)) {
dev_dbg(&cxlr->dev,
@@ -2998,7 +3150,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
}
/* Simple chunk check, by pos & gran, only applies to modulo decodes */
- if (!has_hpa_to_spa(cxlrd) && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos)))
+ if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos))
return ULLONG_MAX;
return hpa;
@@ -3016,8 +3168,6 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
struct cxl_endpoint_decoder *cxled;
u64 hpa, hpa_offset, dpa_offset;
- u64 bits_upper, bits_lower;
- u64 shifted, rem, temp;
u16 eig = 0;
u8 eiw = 0;
int pos;
@@ -3033,56 +3183,21 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
* If the root decoder has SPA to CXL HPA callback, use it. Otherwise
* CXL HPA is assumed to equal SPA.
*/
- if (has_spa_to_hpa(cxlrd)) {
- hpa = cxlrd->ops->spa_to_hpa(cxlrd, p->res->start + offset);
+ if (cxlrd->ops.spa_to_hpa) {
+ hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset);
hpa_offset = hpa - p->res->start;
} else {
hpa_offset = offset;
}
- /*
- * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13
- * eiw < 8
- * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8].
- * Per spec "remove IW bits starting with bit position IG+8"
- * eiw >= 8
- * Position is not explicitly stored in HPA_OFFSET bits. It is
- * derived from the modulo operation of the upper bits using
- * the total number of interleave ways.
- */
- if (eiw < 8) {
- pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0);
- } else {
- shifted = hpa_offset >> (eig + 8);
- div64_u64_rem(shifted, p->interleave_ways, &rem);
- pos = rem;
- }
+
+ pos = cxl_calculate_position(hpa_offset, eiw, eig);
if (pos < 0 || pos >= p->nr_targets) {
dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n",
pos, p->nr_targets);
return -ENXIO;
}
- /*
- * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13
- * Lower bits [IG+7:0] pass through unchanged
- * (eiw < 8)
- * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW)
- * Clear the position bits to isolate upper section, then
- * reverse the left shift by eiw that occurred during DPA->HPA
- * (eiw >= 8)
- * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3
- * Extract upper bits from the correct bit range and divide by 3
- * to recover the original DPA upper bits
- */
- bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0);
- if (eiw < 8) {
- temp = hpa_offset &= ~((u64)GENMASK(eig + eiw + 8 - 1, 0));
- dpa_offset = temp >> eiw;
- } else {
- bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3);
- dpa_offset = bits_upper << (eig + 8);
- }
- dpa_offset |= bits_lower;
+ dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig);
/* Look-up and return the result: a memdev and a DPA */
for (int i = 0; i < p->nr_targets; i++) {
@@ -3398,7 +3513,7 @@ static int match_region_by_range(struct device *dev, const void *data)
p = &cxlr->params;
guard(rwsem_read)(&cxl_rwsem.region);
- return region_res_match_cxl_range(p, r);
+ return spa_maps_hpa(p, r);
}
static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
@@ -3479,6 +3594,10 @@ static int __construct_region(struct cxl_region *cxlr,
"Extended linear cache calculation failed rc:%d\n", rc);
}
+ rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group);
+ if (rc)
+ return rc;
+
rc = insert_resource(cxlrd->res, res);
if (rc) {
/*
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 231ddccf8977..ba17fa86d249 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -451,7 +451,7 @@ struct cxl_root_decoder {
void *platform_data;
struct mutex range_lock;
int qos_class;
- struct cxl_rd_ops *ops;
+ struct cxl_rd_ops ops;
struct cxl_switch_decoder cxlsd;
};
@@ -517,6 +517,14 @@ enum cxl_partition_mode {
*/
#define CXL_REGION_F_NEEDS_RESET 1
+/*
+ * Indicate whether this region is locked due to 1 or more decoders that have
+ * been locked. The approach of all or nothing is taken with regard to the
+ * locked attribute. CXL_REGION_F_NEEDS_RESET should not be set if this flag is
+ * set.
+ */
+#define CXL_REGION_F_LOCK 2
+
/**
* struct cxl_region - CXL region
* @dev: This region's device
@@ -738,6 +746,25 @@ static inline bool is_cxl_root(struct cxl_port *port)
return port->uport_dev == port->dev.parent;
}
+/* Address translation functions exported to cxl_translate test module only */
+int cxl_validate_translation_params(u8 eiw, u16 eig, int pos);
+u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig);
+u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig);
+int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig);
+struct cxl_cxims_data {
+ int nr_maps;
+ u64 xormaps[] __counted_by(nr_maps);
+};
+
+#if IS_ENABLED(CONFIG_CXL_ACPI)
+u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw);
+#else
+static inline u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw)
+{
+ return ULLONG_MAX;
+}
+#endif
+
int cxl_num_decoders_committed(struct cxl_port *port);
bool is_cxl_port(const struct device *dev);
struct cxl_port *to_cxl_port(const struct device *dev);
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 7ae621e618e7..1d526bea8431 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -127,7 +127,6 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev)
return lnksta2 & PCI_EXP_LNKSTA2_FLIT;
}
-int devm_cxl_port_enumerate_dports(struct cxl_port *port);
struct cxl_dev_state;
void read_cdat_data(struct cxl_port *port);
void cxl_cor_error_detected(struct pci_dev *pdev);
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index bd100ac31672..0be4e508affe 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -136,7 +136,7 @@ static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
if (opcode == CXL_MBOX_OP_SANITIZE) {
mutex_lock(&cxl_mbox->mbox_mutex);
if (mds->security.sanitize_node)
- mod_delayed_work(system_wq, &mds->security.poll_dwork, 0);
+ mod_delayed_work(system_percpu_wq, &mds->security.poll_dwork, 0);
mutex_unlock(&cxl_mbox->mbox_mutex);
} else {
/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile
index 70ec901edf2c..2008fb7481b3 100644
--- a/drivers/dma-buf/Makefile
+++ b/drivers/dma-buf/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \
- dma-fence-unwrap.o dma-resv.o
+ dma-fence-unwrap.o dma-resv.o dma-buf-mapping.o
obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o
obj-$(CONFIG_DMABUF_HEAPS) += heaps/
obj-$(CONFIG_SYNC_FILE) += sync_file.o
diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c
new file mode 100644
index 000000000000..b7352e609fbd
--- /dev/null
+++ b/drivers/dma-buf/dma-buf-mapping.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * DMA BUF Mapping Helpers
+ *
+ */
+#include <linux/dma-buf-mapping.h>
+#include <linux/dma-resv.h>
+
+static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length,
+ dma_addr_t addr)
+{
+ unsigned int len, nents;
+ int i;
+
+ nents = DIV_ROUND_UP(length, UINT_MAX);
+ for (i = 0; i < nents; i++) {
+ len = min_t(size_t, length, UINT_MAX);
+ length -= len;
+ /*
+ * DMABUF abuses scatterlist to create a scatterlist
+ * that does not have any CPU list, only the DMA list.
+ * Always set the page related values to NULL to ensure
+ * importers can't use it. The phys_addr based DMA API
+ * does not require the CPU list for mapping or unmapping.
+ */
+ sg_set_page(sgl, NULL, 0, 0);
+ sg_dma_address(sgl) = addr + (dma_addr_t)i * UINT_MAX;
+ sg_dma_len(sgl) = len;
+ sgl = sg_next(sgl);
+ }
+
+ return sgl;
+}
+
+static unsigned int calc_sg_nents(struct dma_iova_state *state,
+ struct dma_buf_phys_vec *phys_vec,
+ size_t nr_ranges, size_t size)
+{
+ unsigned int nents = 0;
+ size_t i;
+
+ if (!state || !dma_use_iova(state)) {
+ for (i = 0; i < nr_ranges; i++)
+ nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);
+ } else {
+ /*
+ * In IOVA case, there is only one SG entry which spans
+ * for whole IOVA address space, but we need to make sure
+ * that it fits sg->length, maybe we need more.
+ */
+ nents = DIV_ROUND_UP(size, UINT_MAX);
+ }
+
+ return nents;
+}
+
+/**
+ * struct dma_buf_dma - holds DMA mapping information
+ * @sgt: Scatter-gather table
+ * @state: DMA IOVA state relevant in IOMMU-based DMA
+ * @size: Total size of DMA transfer
+ */
+struct dma_buf_dma {
+ struct sg_table sgt;
+ struct dma_iova_state *state;
+ size_t size;
+};
+
+/**
+ * dma_buf_phys_vec_to_sgt - Returns the scatterlist table of the attachment
+ * from arrays of physical vectors. This funciton is intended for MMIO memory
+ * only.
+ * @attach: [in] attachment whose scatterlist is to be returned
+ * @provider: [in] p2pdma provider
+ * @phys_vec: [in] array of physical vectors
+ * @nr_ranges: [in] number of entries in phys_vec array
+ * @size: [in] total size of phys_vec
+ * @dir: [in] direction of DMA transfer
+ *
+ * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR
+ * on error. May return -EINTR if it is interrupted by a signal.
+ *
+ * On success, the DMA addresses and lengths in the returned scatterlist are
+ * PAGE_SIZE aligned.
+ *
+ * A mapping must be unmapped by using dma_buf_free_sgt().
+ *
+ * NOTE: This function is intended for exporters. If direct traffic routing is
+ * mandatory exporter should call routing pci_p2pdma_map_type() before calling
+ * this function.
+ */
+struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach,
+ struct p2pdma_provider *provider,
+ struct dma_buf_phys_vec *phys_vec,
+ size_t nr_ranges, size_t size,
+ enum dma_data_direction dir)
+{
+ unsigned int nents, mapped_len = 0;
+ struct dma_buf_dma *dma;
+ struct scatterlist *sgl;
+ dma_addr_t addr;
+ size_t i;
+ int ret;
+
+ dma_resv_assert_held(attach->dmabuf->resv);
+
+ if (WARN_ON(!attach || !attach->dmabuf || !provider))
+ /* This function is supposed to work on MMIO memory only */
+ return ERR_PTR(-EINVAL);
+
+ dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+ if (!dma)
+ return ERR_PTR(-ENOMEM);
+
+ switch (pci_p2pdma_map_type(provider, attach->dev)) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ /*
+ * There is no need in IOVA at all for this flow.
+ */
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ dma->state = kzalloc(sizeof(*dma->state), GFP_KERNEL);
+ if (!dma->state) {
+ ret = -ENOMEM;
+ goto err_free_dma;
+ }
+
+ dma_iova_try_alloc(attach->dev, dma->state, 0, size);
+ break;
+ default:
+ ret = -EINVAL;
+ goto err_free_dma;
+ }
+
+ nents = calc_sg_nents(dma->state, phys_vec, nr_ranges, size);
+ ret = sg_alloc_table(&dma->sgt, nents, GFP_KERNEL | __GFP_ZERO);
+ if (ret)
+ goto err_free_state;
+
+ sgl = dma->sgt.sgl;
+
+ for (i = 0; i < nr_ranges; i++) {
+ if (!dma->state) {
+ addr = pci_p2pdma_bus_addr_map(provider,
+ phys_vec[i].paddr);
+ } else if (dma_use_iova(dma->state)) {
+ ret = dma_iova_link(attach->dev, dma->state,
+ phys_vec[i].paddr, 0,
+ phys_vec[i].len, dir,
+ DMA_ATTR_MMIO);
+ if (ret)
+ goto err_unmap_dma;
+
+ mapped_len += phys_vec[i].len;
+ } else {
+ addr = dma_map_phys(attach->dev, phys_vec[i].paddr,
+ phys_vec[i].len, dir,
+ DMA_ATTR_MMIO);
+ ret = dma_mapping_error(attach->dev, addr);
+ if (ret)
+ goto err_unmap_dma;
+ }
+
+ if (!dma->state || !dma_use_iova(dma->state))
+ sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);
+ }
+
+ if (dma->state && dma_use_iova(dma->state)) {
+ WARN_ON_ONCE(mapped_len != size);
+ ret = dma_iova_sync(attach->dev, dma->state, 0, mapped_len);
+ if (ret)
+ goto err_unmap_dma;
+
+ sgl = fill_sg_entry(sgl, mapped_len, dma->state->addr);
+ }
+
+ dma->size = size;
+
+ /*
+ * No CPU list included — set orig_nents = 0 so others can detect
+ * this via SG table (use nents only).
+ */
+ dma->sgt.orig_nents = 0;
+
+
+ /*
+ * SGL must be NULL to indicate that SGL is the last one
+ * and we allocated correct number of entries in sg_alloc_table()
+ */
+ WARN_ON_ONCE(sgl);
+ return &dma->sgt;
+
+err_unmap_dma:
+ if (!i || !dma->state) {
+ ; /* Do nothing */
+ } else if (dma_use_iova(dma->state)) {
+ dma_iova_destroy(attach->dev, dma->state, mapped_len, dir,
+ DMA_ATTR_MMIO);
+ } else {
+ for_each_sgtable_dma_sg(&dma->sgt, sgl, i)
+ dma_unmap_phys(attach->dev, sg_dma_address(sgl),
+ sg_dma_len(sgl), dir, DMA_ATTR_MMIO);
+ }
+ sg_free_table(&dma->sgt);
+err_free_state:
+ kfree(dma->state);
+err_free_dma:
+ kfree(dma);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_phys_vec_to_sgt, "DMA_BUF");
+
+/**
+ * dma_buf_free_sgt- unmaps the buffer
+ * @attach: [in] attachment to unmap buffer from
+ * @sgt: [in] scatterlist info of the buffer to unmap
+ * @dir: [in] direction of DMA transfer
+ *
+ * This unmaps a DMA mapping for @attached obtained
+ * by dma_buf_phys_vec_to_sgt().
+ */
+void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+ struct dma_buf_dma *dma = container_of(sgt, struct dma_buf_dma, sgt);
+ int i;
+
+ dma_resv_assert_held(attach->dmabuf->resv);
+
+ if (!dma->state) {
+ ; /* Do nothing */
+ } else if (dma_use_iova(dma->state)) {
+ dma_iova_destroy(attach->dev, dma->state, dma->size, dir,
+ DMA_ATTR_MMIO);
+ } else {
+ struct scatterlist *sgl;
+
+ for_each_sgtable_dma_sg(sgt, sgl, i)
+ dma_unmap_phys(attach->dev, sg_dma_address(sgl),
+ sg_dma_len(sgl), dir, DMA_ATTR_MMIO);
+ }
+
+ sg_free_table(sgt);
+ kfree(dma->state);
+ kfree(dma);
+
+}
+EXPORT_SYMBOL_NS_GPL(dma_buf_free_sgt, "DMA_BUF");
diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c
index 02f68b328511..227398673b73 100644
--- a/drivers/dma/ioat/init.c
+++ b/drivers/dma/ioat/init.c
@@ -1286,7 +1286,6 @@ static pci_ers_result_t ioat_pcie_error_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
}
diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c
index f0a63d09d3c4..76542a53e202 100644
--- a/drivers/firmware/efi/cper-arm.c
+++ b/drivers/firmware/efi/cper-arm.c
@@ -93,15 +93,11 @@ static void cper_print_arm_err_info(const char *pfx, u32 type,
bool proc_context_corrupt, corrected, precise_pc, restartable_pc;
bool time_out, access_mode;
- /* If the type is unknown, bail. */
- if (type > CPER_ARM_MAX_TYPE)
- return;
-
/*
* Vendor type errors have error information values that are vendor
* specific.
*/
- if (type == CPER_ARM_VENDOR_ERROR)
+ if (type & CPER_ARM_VENDOR_ERROR)
return;
if (error_info & CPER_ARM_ERR_VALID_TRANSACTION_TYPE) {
@@ -116,43 +112,38 @@ static void cper_print_arm_err_info(const char *pfx, u32 type,
if (error_info & CPER_ARM_ERR_VALID_OPERATION_TYPE) {
op_type = ((error_info >> CPER_ARM_ERR_OPERATION_SHIFT)
& CPER_ARM_ERR_OPERATION_MASK);
- switch (type) {
- case CPER_ARM_CACHE_ERROR:
+ if (type & CPER_ARM_CACHE_ERROR) {
if (op_type < ARRAY_SIZE(arm_cache_err_op_strs)) {
- printk("%soperation type: %s\n", pfx,
+ printk("%scache error, operation type: %s\n", pfx,
arm_cache_err_op_strs[op_type]);
}
- break;
- case CPER_ARM_TLB_ERROR:
+ }
+ if (type & CPER_ARM_TLB_ERROR) {
if (op_type < ARRAY_SIZE(arm_tlb_err_op_strs)) {
- printk("%soperation type: %s\n", pfx,
+ printk("%sTLB error, operation type: %s\n", pfx,
arm_tlb_err_op_strs[op_type]);
}
- break;
- case CPER_ARM_BUS_ERROR:
+ }
+ if (type & CPER_ARM_BUS_ERROR) {
if (op_type < ARRAY_SIZE(arm_bus_err_op_strs)) {
- printk("%soperation type: %s\n", pfx,
+ printk("%sbus error, operation type: %s\n", pfx,
arm_bus_err_op_strs[op_type]);
}
- break;
}
}
if (error_info & CPER_ARM_ERR_VALID_LEVEL) {
level = ((error_info >> CPER_ARM_ERR_LEVEL_SHIFT)
& CPER_ARM_ERR_LEVEL_MASK);
- switch (type) {
- case CPER_ARM_CACHE_ERROR:
+ if (type & CPER_ARM_CACHE_ERROR)
printk("%scache level: %d\n", pfx, level);
- break;
- case CPER_ARM_TLB_ERROR:
+
+ if (type & CPER_ARM_TLB_ERROR)
printk("%sTLB level: %d\n", pfx, level);
- break;
- case CPER_ARM_BUS_ERROR:
+
+ if (type & CPER_ARM_BUS_ERROR)
printk("%saffinity level at which the bus error occurred: %d\n",
pfx, level);
- break;
- }
}
if (error_info & CPER_ARM_ERR_VALID_PROC_CONTEXT_CORRUPT) {
@@ -240,7 +231,8 @@ void cper_print_proc_arm(const char *pfx,
int i, len, max_ctx_type;
struct cper_arm_err_info *err_info;
struct cper_arm_ctx_info *ctx_info;
- char newpfx[64], infopfx[64];
+ char newpfx[64], infopfx[ARRAY_SIZE(newpfx) + 1];
+ char error_type[120];
printk("%sMIDR: 0x%016llx\n", pfx, proc->midr);
@@ -289,9 +281,15 @@ void cper_print_proc_arm(const char *pfx,
newpfx);
}
- printk("%serror_type: %d, %s\n", newpfx, err_info->type,
- err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
- cper_proc_error_type_strs[err_info->type] : "unknown");
+ cper_bits_to_str(error_type, sizeof(error_type),
+ FIELD_GET(CPER_ARM_ERR_TYPE_MASK, err_info->type),
+ cper_proc_error_type_strs,
+ ARRAY_SIZE(cper_proc_error_type_strs));
+
+ printk("%serror_type: 0x%02x: %s%s\n", newpfx, err_info->type,
+ error_type,
+ (err_info->type & ~CPER_ARM_ERR_TYPE_MASK) ? " with reserved bit(s)" : "");
+
if (err_info->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO) {
printk("%serror_info: 0x%016llx\n", newpfx,
err_info->error_info);
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 928409199a1a..0232bd040f61 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -12,6 +12,7 @@
* Specification version 2.4.
*/
+#include <linux/bitmap.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/time.h>
@@ -69,7 +70,7 @@ const char *cper_severity_str(unsigned int severity)
}
EXPORT_SYMBOL_GPL(cper_severity_str);
-/*
+/**
* cper_print_bits - print strings for set bits
* @pfx: prefix for each line, including log level and prefix string
* @bits: bit mask
@@ -106,6 +107,65 @@ void cper_print_bits(const char *pfx, unsigned int bits,
printk("%s\n", buf);
}
+/**
+ * cper_bits_to_str - return a string for set bits
+ * @buf: buffer to store the output string
+ * @buf_size: size of the output string buffer
+ * @bits: bit mask
+ * @strs: string array, indexed by bit position
+ * @strs_size: size of the string array: @strs
+ *
+ * Add to @buf the bitmask in hexadecimal. Then, for each set bit in @bits,
+ * add the corresponding string describing the bit in @strs to @buf.
+ *
+ * A typical example is::
+ *
+ * const char * const bits[] = {
+ * "bit 3 name",
+ * "bit 4 name",
+ * "bit 5 name",
+ * };
+ * char str[120];
+ * unsigned int bitmask = BIT(3) | BIT(5);
+ * #define MASK GENMASK(5,3)
+ *
+ * cper_bits_to_str(str, sizeof(str), FIELD_GET(MASK, bitmask),
+ * bits, ARRAY_SIZE(bits));
+ *
+ * The above code fills the string ``str`` with ``bit 3 name|bit 5 name``.
+ *
+ * Return: number of bytes stored or an error code if lower than zero.
+ */
+int cper_bits_to_str(char *buf, int buf_size, unsigned long bits,
+ const char * const strs[], unsigned int strs_size)
+{
+ int len = buf_size;
+ char *str = buf;
+ int i, size;
+
+ *buf = '\0';
+
+ for_each_set_bit(i, &bits, strs_size) {
+ if (!(bits & BIT_ULL(i)))
+ continue;
+
+ if (*buf && len > 0) {
+ *str = '|';
+ len--;
+ str++;
+ }
+
+ size = strscpy(str, strs[i], len);
+ if (size < 0)
+ return size;
+
+ len -= size;
+ str += size;
+ }
+ return len - buf_size;
+}
+EXPORT_SYMBOL_GPL(cper_bits_to_str);
+
static const char * const proc_type_strs[] = {
"IA32/X64",
"IA64",
diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c
index 874f63b4a383..9cb814c5ba1b 100644
--- a/drivers/firmware/efi/libstub/efi-stub.c
+++ b/drivers/firmware/efi/libstub/efi-stub.c
@@ -56,7 +56,7 @@ static struct screen_info *setup_graphics(void)
{
struct screen_info *si, tmp = {};
- if (efi_setup_gop(&tmp) != EFI_SUCCESS)
+ if (efi_setup_graphics(&tmp, NULL) != EFI_SUCCESS)
return NULL;
si = alloc_screen_info();
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index f5ba032863a9..b2fb0c3fa721 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -34,6 +34,9 @@
#define EFI_ALLOC_LIMIT ULONG_MAX
#endif
+struct edid_info;
+struct screen_info;
+
extern bool efi_no5lvl;
extern bool efi_nochunk;
extern bool efi_nokaslr;
@@ -578,6 +581,32 @@ union efi_graphics_output_protocol {
} mixed_mode;
};
+typedef union efi_edid_discovered_protocol efi_edid_discovered_protocol_t;
+
+union efi_edid_discovered_protocol {
+ struct {
+ u32 size_of_edid;
+ u8 *edid;
+ };
+ struct {
+ u32 size_of_edid;
+ u32 edid;
+ } mixed_mode;
+};
+
+typedef union efi_edid_active_protocol efi_edid_active_protocol_t;
+
+union efi_edid_active_protocol {
+ struct {
+ u32 size_of_edid;
+ u8 *edid;
+ };
+ struct {
+ u32 size_of_edid;
+ u32 edid;
+ } mixed_mode;
+};
+
typedef union {
struct {
u32 revision;
@@ -1085,7 +1114,7 @@ efi_status_t efi_parse_options(char const *cmdline);
void efi_parse_option_graphics(char *option);
-efi_status_t efi_setup_gop(struct screen_info *si);
+efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid);
efi_status_t handle_cmdline_files(efi_loaded_image_t *image,
const efi_char16_t *optstr,
diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c
index 3785fb4986b4..72d74436a7a4 100644
--- a/drivers/firmware/efi/libstub/gop.c
+++ b/drivers/firmware/efi/libstub/gop.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <asm/efi.h>
#include <asm/setup.h>
+#include <video/edid.h>
#include "efistub.h"
@@ -367,24 +368,31 @@ static void find_bits(u32 mask, u8 *pos, u8 *size)
*size = __fls(mask) - *pos + 1;
}
-static void
-setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
- efi_pixel_bitmask_t pixel_info, int pixel_format)
+static void setup_screen_info(struct screen_info *si, const efi_graphics_output_protocol_t *gop)
{
- if (pixel_format == PIXEL_BIT_MASK) {
- find_bits(pixel_info.red_mask,
- &si->red_pos, &si->red_size);
- find_bits(pixel_info.green_mask,
- &si->green_pos, &si->green_size);
- find_bits(pixel_info.blue_mask,
- &si->blue_pos, &si->blue_size);
- find_bits(pixel_info.reserved_mask,
- &si->rsvd_pos, &si->rsvd_size);
- si->lfb_depth = si->red_size + si->green_size +
- si->blue_size + si->rsvd_size;
- si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
+ const efi_graphics_output_protocol_mode_t *mode = efi_table_attr(gop, mode);
+ const efi_graphics_output_mode_info_t *info = efi_table_attr(mode, info);
+
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_width = info->horizontal_resolution;
+ si->lfb_height = info->vertical_resolution;
+
+ efi_set_u64_split(efi_table_attr(mode, frame_buffer_base),
+ &si->lfb_base, &si->ext_lfb_base);
+ if (si->ext_lfb_base)
+ si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ si->pages = 1;
+
+ if (info->pixel_format == PIXEL_BIT_MASK) {
+ find_bits(info->pixel_information.red_mask, &si->red_pos, &si->red_size);
+ find_bits(info->pixel_information.green_mask, &si->green_pos, &si->green_size);
+ find_bits(info->pixel_information.blue_mask, &si->blue_pos, &si->blue_size);
+ find_bits(info->pixel_information.reserved_mask, &si->rsvd_pos, &si->rsvd_size);
+ si->lfb_depth = si->red_size + si->green_size + si->blue_size + si->rsvd_size;
+ si->lfb_linelength = (info->pixels_per_scan_line * si->lfb_depth) / 8;
} else {
- if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
+ if (info->pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
si->red_pos = 0;
si->blue_pos = 16;
} else /* PIXEL_BGR_RESERVED_8BIT_PER_COLOR */ {
@@ -394,20 +402,33 @@ setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
si->green_pos = 8;
si->rsvd_pos = 24;
- si->red_size = si->green_size =
- si->blue_size = si->rsvd_size = 8;
-
+ si->red_size = 8;
+ si->green_size = 8;
+ si->blue_size = 8;
+ si->rsvd_size = 8;
si->lfb_depth = 32;
- si->lfb_linelength = pixels_per_scan_line * 4;
+ si->lfb_linelength = info->pixels_per_scan_line * 4;
}
+
+ si->lfb_size = si->lfb_linelength * si->lfb_height;
+ si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
}
-static efi_graphics_output_protocol_t *find_gop(unsigned long num,
- const efi_handle_t handles[])
+static void setup_edid_info(struct edid_info *edid, u32 gop_size_of_edid, u8 *gop_edid)
+{
+ if (!gop_edid || gop_size_of_edid < 128)
+ memset(edid->dummy, 0, sizeof(edid->dummy));
+ else
+ memcpy(edid->dummy, gop_edid, min(gop_size_of_edid, sizeof(edid->dummy)));
+}
+
+static efi_handle_t find_handle_with_primary_gop(unsigned long num, const efi_handle_t handles[],
+ efi_graphics_output_protocol_t **found_gop)
{
efi_graphics_output_protocol_t *first_gop;
- efi_handle_t h;
+ efi_handle_t h, first_gop_handle;
+ first_gop_handle = NULL;
first_gop = NULL;
for_each_efi_handle(h, handles, num) {
@@ -442,21 +463,25 @@ static efi_graphics_output_protocol_t *find_gop(unsigned long num,
*/
status = efi_bs_call(handle_protocol, h,
&EFI_CONSOLE_OUT_DEVICE_GUID, &dummy);
- if (status == EFI_SUCCESS)
- return gop;
-
- if (!first_gop)
+ if (status == EFI_SUCCESS) {
+ if (found_gop)
+ *found_gop = gop;
+ return h;
+ } else if (!first_gop_handle) {
+ first_gop_handle = h;
first_gop = gop;
+ }
}
- return first_gop;
+ if (found_gop)
+ *found_gop = first_gop;
+ return first_gop_handle;
}
-efi_status_t efi_setup_gop(struct screen_info *si)
+efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid)
{
efi_handle_t *handles __free(efi_pool) = NULL;
- efi_graphics_output_protocol_mode_t *mode;
- efi_graphics_output_mode_info_t *info;
+ efi_handle_t handle;
efi_graphics_output_protocol_t *gop;
efi_status_t status;
unsigned long num;
@@ -467,35 +492,41 @@ efi_status_t efi_setup_gop(struct screen_info *si)
if (status != EFI_SUCCESS)
return status;
- gop = find_gop(num, handles);
- if (!gop)
+ handle = find_handle_with_primary_gop(num, handles, &gop);
+ if (!handle)
return EFI_NOT_FOUND;
/* Change mode if requested */
set_mode(gop);
/* EFI framebuffer */
- mode = efi_table_attr(gop, mode);
- info = efi_table_attr(mode, info);
-
- si->orig_video_isVGA = VIDEO_TYPE_EFI;
-
- si->lfb_width = info->horizontal_resolution;
- si->lfb_height = info->vertical_resolution;
-
- efi_set_u64_split(efi_table_attr(mode, frame_buffer_base),
- &si->lfb_base, &si->ext_lfb_base);
- if (si->ext_lfb_base)
- si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
-
- si->pages = 1;
-
- setup_pixel_info(si, info->pixels_per_scan_line,
- info->pixel_information, info->pixel_format);
-
- si->lfb_size = si->lfb_linelength * si->lfb_height;
+ if (si)
+ setup_screen_info(si, gop);
+
+ /* Display EDID for primary GOP */
+ if (edid) {
+ efi_edid_discovered_protocol_t *discovered_edid;
+ efi_edid_active_protocol_t *active_edid;
+ u32 gop_size_of_edid = 0;
+ u8 *gop_edid = NULL;
+
+ status = efi_bs_call(handle_protocol, handle, &EFI_EDID_ACTIVE_PROTOCOL_GUID,
+ (void **)&active_edid);
+ if (status == EFI_SUCCESS) {
+ gop_size_of_edid = active_edid->size_of_edid;
+ gop_edid = active_edid->edid;
+ } else {
+ status = efi_bs_call(handle_protocol, handle,
+ &EFI_EDID_DISCOVERED_PROTOCOL_GUID,
+ (void **)&discovered_edid);
+ if (status == EFI_SUCCESS) {
+ gop_size_of_edid = discovered_edid->size_of_edid;
+ gop_edid = discovered_edid->edid;
+ }
+ }
- si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
+ setup_edid_info(edid, gop_size_of_edid, gop_edid);
+ }
return EFI_SUCCESS;
}
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 761121a77f9e..cef32e2c82d8 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -203,6 +203,104 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
}
}
+struct smbios_entry_point {
+ u8 anchor[4];
+ u8 ep_checksum;
+ u8 ep_length;
+ u8 major_version;
+ u8 minor_version;
+ u16 max_size_entry;
+ u8 ep_rev;
+ u8 reserved[5];
+
+ struct __packed {
+ u8 anchor[5];
+ u8 checksum;
+ u16 st_length;
+ u32 st_address;
+ u16 number_of_entries;
+ u8 bcd_rev;
+ } intm;
+};
+
+static bool verify_ep_checksum(const void *ptr, int length)
+{
+ u8 sum = 0;
+
+ for (int i = 0; i < length; i++)
+ sum += ((u8 *)ptr)[i];
+
+ return sum == 0;
+}
+
+static bool verify_ep_integrity(const struct smbios_entry_point *ep)
+{
+ if (memcmp(ep->anchor, "_SM_", sizeof(ep->anchor)) != 0)
+ return false;
+
+ if (memcmp(ep->intm.anchor, "_DMI_", sizeof(ep->intm.anchor)) != 0)
+ return false;
+
+ if (!verify_ep_checksum(ep, ep->ep_length) ||
+ !verify_ep_checksum(&ep->intm, sizeof(ep->intm)))
+ return false;
+
+ return true;
+}
+
+static const struct efi_smbios_record *search_record(void *table, u32 length,
+ u8 type)
+{
+ const u8 *p, *end;
+
+ p = (u8 *)table;
+ end = p + length;
+
+ while (p + sizeof(struct efi_smbios_record) < end) {
+ const struct efi_smbios_record *hdr =
+ (struct efi_smbios_record *)p;
+ const u8 *next;
+
+ if (hdr->type == type)
+ return hdr;
+
+ /* Type 127 = End-of-Table */
+ if (hdr->type == 0x7F)
+ return NULL;
+
+ /* Jumping to the unformed section */
+ next = p + hdr->length;
+
+ /* Unformed section ends with 0000h */
+ while ((next[0] != 0 || next[1] != 0) && next + 1 < end)
+ next++;
+
+ next += 2;
+ p = next;
+ }
+
+ return NULL;
+}
+
+static const struct efi_smbios_record *get_table_record(u8 type)
+{
+ const struct smbios_entry_point *ep;
+
+ /*
+ * Locate the legacy 32-bit SMBIOS entrypoint in memory, and parse it
+ * directly. Needed by some Macs that do not implement the EFI protocol.
+ */
+ ep = get_efi_config_table(SMBIOS_TABLE_GUID);
+ if (!ep)
+ return NULL;
+
+ if (!verify_ep_integrity(ep))
+ return NULL;
+
+ return search_record((void *)(unsigned long)ep->intm.st_address,
+ ep->intm.st_length, type);
+}
+
static bool apple_match_product_name(void)
{
static const char type1_product_matches[][15] = {
@@ -218,7 +316,8 @@ static bool apple_match_product_name(void)
const struct efi_smbios_type1_record *record;
const u8 *product;
- record = (struct efi_smbios_type1_record *)efi_get_smbios_record(1);
+ record = (struct efi_smbios_type1_record *)
+ (efi_get_smbios_record(1) ?: get_table_record(1));
if (!record)
return false;
@@ -388,8 +487,9 @@ static void setup_quirks(struct boot_params *boot_params)
static void setup_graphics(struct boot_params *boot_params)
{
struct screen_info *si = memset(&boot_params->screen_info, 0, sizeof(*si));
+ struct edid_info *edid = memset(&boot_params->edid_info, 0, sizeof(*edid));
- efi_setup_gop(si);
+ efi_setup_graphics(si, edid);
}
static void __noreturn efi_exit(efi_handle_t handle, efi_status_t status)
diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c
index c38b1a335590..e727cc5909cb 100644
--- a/drivers/firmware/efi/memattr.c
+++ b/drivers/firmware/efi/memattr.c
@@ -19,19 +19,19 @@ unsigned long __ro_after_init efi_mem_attr_table = EFI_INVALID_TABLE_ADDR;
* Reserve the memory associated with the Memory Attributes configuration
* table, if it exists.
*/
-int __init efi_memattr_init(void)
+void __init efi_memattr_init(void)
{
efi_memory_attributes_table_t *tbl;
unsigned long size;
if (efi_mem_attr_table == EFI_INVALID_TABLE_ADDR)
- return 0;
+ return;
tbl = early_memremap(efi_mem_attr_table, sizeof(*tbl));
if (!tbl) {
pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n",
efi_mem_attr_table);
- return -ENOMEM;
+ return;
}
if (tbl->version > 2) {
@@ -61,7 +61,6 @@ int __init efi_memattr_init(void)
unmap:
early_memunmap(tbl, sizeof(*tbl));
- return 0;
}
/*
diff --git a/drivers/firmware/efi/riscv-runtime.c b/drivers/firmware/efi/riscv-runtime.c
index fa71cd898120..4a2588358be2 100644
--- a/drivers/firmware/efi/riscv-runtime.c
+++ b/drivers/firmware/efi/riscv-runtime.c
@@ -36,20 +36,12 @@ static bool __init efi_virtmap_init(void)
init_new_context(NULL, &efi_mm);
for_each_efi_memory_desc(md) {
- phys_addr_t phys = md->phys_addr;
- int ret;
-
if (!(md->attribute & EFI_MEMORY_RUNTIME))
continue;
if (md->virt_addr == U64_MAX)
return false;
- ret = efi_create_mapping(&efi_mm, md);
- if (ret) {
- pr_warn(" EFI remap %pa: failed to create mapping (%d)\n",
- &phys, ret);
- return false;
- }
+ efi_create_mapping(&efi_mm, md);
}
if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions))
diff --git a/drivers/firmware/efi/stmm/mm_communication.h b/drivers/firmware/efi/stmm/mm_communication.h
index 52a1f32cd1eb..06e7663f96dc 100644
--- a/drivers/firmware/efi/stmm/mm_communication.h
+++ b/drivers/firmware/efi/stmm/mm_communication.h
@@ -32,7 +32,7 @@
/**
* struct efi_mm_communicate_header - Header used for SMM variable communication
-
+ *
* @header_guid: header use for disambiguation of content
* @message_len: length of the message. Does not include the size of the
* header
@@ -111,7 +111,7 @@ struct efi_mm_communicate_header {
/**
* struct smm_variable_communicate_header - Used for SMM variable communication
-
+ *
* @function: function to call in Smm.
* @ret_status: return status
* @data: payload
@@ -128,7 +128,7 @@ struct smm_variable_communicate_header {
/**
* struct smm_variable_access - Used to communicate with StMM by
* SetVariable and GetVariable.
-
+ *
* @guid: vendor GUID
* @data_size: size of EFI variable data
* @name_size: size of EFI name
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a1817b4b5173..58c3ffe707d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1678,9 +1678,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
struct pci_bus *root;
struct resource *res;
+ int max_size, r;
unsigned int i;
u16 cmd;
- int r;
if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
return 0;
@@ -1726,30 +1726,28 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
return 0;
/* Limit the BAR size to what is available */
- rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
- rbar_size);
+ max_size = pci_rebar_get_max_size(adev->pdev, 0);
+ if (max_size < 0)
+ return 0;
+ rbar_size = min(max_size, rbar_size);
/* Disable memory decoding while we change the BAR addresses and size */
pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
pci_write_config_word(adev->pdev, PCI_COMMAND,
cmd & ~PCI_COMMAND_MEMORY);
- /* Free the VRAM and doorbell BAR, we most likely need to move both. */
+ /* Tear down doorbell as resizing will release BARs */
amdgpu_doorbell_fini(adev);
- if (adev->asic_type >= CHIP_BONAIRE)
- pci_release_resource(adev->pdev, 2);
- pci_release_resource(adev->pdev, 0);
-
- r = pci_resize_resource(adev->pdev, 0, rbar_size);
+ r = pci_resize_resource(adev->pdev, 0, rbar_size,
+ (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5
+ : 1 << 2);
if (r == -ENOSPC)
dev_info(adev->dev,
"Not enough PCI address space for a large BAR.");
else if (r && r != -ENOTSUPP)
dev_err(adev->dev, "Problem resizing BAR0 (%d).", r);
- pci_assign_unassigned_bus_resources(adev->pdev->bus);
-
/* When the doorbell or fb BAR isn't available we have no chance of
* using the device.
*/
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 7c89e5e0a277..4db24050edb0 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -239,6 +239,8 @@ i915-y += \
display/intel_cdclk.o \
display/intel_cmtg.o \
display/intel_color.o \
+ display/intel_colorop.o \
+ display/intel_color_pipeline.o \
display/intel_combo_phy.o \
display/intel_connector.o \
display/intel_crtc.o \
diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c
index a217a67ceb43..e7950655434b 100644
--- a/drivers/gpu/drm/i915/display/intel_color.c
+++ b/drivers/gpu/drm/i915/display/intel_color.c
@@ -32,6 +32,8 @@
#include "intel_display_utils.h"
#include "intel_dsb.h"
#include "intel_vrr.h"
+#include "skl_universal_plane.h"
+#include "skl_universal_plane_regs.h"
struct intel_color_funcs {
int (*color_check)(struct intel_atomic_state *state,
@@ -87,6 +89,14 @@ struct intel_color_funcs {
* Read config other than LUTs and CSCs, before them. Optional.
*/
void (*get_config)(struct intel_crtc_state *crtc_state);
+
+ /* Plane CSC*/
+ void (*load_plane_csc_matrix)(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state);
+
+ /* Plane Pre/Post CSC */
+ void (*load_plane_luts)(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state);
};
#define CTM_COEFF_SIGN (1ULL << 63)
@@ -609,6 +619,8 @@ static u16 ctm_to_twos_complement(u64 coeff, int int_bits, int frac_bits)
if (CTM_COEFF_NEGATIVE(coeff))
c = -c;
+ int_bits = max(int_bits, 1);
+
c = clamp(c, -(s64)BIT(int_bits + frac_bits - 1),
(s64)(BIT(int_bits + frac_bits - 1) - 1));
@@ -3836,6 +3848,266 @@ static void icl_read_luts(struct intel_crtc_state *crtc_state)
}
}
+static void
+xelpd_load_plane_csc_matrix(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state)
+{
+ struct intel_display *display = to_intel_display(plane_state);
+ const struct drm_plane_state *state = &plane_state->uapi;
+ enum pipe pipe = to_intel_plane(state->plane)->pipe;
+ enum plane_id plane = to_intel_plane(state->plane)->id;
+ const struct drm_property_blob *blob = plane_state->hw.ctm;
+ struct drm_color_ctm_3x4 *ctm;
+ const u64 *input;
+ u16 coeffs[9] = {};
+ int i, j;
+
+ if (!icl_is_hdr_plane(display, plane) || !blob)
+ return;
+
+ ctm = blob->data;
+ input = ctm->matrix;
+
+ /*
+ * Convert fixed point S31.32 input to format supported by the
+ * hardware.
+ */
+ for (i = 0, j = 0; i < ARRAY_SIZE(coeffs); i++) {
+ u64 abs_coeff = ((1ULL << 63) - 1) & input[j];
+
+ /*
+ * Clamp input value to min/max supported by
+ * hardware.
+ */
+ abs_coeff = clamp_val(abs_coeff, 0, CTM_COEFF_4_0 - 1);
+
+ /* sign bit */
+ if (CTM_COEFF_NEGATIVE(input[j]))
+ coeffs[i] |= 1 << 15;
+
+ if (abs_coeff < CTM_COEFF_0_125)
+ coeffs[i] |= (3 << 12) |
+ ILK_CSC_COEFF_FP(abs_coeff, 12);
+ else if (abs_coeff < CTM_COEFF_0_25)
+ coeffs[i] |= (2 << 12) |
+ ILK_CSC_COEFF_FP(abs_coeff, 11);
+ else if (abs_coeff < CTM_COEFF_0_5)
+ coeffs[i] |= (1 << 12) |
+ ILK_CSC_COEFF_FP(abs_coeff, 10);
+ else if (abs_coeff < CTM_COEFF_1_0)
+ coeffs[i] |= ILK_CSC_COEFF_FP(abs_coeff, 9);
+ else if (abs_coeff < CTM_COEFF_2_0)
+ coeffs[i] |= (7 << 12) |
+ ILK_CSC_COEFF_FP(abs_coeff, 8);
+ else
+ coeffs[i] |= (6 << 12) |
+ ILK_CSC_COEFF_FP(abs_coeff, 7);
+
+ /* Skip postoffs */
+ if (!((j + 2) % 4))
+ j += 2;
+ else
+ j++;
+ }
+
+ intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 0),
+ coeffs[0] << 16 | coeffs[1]);
+ intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 1),
+ coeffs[2] << 16);
+
+ intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 2),
+ coeffs[3] << 16 | coeffs[4]);
+ intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 3),
+ coeffs[5] << 16);
+
+ intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 4),
+ coeffs[6] << 16 | coeffs[7]);
+ intel_de_write_dsb(display, dsb, PLANE_CSC_COEFF(pipe, plane, 5),
+ coeffs[8] << 16);
+
+ intel_de_write_dsb(display, dsb, PLANE_CSC_PREOFF(pipe, plane, 0), 0);
+ intel_de_write_dsb(display, dsb, PLANE_CSC_PREOFF(pipe, plane, 1), 0);
+ intel_de_write_dsb(display, dsb, PLANE_CSC_PREOFF(pipe, plane, 2), 0);
+
+ /*
+ * Conversion from S31.32 to S0.12. BIT[12] is the signed bit
+ */
+ intel_de_write_dsb(display, dsb,
+ PLANE_CSC_POSTOFF(pipe, plane, 0),
+ ctm_to_twos_complement(input[3], 0, 12));
+ intel_de_write_dsb(display, dsb,
+ PLANE_CSC_POSTOFF(pipe, plane, 1),
+ ctm_to_twos_complement(input[7], 0, 12));
+ intel_de_write_dsb(display, dsb,
+ PLANE_CSC_POSTOFF(pipe, plane, 2),
+ ctm_to_twos_complement(input[11], 0, 12));
+}
+
+static void
+xelpd_program_plane_pre_csc_lut(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state)
+{
+ struct intel_display *display = to_intel_display(plane_state);
+ const struct drm_plane_state *state = &plane_state->uapi;
+ enum pipe pipe = to_intel_plane(state->plane)->pipe;
+ enum plane_id plane = to_intel_plane(state->plane)->id;
+ const struct drm_color_lut32 *pre_csc_lut = plane_state->hw.degamma_lut->data;
+ u32 i, lut_size;
+
+ if (icl_is_hdr_plane(display, plane)) {
+ lut_size = 128;
+
+ intel_de_write_dsb(display, dsb,
+ PLANE_PRE_CSC_GAMC_INDEX_ENH(pipe, plane, 0),
+ PLANE_PAL_PREC_AUTO_INCREMENT);
+
+ if (pre_csc_lut) {
+ for (i = 0; i < lut_size; i++) {
+ u32 lut_val = drm_color_lut32_extract(pre_csc_lut[i].green, 24);
+
+ intel_de_write_dsb(display, dsb,
+ PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, 0),
+ lut_val);
+ }
+
+ /* Program the max register to clamp values > 1.0. */
+ /* TODO: Restrict to 0x7ffffff */
+ do {
+ intel_de_write_dsb(display, dsb,
+ PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, 0),
+ (1 << 24));
+ } while (i++ > 130);
+ } else {
+ for (i = 0; i < lut_size; i++) {
+ u32 v = (i * ((1 << 24) - 1)) / (lut_size - 1);
+
+ intel_de_write_dsb(display, dsb,
+ PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, 0), v);
+ }
+
+ do {
+ intel_de_write_dsb(display, dsb,
+ PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, 0),
+ 1 << 24);
+ } while (i++ < 130);
+ }
+
+ intel_de_write_dsb(display, dsb, PLANE_PRE_CSC_GAMC_INDEX_ENH(pipe, plane, 0), 0);
+ }
+}
+
+static void
+xelpd_program_plane_post_csc_lut(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state)
+{
+ struct intel_display *display = to_intel_display(plane_state);
+ const struct drm_plane_state *state = &plane_state->uapi;
+ enum pipe pipe = to_intel_plane(state->plane)->pipe;
+ enum plane_id plane = to_intel_plane(state->plane)->id;
+ const struct drm_color_lut32 *post_csc_lut = plane_state->hw.gamma_lut->data;
+ u32 i, lut_size, lut_val;
+
+ if (icl_is_hdr_plane(display, plane)) {
+ intel_de_write_dsb(display, dsb, PLANE_POST_CSC_GAMC_INDEX_ENH(pipe, plane, 0),
+ PLANE_PAL_PREC_AUTO_INCREMENT);
+ /* TODO: Add macro */
+ intel_de_write_dsb(display, dsb, PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH(pipe, plane, 0),
+ PLANE_PAL_PREC_AUTO_INCREMENT);
+ if (post_csc_lut) {
+ lut_size = 32;
+ for (i = 0; i < lut_size; i++) {
+ lut_val = drm_color_lut32_extract(post_csc_lut[i].green, 24);
+
+ intel_de_write_dsb(display, dsb,
+ PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, 0),
+ lut_val);
+ }
+
+ /* Segment 2 */
+ do {
+ intel_de_write_dsb(display, dsb,
+ PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, 0),
+ (1 << 24));
+ } while (i++ < 34);
+ } else {
+ /*TODO: Add for segment 0 */
+ lut_size = 32;
+ for (i = 0; i < lut_size; i++) {
+ u32 v = (i * ((1 << 24) - 1)) / (lut_size - 1);
+
+ intel_de_write_dsb(display, dsb,
+ PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, 0), v);
+ }
+
+ do {
+ intel_de_write_dsb(display, dsb,
+ PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, 0),
+ 1 << 24);
+ } while (i++ < 34);
+ }
+
+ intel_de_write_dsb(display, dsb, PLANE_POST_CSC_GAMC_INDEX_ENH(pipe, plane, 0), 0);
+ intel_de_write_dsb(display, dsb,
+ PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH(pipe, plane, 0), 0);
+ }
+}
+
+static void
+xelpd_plane_load_luts(struct intel_dsb *dsb, const struct intel_plane_state *plane_state)
+{
+ if (plane_state->hw.degamma_lut)
+ xelpd_program_plane_pre_csc_lut(dsb, plane_state);
+
+ if (plane_state->hw.gamma_lut)
+ xelpd_program_plane_post_csc_lut(dsb, plane_state);
+}
+
+static u32 glk_3dlut_10(const struct drm_color_lut32 *color)
+{
+ return REG_FIELD_PREP(LUT_3D_DATA_RED_MASK, drm_color_lut32_extract(color->red, 10)) |
+ REG_FIELD_PREP(LUT_3D_DATA_GREEN_MASK, drm_color_lut32_extract(color->green, 10)) |
+ REG_FIELD_PREP(LUT_3D_DATA_BLUE_MASK, drm_color_lut32_extract(color->blue, 10));
+}
+
+static void glk_load_lut_3d(struct intel_dsb *dsb,
+ struct intel_crtc *crtc,
+ const struct drm_property_blob *blob)
+{
+ struct intel_display *display = to_intel_display(crtc->base.dev);
+ const struct drm_color_lut32 *lut = blob->data;
+ int i, lut_size = drm_color_lut32_size(blob);
+ enum pipe pipe = crtc->pipe;
+
+ if (!dsb && intel_de_read(display, LUT_3D_CTL(pipe)) & LUT_3D_READY) {
+ drm_err(display->drm, "[CRTC:%d:%s] 3D LUT not ready, not loading LUTs\n",
+ crtc->base.base.id, crtc->base.name);
+ return;
+ }
+
+ intel_de_write_dsb(display, dsb, LUT_3D_INDEX(pipe), LUT_3D_AUTO_INCREMENT);
+ for (i = 0; i < lut_size; i++)
+ intel_de_write_dsb(display, dsb, LUT_3D_DATA(pipe), glk_3dlut_10(&lut[i]));
+ intel_de_write_dsb(display, dsb, LUT_3D_INDEX(pipe), 0);
+}
+
+static void glk_lut_3d_commit(struct intel_dsb *dsb, struct intel_crtc *crtc, bool enable)
+{
+ struct intel_display *display = to_intel_display(crtc);
+ enum pipe pipe = crtc->pipe;
+ u32 val = 0;
+
+ if (!dsb && intel_de_read(display, LUT_3D_CTL(pipe)) & LUT_3D_READY) {
+ drm_err(display->drm, "[CRTC:%d:%s] 3D LUT not ready, not committing change\n",
+ crtc->base.base.id, crtc->base.name);
+ return;
+ }
+
+ if (enable)
+ val = LUT_3D_ENABLE | LUT_3D_READY | LUT_3D_BIND_PLANE_1;
+
+ intel_de_write_dsb(display, dsb, LUT_3D_CTL(pipe), val);
+}
+
static const struct intel_color_funcs chv_color_funcs = {
.color_check = chv_color_check,
.color_commit_arm = i9xx_color_commit_arm,
@@ -3883,6 +4155,8 @@ static const struct intel_color_funcs tgl_color_funcs = {
.lut_equal = icl_lut_equal,
.read_csc = icl_read_csc,
.get_config = skl_get_config,
+ .load_plane_csc_matrix = xelpd_load_plane_csc_matrix,
+ .load_plane_luts = xelpd_plane_load_luts,
};
static const struct intel_color_funcs icl_color_funcs = {
@@ -3963,6 +4237,67 @@ static const struct intel_color_funcs ilk_color_funcs = {
.get_config = ilk_get_config,
};
+void intel_color_plane_commit_arm(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state)
+{
+ struct intel_display *display = to_intel_display(plane_state);
+ struct intel_crtc *crtc = to_intel_crtc(plane_state->uapi.crtc);
+
+ if (crtc && intel_color_crtc_has_3dlut(display, crtc->pipe))
+ glk_lut_3d_commit(dsb, crtc, !!plane_state->hw.lut_3d);
+}
+
+static void
+intel_color_load_plane_csc_matrix(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state)
+{
+ struct intel_display *display = to_intel_display(plane_state);
+
+ if (display->funcs.color->load_plane_csc_matrix)
+ display->funcs.color->load_plane_csc_matrix(dsb, plane_state);
+}
+
+static void
+intel_color_load_plane_luts(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state)
+{
+ struct intel_display *display = to_intel_display(plane_state);
+
+ if (display->funcs.color->load_plane_luts)
+ display->funcs.color->load_plane_luts(dsb, plane_state);
+}
+
+bool
+intel_color_crtc_has_3dlut(struct intel_display *display, enum pipe pipe)
+{
+ if (DISPLAY_VER(display) >= 12)
+ return pipe == PIPE_A || pipe == PIPE_B;
+ else
+ return false;
+}
+
+static void
+intel_color_load_3dlut(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state)
+{
+ struct intel_display *display = to_intel_display(plane_state);
+ struct intel_crtc *crtc = to_intel_crtc(plane_state->uapi.crtc);
+
+ if (crtc && intel_color_crtc_has_3dlut(display, crtc->pipe))
+ glk_load_lut_3d(dsb, crtc, plane_state->hw.lut_3d);
+}
+
+void intel_color_plane_program_pipeline(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state)
+{
+ if (plane_state->hw.ctm)
+ intel_color_load_plane_csc_matrix(dsb, plane_state);
+ if (plane_state->hw.degamma_lut || plane_state->hw.gamma_lut)
+ intel_color_load_plane_luts(dsb, plane_state);
+ if (plane_state->hw.lut_3d)
+ intel_color_load_3dlut(dsb, plane_state);
+}
+
void intel_color_crtc_init(struct intel_crtc *crtc)
{
struct intel_display *display = to_intel_display(crtc);
diff --git a/drivers/gpu/drm/i915/display/intel_color.h b/drivers/gpu/drm/i915/display/intel_color.h
index bf7a12ce9df0..c21b9bdf7bb8 100644
--- a/drivers/gpu/drm/i915/display/intel_color.h
+++ b/drivers/gpu/drm/i915/display/intel_color.h
@@ -13,7 +13,9 @@ struct intel_crtc_state;
struct intel_crtc;
struct intel_display;
struct intel_dsb;
+struct intel_plane_state;
struct drm_property_blob;
+enum pipe;
void intel_color_init_hooks(struct intel_display *display);
int intel_color_init(struct intel_display *display);
@@ -40,5 +42,9 @@ bool intel_color_lut_equal(const struct intel_crtc_state *crtc_state,
const struct drm_property_blob *blob2,
bool is_pre_csc_lut);
void intel_color_assert_luts(const struct intel_crtc_state *crtc_state);
-
+void intel_color_plane_program_pipeline(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state);
+void intel_color_plane_commit_arm(struct intel_dsb *dsb,
+ const struct intel_plane_state *plane_state);
+bool intel_color_crtc_has_3dlut(struct intel_display *display, enum pipe pipe);
#endif /* __INTEL_COLOR_H__ */
diff --git a/drivers/gpu/drm/i915/display/intel_color_pipeline.c b/drivers/gpu/drm/i915/display/intel_color_pipeline.c
new file mode 100644
index 000000000000..942d9b9c93ce
--- /dev/null
+++ b/drivers/gpu/drm/i915/display/intel_color_pipeline.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+#include "intel_color.h"
+#include "intel_colorop.h"
+#include "intel_color_pipeline.h"
+#include "intel_de.h"
+#include "intel_display_types.h"
+#include "skl_universal_plane.h"
+
+#define MAX_COLOR_PIPELINES 1
+#define PLANE_DEGAMMA_SIZE 128
+#define PLANE_GAMMA_SIZE 32
+
+static
+int _intel_color_pipeline_plane_init(struct drm_plane *plane, struct drm_prop_enum_list *list,
+ enum pipe pipe)
+{
+ struct drm_device *dev = plane->dev;
+ struct intel_display *display = to_intel_display(dev);
+ struct drm_colorop *prev_op;
+ struct intel_colorop *colorop;
+ int ret;
+
+ colorop = intel_colorop_create(INTEL_PLANE_CB_PRE_CSC_LUT);
+
+ ret = drm_plane_colorop_curve_1d_lut_init(dev, &colorop->base, plane,
+ PLANE_DEGAMMA_SIZE,
+ DRM_COLOROP_LUT1D_INTERPOLATION_LINEAR,
+ DRM_COLOROP_FLAG_ALLOW_BYPASS);
+
+ if (ret)
+ return ret;
+
+ list->type = colorop->base.base.id;
+ list->name = kasprintf(GFP_KERNEL, "Color Pipeline %d", colorop->base.base.id);
+
+ /* TODO: handle failures and clean up */
+ prev_op = &colorop->base;
+
+ if (DISPLAY_VER(display) >= 35 &&
+ intel_color_crtc_has_3dlut(display, pipe) &&
+ plane->type == DRM_PLANE_TYPE_PRIMARY) {
+ colorop = intel_colorop_create(INTEL_PLANE_CB_3DLUT);
+
+ ret = drm_plane_colorop_3dlut_init(dev, &colorop->base, plane, 17,
+ DRM_COLOROP_LUT3D_INTERPOLATION_TETRAHEDRAL,
+ true);
+ if (ret)
+ return ret;
+
+ drm_colorop_set_next_property(prev_op, &colorop->base);
+
+ prev_op = &colorop->base;
+ }
+
+ colorop = intel_colorop_create(INTEL_PLANE_CB_CSC);
+ ret = drm_plane_colorop_ctm_3x4_init(dev, &colorop->base, plane,
+ DRM_COLOROP_FLAG_ALLOW_BYPASS);
+ if (ret)
+ return ret;
+
+ drm_colorop_set_next_property(prev_op, &colorop->base);
+ prev_op = &colorop->base;
+
+ colorop = intel_colorop_create(INTEL_PLANE_CB_POST_CSC_LUT);
+ ret = drm_plane_colorop_curve_1d_lut_init(dev, &colorop->base, plane,
+ PLANE_GAMMA_SIZE,
+ DRM_COLOROP_LUT1D_INTERPOLATION_LINEAR,
+ DRM_COLOROP_FLAG_ALLOW_BYPASS);
+ if (ret)
+ return ret;
+
+ drm_colorop_set_next_property(prev_op, &colorop->base);
+
+ return 0;
+}
+
+int intel_color_pipeline_plane_init(struct drm_plane *plane, enum pipe pipe)
+{
+ struct drm_device *dev = plane->dev;
+ struct intel_display *display = to_intel_display(dev);
+ struct drm_prop_enum_list pipelines[MAX_COLOR_PIPELINES];
+ int len = 0;
+ int ret;
+
+ /* Currently expose pipeline only for HDR planes */
+ if (!icl_is_hdr_plane(display, to_intel_plane(plane)->id))
+ return 0;
+
+ /* Add pipeline consisting of transfer functions */
+ ret = _intel_color_pipeline_plane_init(plane, &pipelines[len], pipe);
+ if (ret)
+ return ret;
+ len++;
+
+ return drm_plane_create_color_pipeline_property(plane, pipelines, len);
+}
diff --git a/drivers/gpu/drm/i915/display/intel_color_pipeline.h b/drivers/gpu/drm/i915/display/intel_color_pipeline.h
new file mode 100644
index 000000000000..a457d306da7f
--- /dev/null
+++ b/drivers/gpu/drm/i915/display/intel_color_pipeline.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef __INTEL_COLOR_PIPELINE_H__
+#define __INTEL_COLOR_PIPELINE_H__
+
+struct drm_plane;
+enum pipe;
+
+int intel_color_pipeline_plane_init(struct drm_plane *plane, enum pipe pipe);
+
+#endif /* __INTEL_COLOR_PIPELINE_H__ */
diff --git a/drivers/gpu/drm/i915/display/intel_color_regs.h b/drivers/gpu/drm/i915/display/intel_color_regs.h
index 8eb643cfead7..c370b6029369 100644
--- a/drivers/gpu/drm/i915/display/intel_color_regs.h
+++ b/drivers/gpu/drm/i915/display/intel_color_regs.h
@@ -316,4 +316,33 @@
#define SKL_BOTTOM_COLOR_CSC_ENABLE REG_BIT(30)
#define SKL_BOTTOM_COLOR(pipe) _MMIO_PIPE(pipe, _SKL_BOTTOM_COLOR_A, _SKL_BOTTOM_COLOR_B)
+/* 3D LUT */
+#define _LUT_3D_CTL_A 0x490A4
+#define _LUT_3D_CTL_B 0x491A4
+#define LUT_3D_CTL(pipe) _MMIO_PIPE(pipe, _LUT_3D_CTL_A, _LUT_3D_CTL_B)
+#define LUT_3D_ENABLE REG_BIT(31)
+#define LUT_3D_READY REG_BIT(30)
+#define LUT_3D_BINDING_MASK REG_GENMASK(23, 22)
+#define LUT_3D_BIND_PIPE REG_FIELD_PREP(LUT_3D_BINDING_MASK, 0)
+#define LUT_3D_BIND_PLANE_1 REG_FIELD_PREP(LUT_3D_BINDING_MASK, 1)
+#define LUT_3D_BIND_PLANE_2 REG_FIELD_PREP(LUT_3D_BINDING_MASK, 2)
+#define LUT_3D_BIND_PLANE_3 REG_FIELD_PREP(LUT_3D_BINDING_MASK, 3)
+
+#define _LUT_3D_INDEX_A 0x490A8
+#define _LUT_3D_INDEX_B 0x491A8
+#define LUT_3D_INDEX(pipe) _MMIO_PIPE(pipe, _LUT_3D_INDEX_A, _LUT_3D_INDEX_B)
+#define LUT_3D_AUTO_INCREMENT REG_BIT(13)
+#define LUT_3D_INDEX_VALUE_MASK REG_GENMASK(12, 0)
+#define LUT_3D_INDEX_VALUE(x) REG_FIELD_PREP(LUT_3D_INDEX_VALUE_MASK, (x))
+
+#define _LUT_3D_DATA_A 0x490AC
+#define _LUT_3D_DATA_B 0x491AC
+#define LUT_3D_DATA(pipe) _MMIO_PIPE(pipe, _LUT_3D_DATA_A, _LUT_3D_DATA_B)
+#define LUT_3D_DATA_RED_MASK REG_GENMASK(29, 20)
+#define LUT_3D_DATA_GREEN_MASK REG_GENMASK(19, 10)
+#define LUT_3D_DATA_BLUE_MASK REG_GENMASK(9, 0)
+#define LUT_3D_DATA_RED(x) REG_FIELD_PREP(LUT_3D_DATA_RED_MASK, (x))
+#define LUT_3D_DATA_GREEN(x) REG_FIELD_PREP(LUT_3D_DATA_GREEN_MASK, (x))
+#define LUT_3D_DATA_BLUE(x) REG_FIELD_PREP(LUT_3D_DATA_BLUE_MASK, (x))
+
#endif /* __INTEL_COLOR_REGS_H__ */
diff --git a/drivers/gpu/drm/i915/display/intel_colorop.c b/drivers/gpu/drm/i915/display/intel_colorop.c
new file mode 100644
index 000000000000..f2fc0d8780ce
--- /dev/null
+++ b/drivers/gpu/drm/i915/display/intel_colorop.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+#include "intel_colorop.h"
+
+struct intel_colorop *to_intel_colorop(struct drm_colorop *colorop)
+{
+ return container_of(colorop, struct intel_colorop, base);
+}
+
+struct intel_colorop *intel_colorop_alloc(void)
+{
+ struct intel_colorop *colorop;
+
+ colorop = kzalloc(sizeof(*colorop), GFP_KERNEL);
+ if (!colorop)
+ return ERR_PTR(-ENOMEM);
+
+ return colorop;
+}
+
+struct intel_colorop *intel_colorop_create(enum intel_color_block id)
+{
+ struct intel_colorop *colorop;
+
+ colorop = intel_colorop_alloc();
+
+ if (IS_ERR(colorop))
+ return colorop;
+
+ colorop->id = id;
+
+ return colorop;
+}
diff --git a/drivers/gpu/drm/i915/display/intel_colorop.h b/drivers/gpu/drm/i915/display/intel_colorop.h
new file mode 100644
index 000000000000..21d58eb9f3d0
--- /dev/null
+++ b/drivers/gpu/drm/i915/display/intel_colorop.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef __INTEL_COLOROP_H__
+#define __INTEL_COLOROP_H__
+
+#include "intel_display_types.h"
+
+struct intel_colorop *to_intel_colorop(struct drm_colorop *colorop);
+struct intel_colorop *intel_colorop_alloc(void);
+struct intel_colorop *intel_colorop_create(enum intel_color_block id);
+
+#endif /* __INTEL_COLOROP_H__ */
diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c
index 7b4fd18c60e2..095a319f8bc9 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -7304,6 +7304,7 @@ static void intel_atomic_dsb_finish(struct intel_atomic_state *state,
struct intel_display *display = to_intel_display(state);
struct intel_crtc_state *new_crtc_state =
intel_atomic_get_new_crtc_state(state, crtc);
+ unsigned int size = new_crtc_state->plane_color_changed ? 8192 : 1024;
if (!new_crtc_state->use_flipq &&
!new_crtc_state->use_dsb &&
@@ -7314,10 +7315,12 @@ static void intel_atomic_dsb_finish(struct intel_atomic_state *state,
* Rough estimate:
* ~64 registers per each plane * 8 planes = 512
* Double that for pipe stuff and other overhead.
+ * ~4913 registers for 3DLUT
+ * ~200 color registers * 3 HDR planes
*/
new_crtc_state->dsb_commit = intel_dsb_prepare(state, crtc, INTEL_DSB_0,
new_crtc_state->use_dsb ||
- new_crtc_state->use_flipq ? 1024 : 16);
+ new_crtc_state->use_flipq ? size : 16);
if (!new_crtc_state->dsb_commit) {
new_crtc_state->use_flipq = false;
new_crtc_state->use_dsb = false;
diff --git a/drivers/gpu/drm/i915/display/intel_display_limits.h b/drivers/gpu/drm/i915/display/intel_display_limits.h
index f0fa27e365ab..cb3c9c665c44 100644
--- a/drivers/gpu/drm/i915/display/intel_display_limits.h
+++ b/drivers/gpu/drm/i915/display/intel_display_limits.h
@@ -138,4 +138,13 @@ enum hpd_pin {
HPD_NUM_PINS
};
+enum intel_color_block {
+ INTEL_PLANE_CB_PRE_CSC_LUT,
+ INTEL_PLANE_CB_CSC,
+ INTEL_PLANE_CB_POST_CSC_LUT,
+ INTEL_PLANE_CB_3DLUT,
+
+ INTEL_CB_MAX
+};
+
#endif /* __INTEL_DISPLAY_LIMITS_H__ */
diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h
index 38702a9e0f50..06bf8f7c0989 100644
--- a/drivers/gpu/drm/i915/display/intel_display_types.h
+++ b/drivers/gpu/drm/i915/display/intel_display_types.h
@@ -646,6 +646,7 @@ struct intel_plane_state {
enum drm_color_encoding color_encoding;
enum drm_color_range color_range;
enum drm_scaling_filter scaling_filter;
+ struct drm_property_blob *ctm, *degamma_lut, *gamma_lut, *lut_3d;
} hw;
struct i915_vma *ggtt_vma;
@@ -1391,6 +1392,9 @@ struct intel_crtc_state {
u8 silence_period_sym_clocks;
u8 lfps_half_cycle_num_of_syms;
} alpm_state;
+
+ /* to track changes in plane color blocks */
+ bool plane_color_changed;
};
enum intel_pipe_crc_source {
@@ -1985,6 +1989,11 @@ struct intel_dp_mst_encoder {
struct intel_connector *connector;
};
+struct intel_colorop {
+ struct drm_colorop base;
+ enum intel_color_block id;
+};
+
static inline struct intel_encoder *
intel_attached_encoder(struct intel_connector *connector)
{
diff --git a/drivers/gpu/drm/i915/display/intel_plane.c b/drivers/gpu/drm/i915/display/intel_plane.c
index 5105e3278bc4..ab6a58530b39 100644
--- a/drivers/gpu/drm/i915/display/intel_plane.c
+++ b/drivers/gpu/drm/i915/display/intel_plane.c
@@ -49,6 +49,7 @@
#include "i9xx_plane_regs.h"
#include "intel_cdclk.h"
#include "intel_cursor.h"
+#include "intel_colorop.h"
#include "intel_display_rps.h"
#include "intel_display_trace.h"
#include "intel_display_types.h"
@@ -336,6 +337,58 @@ intel_plane_copy_uapi_plane_damage(struct intel_plane_state *new_plane_state,
*damage = drm_plane_state_src(&new_uapi_plane_state->uapi);
}
+static bool
+intel_plane_colorop_replace_blob(struct intel_plane_state *plane_state,
+ struct intel_colorop *intel_colorop,
+ struct drm_property_blob *blob)
+{
+ if (intel_colorop->id == INTEL_PLANE_CB_CSC)
+ return drm_property_replace_blob(&plane_state->hw.ctm, blob);
+ else if (intel_colorop->id == INTEL_PLANE_CB_PRE_CSC_LUT)
+ return drm_property_replace_blob(&plane_state->hw.degamma_lut, blob);
+ else if (intel_colorop->id == INTEL_PLANE_CB_POST_CSC_LUT)
+ return drm_property_replace_blob(&plane_state->hw.gamma_lut, blob);
+ else if (intel_colorop->id == INTEL_PLANE_CB_3DLUT)
+ return drm_property_replace_blob(&plane_state->hw.lut_3d, blob);
+
+ return false;
+}
+
+static void
+intel_plane_color_copy_uapi_to_hw_state(struct intel_plane_state *plane_state,
+ const struct intel_plane_state *from_plane_state,
+ struct intel_crtc *crtc)
+{
+ struct drm_colorop *iter_colorop, *colorop;
+ struct drm_colorop_state *new_colorop_state;
+ struct drm_atomic_state *state = plane_state->uapi.state;
+ struct intel_colorop *intel_colorop;
+ struct drm_property_blob *blob;
+ struct intel_atomic_state *intel_atomic_state = to_intel_atomic_state(state);
+ struct intel_crtc_state *new_crtc_state = intel_atomic_state ?
+ intel_atomic_get_new_crtc_state(intel_atomic_state, crtc) : NULL;
+ bool changed = false;
+ int i = 0;
+
+ iter_colorop = plane_state->uapi.color_pipeline;
+
+ while (iter_colorop) {
+ for_each_new_colorop_in_state(state, colorop, new_colorop_state, i) {
+ if (new_colorop_state->colorop == iter_colorop) {
+ blob = new_colorop_state->bypass ? NULL : new_colorop_state->data;
+ intel_colorop = to_intel_colorop(colorop);
+ changed |= intel_plane_colorop_replace_blob(plane_state,
+ intel_colorop,
+ blob);
+ }
+ }
+ iter_colorop = iter_colorop->next;
+ }
+
+ if (new_crtc_state && changed)
+ new_crtc_state->plane_color_changed = true;
+}
+
void intel_plane_copy_uapi_to_hw_state(struct intel_plane_state *plane_state,
const struct intel_plane_state *from_plane_state,
struct intel_crtc *crtc)
@@ -364,6 +417,8 @@ void intel_plane_copy_uapi_to_hw_state(struct intel_plane_state *plane_state,
plane_state->uapi.src = drm_plane_state_src(&from_plane_state->uapi);
plane_state->uapi.dst = drm_plane_state_dest(&from_plane_state->uapi);
+
+ intel_plane_color_copy_uapi_to_hw_state(plane_state, from_plane_state, crtc);
}
void intel_plane_copy_hw_state(struct intel_plane_state *plane_state,
diff --git a/drivers/gpu/drm/i915/display/skl_universal_plane.c b/drivers/gpu/drm/i915/display/skl_universal_plane.c
index 89c8003ccfe7..ee8e24497d2c 100644
--- a/drivers/gpu/drm/i915/display/skl_universal_plane.c
+++ b/drivers/gpu/drm/i915/display/skl_universal_plane.c
@@ -11,6 +11,8 @@
#include "pxp/intel_pxp.h"
#include "intel_bo.h"
+#include "intel_color.h"
+#include "intel_color_pipeline.h"
#include "intel_de.h"
#include "intel_display_irq.h"
#include "intel_display_regs.h"
@@ -1275,6 +1277,18 @@ static u32 glk_plane_color_ctl(const struct intel_plane_state *plane_state)
if (plane_state->force_black)
plane_color_ctl |= PLANE_COLOR_PLANE_CSC_ENABLE;
+ if (plane_state->hw.degamma_lut)
+ plane_color_ctl |= PLANE_COLOR_PRE_CSC_GAMMA_ENABLE;
+
+ if (plane_state->hw.ctm)
+ plane_color_ctl |= PLANE_COLOR_PLANE_CSC_ENABLE;
+
+ if (plane_state->hw.gamma_lut) {
+ plane_color_ctl &= ~PLANE_COLOR_PLANE_GAMMA_DISABLE;
+ if (drm_color_lut32_size(plane_state->hw.gamma_lut) != 32)
+ plane_color_ctl |= PLANE_COLOR_POST_CSC_GAMMA_MULTSEG_ENABLE;
+ }
+
return plane_color_ctl;
}
@@ -1556,6 +1570,8 @@ icl_plane_update_noarm(struct intel_dsb *dsb,
plane_color_ctl = plane_state->color_ctl |
glk_plane_color_ctl_crtc(crtc_state);
+ intel_color_plane_program_pipeline(dsb, plane_state);
+
/* The scaler will handle the output position */
if (plane_state->scaler_id >= 0) {
crtc_x = 0;
@@ -1657,6 +1673,8 @@ icl_plane_update_arm(struct intel_dsb *dsb,
icl_plane_update_sel_fetch_arm(dsb, plane, crtc_state, plane_state);
+ intel_color_plane_commit_arm(dsb, plane_state);
+
/*
* In order to have FBC for fp16 formats pixel normalizer block must be
* active. Check if pixel normalizer block need to be enabled for FBC.
@@ -3001,6 +3019,9 @@ skl_universal_plane_create(struct intel_display *display,
DRM_COLOR_YCBCR_BT709,
DRM_COLOR_YCBCR_LIMITED_RANGE);
+ if (DISPLAY_VER(display) >= 12)
+ intel_color_pipeline_plane_init(&plane->base, pipe);
+
drm_plane_create_alpha_property(&plane->base);
drm_plane_create_blend_mode_property(&plane->base,
BIT(DRM_MODE_BLEND_PIXEL_NONE) |
diff --git a/drivers/gpu/drm/i915/display/skl_universal_plane_regs.h b/drivers/gpu/drm/i915/display/skl_universal_plane_regs.h
index 6f815b231340..6fd4da9f63cf 100644
--- a/drivers/gpu/drm/i915/display/skl_universal_plane_regs.h
+++ b/drivers/gpu/drm/i915/display/skl_universal_plane_regs.h
@@ -254,6 +254,8 @@
#define PLANE_COLOR_PIPE_CSC_ENABLE REG_BIT(23) /* Pre-ICL */
#define PLANE_COLOR_PLANE_CSC_ENABLE REG_BIT(21) /* ICL+ */
#define PLANE_COLOR_INPUT_CSC_ENABLE REG_BIT(20) /* ICL+ */
+#define PLANE_COLOR_POST_CSC_GAMMA_MULTSEG_ENABLE REG_BIT(15) /* TGL+ */
+#define PLANE_COLOR_PRE_CSC_GAMMA_ENABLE REG_BIT(14)
#define PLANE_COLOR_CSC_MODE_MASK REG_GENMASK(19, 17)
#define PLANE_COLOR_CSC_MODE_BYPASS REG_FIELD_PREP(PLANE_COLOR_CSC_MODE_MASK, 0)
#define PLANE_COLOR_CSC_MODE_YUV601_TO_RGB601 REG_FIELD_PREP(PLANE_COLOR_CSC_MODE_MASK, 1)
@@ -290,6 +292,119 @@
_PLANE_INPUT_CSC_POSTOFF_HI_1_A, _PLANE_INPUT_CSC_POSTOFF_HI_1_B, \
_PLANE_INPUT_CSC_POSTOFF_HI_2_A, _PLANE_INPUT_CSC_POSTOFF_HI_2_B)
+#define _MMIO_PLANE_GAMC(plane, i, a, b) _MMIO(_PIPE(plane, a, b) + (i) * 4)
+
+#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1_A 0x70160
+#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1_B 0x71160
+#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2_A 0x70260
+#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2_B 0x71260
+#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1_A, \
+ _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1_B)
+#define _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2_A, \
+ _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2_B)
+#define PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_1(pipe), \
+ _PLANE_POST_CSC_GAMC_SEG0_INDEX_ENH_2(pipe))
+
+#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1_A 0x70164
+#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1_B 0x71164
+#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2_A 0x70264
+#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2_B 0x71264
+#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1_A, \
+ _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1_B)
+#define _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2_A, \
+ _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2_B)
+#define PLANE_POST_CSC_GAMC_SEG0_DATA_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_1(pipe), \
+ _PLANE_POST_CSC_GAMC_SEG0_DATA_ENH_2(pipe))
+
+#define _PLANE_POST_CSC_GAMC_INDEX_ENH_1_A 0x701d8
+#define _PLANE_POST_CSC_GAMC_INDEX_ENH_1_B 0x711d8
+#define _PLANE_POST_CSC_GAMC_INDEX_ENH_2_A 0x702d8
+#define _PLANE_POST_CSC_GAMC_INDEX_ENH_2_B 0x712d8
+#define _PLANE_POST_CSC_GAMC_INDEX_ENH_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_INDEX_ENH_1_A, \
+ _PLANE_POST_CSC_GAMC_INDEX_ENH_1_B)
+#define _PLANE_POST_CSC_GAMC_INDEX_ENH_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_INDEX_ENH_2_A, \
+ _PLANE_POST_CSC_GAMC_INDEX_ENH_2_B)
+#define PLANE_POST_CSC_GAMC_INDEX_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_INDEX_ENH_1(pipe), \
+ _PLANE_POST_CSC_GAMC_INDEX_ENH_2(pipe))
+
+#define _PLANE_POST_CSC_GAMC_DATA_ENH_1_A 0x701dc
+#define _PLANE_POST_CSC_GAMC_DATA_ENH_1_B 0x711dc
+#define _PLANE_POST_CSC_GAMC_DATA_ENH_2_A 0x702dc
+#define _PLANE_POST_CSC_GAMC_DATA_ENH_2_B 0x712dc
+#define _PLANE_POST_CSC_GAMC_DATA_ENH_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_DATA_ENH_1_A, \
+ _PLANE_POST_CSC_GAMC_DATA_ENH_1_B)
+#define _PLANE_POST_CSC_GAMC_DATA_ENH_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_DATA_ENH_2_A, \
+ _PLANE_POST_CSC_GAMC_DATA_ENH_2_B)
+#define PLANE_POST_CSC_GAMC_DATA_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_DATA_ENH_1(pipe), \
+ _PLANE_POST_CSC_GAMC_DATA_ENH_2(pipe))
+
+#define _PLANE_POST_CSC_GAMC_INDEX_1_A 0x704d8
+#define _PLANE_POST_CSC_GAMC_INDEX_1_B 0x714d8
+#define _PLANE_POST_CSC_GAMC_INDEX_2_A 0x705d8
+#define _PLANE_POST_CSC_GAMC_INDEX_2_B 0x715d8
+#define _PLANE_POST_CSC_GAMC_INDEX_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_INDEX_1_A, \
+ _PLANE_POST_CSC_GAMC_INDEX_1_B)
+#define _PLANE_POST_CSC_GAMC_INDEX_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_INDEX_2_A, \
+ _PLANE_POST_CSC_GAMC_INDEX_2_B)
+#define PLANE_POST_CSC_GAMC_INDEX(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_INDEX_1(pipe), \
+ _PLANE_POST_CSC_GAMC_INDEX_2(pipe))
+
+#define _PLANE_POST_CSC_GAMC_DATA_1_A 0x704dc
+#define _PLANE_POST_CSC_GAMC_DATA_1_B 0x714dc
+#define _PLANE_POST_CSC_GAMC_DATA_2_A 0x705dc
+#define _PLANE_POST_CSC_GAMC_DATA_2_B 0x715dc
+#define _PLANE_POST_CSC_GAMC_DATA_1(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_DATA_1_A, \
+ _PLANE_POST_CSC_GAMC_DATA_1_B)
+#define _PLANE_POST_CSC_GAMC_DATA_2(pipe) _PIPE(pipe, _PLANE_POST_CSC_GAMC_DATA_2_A, \
+ _PLANE_POST_CSC_GAMC_DATA_2_B)
+#define PLANE_POST_CSC_GAMC_DATA(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_POST_CSC_GAMC_DATA_1(pipe), \
+ _PLANE_POST_CSC_GAMC_DATA_2(pipe))
+
+#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_1_A 0x701d0
+#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_1_B 0x711d0
+#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_2_A 0x702d0
+#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_2_B 0x712d0
+#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_1(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_INDEX_ENH_1_A, \
+ _PLANE_PRE_CSC_GAMC_INDEX_ENH_1_B)
+#define _PLANE_PRE_CSC_GAMC_INDEX_ENH_2(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_INDEX_ENH_2_A, \
+ _PLANE_PRE_CSC_GAMC_INDEX_ENH_2_B)
+#define PLANE_PRE_CSC_GAMC_INDEX_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_PRE_CSC_GAMC_INDEX_ENH_1(pipe), \
+ _PLANE_PRE_CSC_GAMC_INDEX_ENH_2(pipe))
+#define PLANE_PAL_PREC_AUTO_INCREMENT REG_BIT(10)
+
+#define _PLANE_PRE_CSC_GAMC_DATA_ENH_1_A 0x701d4
+#define _PLANE_PRE_CSC_GAMC_DATA_ENH_1_B 0x711d4
+#define _PLANE_PRE_CSC_GAMC_DATA_ENH_2_A 0x702d4
+#define _PLANE_PRE_CSC_GAMC_DATA_ENH_2_B 0x712d4
+#define _PLANE_PRE_CSC_GAMC_DATA_ENH_1(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_DATA_ENH_1_A, \
+ _PLANE_PRE_CSC_GAMC_DATA_ENH_1_B)
+#define _PLANE_PRE_CSC_GAMC_DATA_ENH_2(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_DATA_ENH_2_A, \
+ _PLANE_PRE_CSC_GAMC_DATA_ENH_2_B)
+#define PLANE_PRE_CSC_GAMC_DATA_ENH(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_PRE_CSC_GAMC_DATA_ENH_1(pipe), \
+ _PLANE_PRE_CSC_GAMC_DATA_ENH_2(pipe))
+
+#define _PLANE_PRE_CSC_GAMC_INDEX_1_A 0x704d0
+#define _PLANE_PRE_CSC_GAMC_INDEX_1_B 0x714d0
+#define _PLANE_PRE_CSC_GAMC_INDEX_2_A 0x705d0
+#define _PLANE_PRE_CSC_GAMC_INDEX_2_B 0x715d0
+#define _PLANE_PRE_CSC_GAMC_INDEX_1(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_INDEX_1_A, \
+ _PLANE_PRE_CSC_GAMC_INDEX_1_B)
+#define _PLANE_PRE_CSC_GAMC_INDEX_2(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_INDEX_2_A, \
+ _PLANE_PRE_CSC_GAMC_INDEX_2_B)
+#define PLANE_PRE_CSC_GAMC_INDEX(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_PRE_CSC_GAMC_INDEX_1(pipe), \
+ _PLANE_PRE_CSC_GAMC_INDEX_2(pipe))
+
+#define _PLANE_PRE_CSC_GAMC_DATA_1_A 0x704d4
+#define _PLANE_PRE_CSC_GAMC_DATA_1_B 0x714d4
+#define _PLANE_PRE_CSC_GAMC_DATA_2_A 0x705d4
+#define _PLANE_PRE_CSC_GAMC_DATA_2_B 0x715d4
+#define _PLANE_PRE_CSC_GAMC_DATA_1(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_DATA_1_A, \
+ _PLANE_PRE_CSC_GAMC_DATA_1_B)
+#define _PLANE_PRE_CSC_GAMC_DATA_2(pipe) _PIPE(pipe, _PLANE_PRE_CSC_GAMC_DATA_2_A, \
+ _PLANE_PRE_CSC_GAMC_DATA_2_B)
+#define PLANE_PRE_CSC_GAMC_DATA(pipe, plane, i) _MMIO_PLANE_GAMC(plane, i, _PLANE_PRE_CSC_GAMC_DATA_1(pipe), \
+ _PLANE_PRE_CSC_GAMC_DATA_2(pipe))
+
#define _PLANE_CSC_RY_GY_1_A 0x70210
#define _PLANE_CSC_RY_GY_2_A 0x70310
#define _PLANE_CSC_RY_GY_1_B 0x71210
diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
index 890183de2277..a30060fd4429 100644
--- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c
+++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c
@@ -20,16 +20,6 @@
#include "gt/intel_gt_regs.h"
#ifdef CONFIG_64BIT
-static void _release_bars(struct pci_dev *pdev)
-{
- int resno;
-
- for (resno = PCI_STD_RESOURCES; resno < PCI_STD_RESOURCE_END; resno++) {
- if (pci_resource_len(pdev, resno))
- pci_release_resource(pdev, resno);
- }
-}
-
static void
_resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size)
{
@@ -37,9 +27,7 @@ _resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size)
int bar_size = pci_rebar_bytes_to_size(size);
int ret;
- _release_bars(pdev);
-
- ret = pci_resize_resource(pdev, resno, bar_size);
+ ret = pci_resize_resource(pdev, resno, bar_size, 0);
if (ret) {
drm_info(&i915->drm, "Failed to resize BAR%d to %dM (%pe)\n",
resno, 1 << bar_size, ERR_PTR(ret));
@@ -63,16 +51,12 @@ static void i915_resize_lmem_bar(struct drm_i915_private *i915, resource_size_t
current_size = roundup_pow_of_two(pci_resource_len(pdev, GEN12_LMEM_BAR));
if (i915->params.lmem_bar_size) {
- u32 bar_sizes;
-
- rebar_size = i915->params.lmem_bar_size *
- (resource_size_t)SZ_1M;
- bar_sizes = pci_rebar_get_possible_sizes(pdev, GEN12_LMEM_BAR);
-
+ rebar_size = i915->params.lmem_bar_size * (resource_size_t)SZ_1M;
if (rebar_size == current_size)
return;
- if (!(bar_sizes & BIT(pci_rebar_bytes_to_size(rebar_size))) ||
+ if (!pci_rebar_size_supported(pdev, GEN12_LMEM_BAR,
+ pci_rebar_bytes_to_size(rebar_size)) ||
rebar_size >= roundup_pow_of_two(lmem_size)) {
rebar_size = lmem_size;
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index bbeba0d3fca8..3abc9206f1a8 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1141,6 +1141,122 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags,
return func(vgpu, index, start, count, flags, data);
}
+static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+ struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
+ int nr_areas = 1;
+ int cap_type_id;
+ unsigned int i;
+ int ret;
+
+ switch (info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = vgpu->gvt->device_info.cfg_space_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = vgpu->cfg_space.bar[info->index].size;
+ if (!info->size) {
+ info->flags = 0;
+ break;
+ }
+
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ break;
+ case VFIO_PCI_BAR1_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = 0;
+ info->flags = 0;
+ break;
+ case VFIO_PCI_BAR2_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->flags = VFIO_REGION_INFO_FLAG_CAPS |
+ VFIO_REGION_INFO_FLAG_MMAP |
+ VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ info->size = gvt_aperture_sz(vgpu->gvt);
+
+ sparse = kzalloc(struct_size(sparse, areas, nr_areas),
+ GFP_KERNEL);
+ if (!sparse)
+ return -ENOMEM;
+
+ sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->header.version = 1;
+ sparse->nr_areas = nr_areas;
+ cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->areas[0].offset =
+ PAGE_ALIGN(vgpu_aperture_offset(vgpu));
+ sparse->areas[0].size = vgpu_aperture_sz(vgpu);
+ break;
+
+ case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = 0;
+ info->flags = 0;
+
+ gvt_dbg_core("get region info bar:%d\n", info->index);
+ break;
+
+ case VFIO_PCI_ROM_REGION_INDEX:
+ case VFIO_PCI_VGA_REGION_INDEX:
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = 0;
+ info->flags = 0;
+
+ gvt_dbg_core("get region info index:%d\n", info->index);
+ break;
+ default: {
+ struct vfio_region_info_cap_type cap_type = {
+ .header.id = VFIO_REGION_INFO_CAP_TYPE,
+ .header.version = 1
+ };
+
+ if (info->index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions)
+ return -EINVAL;
+ info->index = array_index_nospec(
+ info->index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions);
+
+ i = info->index - VFIO_PCI_NUM_REGIONS;
+
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = vgpu->region[i].size;
+ info->flags = vgpu->region[i].flags;
+
+ cap_type.type = vgpu->region[i].type;
+ cap_type.subtype = vgpu->region[i].subtype;
+
+ ret = vfio_info_add_capability(caps, &cap_type.header,
+ sizeof(cap_type));
+ if (ret)
+ return ret;
+ }
+ }
+
+ if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
+ ret = -EINVAL;
+ if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) {
+ ret = vfio_info_add_capability(
+ caps, &sparse->header,
+ struct_size(sparse, areas, sparse->nr_areas));
+ }
+ if (ret) {
+ kfree(sparse);
+ return ret;
+ }
+ }
+
+ kfree(sparse);
+ return 0;
+}
+
static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
unsigned long arg)
{
@@ -1169,152 +1285,6 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
- } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
- struct vfio_region_info info;
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
- unsigned int i;
- int ret;
- struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
- int nr_areas = 1;
- int cap_type_id;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
- case VFIO_PCI_CONFIG_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = vgpu->gvt->device_info.cfg_space_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- break;
- case VFIO_PCI_BAR0_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = vgpu->cfg_space.bar[info.index].size;
- if (!info.size) {
- info.flags = 0;
- break;
- }
-
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- break;
- case VFIO_PCI_BAR1_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = 0;
- info.flags = 0;
- break;
- case VFIO_PCI_BAR2_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.flags = VFIO_REGION_INFO_FLAG_CAPS |
- VFIO_REGION_INFO_FLAG_MMAP |
- VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- info.size = gvt_aperture_sz(vgpu->gvt);
-
- sparse = kzalloc(struct_size(sparse, areas, nr_areas),
- GFP_KERNEL);
- if (!sparse)
- return -ENOMEM;
-
- sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
- sparse->header.version = 1;
- sparse->nr_areas = nr_areas;
- cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
- sparse->areas[0].offset =
- PAGE_ALIGN(vgpu_aperture_offset(vgpu));
- sparse->areas[0].size = vgpu_aperture_sz(vgpu);
- break;
-
- case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = 0;
- info.flags = 0;
-
- gvt_dbg_core("get region info bar:%d\n", info.index);
- break;
-
- case VFIO_PCI_ROM_REGION_INDEX:
- case VFIO_PCI_VGA_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = 0;
- info.flags = 0;
-
- gvt_dbg_core("get region info index:%d\n", info.index);
- break;
- default:
- {
- struct vfio_region_info_cap_type cap_type = {
- .header.id = VFIO_REGION_INFO_CAP_TYPE,
- .header.version = 1 };
-
- if (info.index >= VFIO_PCI_NUM_REGIONS +
- vgpu->num_regions)
- return -EINVAL;
- info.index =
- array_index_nospec(info.index,
- VFIO_PCI_NUM_REGIONS +
- vgpu->num_regions);
-
- i = info.index - VFIO_PCI_NUM_REGIONS;
-
- info.offset =
- VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = vgpu->region[i].size;
- info.flags = vgpu->region[i].flags;
-
- cap_type.type = vgpu->region[i].type;
- cap_type.subtype = vgpu->region[i].subtype;
-
- ret = vfio_info_add_capability(&caps,
- &cap_type.header,
- sizeof(cap_type));
- if (ret)
- return ret;
- }
- }
-
- if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
- ret = -EINVAL;
- if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP)
- ret = vfio_info_add_capability(&caps,
- &sparse->header,
- struct_size(sparse, areas,
- sparse->nr_areas));
- if (ret) {
- kfree(sparse);
- return ret;
- }
- }
-
- if (caps.size) {
- info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info.argsz < sizeof(info) + caps.size) {
- info.argsz = sizeof(info) + caps.size;
- info.cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(info));
- if (copy_to_user((void __user *)arg +
- sizeof(info), caps.buf,
- caps.size)) {
- kfree(caps.buf);
- kfree(sparse);
- return -EFAULT;
- }
- info.cap_offset = sizeof(info);
- }
-
- kfree(caps.buf);
- }
-
- kfree(sparse);
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
struct vfio_irq_info info;
@@ -1477,6 +1447,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = {
.write = intel_vgpu_write,
.mmap = intel_vgpu_mmap,
.ioctl = intel_vgpu_ioctl,
+ .get_region_info_caps = intel_vgpu_ioctl_get_region_info,
.dma_unmap = intel_vgpu_dma_unmap,
.bind_iommufd = vfio_iommufd_emulated_bind,
.unbind_iommufd = vfio_iommufd_emulated_unbind,
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index e4b273b025d2..62be4a5227e4 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -184,6 +184,10 @@ xe-$(CONFIG_PCI_IOV) += \
xe_sriov_pf_sysfs.o \
xe_tile_sriov_pf_debugfs.o
+ifdef CONFIG_XE_VFIO_PCI
+ xe-$(CONFIG_PCI_IOV) += xe_sriov_vfio.o
+endif
+
# include helpers for tests even when XE is built-in
ifdef CONFIG_DRM_XE_KUNIT_TEST
xe-y += tests/xe_kunit_helpers.o
@@ -242,6 +246,8 @@ xe-$(CONFIG_DRM_XE_DISPLAY) += \
i915-display/intel_cdclk.o \
i915-display/intel_cmtg.o \
i915-display/intel_color.o \
+ i915-display/intel_colorop.o \
+ i915-display/intel_color_pipeline.o \
i915-display/intel_combo_phy.o \
i915-display/intel_connector.o \
i915-display/intel_crtc.o \
diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
index 9955397aaaa9..c7a77a3a9681 100644
--- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
+++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
@@ -54,13 +54,14 @@ static inline void xe_sched_tdr_queue_imm(struct xe_gpu_scheduler *sched)
static inline void xe_sched_resubmit_jobs(struct xe_gpu_scheduler *sched)
{
struct drm_sched_job *s_job;
+ bool restore_replay = false;
list_for_each_entry(s_job, &sched->base.pending_list, list) {
struct drm_sched_fence *s_fence = s_job->s_fence;
struct dma_fence *hw_fence = s_fence->parent;
- if (to_xe_sched_job(s_job)->skip_emit ||
- (hw_fence && !dma_fence_is_signaled(hw_fence)))
+ restore_replay |= to_xe_sched_job(s_job)->restore_replay;
+ if (restore_replay || (hw_fence && !dma_fence_is_signaled(hw_fence)))
sched->base.ops->run_job(s_job);
}
}
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
index 62f6cc45a764..59c5c6b4d994 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
@@ -711,7 +711,7 @@ static u64 pf_profile_fair_ggtt(struct xe_gt *gt, unsigned int num_vfs)
if (num_vfs > 56)
return SZ_64M - SZ_8M;
- return rounddown_pow_of_two(shareable / num_vfs);
+ return rounddown_pow_of_two(div_u64(shareable, num_vfs));
}
/**
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
index d5d918ddce4f..3174a8dee779 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
@@ -17,6 +17,7 @@
#include "xe_gt_sriov_pf_helpers.h"
#include "xe_gt_sriov_pf_migration.h"
#include "xe_gt_sriov_printk.h"
+#include "xe_guc.h"
#include "xe_guc_buf.h"
#include "xe_guc_ct.h"
#include "xe_migrate.h"
@@ -1023,6 +1024,12 @@ static void action_ring_cleanup(void *arg)
ptr_ring_cleanup(r, destroy_pf_packet);
}
+static void pf_gt_migration_check_support(struct xe_gt *gt)
+{
+ if (GUC_FIRMWARE_VER(&gt->uc.guc) < MAKE_GUC_VER(70, 54, 0))
+ xe_sriov_pf_migration_disable(gt_to_xe(gt), "requires GuC version >= 70.54.0");
+}
+
/**
* xe_gt_sriov_pf_migration_init() - Initialize support for VF migration.
* @gt: the &xe_gt
@@ -1039,6 +1046,8 @@ int xe_gt_sriov_pf_migration_init(struct xe_gt *gt)
xe_gt_assert(gt, IS_SRIOV_PF(xe));
+ pf_gt_migration_check_support(gt);
+
if (!pf_migration_supported(gt))
return 0;
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index d4ffdb71ef3d..ed7be50b2f72 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -822,7 +822,7 @@ static void submit_exec_queue(struct xe_exec_queue *q, struct xe_sched_job *job)
xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
- if (!job->skip_emit || job->last_replay) {
+ if (!job->restore_replay || job->last_replay) {
if (xe_exec_queue_is_parallel(q))
wq_item_append(q);
else
@@ -881,10 +881,10 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
if (!killed_or_banned_or_wedged && !xe_sched_job_is_error(job)) {
if (!exec_queue_registered(q))
register_exec_queue(q, GUC_CONTEXT_NORMAL);
- if (!job->skip_emit)
+ if (!job->restore_replay)
q->ring_ops->emit_job(job);
submit_exec_queue(q, job);
- job->skip_emit = false;
+ job->restore_replay = false;
}
/*
@@ -2112,6 +2112,18 @@ static void guc_exec_queue_revert_pending_state_change(struct xe_guc *guc,
q->guc->resume_time = 0;
}
+static void lrc_parallel_clear(struct xe_lrc *lrc)
+{
+ struct xe_device *xe = gt_to_xe(lrc->gt);
+ struct iosys_map map = xe_lrc_parallel_map(lrc);
+ int i;
+
+ for (i = 0; i < WQ_SIZE / sizeof(u32); ++i)
+ parallel_write(xe, map, wq[i],
+ FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) |
+ FIELD_PREP(WQ_LEN_MASK, 0));
+}
+
/*
* This function is quite complex but only real way to ensure no state is lost
* during VF resume flows. The function scans the queue state, make adjustments
@@ -2135,8 +2147,8 @@ static void guc_exec_queue_pause(struct xe_guc *guc, struct xe_exec_queue *q)
guc_exec_queue_revert_pending_state_change(guc, q);
if (xe_exec_queue_is_parallel(q)) {
- struct xe_device *xe = guc_to_xe(guc);
- struct iosys_map map = xe_lrc_parallel_map(q->lrc[0]);
+ /* Pairs with WRITE_ONCE in __xe_exec_queue_init */
+ struct xe_lrc *lrc = READ_ONCE(q->lrc[0]);
/*
* NOP existing WQ commands that may contain stale GGTT
@@ -2144,14 +2156,14 @@ static void guc_exec_queue_pause(struct xe_guc *guc, struct xe_exec_queue *q)
* seems to get confused if the WQ head/tail pointers are
* adjusted.
*/
- for (i = 0; i < WQ_SIZE / sizeof(u32); ++i)
- parallel_write(xe, map, wq[i],
- FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) |
- FIELD_PREP(WQ_LEN_MASK, 0));
+ if (lrc)
+ lrc_parallel_clear(lrc);
}
job = xe_sched_first_pending_job(sched);
if (job) {
+ job->restore_replay = true;
+
/*
* Adjust software tail so jobs submitted overwrite previous
* position in ring buffer with new GGTT addresses.
@@ -2241,17 +2253,18 @@ static void guc_exec_queue_unpause_prepare(struct xe_guc *guc,
struct xe_exec_queue *q)
{
struct xe_gpu_scheduler *sched = &q->guc->sched;
- struct drm_sched_job *s_job;
struct xe_sched_job *job = NULL;
+ bool restore_replay = false;
- list_for_each_entry(s_job, &sched->base.pending_list, list) {
- job = to_xe_sched_job(s_job);
-
- xe_gt_dbg(guc_to_gt(guc), "Replay JOB - guc_id=%d, seqno=%d",
- q->guc->id, xe_sched_job_seqno(job));
+ list_for_each_entry(job, &sched->base.pending_list, drm.list) {
+ restore_replay |= job->restore_replay;
+ if (restore_replay) {
+ xe_gt_dbg(guc_to_gt(guc), "Replay JOB - guc_id=%d, seqno=%d",
+ q->guc->id, xe_sched_job_seqno(job));
- q->ring_ops->emit_job(job);
- job->skip_emit = true;
+ q->ring_ops->emit_job(job);
+ job->restore_replay = true;
+ }
}
if (job)
diff --git a/drivers/gpu/drm/xe/xe_pagefault.c b/drivers/gpu/drm/xe/xe_pagefault.c
index fe3e40145012..afb06598b6e1 100644
--- a/drivers/gpu/drm/xe/xe_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_pagefault.c
@@ -102,7 +102,6 @@ retry_userptr:
/* Lock VM and BOs dma-resv */
xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
- drm_exec_init(&exec, 0, 0);
drm_exec_until_all_locked(&exec) {
err = xe_pagefault_begin(&exec, vma, tile->mem.vram,
needs_vram == 1);
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 4636e4ef9baa..9c9ea10d994c 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -1223,6 +1223,23 @@ static struct pci_driver xe_pci_driver = {
#endif
};
+/**
+ * xe_pci_to_pf_device() - Get PF &xe_device.
+ * @pdev: the VF &pci_dev device
+ *
+ * Return: pointer to PF &xe_device, NULL otherwise.
+ */
+struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev)
+{
+ struct drm_device *drm;
+
+ drm = pci_iov_get_pf_drvdata(pdev, &xe_pci_driver);
+ if (IS_ERR(drm))
+ return NULL;
+
+ return to_xe_device(drm);
+}
+
int xe_register_pci_driver(void)
{
return pci_register_driver(&xe_pci_driver);
diff --git a/drivers/gpu/drm/xe/xe_pci.h b/drivers/gpu/drm/xe/xe_pci.h
index 611c1209b14c..11bcc5fe2c5b 100644
--- a/drivers/gpu/drm/xe/xe_pci.h
+++ b/drivers/gpu/drm/xe/xe_pci.h
@@ -6,7 +6,10 @@
#ifndef _XE_PCI_H_
#define _XE_PCI_H_
+struct pci_dev;
+
int xe_register_pci_driver(void);
void xe_unregister_pci_driver(void);
+struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev);
#endif
diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index 44924512830f..766922530265 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -726,6 +726,13 @@ static void xe_pm_runtime_lockdep_prime(void)
/**
* xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously
* @xe: xe device instance
+ *
+ * When possible, scope-based runtime PM (through guard(xe_pm_runtime)) is
+ * be preferred over direct usage of this function. Manual get/put handling
+ * should only be used when the function contains goto-based logic which
+ * can break scope-based handling, or when the lifetime of the runtime PM
+ * reference does not match a specific scope (e.g., runtime PM obtained in one
+ * function and released in a different one).
*/
void xe_pm_runtime_get(struct xe_device *xe)
{
@@ -758,6 +765,13 @@ void xe_pm_runtime_put(struct xe_device *xe)
* xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl
* @xe: xe device instance
*
+ * When possible, scope-based runtime PM (through
+ * ACQUIRE(xe_pm_runtime_ioctl, ...)) is be preferred over direct usage of this
+ * function. Manual get/put handling should only be used when the function
+ * contains goto-based logic which can break scope-based handling, or when the
+ * lifetime of the runtime PM reference does not match a specific scope (e.g.,
+ * runtime PM obtained in one function and released in a different one).
+ *
* Returns: Any number greater than or equal to 0 for success, negative error
* code otherwise.
*/
@@ -827,6 +841,13 @@ static bool xe_pm_suspending_or_resuming(struct xe_device *xe)
* It will warn if not protected.
* The reference should be put back after this function regardless, since it
* will always bump the usage counter, regardless.
+ *
+ * When possible, scope-based runtime PM (through guard(xe_pm_runtime_noresume))
+ * is be preferred over direct usage of this function. Manual get/put handling
+ * should only be used when the function contains goto-based logic which can
+ * break scope-based handling, or when the lifetime of the runtime PM reference
+ * does not match a specific scope (e.g., runtime PM obtained in one function
+ * and released in a different one).
*/
void xe_pm_runtime_get_noresume(struct xe_device *xe)
{
diff --git a/drivers/gpu/drm/xe/xe_pm.h b/drivers/gpu/drm/xe/xe_pm.h
index f7f89a18b6fc..6b27039e7b2d 100644
--- a/drivers/gpu/drm/xe/xe_pm.h
+++ b/drivers/gpu/drm/xe/xe_pm.h
@@ -6,6 +6,7 @@
#ifndef _XE_PM_H_
#define _XE_PM_H_
+#include <linux/cleanup.h>
#include <linux/pm_runtime.h>
#define DEFAULT_VRAM_THRESHOLD 300 /* in MB */
@@ -37,4 +38,20 @@ int xe_pm_block_on_suspend(struct xe_device *xe);
void xe_pm_might_block_on_suspend(void);
int xe_pm_module_init(void);
+static inline void __xe_pm_runtime_noop(struct xe_device *xe) {}
+
+DEFINE_GUARD(xe_pm_runtime, struct xe_device *,
+ xe_pm_runtime_get(_T), xe_pm_runtime_put(_T))
+DEFINE_GUARD(xe_pm_runtime_noresume, struct xe_device *,
+ xe_pm_runtime_get_noresume(_T), xe_pm_runtime_put(_T))
+DEFINE_GUARD_COND(xe_pm_runtime, _ioctl, xe_pm_runtime_get_ioctl(_T), _RET >= 0)
+
+/*
+ * Used when a function needs to release runtime PM in all possible cases
+ * and error paths, but the wakeref was already acquired by a different
+ * function (i.e., get() has already happened so only a put() is needed).
+ */
+DEFINE_GUARD(xe_pm_runtime_release_only, struct xe_device *,
+ __xe_pm_runtime_noop(_T), xe_pm_runtime_put(_T));
+
#endif
diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
index d26612abb4ca..7c4c54fe920a 100644
--- a/drivers/gpu/drm/xe/xe_sched_job_types.h
+++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
@@ -63,8 +63,8 @@ struct xe_sched_job {
bool ring_ops_flush_tlb;
/** @ggtt: mapped in ggtt. */
bool ggtt;
- /** @skip_emit: skip emitting the job */
- bool skip_emit;
+ /** @restore_replay: job being replayed for restore */
+ bool restore_replay;
/** @last_replay: last job being replayed */
bool last_replay;
/** @ptrs: per instance pointers. */
diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c
index de06cc690fc8..6c4b16409cc9 100644
--- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c
+++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c
@@ -46,13 +46,37 @@ bool xe_sriov_pf_migration_supported(struct xe_device *xe)
{
xe_assert(xe, IS_SRIOV_PF(xe));
- return xe->sriov.pf.migration.supported;
+ return IS_ENABLED(CONFIG_DRM_XE_DEBUG) || !xe->sriov.pf.migration.disabled;
}
-static bool pf_check_migration_support(struct xe_device *xe)
+/**
+ * xe_sriov_pf_migration_disable() - Turn off SR-IOV VF migration support on PF.
+ * @xe: the &xe_device instance.
+ * @fmt: format string for the log message, to be combined with following VAs.
+ */
+void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list va_args;
+
+ xe_assert(xe, IS_SRIOV_PF(xe));
+
+ va_start(va_args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &va_args;
+ xe_sriov_notice(xe, "migration %s: %pV\n",
+ IS_ENABLED(CONFIG_DRM_XE_DEBUG) ?
+ "missing prerequisite" : "disabled",
+ &vaf);
+ va_end(va_args);
+
+ xe->sriov.pf.migration.disabled = true;
+}
+
+static void pf_migration_check_support(struct xe_device *xe)
{
- /* XXX: for now this is for feature enabling only */
- return IS_ENABLED(CONFIG_DRM_XE_DEBUG);
+ if (!xe_device_has_memirq(xe))
+ xe_sriov_pf_migration_disable(xe, "requires memory-based IRQ support");
}
static void pf_migration_cleanup(void *arg)
@@ -77,7 +101,8 @@ int xe_sriov_pf_migration_init(struct xe_device *xe)
xe_assert(xe, IS_SRIOV_PF(xe));
- xe->sriov.pf.migration.supported = pf_check_migration_support(xe);
+ pf_migration_check_support(xe);
+
if (!xe_sriov_pf_migration_supported(xe))
return 0;
diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.h b/drivers/gpu/drm/xe/xe_sriov_pf_migration.h
index b806298a0bb6..f8f408df8481 100644
--- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.h
+++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.h
@@ -14,6 +14,7 @@ struct xe_sriov_packet;
int xe_sriov_pf_migration_init(struct xe_device *xe);
bool xe_sriov_pf_migration_supported(struct xe_device *xe);
+void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...);
int xe_sriov_pf_migration_restore_produce(struct xe_device *xe, unsigned int vfid,
struct xe_sriov_packet *data);
struct xe_sriov_packet *
diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h b/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h
index 363d673ee1dd..7d9a8a278d91 100644
--- a/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h
+++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h
@@ -14,8 +14,8 @@
* struct xe_sriov_pf_migration - Xe device level VF migration data
*/
struct xe_sriov_pf_migration {
- /** @supported: indicates whether VF migration feature is supported */
- bool supported;
+ /** @disabled: indicates whether VF migration feature is disabled */
+ bool disabled;
};
/**
diff --git a/drivers/gpu/drm/xe/xe_sriov_vfio.c b/drivers/gpu/drm/xe/xe_sriov_vfio.c
new file mode 100644
index 000000000000..e9a7615bb5c5
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_sriov_vfio.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <drm/intel/xe_sriov_vfio.h>
+#include <linux/cleanup.h>
+
+#include "xe_pci.h"
+#include "xe_pm.h"
+#include "xe_sriov_pf_control.h"
+#include "xe_sriov_pf_helpers.h"
+#include "xe_sriov_pf_migration.h"
+
+struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev)
+{
+ return xe_pci_to_pf_device(pdev);
+}
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_get_pf, "xe-vfio-pci");
+
+bool xe_sriov_vfio_migration_supported(struct xe_device *xe)
+{
+ if (!IS_SRIOV_PF(xe))
+ return -EPERM;
+
+ return xe_sriov_pf_migration_supported(xe);
+}
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_migration_supported, "xe-vfio-pci");
+
+#define DEFINE_XE_SRIOV_VFIO_FUNCTION(_type, _func, _impl) \
+_type xe_sriov_vfio_##_func(struct xe_device *xe, unsigned int vfid) \
+{ \
+ if (!IS_SRIOV_PF(xe)) \
+ return -EPERM; \
+ if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) \
+ return -EINVAL; \
+ \
+ guard(xe_pm_runtime_noresume)(xe); \
+ \
+ return xe_sriov_pf_##_impl(xe, vfid); \
+} \
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_##_func, "xe-vfio-pci")
+
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, wait_flr_done, control_wait_flr);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, suspend_device, control_pause_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_device, control_resume_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_enter, control_trigger_save_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_exit, control_finish_save_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_enter, control_trigger_restore_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_exit, control_finish_restore_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(int, error, control_stop_vf);
+DEFINE_XE_SRIOV_VFIO_FUNCTION(ssize_t, stop_copy_size, migration_size);
+
+ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid,
+ char __user *buf, size_t len)
+{
+ if (!IS_SRIOV_PF(xe))
+ return -EPERM;
+ if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe))
+ return -EINVAL;
+
+ guard(xe_pm_runtime_noresume)(xe);
+
+ return xe_sriov_pf_migration_read(xe, vfid, buf, len);
+}
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_read, "xe-vfio-pci");
+
+ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid,
+ const char __user *buf, size_t len)
+{
+ if (!IS_SRIOV_PF(xe))
+ return -EPERM;
+ if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe))
+ return -EINVAL;
+
+ guard(xe_pm_runtime_noresume)(xe);
+
+ return xe_sriov_pf_migration_write(xe, vfid, buf, len);
+}
+EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_write, "xe-vfio-pci");
diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c
index 0e10da790cc5..d50baefcd124 100644
--- a/drivers/gpu/drm/xe/xe_vram.c
+++ b/drivers/gpu/drm/xe/xe_vram.c
@@ -25,39 +25,13 @@
#include "xe_vram.h"
#include "xe_vram_types.h"
-#define BAR_SIZE_SHIFT 20
-
-/*
- * Release all the BARs that could influence/block LMEMBAR resizing, i.e.
- * assigned IORESOURCE_MEM_64 BARs
- */
-static void release_bars(struct pci_dev *pdev)
-{
- struct resource *res;
- int i;
-
- pci_dev_for_each_resource(pdev, res, i) {
- /* Resource already un-assigned, do not reset it */
- if (!res->parent)
- continue;
-
- /* No need to release unrelated BARs */
- if (!(res->flags & IORESOURCE_MEM_64))
- continue;
-
- pci_release_resource(pdev, i);
- }
-}
-
static void resize_bar(struct xe_device *xe, int resno, resource_size_t size)
{
struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
int bar_size = pci_rebar_bytes_to_size(size);
int ret;
- release_bars(pdev);
-
- ret = pci_resize_resource(pdev, resno, bar_size);
+ ret = pci_resize_resource(pdev, resno, bar_size, 0);
if (ret) {
drm_info(&xe->drm, "Failed to resize BAR%d to %dM (%pe). Consider enabling 'Resizable BAR' support in your BIOS\n",
resno, 1 << bar_size, ERR_PTR(ret));
@@ -79,41 +53,37 @@ void xe_vram_resize_bar(struct xe_device *xe)
resource_size_t current_size;
resource_size_t rebar_size;
struct resource *root_res;
- u32 bar_size_mask;
+ int max_size, i;
u32 pci_cmd;
- int i;
/* gather some relevant info */
current_size = pci_resource_len(pdev, LMEM_BAR);
- bar_size_mask = pci_rebar_get_possible_sizes(pdev, LMEM_BAR);
-
- if (!bar_size_mask)
- return;
if (force_vram_bar_size < 0)
return;
/* set to a specific size? */
if (force_vram_bar_size) {
- u32 bar_size_bit;
+ rebar_size = pci_rebar_bytes_to_size(force_vram_bar_size *
+ (resource_size_t)SZ_1M);
- rebar_size = force_vram_bar_size * (resource_size_t)SZ_1M;
-
- bar_size_bit = bar_size_mask & BIT(pci_rebar_bytes_to_size(rebar_size));
-
- if (!bar_size_bit) {
+ if (!pci_rebar_size_supported(pdev, LMEM_BAR, rebar_size)) {
drm_info(&xe->drm,
- "Requested size: %lluMiB is not supported by rebar sizes: 0x%x. Leaving default: %lluMiB\n",
- (u64)rebar_size >> 20, bar_size_mask, (u64)current_size >> 20);
+ "Requested size: %lluMiB is not supported by rebar sizes: 0x%llx. Leaving default: %lluMiB\n",
+ (u64)pci_rebar_size_to_bytes(rebar_size) >> 20,
+ pci_rebar_get_possible_sizes(pdev, LMEM_BAR),
+ (u64)current_size >> 20);
return;
}
- rebar_size = 1ULL << (__fls(bar_size_bit) + BAR_SIZE_SHIFT);
-
+ rebar_size = pci_rebar_size_to_bytes(rebar_size);
if (rebar_size == current_size)
return;
} else {
- rebar_size = 1ULL << (__fls(bar_size_mask) + BAR_SIZE_SHIFT);
+ max_size = pci_rebar_get_max_size(pdev, LMEM_BAR);
+ if (max_size < 0)
+ return;
+ rebar_size = pci_rebar_size_to_bytes(max_size);
/* only resize if larger than current */
if (rebar_size <= current_size)
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index f0323f1d6f01..794b9778816b 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -80,6 +80,7 @@ config INFINIBAND_VIRT_DMA
if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
if !UML
source "drivers/infiniband/hw/bnxt_re/Kconfig"
+source "drivers/infiniband/hw/bng_re/Kconfig"
source "drivers/infiniband/hw/cxgb4/Kconfig"
source "drivers/infiniband/hw/efa/Kconfig"
source "drivers/infiniband/hw/erdma/Kconfig"
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 01bede8ba105..024df6ee239d 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -34,7 +34,6 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
-#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
@@ -1057,6 +1056,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
{
struct cm_id_private *cm_id_priv;
enum ib_cm_state old_state;
+ unsigned long timeout;
struct cm_work *work;
int ret;
@@ -1167,10 +1167,9 @@ retest:
xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
cm_deref_id(cm_id_priv);
+ timeout = msecs_to_jiffies((cm_id_priv->max_cm_retries * cm_id_priv->timeout_ms * 5) / 4);
do {
- ret = wait_for_completion_timeout(&cm_id_priv->comp,
- msecs_to_jiffies(
- CM_DESTROY_ID_WAIT_TIMEOUT));
+ ret = wait_for_completion_timeout(&cm_id_priv->comp, timeout);
if (!ret) /* timeout happened */
cm_destroy_id_wait_timeout(cm_id, old_state);
} while (!ret);
@@ -4518,7 +4517,7 @@ static int __init ib_cm_init(void)
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
INIT_LIST_HEAD(&cm.timewait_list);
- cm.wq = alloc_workqueue("ib_cm", 0, 1);
+ cm.wq = alloc_workqueue("ib_cm", WQ_PERCPU, 1);
if (!cm.wq) {
ret = -ENOMEM;
goto error2;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 5b2d3ae3f9fc..95e89f5c147c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4475,6 +4475,8 @@ int rdma_connect_locked(struct rdma_cm_id *id,
container_of(id, struct rdma_id_private, id);
int ret;
+ lockdep_assert_held(&id_priv->handler_mutex);
+
if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
return -EINVAL;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b4f3c835844a..13e8a1714bbd 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -3021,7 +3021,7 @@ static int __init ib_core_init(void)
{
int ret = -ENOMEM;
- ib_wq = alloc_workqueue("infiniband", 0, 0);
+ ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0);
if (!ib_wq)
return -ENOMEM;
@@ -3031,7 +3031,7 @@ static int __init ib_core_init(void)
goto err;
ib_comp_wq = alloc_workqueue("ib-comp-wq",
- WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0);
if (!ib_comp_wq)
goto err_unbound;
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index a7de6f403fca..b097cfcade1c 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -175,7 +175,7 @@ void rdma_restrack_new(struct rdma_restrack_entry *res,
EXPORT_SYMBOL(rdma_restrack_new);
/**
- * rdma_restrack_add() - add object to the reource tracking database
+ * rdma_restrack_add() - add object to the resource tracking database
* @res: resource entry
*/
void rdma_restrack_add(struct rdma_restrack_entry *res)
@@ -277,7 +277,7 @@ int rdma_restrack_put(struct rdma_restrack_entry *res)
EXPORT_SYMBOL(rdma_restrack_put);
/**
- * rdma_restrack_del() - delete object from the reource tracking database
+ * rdma_restrack_del() - delete object from the resource tracking database
* @res: resource entry
*/
void rdma_restrack_del(struct rdma_restrack_entry *res)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index f86ece701db6..ec3be65a2b88 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -366,7 +366,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
xa_lock(&ctx_table);
if (xa_load(&ctx_table, ctx->id) == ctx)
- queue_work(system_unbound_wq, &ctx->close_work);
+ queue_work(system_dfl_wq, &ctx->close_work);
xa_unlock(&ctx_table);
}
return 0;
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index c5b686394760..8137031c2a65 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -45,6 +45,8 @@
#include "uverbs.h"
+#define RESCHED_LOOP_CNT_THRESHOLD 0x1000
+
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
bool make_dirty = umem->writable && dirty;
@@ -55,10 +57,14 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt,
DMA_BIDIRECTIONAL, 0);
- for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i)
+ for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) {
unpin_user_page_range_dirty_lock(sg_page(sg),
DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);
+ if (i && !(i % RESCHED_LOOP_CNT_THRESHOLD))
+ cond_resched();
+ }
+
sg_free_append_table(&umem->sgt_append);
}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3a5f81402d2f..11b1a194de44 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -148,6 +148,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
case IB_RATE_400_GBPS: return 160;
case IB_RATE_600_GBPS: return 240;
case IB_RATE_800_GBPS: return 320;
+ case IB_RATE_1600_GBPS: return 640;
default: return -1;
}
}
@@ -178,6 +179,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
case 160: return IB_RATE_400_GBPS;
case 240: return IB_RATE_600_GBPS;
case 320: return IB_RATE_800_GBPS;
+ case 640: return IB_RATE_1600_GBPS;
default: return IB_RATE_PORT_CURRENT;
}
}
@@ -208,6 +210,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
case IB_RATE_400_GBPS: return 425000;
case IB_RATE_600_GBPS: return 637500;
case IB_RATE_800_GBPS: return 850000;
+ case IB_RATE_1600_GBPS: return 1700000;
default: return -1;
}
}
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index b706dc0d0263..c42b22ac3303 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -13,5 +13,6 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/
obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/
obj-$(CONFIG_INFINIBAND_QEDR) += qedr/
obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/
+obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re/
obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/
obj-$(CONFIG_INFINIBAND_IONIC) += ionic/
diff --git a/drivers/infiniband/hw/bng_re/Kconfig b/drivers/infiniband/hw/bng_re/Kconfig
new file mode 100644
index 000000000000..85845f72c64d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config INFINIBAND_BNG_RE
+ tristate "Broadcom Next generation RoCE HCA support"
+ depends on 64BIT
+ depends on INET && DCB && BNGE
+ help
+ This driver supports Broadcom Next generation
+ 50/100/200/400/800 gigabit RoCE HCAs. The module
+ will be called bng_re. To compile this driver
+ as a module, choose M here.
diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile
new file mode 100644
index 000000000000..c6aaaf853c77
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge -I $(srctree)/drivers/infiniband/hw/bnxt_re
+
+obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o
+
+bng_re-y := bng_dev.o bng_fw.o \
+ bng_res.o bng_sp.o \
+ bng_debugfs.o
diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.c b/drivers/infiniband/hw/bng_re/bng_debugfs.c
new file mode 100644
index 000000000000..9ec5a8785250
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_debugfs.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/debugfs.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bnge.h"
+#include "bnge_auxr.h"
+#include "bng_re.h"
+#include "bng_debugfs.h"
+
+static struct dentry *bng_re_debugfs_root;
+
+void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev)
+{
+ struct pci_dev *pdev = rdev->aux_dev->pdev;
+
+ rdev->dbg_root =
+ debugfs_create_dir(dev_name(&pdev->dev), bng_re_debugfs_root);
+}
+
+void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev)
+{
+ debugfs_remove_recursive(rdev->dbg_root);
+ rdev->dbg_root = NULL;
+}
+
+void bng_re_register_debugfs(void)
+{
+ bng_re_debugfs_root = debugfs_create_dir("bng_re", NULL);
+}
+
+void bng_re_unregister_debugfs(void)
+{
+ debugfs_remove(bng_re_debugfs_root);
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.h b/drivers/infiniband/hw/bng_re/bng_debugfs.h
new file mode 100644
index 000000000000..baef71df4242
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_debugfs.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RE_DEBUGFS__
+#define __BNG_RE_DEBUGFS__
+
+void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev);
+void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev);
+
+void bng_re_register_debugfs(void);
+void bng_re_unregister_debugfs(void);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c
new file mode 100644
index 000000000000..d8f8d7f7075f
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_dev.c
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/auxiliary_bus.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "bng_res.h"
+#include "bng_sp.h"
+#include "bng_fw.h"
+#include "bnge.h"
+#include "bnge_auxr.h"
+#include "bng_re.h"
+#include "bnge_hwrm.h"
+#include "bng_debugfs.h"
+
+MODULE_AUTHOR("Siva Reddy Kallam <siva.kallam@broadcom.com>");
+MODULE_DESCRIPTION(BNG_RE_DESC);
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct bng_re_dev *bng_re_dev_add(struct auxiliary_device *adev,
+ struct bnge_auxr_dev *aux_dev)
+{
+ struct bng_re_dev *rdev;
+
+ /* Allocate bng_re_dev instance */
+ rdev = ib_alloc_device(bng_re_dev, ibdev);
+ if (!rdev) {
+ pr_err("%s: bng_re_dev allocation failure!", KBUILD_MODNAME);
+ return NULL;
+ }
+
+ /* Assign auxiliary device specific data */
+ rdev->netdev = aux_dev->net;
+ rdev->aux_dev = aux_dev;
+ rdev->adev = adev;
+ rdev->fn_id = rdev->aux_dev->pdev->devfn;
+
+ return rdev;
+}
+
+
+static int bng_re_register_netdev(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev;
+
+ aux_dev = rdev->aux_dev;
+ return bnge_register_dev(aux_dev, rdev->adev);
+}
+
+static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev)
+{
+ struct bng_re_chip_ctx *chip_ctx;
+
+ if (!rdev->chip_ctx)
+ return;
+
+ kfree(rdev->dev_attr);
+ rdev->dev_attr = NULL;
+
+ chip_ctx = rdev->chip_ctx;
+ rdev->chip_ctx = NULL;
+ rdev->rcfw.res = NULL;
+ rdev->bng_res.cctx = NULL;
+ rdev->bng_res.pdev = NULL;
+ kfree(chip_ctx);
+}
+
+static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev)
+{
+ struct bng_re_chip_ctx *chip_ctx;
+ struct bnge_auxr_dev *aux_dev;
+ int rc = -ENOMEM;
+
+ aux_dev = rdev->aux_dev;
+ rdev->bng_res.pdev = aux_dev->pdev;
+ rdev->rcfw.res = &rdev->bng_res;
+ chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL);
+ if (!chip_ctx)
+ return -ENOMEM;
+ chip_ctx->chip_num = aux_dev->chip_num;
+ chip_ctx->hw_stats_size = aux_dev->hw_ring_stats_size;
+
+ rdev->chip_ctx = chip_ctx;
+ rdev->bng_res.cctx = rdev->chip_ctx;
+ rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL);
+ if (!rdev->dev_attr)
+ goto free_chip_ctx;
+ rdev->bng_res.dattr = rdev->dev_attr;
+
+ return 0;
+free_chip_ctx:
+ kfree(rdev->chip_ctx);
+ rdev->chip_ctx = NULL;
+ return rc;
+}
+
+static void bng_re_init_hwrm_hdr(struct input *hdr, u16 opcd)
+{
+ hdr->req_type = cpu_to_le16(opcd);
+ hdr->cmpl_ring = cpu_to_le16(-1);
+ hdr->target_id = cpu_to_le16(-1);
+}
+
+static void bng_re_fill_fw_msg(struct bnge_fw_msg *fw_msg, void *msg,
+ int msg_len, void *resp, int resp_max_len,
+ int timeout)
+{
+ fw_msg->msg = msg;
+ fw_msg->msg_len = msg_len;
+ fw_msg->resp = resp;
+ fw_msg->resp_max_len = resp_max_len;
+ fw_msg->timeout = timeout;
+}
+
+static int bng_re_net_ring_free(struct bng_re_dev *rdev,
+ u16 fw_ring_id, int type)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ring_free_input req = {};
+ struct hwrm_ring_free_output resp;
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!rdev)
+ return rc;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE);
+ req.ring_type = type;
+ req.ring_id = cpu_to_le16(fw_ring_id);
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc)
+ ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x",
+ req.ring_id, rc);
+ return rc;
+}
+
+static int bng_re_net_ring_alloc(struct bng_re_dev *rdev,
+ struct bng_re_ring_attr *ring_attr,
+ u16 *fw_ring_id)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ring_alloc_input req = {};
+ struct hwrm_ring_alloc_output resp;
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC);
+ req.enables = 0;
+ req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]);
+ if (ring_attr->pages > 1) {
+ /* Page size is in log2 units */
+ req.page_size = BNGE_PAGE_SHIFT;
+ req.page_tbl_depth = 1;
+ }
+ req.fbo = 0;
+ /* Association of ring index with doorbell index and MSIX number */
+ req.logical_id = cpu_to_le16(ring_attr->lrid);
+ req.length = cpu_to_le32(ring_attr->depth + 1);
+ req.ring_type = ring_attr->type;
+ req.int_mode = ring_attr->mode;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (!rc)
+ *fw_ring_id = le16_to_cpu(resp.ring_id);
+
+ return rc;
+}
+
+static int bng_re_stats_ctx_free(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_stat_ctx_free_input req = {};
+ struct hwrm_stat_ctx_free_output resp = {};
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE);
+ req.stat_ctx_id = cpu_to_le32(rdev->stats_ctx.fw_id);
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc)
+ ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x",
+ rc);
+
+ return rc;
+}
+
+static int bng_re_stats_ctx_alloc(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct bng_re_stats *stats = &rdev->stats_ctx;
+ struct hwrm_stat_ctx_alloc_output resp = {};
+ struct hwrm_stat_ctx_alloc_input req = {};
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ stats->fw_id = BNGE_INVALID_STATS_CTX_ID;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC);
+ req.update_period_ms = cpu_to_le32(1000);
+ req.stats_dma_addr = cpu_to_le64(stats->dma_map);
+ req.stats_dma_length = cpu_to_le16(rdev->chip_ctx->hw_stats_size);
+ req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (!rc)
+ stats->fw_id = le32_to_cpu(resp.stat_ctx_id);
+ return rc;
+}
+
+static void bng_re_query_hwrm_version(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ver_get_output ver_get_resp = {};
+ struct hwrm_ver_get_input ver_get_req = {};
+ struct bng_re_chip_ctx *cctx;
+ struct bnge_fw_msg fw_msg = {};
+ int rc;
+
+ bng_re_init_hwrm_hdr((void *)&ver_get_req, HWRM_VER_GET);
+ ver_get_req.hwrm_intf_maj = HWRM_VERSION_MAJOR;
+ ver_get_req.hwrm_intf_min = HWRM_VERSION_MINOR;
+ ver_get_req.hwrm_intf_upd = HWRM_VERSION_UPDATE;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&ver_get_req, sizeof(ver_get_req),
+ (void *)&ver_get_resp, sizeof(ver_get_resp),
+ BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x",
+ rc);
+ return;
+ }
+
+ cctx = rdev->chip_ctx;
+ cctx->hwrm_intf_ver =
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_major) << 48 |
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_minor) << 32 |
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_build) << 16 |
+ le16_to_cpu(ver_get_resp.hwrm_intf_patch);
+
+ cctx->hwrm_cmd_max_timeout = le16_to_cpu(ver_get_resp.max_req_timeout);
+
+ if (!cctx->hwrm_cmd_max_timeout)
+ cctx->hwrm_cmd_max_timeout = BNG_ROCE_FW_MAX_TIMEOUT;
+}
+
+static void bng_re_dev_uninit(struct bng_re_dev *rdev)
+{
+ int rc;
+ bng_re_debugfs_rem_pdev(rdev);
+
+ if (test_and_clear_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
+ rc = bng_re_deinit_rcfw(&rdev->rcfw);
+ if (rc)
+ ibdev_warn(&rdev->ibdev,
+ "Failed to deinitialize RCFW: %#x", rc);
+ bng_re_stats_ctx_free(rdev);
+ bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx);
+ bng_re_disable_rcfw_channel(&rdev->rcfw);
+ bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id,
+ RING_ALLOC_REQ_RING_TYPE_NQ);
+ bng_re_free_rcfw_channel(&rdev->rcfw);
+ }
+
+ kfree(rdev->nqr);
+ rdev->nqr = NULL;
+ bng_re_destroy_chip_ctx(rdev);
+ if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags))
+ bnge_unregister_dev(rdev->aux_dev);
+}
+
+static int bng_re_dev_init(struct bng_re_dev *rdev)
+{
+ struct bng_re_ring_attr rattr = {};
+ struct bng_re_creq_ctx *creq;
+ u32 db_offt;
+ int vid;
+ u8 type;
+ int rc;
+
+ /* Registered a new RoCE device instance to netdev */
+ rc = bng_re_register_netdev(rdev);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to register with netedev: %#x\n", rc);
+ return -EINVAL;
+ }
+
+ set_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+
+ if (rdev->aux_dev->auxr_info->msix_requested < BNG_RE_MIN_MSIX) {
+ ibdev_err(&rdev->ibdev,
+ "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n",
+ rdev->aux_dev->auxr_info->msix_requested);
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ return -EINVAL;
+ }
+ ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n",
+ rdev->aux_dev->auxr_info->msix_requested);
+
+ rc = bng_re_setup_chip_ctx(rdev);
+ if (rc) {
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ ibdev_err(&rdev->ibdev, "Failed to get chip context\n");
+ return -EINVAL;
+ }
+
+ bng_re_query_hwrm_version(rdev);
+
+ rc = bng_re_alloc_fw_channel(&rdev->bng_res, &rdev->rcfw);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate RCFW Channel: %#x\n", rc);
+ goto fail;
+ }
+
+ /* Allocate nq record memory */
+ rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL);
+ if (!rdev->nqr) {
+ bng_re_destroy_chip_ctx(rdev);
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ return -ENOMEM;
+ }
+
+ rdev->nqr->num_msix = rdev->aux_dev->auxr_info->msix_requested;
+ memcpy(rdev->nqr->msix_entries, rdev->aux_dev->msix_info,
+ sizeof(struct bnge_msix_info) * rdev->nqr->num_msix);
+
+ type = RING_ALLOC_REQ_RING_TYPE_NQ;
+ creq = &rdev->rcfw.creq;
+ rattr.dma_arr = creq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr;
+ rattr.pages = creq->hwq.pbl[creq->hwq.level].pg_count;
+ rattr.type = type;
+ rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+ rattr.depth = BNG_FW_CREQE_MAX_CNT - 1;
+ rattr.lrid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].ring_idx;
+ rc = bng_re_net_ring_alloc(rdev, &rattr, &creq->ring_id);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc);
+ goto free_rcfw;
+ }
+ db_offt = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].db_offset;
+ vid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].vector;
+
+ rc = bng_re_enable_fw_channel(&rdev->rcfw,
+ vid, db_offt);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n",
+ rc);
+ goto free_ring;
+ }
+
+ rc = bng_re_get_dev_attr(&rdev->rcfw);
+ if (rc)
+ goto disable_rcfw;
+
+ bng_re_debugfs_add_pdev(rdev);
+ rc = bng_re_alloc_stats_ctx_mem(rdev->bng_res.pdev, rdev->chip_ctx,
+ &rdev->stats_ctx);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate stats context: %#x\n", rc);
+ goto disable_rcfw;
+ }
+
+ rc = bng_re_stats_ctx_alloc(rdev);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate QPLIB context: %#x\n", rc);
+ goto free_stats_ctx;
+ }
+
+ rc = bng_re_init_rcfw(&rdev->rcfw, &rdev->stats_ctx);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to initialize RCFW: %#x\n", rc);
+ goto free_sctx;
+ }
+ set_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags);
+
+ return 0;
+free_sctx:
+ bng_re_stats_ctx_free(rdev);
+free_stats_ctx:
+ bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx);
+disable_rcfw:
+ bng_re_disable_rcfw_channel(&rdev->rcfw);
+free_ring:
+ bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type);
+free_rcfw:
+ bng_re_free_rcfw_channel(&rdev->rcfw);
+fail:
+ bng_re_dev_uninit(rdev);
+ return rc;
+}
+
+static int bng_re_add_device(struct auxiliary_device *adev)
+{
+ struct bnge_auxr_priv *auxr_priv =
+ container_of(adev, struct bnge_auxr_priv, aux_dev);
+ struct bng_re_en_dev_info *dev_info;
+ struct bng_re_dev *rdev;
+ int rc;
+
+ dev_info = auxiliary_get_drvdata(adev);
+
+ rdev = bng_re_dev_add(adev, auxr_priv->auxr_dev);
+ if (!rdev) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ dev_info->rdev = rdev;
+
+ rc = bng_re_dev_init(rdev);
+ if (rc)
+ goto re_dev_dealloc;
+
+ return 0;
+
+re_dev_dealloc:
+ ib_dealloc_device(&rdev->ibdev);
+exit:
+ return rc;
+}
+
+
+static void bng_re_remove_device(struct bng_re_dev *rdev,
+ struct auxiliary_device *aux_dev)
+{
+ bng_re_dev_uninit(rdev);
+ ib_dealloc_device(&rdev->ibdev);
+}
+
+
+static int bng_re_probe(struct auxiliary_device *adev,
+ const struct auxiliary_device_id *id)
+{
+ struct bnge_auxr_priv *aux_priv =
+ container_of(adev, struct bnge_auxr_priv, aux_dev);
+ struct bng_re_en_dev_info *en_info;
+ int rc;
+
+ en_info = kzalloc(sizeof(*en_info), GFP_KERNEL);
+ if (!en_info)
+ return -ENOMEM;
+
+ en_info->auxr_dev = aux_priv->auxr_dev;
+
+ auxiliary_set_drvdata(adev, en_info);
+
+ rc = bng_re_add_device(adev);
+ if (rc)
+ kfree(en_info);
+
+ return rc;
+}
+
+static void bng_re_remove(struct auxiliary_device *adev)
+{
+ struct bng_re_en_dev_info *dev_info = auxiliary_get_drvdata(adev);
+ struct bng_re_dev *rdev;
+
+ rdev = dev_info->rdev;
+
+ if (rdev)
+ bng_re_remove_device(rdev, adev);
+ kfree(dev_info);
+}
+
+static const struct auxiliary_device_id bng_re_id_table[] = {
+ { .name = BNG_RE_ADEV_NAME ".rdma", },
+ {},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, bng_re_id_table);
+
+static struct auxiliary_driver bng_re_driver = {
+ .name = "rdma",
+ .probe = bng_re_probe,
+ .remove = bng_re_remove,
+ .id_table = bng_re_id_table,
+};
+
+static int __init bng_re_mod_init(void)
+{
+ int rc;
+
+
+ bng_re_register_debugfs();
+
+ rc = auxiliary_driver_register(&bng_re_driver);
+ if (rc) {
+ pr_err("%s: Failed to register auxiliary driver\n",
+ KBUILD_MODNAME);
+ goto unreg_debugfs;
+ }
+ return 0;
+unreg_debugfs:
+ bng_re_unregister_debugfs();
+ return rc;
+}
+
+static void __exit bng_re_mod_exit(void)
+{
+ auxiliary_driver_unregister(&bng_re_driver);
+ bng_re_unregister_debugfs();
+}
+
+module_init(bng_re_mod_init);
+module_exit(bng_re_mod_exit);
diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c
new file mode 100644
index 000000000000..7d9539113cf5
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_fw.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/pci.h>
+
+#include "roce_hsi.h"
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bng_sp.h"
+
+/**
+ * bng_re_map_rc - map return type based on opcode
+ * @opcode: roce slow path opcode
+ *
+ * case #1
+ * Firmware initiated error recovery is a safe state machine and
+ * driver can consider all the underlying rdma resources are free.
+ * In this state, it is safe to return success for opcodes related to
+ * destroying rdma resources (like destroy qp, destroy cq etc.).
+ *
+ * case #2
+ * If driver detect potential firmware stall, it is not safe state machine
+ * and the driver can not consider all the underlying rdma resources are
+ * freed.
+ * In this state, it is not safe to return success for opcodes related to
+ * destroying rdma resources (like destroy qp, destroy cq etc.).
+ *
+ * Scope of this helper function is only for case #1.
+ *
+ * Returns:
+ * 0 to communicate success to caller.
+ * Non zero error code to communicate failure to caller.
+ */
+static int bng_re_map_rc(u8 opcode)
+{
+ switch (opcode) {
+ case CMDQ_BASE_OPCODE_DESTROY_QP:
+ case CMDQ_BASE_OPCODE_DESTROY_SRQ:
+ case CMDQ_BASE_OPCODE_DESTROY_CQ:
+ case CMDQ_BASE_OPCODE_DEALLOCATE_KEY:
+ case CMDQ_BASE_OPCODE_DEREGISTER_MR:
+ case CMDQ_BASE_OPCODE_DELETE_GID:
+ case CMDQ_BASE_OPCODE_DESTROY_QP1:
+ case CMDQ_BASE_OPCODE_DESTROY_AH:
+ case CMDQ_BASE_OPCODE_DEINITIALIZE_FW:
+ case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC:
+ case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE:
+ return 0;
+ default:
+ return -ETIMEDOUT;
+ }
+}
+
+void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw)
+{
+ kfree(rcfw->crsqe_tbl);
+ bng_re_free_hwq(rcfw->res, &rcfw->cmdq.hwq);
+ bng_re_free_hwq(rcfw->res, &rcfw->creq.hwq);
+ rcfw->pdev = NULL;
+}
+
+int bng_re_alloc_fw_channel(struct bng_re_res *res,
+ struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_hwq_attr hwq_attr = {};
+ struct bng_re_sg_info sginfo = {};
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_creq_ctx *creq;
+
+ rcfw->pdev = res->pdev;
+ cmdq = &rcfw->cmdq;
+ creq = &rcfw->creq;
+ rcfw->res = res;
+
+ sginfo.pgsize = PAGE_SIZE;
+ sginfo.pgshft = PAGE_SHIFT;
+
+ hwq_attr.sginfo = &sginfo;
+ hwq_attr.res = rcfw->res;
+ hwq_attr.depth = BNG_FW_CREQE_MAX_CNT;
+ hwq_attr.stride = BNG_FW_CREQE_UNITS;
+ hwq_attr.type = BNG_HWQ_TYPE_QUEUE;
+
+ if (bng_re_alloc_init_hwq(&creq->hwq, &hwq_attr)) {
+ dev_err(&rcfw->pdev->dev,
+ "HW channel CREQ allocation failed\n");
+ goto fail;
+ }
+
+ rcfw->cmdq_depth = BNG_FW_CMDQE_MAX_CNT;
+
+ sginfo.pgsize = bng_fw_cmdqe_page_size(rcfw->cmdq_depth);
+ hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF;
+ hwq_attr.stride = BNG_FW_CMDQE_UNITS;
+ hwq_attr.type = BNG_HWQ_TYPE_CTX;
+ if (bng_re_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) {
+ dev_err(&rcfw->pdev->dev,
+ "HW channel CMDQ allocation failed\n");
+ goto fail;
+ }
+
+ rcfw->crsqe_tbl = kcalloc(cmdq->hwq.max_elements,
+ sizeof(*rcfw->crsqe_tbl), GFP_KERNEL);
+ if (!rcfw->crsqe_tbl)
+ goto fail;
+
+ spin_lock_init(&rcfw->tbl_lock);
+
+ rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout;
+ return 0;
+
+fail:
+ bng_re_free_rcfw_channel(rcfw);
+ return -ENOMEM;
+}
+
+static int bng_re_process_qp_event(struct bng_re_rcfw *rcfw,
+ struct creq_qp_event *qp_event,
+ u32 *num_wait)
+{
+ struct bng_re_hwq *hwq = &rcfw->cmdq.hwq;
+ struct bng_re_crsqe *crsqe;
+ u32 req_size;
+ u16 cookie;
+ bool is_waiter_alive;
+ struct pci_dev *pdev;
+ u32 wait_cmds = 0;
+ int rc = 0;
+
+ pdev = rcfw->pdev;
+ switch (qp_event->event) {
+ case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
+ dev_err(&pdev->dev, "Received QP error notification\n");
+ break;
+ default:
+ /*
+ * Command Response
+ * cmdq->lock needs to be acquired to synchronie
+ * the command send and completion reaping. This function
+ * is always called with creq->lock held. Using
+ * the nested variant of spin_lock.
+ *
+ */
+
+ spin_lock_nested(&hwq->lock, SINGLE_DEPTH_NESTING);
+ cookie = le16_to_cpu(qp_event->cookie);
+ cookie &= BNG_FW_MAX_COOKIE_VALUE;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED,
+ &rcfw->cmdq.flags),
+ "Unreponsive rcfw channel detected.!!")) {
+ dev_info(&pdev->dev,
+ "rcfw timedout: cookie = %#x, free_slots = %d",
+ cookie, crsqe->free_slots);
+ spin_unlock(&hwq->lock);
+ return rc;
+ }
+
+ if (crsqe->is_waiter_alive) {
+ if (crsqe->resp) {
+ memcpy(crsqe->resp, qp_event, sizeof(*qp_event));
+ /* Insert write memory barrier to ensure that
+ * response data is copied before clearing the
+ * flags
+ */
+ smp_wmb();
+ }
+ }
+
+ wait_cmds++;
+
+ req_size = crsqe->req_size;
+ is_waiter_alive = crsqe->is_waiter_alive;
+
+ crsqe->req_size = 0;
+ if (!is_waiter_alive)
+ crsqe->resp = NULL;
+
+ crsqe->is_in_used = false;
+
+ hwq->cons += req_size;
+
+ spin_unlock(&hwq->lock);
+ }
+ *num_wait += wait_cmds;
+ return rc;
+}
+
+/* function events */
+static int bng_re_process_func_event(struct bng_re_rcfw *rcfw,
+ struct creq_func_event *func_event)
+{
+ switch (func_event->event) {
+ case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CQ_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TQM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TIM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST:
+ case CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* CREQ Completion handlers */
+static void bng_re_service_creq(struct tasklet_struct *t)
+{
+ struct bng_re_rcfw *rcfw = from_tasklet(rcfw, t, creq.creq_tasklet);
+ struct bng_re_creq_ctx *creq = &rcfw->creq;
+ u32 type, budget = BNG_FW_CREQ_ENTRY_POLL_BUDGET;
+ struct bng_re_hwq *hwq = &creq->hwq;
+ struct creq_base *creqe;
+ u32 num_wakeup = 0;
+ u32 hw_polled = 0;
+
+ /* Service the CREQ until budget is over */
+ spin_lock_bh(&hwq->lock);
+ while (budget > 0) {
+ creqe = bng_re_get_qe(hwq, hwq->cons, NULL);
+ if (!BNG_FW_CREQ_CMP_VALID(creqe, creq->creq_db.dbinfo.flags))
+ break;
+ /* The valid test of the entry must be done first before
+ * reading any further.
+ */
+ dma_rmb();
+
+ type = creqe->type & CREQ_BASE_TYPE_MASK;
+ switch (type) {
+ case CREQ_BASE_TYPE_QP_EVENT:
+ bng_re_process_qp_event
+ (rcfw, (struct creq_qp_event *)creqe,
+ &num_wakeup);
+ creq->stats.creq_qp_event_processed++;
+ break;
+ case CREQ_BASE_TYPE_FUNC_EVENT:
+ if (!bng_re_process_func_event
+ (rcfw, (struct creq_func_event *)creqe))
+ creq->stats.creq_func_event_processed++;
+ else
+ dev_warn(&rcfw->pdev->dev,
+ "aeqe:%#x Not handled\n", type);
+ break;
+ default:
+ if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT)
+ dev_warn(&rcfw->pdev->dev,
+ "creqe with event 0x%x not handled\n",
+ type);
+ break;
+ }
+ budget--;
+ hw_polled++;
+ bng_re_hwq_incr_cons(hwq->max_elements, &hwq->cons,
+ 1, &creq->creq_db.dbinfo.flags);
+ }
+
+ if (hw_polled)
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo,
+ rcfw->res->cctx, true);
+ spin_unlock_bh(&hwq->lock);
+ if (num_wakeup)
+ wake_up_nr(&rcfw->cmdq.waitq, num_wakeup);
+}
+
+static int __send_message_basic_sanity(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg,
+ u8 opcode)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+
+ cmdq = &rcfw->cmdq;
+
+ if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags))
+ return -ETIMEDOUT;
+
+ if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
+ opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
+ dev_err(&rcfw->pdev->dev, "RCFW already initialized!");
+ return -EINVAL;
+ }
+
+ if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
+ (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
+ opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
+ opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
+ dev_err(&rcfw->pdev->dev,
+ "RCFW not initialized, reject opcode 0x%x",
+ opcode);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int __send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg, u8 opcode)
+{
+ u32 bsize, free_slots, required_slots;
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_crsqe *crsqe;
+ struct bng_fw_cmdqe *cmdqe;
+ struct bng_re_hwq *hwq;
+ u32 sw_prod, cmdq_prod;
+ struct pci_dev *pdev;
+ u16 cookie;
+ u8 *preq;
+
+ cmdq = &rcfw->cmdq;
+ hwq = &cmdq->hwq;
+ pdev = rcfw->pdev;
+
+ /* Cmdq are in 16-byte units, each request can consume 1 or more
+ * cmdqe
+ */
+ spin_lock_bh(&hwq->lock);
+ required_slots = bng_re_get_cmd_slots(msg->req);
+ free_slots = HWQ_FREE_SLOTS(hwq);
+ cookie = cmdq->seq_num & BNG_FW_MAX_COOKIE_VALUE;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ if (required_slots >= free_slots) {
+ dev_info_ratelimited(&pdev->dev,
+ "CMDQ is full req/free %d/%d!",
+ required_slots, free_slots);
+ spin_unlock_bh(&hwq->lock);
+ return -EAGAIN;
+ }
+ __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie));
+
+ bsize = bng_re_set_cmd_slots(msg->req);
+ crsqe->free_slots = free_slots;
+ crsqe->resp = (struct creq_qp_event *)msg->resp;
+ crsqe->is_waiter_alive = true;
+ crsqe->is_in_used = true;
+ crsqe->opcode = opcode;
+
+ crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz);
+ if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) {
+ struct bng_re_rcfw_sbuf *sbuf = msg->sb;
+
+ __set_cmdq_base_resp_addr(msg->req, msg->req_sz,
+ cpu_to_le64(sbuf->dma_addr));
+ __set_cmdq_base_resp_size(msg->req, msg->req_sz,
+ ALIGN(sbuf->size,
+ BNG_FW_CMDQE_UNITS) /
+ BNG_FW_CMDQE_UNITS);
+ }
+
+ preq = (u8 *)msg->req;
+ do {
+ /* Locate the next cmdq slot */
+ sw_prod = HWQ_CMP(hwq->prod, hwq);
+ cmdqe = bng_re_get_qe(hwq, sw_prod, NULL);
+ /* Copy a segment of the req cmd to the cmdq */
+ memset(cmdqe, 0, sizeof(*cmdqe));
+ memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe)));
+ preq += min_t(u32, bsize, sizeof(*cmdqe));
+ bsize -= min_t(u32, bsize, sizeof(*cmdqe));
+ hwq->prod++;
+ } while (bsize > 0);
+ cmdq->seq_num++;
+
+ cmdq_prod = hwq->prod & 0xFFFF;
+ if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) {
+ /* The very first doorbell write
+ * is required to set this flag
+ * which prompts the FW to reset
+ * its internal pointers
+ */
+ cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG);
+ clear_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
+ }
+ /* ring CMDQ DB */
+ wmb();
+ writel(cmdq_prod, cmdq->cmdq_mbox.prod);
+ writel(BNG_FW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db);
+ spin_unlock_bh(&hwq->lock);
+ /* Return the CREQ response pointer */
+ return 0;
+}
+
+/**
+ * __wait_for_resp - Don't hold the cpu context and wait for response
+ * @rcfw: rcfw channel instance of rdev
+ * @cookie: cookie to track the command
+ *
+ * Wait for command completion in sleepable context.
+ *
+ * Returns:
+ * 0 if command is completed by firmware.
+ * Non zero error code for rest of the case.
+ */
+static int __wait_for_resp(struct bng_re_rcfw *rcfw, u16 cookie)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_crsqe *crsqe;
+
+ cmdq = &rcfw->cmdq;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ do {
+ wait_event_timeout(cmdq->waitq,
+ !crsqe->is_in_used,
+ secs_to_jiffies(rcfw->max_timeout));
+
+ if (!crsqe->is_in_used)
+ return 0;
+
+ bng_re_service_creq(&rcfw->creq.creq_tasklet);
+
+ if (!crsqe->is_in_used)
+ return 0;
+ } while (true);
+};
+
+/**
+ * bng_re_rcfw_send_message - interface to send
+ * and complete rcfw command.
+ * @rcfw: rcfw channel instance of rdev
+ * @msg: message to send
+ *
+ * This function does not account shadow queue depth. It will send
+ * all the command unconditionally as long as send queue is not full.
+ *
+ * Returns:
+ * 0 if command completed by firmware.
+ * Non zero if the command is not completed by firmware.
+ */
+int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg)
+{
+ struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp;
+ struct bng_re_crsqe *crsqe;
+ u16 cookie;
+ int rc;
+ u8 opcode;
+
+ opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz);
+
+ rc = __send_message_basic_sanity(rcfw, msg, opcode);
+ if (rc)
+ return rc == -ENXIO ? bng_re_map_rc(opcode) : rc;
+
+ rc = __send_message(rcfw, msg, opcode);
+ if (rc)
+ return rc;
+
+ cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz))
+ & BNG_FW_MAX_COOKIE_VALUE;
+
+ rc = __wait_for_resp(rcfw, cookie);
+
+ if (rc) {
+ spin_lock_bh(&rcfw->cmdq.hwq.lock);
+ crsqe = &rcfw->crsqe_tbl[cookie];
+ crsqe->is_waiter_alive = false;
+ if (rc == -ENODEV)
+ set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags);
+ spin_unlock_bh(&rcfw->cmdq.hwq.lock);
+ return -ETIMEDOUT;
+ }
+
+ if (evnt->status) {
+ /* failed with status */
+ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n",
+ cookie, opcode, evnt->status);
+ rc = -EIO;
+ }
+
+ return rc;
+}
+
+static int bng_re_map_cmdq_mbox(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_cmdq_mbox *mbox;
+ resource_size_t bar_reg;
+ struct pci_dev *pdev;
+
+ pdev = rcfw->pdev;
+ mbox = &rcfw->cmdq.cmdq_mbox;
+
+ mbox->reg.bar_id = BNG_FW_COMM_PCI_BAR_REGION;
+ mbox->reg.len = BNG_FW_COMM_SIZE;
+ mbox->reg.bar_base = pci_resource_start(pdev, mbox->reg.bar_id);
+ if (!mbox->reg.bar_base) {
+ dev_err(&pdev->dev,
+ "CMDQ BAR region %d resc start is 0!\n",
+ mbox->reg.bar_id);
+ return -ENOMEM;
+ }
+
+ bar_reg = mbox->reg.bar_base + BNG_FW_COMM_BASE_OFFSET;
+ mbox->reg.len = BNG_FW_COMM_SIZE;
+ mbox->reg.bar_reg = ioremap(bar_reg, mbox->reg.len);
+ if (!mbox->reg.bar_reg) {
+ dev_err(&pdev->dev,
+ "CMDQ BAR region %d mapping failed\n",
+ mbox->reg.bar_id);
+ return -ENOMEM;
+ }
+
+ mbox->prod = (void __iomem *)(mbox->reg.bar_reg +
+ BNG_FW_PF_VF_COMM_PROD_OFFSET);
+ mbox->db = (void __iomem *)(mbox->reg.bar_reg + BNG_FW_COMM_TRIG_OFFSET);
+ return 0;
+}
+
+static irqreturn_t bng_re_creq_irq(int irq, void *dev_instance)
+{
+ struct bng_re_rcfw *rcfw = dev_instance;
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_hwq *hwq;
+ u32 sw_cons;
+
+ creq = &rcfw->creq;
+ hwq = &creq->hwq;
+ /* Prefetch the CREQ element */
+ sw_cons = HWQ_CMP(hwq->cons, hwq);
+ bng_re_get_qe(hwq, sw_cons, NULL);
+
+ tasklet_schedule(&creq->creq_tasklet);
+ return IRQ_HANDLED;
+}
+
+int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector,
+ bool need_init)
+{
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_res *res;
+ int rc;
+
+ creq = &rcfw->creq;
+ res = rcfw->res;
+
+ if (creq->irq_handler_avail)
+ return -EFAULT;
+
+ creq->msix_vec = msix_vector;
+ if (need_init)
+ tasklet_setup(&creq->creq_tasklet, bng_re_service_creq);
+ else
+ tasklet_enable(&creq->creq_tasklet);
+
+ creq->irq_name = kasprintf(GFP_KERNEL, "bng_re-creq@pci:%s",
+ pci_name(res->pdev));
+ if (!creq->irq_name)
+ return -ENOMEM;
+ rc = request_irq(creq->msix_vec, bng_re_creq_irq, 0,
+ creq->irq_name, rcfw);
+ if (rc) {
+ kfree(creq->irq_name);
+ creq->irq_name = NULL;
+ tasklet_disable(&creq->creq_tasklet);
+ return rc;
+ }
+ creq->irq_handler_avail = true;
+
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true);
+ atomic_inc(&rcfw->rcfw_intr_enabled);
+
+ return 0;
+}
+
+static int bng_re_map_creq_db(struct bng_re_rcfw *rcfw, u32 reg_offt)
+{
+ struct bng_re_creq_db *creq_db;
+ resource_size_t bar_reg;
+ struct pci_dev *pdev;
+
+ pdev = rcfw->pdev;
+ creq_db = &rcfw->creq.creq_db;
+
+ creq_db->dbinfo.flags = 0;
+ creq_db->reg.bar_id = BNG_FW_COMM_CONS_PCI_BAR_REGION;
+ creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id);
+ if (!creq_db->reg.bar_id)
+ dev_err(&pdev->dev,
+ "CREQ BAR region %d resc start is 0!",
+ creq_db->reg.bar_id);
+
+ bar_reg = creq_db->reg.bar_base + reg_offt;
+
+ creq_db->reg.len = BNG_FW_CREQ_DB_LEN;
+ creq_db->reg.bar_reg = ioremap(bar_reg, creq_db->reg.len);
+ if (!creq_db->reg.bar_reg) {
+ dev_err(&pdev->dev,
+ "CREQ BAR region %d mapping failed",
+ creq_db->reg.bar_id);
+ return -ENOMEM;
+ }
+ creq_db->dbinfo.db = creq_db->reg.bar_reg;
+ creq_db->dbinfo.hwq = &rcfw->creq.hwq;
+ creq_db->dbinfo.xid = rcfw->creq.ring_id;
+ return 0;
+}
+
+void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill)
+{
+ struct bng_re_creq_ctx *creq;
+
+ creq = &rcfw->creq;
+
+ if (!creq->irq_handler_avail)
+ return;
+
+ creq->irq_handler_avail = false;
+ /* Mask h/w interrupts */
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false);
+ /* Sync with last running IRQ-handler */
+ synchronize_irq(creq->msix_vec);
+ free_irq(creq->msix_vec, rcfw);
+ kfree(creq->irq_name);
+ creq->irq_name = NULL;
+ atomic_set(&rcfw->rcfw_intr_enabled, 0);
+ if (kill)
+ tasklet_kill(&creq->creq_tasklet);
+ tasklet_disable(&creq->creq_tasklet);
+}
+
+void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_cmdq_ctx *cmdq;
+
+ creq = &rcfw->creq;
+ cmdq = &rcfw->cmdq;
+ /* Make sure the HW channel is stopped! */
+ bng_re_rcfw_stop_irq(rcfw, true);
+
+ iounmap(cmdq->cmdq_mbox.reg.bar_reg);
+ iounmap(creq->creq_db.reg.bar_reg);
+
+ cmdq->cmdq_mbox.reg.bar_reg = NULL;
+ creq->creq_db.reg.bar_reg = NULL;
+ creq->msix_vec = 0;
+}
+
+static void bng_re_start_rcfw(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_cmdq_mbox *mbox;
+ struct cmdq_init init = {0};
+
+ cmdq = &rcfw->cmdq;
+ creq = &rcfw->creq;
+ mbox = &cmdq->cmdq_mbox;
+
+ init.cmdq_pbl = cpu_to_le64(cmdq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr[0]);
+ init.cmdq_size_cmdq_lvl =
+ cpu_to_le16(((rcfw->cmdq_depth <<
+ CMDQ_INIT_CMDQ_SIZE_SFT) &
+ CMDQ_INIT_CMDQ_SIZE_MASK) |
+ ((cmdq->hwq.level <<
+ CMDQ_INIT_CMDQ_LVL_SFT) &
+ CMDQ_INIT_CMDQ_LVL_MASK));
+ init.creq_ring_id = cpu_to_le16(creq->ring_id);
+ /* Write to the mailbox register */
+ __iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4);
+}
+
+int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw,
+ int msix_vector,
+ int cp_bar_reg_off)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ int rc;
+
+ cmdq = &rcfw->cmdq;
+
+ /* Assign defaults */
+ cmdq->seq_num = 0;
+ set_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
+ init_waitqueue_head(&cmdq->waitq);
+
+ rc = bng_re_map_cmdq_mbox(rcfw);
+ if (rc)
+ return rc;
+
+ rc = bng_re_map_creq_db(rcfw, cp_bar_reg_off);
+ if (rc)
+ return rc;
+
+ rc = bng_re_rcfw_start_irq(rcfw, msix_vector, true);
+ if (rc) {
+ dev_err(&rcfw->pdev->dev,
+ "Failed to request IRQ for CREQ rc = 0x%x\n", rc);
+ bng_re_disable_rcfw_channel(rcfw);
+ return rc;
+ }
+
+ bng_re_start_rcfw(rcfw);
+ return 0;
+}
+
+int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw)
+{
+ struct creq_deinitialize_fw_resp resp = {};
+ struct cmdq_deinitialize_fw req = {};
+ struct bng_re_cmdqmsg msg = {};
+ int rc;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_DEINITIALIZE_FW,
+ sizeof(req));
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL,
+ sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return rc;
+
+ clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
+ return 0;
+}
+static inline bool _is_hw_retx_supported(u16 dev_cap_flags)
+{
+ return dev_cap_flags &
+ (CREQ_QUERY_FUNC_RESP_SB_HW_REQUESTER_RETX_ENABLED |
+ CREQ_QUERY_FUNC_RESP_SB_HW_RESPONDER_RETX_ENABLED);
+}
+
+#define BNG_RE_HW_RETX(a) _is_hw_retx_supported((a))
+static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2)
+{
+ return dev_cap_ext_flags2 &
+ CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED;
+}
+
+int bng_re_init_rcfw(struct bng_re_rcfw *rcfw,
+ struct bng_re_stats *stats_ctx)
+{
+ struct creq_initialize_fw_resp resp = {};
+ struct cmdq_initialize_fw req = {};
+ struct bng_re_cmdqmsg msg = {};
+ int rc;
+ u16 flags = 0;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_INITIALIZE_FW,
+ sizeof(req));
+ /* Supply (log-base-2-of-host-page-size - base-page-shift)
+ * to bono to adjust the doorbell page sizes.
+ */
+ req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT -
+ BNG_FW_DBR_BASE_PAGE_SHIFT);
+ if (BNG_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags))
+ flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED;
+ if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2))
+ flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED;
+ req.flags |= cpu_to_le16(flags);
+ req.stat_ctx_id = cpu_to_le32(stats_ctx->fw_id);
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return rc;
+ set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
+ return 0;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h
new file mode 100644
index 000000000000..c89c926ec2fc
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_fw.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_FW_H__
+#define __BNG_FW_H__
+
+#include "bng_tlv.h"
+
+/* FW DB related */
+#define BNG_FW_CMDQ_TRIG_VAL 1
+#define BNG_FW_COMM_PCI_BAR_REGION 0
+#define BNG_FW_COMM_CONS_PCI_BAR_REGION 2
+#define BNG_FW_DBR_BASE_PAGE_SHIFT 12
+#define BNG_FW_COMM_SIZE 0x104
+#define BNG_FW_COMM_BASE_OFFSET 0x600
+#define BNG_FW_COMM_TRIG_OFFSET 0x100
+#define BNG_FW_PF_VF_COMM_PROD_OFFSET 0xc
+#define BNG_FW_CREQ_DB_LEN 8
+
+/* CREQ */
+#define BNG_FW_CREQE_MAX_CNT (64 * 1024)
+#define BNG_FW_CREQE_UNITS 16
+#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100
+#define BNG_FW_CREQ_CMP_VALID(hdr, pass) \
+ (!!((hdr)->v & CREQ_BASE_V) == \
+ !((pass) & BNG_RE_FLAG_EPOCH_CONS_MASK))
+#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100
+
+/* CMDQ */
+struct bng_fw_cmdqe {
+ u8 data[16];
+};
+
+#define BNG_FW_CMDQE_MAX_CNT 8192
+#define BNG_FW_CMDQE_UNITS sizeof(struct bng_fw_cmdqe)
+#define BNG_FW_CMDQE_BYTES(depth) ((depth) * BNG_FW_CMDQE_UNITS)
+
+#define BNG_FW_MAX_COOKIE_VALUE (BNG_FW_CMDQE_MAX_CNT - 1)
+#define BNG_FW_CMD_IS_BLOCKING 0x8000
+
+/* Crsq buf is 1024-Byte */
+struct bng_re_crsbe {
+ u8 data[1024];
+};
+
+
+static inline u32 bng_fw_cmdqe_npages(u32 depth)
+{
+ u32 npages;
+
+ npages = BNG_FW_CMDQE_BYTES(depth) / PAGE_SIZE;
+ if (BNG_FW_CMDQE_BYTES(depth) % PAGE_SIZE)
+ npages++;
+ return npages;
+}
+
+static inline u32 bng_fw_cmdqe_page_size(u32 depth)
+{
+ return (bng_fw_cmdqe_npages(depth) * PAGE_SIZE);
+}
+struct bng_re_cmdq_mbox {
+ struct bng_re_reg_desc reg;
+ void __iomem *prod;
+ void __iomem *db;
+};
+
+/* HWQ */
+struct bng_re_cmdq_ctx {
+ struct bng_re_hwq hwq;
+ struct bng_re_cmdq_mbox cmdq_mbox;
+ unsigned long flags;
+#define FIRMWARE_INITIALIZED_FLAG (0)
+#define FIRMWARE_STALL_DETECTED (3)
+#define FIRMWARE_FIRST_FLAG (31)
+ wait_queue_head_t waitq;
+ u32 seq_num;
+};
+
+struct bng_re_creq_db {
+ struct bng_re_reg_desc reg;
+ struct bng_re_db_info dbinfo;
+};
+
+struct bng_re_creq_stat {
+ u64 creq_qp_event_processed;
+ u64 creq_func_event_processed;
+};
+
+struct bng_re_creq_ctx {
+ struct bng_re_hwq hwq;
+ struct bng_re_creq_db creq_db;
+ struct bng_re_creq_stat stats;
+ struct tasklet_struct creq_tasklet;
+ u16 ring_id;
+ int msix_vec;
+ bool irq_handler_avail;
+ char *irq_name;
+};
+
+struct bng_re_crsqe {
+ struct creq_qp_event *resp;
+ u32 req_size;
+ /* Free slots at the time of submission */
+ u32 free_slots;
+ u8 opcode;
+ bool is_waiter_alive;
+ bool is_in_used;
+};
+
+struct bng_re_rcfw_sbuf {
+ void *sb;
+ dma_addr_t dma_addr;
+ u32 size;
+};
+
+/* RoCE FW Communication Channels */
+struct bng_re_rcfw {
+ struct pci_dev *pdev;
+ struct bng_re_res *res;
+ struct bng_re_cmdq_ctx cmdq;
+ struct bng_re_creq_ctx creq;
+ struct bng_re_crsqe *crsqe_tbl;
+ /* To synchronize the qp-handle hash table */
+ spinlock_t tbl_lock;
+ u32 cmdq_depth;
+ /* cached from chip cctx for quick reference in slow path */
+ u16 max_timeout;
+ atomic_t rcfw_intr_enabled;
+};
+
+struct bng_re_cmdqmsg {
+ struct cmdq_base *req;
+ struct creq_base *resp;
+ void *sb;
+ u32 req_sz;
+ u32 res_sz;
+ u8 block;
+};
+
+static inline void bng_re_rcfw_cmd_prep(struct cmdq_base *req,
+ u8 opcode, u8 cmd_size)
+{
+ req->opcode = opcode;
+ req->cmd_size = cmd_size;
+}
+
+static inline void bng_re_fill_cmdqmsg(struct bng_re_cmdqmsg *msg,
+ void *req, void *resp, void *sb,
+ u32 req_sz, u32 res_sz, u8 block)
+{
+ msg->req = req;
+ msg->resp = resp;
+ msg->sb = sb;
+ msg->req_sz = req_sz;
+ msg->res_sz = res_sz;
+ msg->block = block;
+}
+
+/* Get the number of command units required for the req. The
+ * function returns correct value only if called before
+ * setting using bng_re_set_cmd_slots
+ */
+static inline u32 bng_re_get_cmd_slots(struct cmdq_base *req)
+{
+ u32 cmd_units = 0;
+
+ if (HAS_TLV_HEADER(req)) {
+ struct roce_tlv *tlv_req = (struct roce_tlv *)req;
+
+ cmd_units = tlv_req->total_size;
+ } else {
+ cmd_units = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) /
+ BNG_FW_CMDQE_UNITS;
+ }
+
+ return cmd_units;
+}
+
+static inline u32 bng_re_set_cmd_slots(struct cmdq_base *req)
+{
+ u32 cmd_byte = 0;
+
+ if (HAS_TLV_HEADER(req)) {
+ struct roce_tlv *tlv_req = (struct roce_tlv *)req;
+
+ cmd_byte = tlv_req->total_size * BNG_FW_CMDQE_UNITS;
+ } else {
+ cmd_byte = req->cmd_size;
+ req->cmd_size = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) /
+ BNG_FW_CMDQE_UNITS;
+ }
+
+ return cmd_byte;
+}
+
+void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw);
+int bng_re_alloc_fw_channel(struct bng_re_res *res,
+ struct bng_re_rcfw *rcfw);
+int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw,
+ int msix_vector,
+ int cp_bar_reg_off);
+void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw);
+int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector,
+ bool need_init);
+void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill);
+int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg);
+int bng_re_init_rcfw(struct bng_re_rcfw *rcfw,
+ struct bng_re_stats *stats_ctx);
+int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h
new file mode 100644
index 000000000000..dae4862621a7
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_re.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RE_H__
+#define __BNG_RE_H__
+
+#include "bng_res.h"
+
+#define BNG_RE_ADEV_NAME "bng_en"
+
+#define BNG_RE_DESC "Broadcom 800G RoCE Driver"
+
+#define rdev_to_dev(rdev) ((rdev) ? (&(rdev)->ibdev.dev) : NULL)
+
+#define BNG_RE_MIN_MSIX 2
+#define BNG_RE_MAX_MSIX BNGE_MAX_ROCE_MSIX
+
+#define BNG_RE_CREQ_NQ_IDX 0
+
+#define BNGE_INVALID_STATS_CTX_ID -1
+/* NQ specific structures */
+struct bng_re_nq_db {
+ struct bng_re_reg_desc reg;
+ struct bng_re_db_info dbinfo;
+};
+
+struct bng_re_nq {
+ struct pci_dev *pdev;
+ struct bng_re_res *res;
+ char *name;
+ struct bng_re_hwq hwq;
+ struct bng_re_nq_db nq_db;
+ u16 ring_id;
+ int msix_vec;
+ cpumask_t mask;
+ struct tasklet_struct nq_tasklet;
+ bool requested;
+ int budget;
+ u32 load;
+
+ struct workqueue_struct *cqn_wq;
+};
+
+struct bng_re_nq_record {
+ struct bnge_msix_info msix_entries[BNG_RE_MAX_MSIX];
+ struct bng_re_nq nq[BNG_RE_MAX_MSIX];
+ int num_msix;
+ /* serialize NQ access */
+ struct mutex load_lock;
+};
+
+struct bng_re_en_dev_info {
+ struct bng_re_dev *rdev;
+ struct bnge_auxr_dev *auxr_dev;
+};
+
+struct bng_re_ring_attr {
+ dma_addr_t *dma_arr;
+ int pages;
+ int type;
+ u32 depth;
+ u32 lrid; /* Logical ring id */
+ u8 mode;
+};
+
+struct bng_re_dev {
+ struct ib_device ibdev;
+ unsigned long flags;
+#define BNG_RE_FLAG_NETDEV_REGISTERED 0
+#define BNG_RE_FLAG_RCFW_CHANNEL_EN 1
+ struct net_device *netdev;
+ struct auxiliary_device *adev;
+ struct bnge_auxr_dev *aux_dev;
+ struct bng_re_chip_ctx *chip_ctx;
+ int fn_id;
+ struct bng_re_res bng_res;
+ struct bng_re_rcfw rcfw;
+ struct bng_re_nq_record *nqr;
+ /* Device Resources */
+ struct bng_re_dev_attr *dev_attr;
+ struct dentry *dbg_root;
+ struct bng_re_stats stats_ctx;
+};
+
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_res.c b/drivers/infiniband/hw/bng_re/bng_res.c
new file mode 100644
index 000000000000..c50823758b53
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_res.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+#include <rdma/ib_umem.h>
+
+#include <linux/bnxt/hsi.h>
+#include "bng_res.h"
+#include "roce_hsi.h"
+
+/* Stats */
+void bng_re_free_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_stats *stats)
+{
+ if (stats->dma) {
+ dma_free_coherent(&pdev->dev, stats->size,
+ stats->dma, stats->dma_map);
+ }
+ memset(stats, 0, sizeof(*stats));
+ stats->fw_id = -1;
+}
+
+int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_chip_ctx *cctx,
+ struct bng_re_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->fw_id = -1;
+ stats->size = cctx->hw_stats_size;
+ stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
+ &stats->dma_map, GFP_KERNEL);
+ if (!stats->dma)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void bng_free_pbl(struct bng_re_res *res, struct bng_re_pbl *pbl)
+{
+ struct pci_dev *pdev = res->pdev;
+ int i;
+
+ for (i = 0; i < pbl->pg_count; i++) {
+ if (pbl->pg_arr[i])
+ dma_free_coherent(&pdev->dev, pbl->pg_size,
+ (void *)((unsigned long)
+ pbl->pg_arr[i] &
+ PAGE_MASK),
+ pbl->pg_map_arr[i]);
+ else
+ dev_warn(&pdev->dev,
+ "PBL free pg_arr[%d] empty?!\n", i);
+ pbl->pg_arr[i] = NULL;
+ }
+
+ vfree(pbl->pg_arr);
+ pbl->pg_arr = NULL;
+ vfree(pbl->pg_map_arr);
+ pbl->pg_map_arr = NULL;
+ pbl->pg_count = 0;
+ pbl->pg_size = 0;
+}
+
+static int bng_alloc_pbl(struct bng_re_res *res,
+ struct bng_re_pbl *pbl,
+ struct bng_re_sg_info *sginfo)
+{
+ struct pci_dev *pdev = res->pdev;
+ u32 pages;
+ int i;
+
+ if (sginfo->nopte)
+ return 0;
+ pages = sginfo->npages;
+
+ /* page ptr arrays */
+ pbl->pg_arr = vmalloc_array(pages, sizeof(void *));
+ if (!pbl->pg_arr)
+ return -ENOMEM;
+
+ pbl->pg_map_arr = vmalloc_array(pages, sizeof(dma_addr_t));
+ if (!pbl->pg_map_arr) {
+ vfree(pbl->pg_arr);
+ pbl->pg_arr = NULL;
+ return -ENOMEM;
+ }
+ pbl->pg_count = 0;
+ pbl->pg_size = sginfo->pgsize;
+
+ for (i = 0; i < pages; i++) {
+ pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
+ pbl->pg_size,
+ &pbl->pg_map_arr[i],
+ GFP_KERNEL);
+ if (!pbl->pg_arr[i])
+ goto fail;
+ pbl->pg_count++;
+ }
+
+ return 0;
+fail:
+ bng_free_pbl(res, pbl);
+ return -ENOMEM;
+}
+
+void bng_re_free_hwq(struct bng_re_res *res,
+ struct bng_re_hwq *hwq)
+{
+ int i;
+
+ if (!hwq->max_elements)
+ return;
+ if (hwq->level >= BNG_PBL_LVL_MAX)
+ return;
+
+ for (i = 0; i < hwq->level + 1; i++)
+ bng_free_pbl(res, &hwq->pbl[i]);
+
+ hwq->level = BNG_PBL_LVL_MAX;
+ hwq->max_elements = 0;
+ hwq->element_size = 0;
+ hwq->prod = 0;
+ hwq->cons = 0;
+}
+
+/* All HWQs are power of 2 in size */
+int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq,
+ struct bng_re_hwq_attr *hwq_attr)
+{
+ u32 npages, pg_size;
+ struct bng_re_sg_info sginfo = {};
+ u32 depth, stride, npbl, npde;
+ dma_addr_t *src_phys_ptr, **dst_virt_ptr;
+ struct bng_re_res *res;
+ struct pci_dev *pdev;
+ int i, rc, lvl;
+
+ res = hwq_attr->res;
+ pdev = res->pdev;
+ pg_size = hwq_attr->sginfo->pgsize;
+ hwq->level = BNG_PBL_LVL_MAX;
+
+ depth = roundup_pow_of_two(hwq_attr->depth);
+ stride = roundup_pow_of_two(hwq_attr->stride);
+
+ npages = (depth * stride) / pg_size;
+ if ((depth * stride) % pg_size)
+ npages++;
+ if (!npages)
+ return -EINVAL;
+ hwq_attr->sginfo->npages = npages;
+
+ if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) {
+ /* This request is Level 0, map PTE */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_0;
+ goto done;
+ }
+
+ if (npages >= MAX_PBL_LVL_0_PGS) {
+ if (npages > MAX_PBL_LVL_1_PGS) {
+ u32 flag = PTU_PTE_VALID;
+ /* 2 levels of indirection */
+ npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+ if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+ npbl++;
+ npde = npbl >> MAX_PDL_LVL_SHIFT;
+ if (npbl % BIT(MAX_PDL_LVL_SHIFT))
+ npde++;
+ /* Alloc PDE pages */
+ sginfo.pgsize = npde * pg_size;
+ sginfo.npages = 1;
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo);
+ if (rc)
+ goto fail;
+
+ /* Alloc PBL pages */
+ sginfo.npages = npbl;
+ sginfo.pgsize = PAGE_SIZE;
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], &sginfo);
+ if (rc)
+ goto fail;
+ /* Fill PDL with PBL page pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++)
+ dst_virt_ptr[0][i] = src_phys_ptr[i] | flag;
+
+ /* Alloc or init PTEs */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_2],
+ hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_2;
+ if (hwq_attr->sginfo->nopte)
+ goto done;
+ /* Fill PBLs with PTE pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_1].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_2].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_2].pg_count; i++) {
+ dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+ src_phys_ptr[i] | PTU_PTE_VALID;
+ }
+ if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) {
+ /* Find the last pg of the size */
+ i = hwq->pbl[BNG_PBL_LVL_2].pg_count;
+ dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+ PTU_PTE_LAST;
+ if (i > 1)
+ dst_virt_ptr[PTR_PG(i - 2)]
+ [PTR_IDX(i - 2)] |=
+ PTU_PTE_NEXT_TO_LAST;
+ }
+ } else { /* pages < 512 npbl = 1, npde = 0 */
+ u32 flag = PTU_PTE_VALID;
+
+ /* 1 level of indirection */
+ npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+ if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+ npbl++;
+ sginfo.npages = npbl;
+ sginfo.pgsize = PAGE_SIZE;
+ /* Alloc PBL page */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo);
+ if (rc)
+ goto fail;
+ /* Alloc or init PTEs */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1],
+ hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_1;
+ if (hwq_attr->sginfo->nopte)
+ goto done;
+ /* Fill PBL with PTE pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++)
+ dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+ src_phys_ptr[i] | flag;
+ if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) {
+ /* Find the last pg of the size */
+ i = hwq->pbl[BNG_PBL_LVL_1].pg_count;
+ dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+ PTU_PTE_LAST;
+ if (i > 1)
+ dst_virt_ptr[PTR_PG(i - 2)]
+ [PTR_IDX(i - 2)] |=
+ PTU_PTE_NEXT_TO_LAST;
+ }
+ }
+ }
+done:
+ hwq->prod = 0;
+ hwq->cons = 0;
+ hwq->pdev = pdev;
+ hwq->depth = hwq_attr->depth;
+ hwq->max_elements = hwq->depth;
+ hwq->element_size = stride;
+ hwq->qe_ppg = pg_size / stride;
+ /* For direct access to the elements */
+ lvl = hwq->level;
+ if (hwq_attr->sginfo->nopte && hwq->level)
+ lvl = hwq->level - 1;
+ hwq->pbl_ptr = hwq->pbl[lvl].pg_arr;
+ hwq->pbl_dma_ptr = hwq->pbl[lvl].pg_map_arr;
+ spin_lock_init(&hwq->lock);
+
+ return 0;
+fail:
+ bng_re_free_hwq(res, hwq);
+ return -ENOMEM;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h
new file mode 100644
index 000000000000..9997f86d6a0e
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_res.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RES_H__
+#define __BNG_RES_H__
+
+#include "roce_hsi.h"
+
+#define BNG_ROCE_FW_MAX_TIMEOUT 60
+
+#define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *))
+#define PTR_MAX_IDX_PER_PG (PTR_CNT_PER_PG - 1)
+#define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG)
+#define PTR_IDX(x) ((x) & PTR_MAX_IDX_PER_PG)
+
+#define HWQ_CMP(idx, hwq) ((idx) & ((hwq)->max_elements - 1))
+#define HWQ_FREE_SLOTS(hwq) (hwq->max_elements - \
+ ((HWQ_CMP(hwq->prod, hwq)\
+ - HWQ_CMP(hwq->cons, hwq))\
+ & (hwq->max_elements - 1)))
+
+#define MAX_PBL_LVL_0_PGS 1
+#define MAX_PBL_LVL_1_PGS 512
+#define MAX_PBL_LVL_1_PGS_SHIFT 9
+#define MAX_PBL_LVL_1_PGS_FOR_LVL_2 256
+#define MAX_PBL_LVL_2_PGS (256 * 512)
+#define MAX_PDL_LVL_SHIFT 9
+
+#define BNG_RE_DBR_VALID (0x1UL << 26)
+#define BNG_RE_DBR_EPOCH_SHIFT 24
+#define BNG_RE_DBR_TOGGLE_SHIFT 25
+
+#define BNG_MAX_TQM_ALLOC_REQ 48
+
+struct bng_re_reg_desc {
+ u8 bar_id;
+ resource_size_t bar_base;
+ unsigned long offset;
+ void __iomem *bar_reg;
+ size_t len;
+};
+
+struct bng_re_db_info {
+ void __iomem *db;
+ void __iomem *priv_db;
+ struct bng_re_hwq *hwq;
+ u32 xid;
+ u32 max_slot;
+ u32 flags;
+ u8 toggle;
+};
+
+enum bng_re_db_info_flags_mask {
+ BNG_RE_FLAG_EPOCH_CONS_SHIFT = 0x0UL,
+ BNG_RE_FLAG_EPOCH_PROD_SHIFT = 0x1UL,
+ BNG_RE_FLAG_EPOCH_CONS_MASK = 0x1UL,
+ BNG_RE_FLAG_EPOCH_PROD_MASK = 0x2UL,
+};
+
+enum bng_re_db_epoch_flag_shift {
+ BNG_RE_DB_EPOCH_CONS_SHIFT = BNG_RE_DBR_EPOCH_SHIFT,
+ BNG_RE_DB_EPOCH_PROD_SHIFT = (BNG_RE_DBR_EPOCH_SHIFT - 1),
+};
+
+struct bng_re_chip_ctx {
+ u16 chip_num;
+ u16 hw_stats_size;
+ u64 hwrm_intf_ver;
+ u16 hwrm_cmd_max_timeout;
+};
+
+struct bng_re_pbl {
+ u32 pg_count;
+ u32 pg_size;
+ void **pg_arr;
+ dma_addr_t *pg_map_arr;
+};
+
+enum bng_re_pbl_lvl {
+ BNG_PBL_LVL_0,
+ BNG_PBL_LVL_1,
+ BNG_PBL_LVL_2,
+ BNG_PBL_LVL_MAX
+};
+
+enum bng_re_hwq_type {
+ BNG_HWQ_TYPE_CTX,
+ BNG_HWQ_TYPE_QUEUE
+};
+
+struct bng_re_sg_info {
+ u32 npages;
+ u32 pgshft;
+ u32 pgsize;
+ bool nopte;
+};
+
+struct bng_re_hwq_attr {
+ struct bng_re_res *res;
+ struct bng_re_sg_info *sginfo;
+ enum bng_re_hwq_type type;
+ u32 depth;
+ u32 stride;
+ u32 aux_stride;
+ u32 aux_depth;
+};
+
+struct bng_re_hwq {
+ struct pci_dev *pdev;
+ /* lock to protect hwq */
+ spinlock_t lock;
+ struct bng_re_pbl pbl[BNG_PBL_LVL_MAX + 1];
+ /* Valid values: 0, 1, 2 */
+ enum bng_re_pbl_lvl level;
+ /* PBL entries */
+ void **pbl_ptr;
+ /* PBL dma_addr */
+ dma_addr_t *pbl_dma_ptr;
+ u32 max_elements;
+ u32 depth;
+ u16 element_size;
+ u32 prod;
+ u32 cons;
+ /* queue entry per page */
+ u16 qe_ppg;
+};
+
+struct bng_re_stats {
+ dma_addr_t dma_map;
+ void *dma;
+ u32 size;
+ u32 fw_id;
+};
+
+struct bng_re_res {
+ struct pci_dev *pdev;
+ struct bng_re_chip_ctx *cctx;
+ struct bng_re_dev_attr *dattr;
+};
+
+static inline void *bng_re_get_qe(struct bng_re_hwq *hwq,
+ u32 indx, u64 *pg)
+{
+ u32 pg_num, pg_idx;
+
+ pg_num = (indx / hwq->qe_ppg);
+ pg_idx = (indx % hwq->qe_ppg);
+ if (pg)
+ *pg = (u64)&hwq->pbl_ptr[pg_num];
+ return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx);
+}
+
+#define BNG_RE_INIT_DBHDR(xid, type, indx, toggle) \
+ (((u64)(((xid) & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | \
+ (type) | BNG_RE_DBR_VALID) << 32) | (indx) | \
+ (((u32)(toggle)) << (BNG_RE_DBR_TOGGLE_SHIFT)))
+
+static inline void bng_re_ring_db(struct bng_re_db_info *info,
+ u32 type)
+{
+ u64 key = 0;
+ u32 indx;
+ u8 toggle = 0;
+
+ if (type == DBC_DBC_TYPE_CQ_ARMALL ||
+ type == DBC_DBC_TYPE_CQ_ARMSE)
+ toggle = info->toggle;
+
+ indx = (info->hwq->cons & DBC_DBC_INDEX_MASK) |
+ ((info->flags & BNG_RE_FLAG_EPOCH_CONS_MASK) <<
+ BNG_RE_DB_EPOCH_CONS_SHIFT);
+
+ key = BNG_RE_INIT_DBHDR(info->xid, type, indx, toggle);
+ writeq(key, info->db);
+}
+
+static inline void bng_re_ring_nq_db(struct bng_re_db_info *info,
+ struct bng_re_chip_ctx *cctx,
+ bool arm)
+{
+ u32 type;
+
+ type = arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+ bng_re_ring_db(info, type);
+}
+
+static inline void bng_re_hwq_incr_cons(u32 max_elements, u32 *cons, u32 cnt,
+ u32 *dbinfo_flags)
+{
+ /* move cons and update toggle/epoch if wrap around */
+ *cons += cnt;
+ if (*cons >= max_elements) {
+ *cons %= max_elements;
+ *dbinfo_flags ^= 1UL << BNG_RE_FLAG_EPOCH_CONS_SHIFT;
+ }
+}
+
+static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2)
+{
+ return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED);
+}
+
+void bng_re_free_hwq(struct bng_re_res *res,
+ struct bng_re_hwq *hwq);
+
+int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq,
+ struct bng_re_hwq_attr *hwq_attr);
+
+void bng_re_free_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_stats *stats);
+
+int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_chip_ctx *cctx,
+ struct bng_re_stats *stats);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_sp.c b/drivers/infiniband/hw/bng_re/bng_sp.c
new file mode 100644
index 000000000000..83099e05328d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_sp.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bng_sp.h"
+#include "bng_tlv.h"
+
+static bool bng_re_is_atomic_cap(struct bng_re_rcfw *rcfw)
+{
+ u16 pcie_ctl2 = 0;
+
+ pcie_capability_read_word(rcfw->pdev, PCI_EXP_DEVCTL2, &pcie_ctl2);
+ return (pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ);
+}
+
+static void bng_re_query_version(struct bng_re_rcfw *rcfw,
+ char *fw_ver)
+{
+ struct creq_query_version_resp resp = {};
+ struct bng_re_cmdqmsg msg = {};
+ struct cmdq_query_version req = {};
+ int rc;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_QUERY_VERSION,
+ sizeof(req));
+
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return;
+ fw_ver[0] = resp.fw_maj;
+ fw_ver[1] = resp.fw_minor;
+ fw_ver[2] = resp.fw_bld;
+ fw_ver[3] = resp.fw_rsvd;
+}
+
+int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_dev_attr *attr = rcfw->res->dattr;
+ struct creq_query_func_resp resp = {};
+ struct bng_re_cmdqmsg msg = {};
+ struct creq_query_func_resp_sb *sb;
+ struct bng_re_rcfw_sbuf sbuf;
+ struct cmdq_query_func req = {};
+ u8 *tqm_alloc;
+ int i, rc;
+ u32 temp;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_QUERY_FUNC,
+ sizeof(req));
+
+ sbuf.size = ALIGN(sizeof(*sb), BNG_FW_CMDQE_UNITS);
+ sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size,
+ &sbuf.dma_addr, GFP_KERNEL);
+ if (!sbuf.sb)
+ return -ENOMEM;
+ sb = sbuf.sb;
+ req.resp_size = sbuf.size / BNG_FW_CMDQE_UNITS;
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req),
+ sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ goto bail;
+ /* Extract the context from the side buffer */
+ attr->max_qp = le32_to_cpu(sb->max_qp);
+ /* max_qp value reported by FW doesn't include the QP1 */
+ attr->max_qp += 1;
+ attr->max_qp_rd_atom =
+ sb->max_qp_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ?
+ BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom;
+ attr->max_qp_init_rd_atom =
+ sb->max_qp_init_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ?
+ BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom;
+ attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1;
+
+ /* Adjust for max_qp_wqes for variable wqe */
+ attr->max_qp_wqes = min_t(u32, attr->max_qp_wqes, BNG_VAR_MAX_WQE - 1);
+
+ attr->max_qp_sges = min_t(u32, sb->max_sge_var_wqe, BNG_VAR_MAX_SGE);
+ attr->max_cq = le32_to_cpu(sb->max_cq);
+ attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
+ attr->max_cq_sges = attr->max_qp_sges;
+ attr->max_mr = le32_to_cpu(sb->max_mr);
+ attr->max_mw = le32_to_cpu(sb->max_mw);
+
+ attr->max_mr_size = le64_to_cpu(sb->max_mr_size);
+ attr->max_pd = 64 * 1024;
+ attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp);
+ attr->max_ah = le32_to_cpu(sb->max_ah);
+
+ attr->max_srq = le16_to_cpu(sb->max_srq);
+ attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
+ attr->max_srq_sges = sb->max_srq_sge;
+ attr->max_pkey = 1;
+ attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
+ /*
+ * Read the max gid supported by HW.
+ * For each entry in HW GID in HW table, we consume 2
+ * GID entries in the kernel GID table. So max_gid reported
+ * to stack can be up to twice the value reported by the HW, up to 256 gids.
+ */
+ attr->max_sgid = le32_to_cpu(sb->max_gid);
+ attr->max_sgid = min_t(u32, BNG_RE_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid);
+ attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags);
+ attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2);
+
+ if (_is_max_srq_ext_supported(attr->dev_cap_flags2))
+ attr->max_srq += le16_to_cpu(sb->max_srq_ext);
+
+ bng_re_query_version(rcfw, attr->fw_ver);
+ for (i = 0; i < BNG_MAX_TQM_ALLOC_REQ / 4; i++) {
+ temp = le32_to_cpu(sb->tqm_alloc_reqs[i]);
+ tqm_alloc = (u8 *)&temp;
+ attr->tqm_alloc_reqs[i * 4] = *tqm_alloc;
+ attr->tqm_alloc_reqs[i * 4 + 1] = *(++tqm_alloc);
+ attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc);
+ attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc);
+ }
+
+ attr->max_dpi = le32_to_cpu(sb->max_dpi);
+ attr->is_atomic = bng_re_is_atomic_cap(rcfw);
+bail:
+ dma_free_coherent(&rcfw->pdev->dev, sbuf.size,
+ sbuf.sb, sbuf.dma_addr);
+ return rc;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_sp.h b/drivers/infiniband/hw/bng_re/bng_sp.h
new file mode 100644
index 000000000000..e15190515ed1
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_sp.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_SP_H__
+#define __BNG_SP_H__
+
+#include "bng_fw.h"
+
+#define BNG_VAR_MAX_WQE 4352
+#define BNG_VAR_MAX_SGE 13
+
+struct bng_re_dev_attr {
+#define FW_VER_ARR_LEN 4
+ u8 fw_ver[FW_VER_ARR_LEN];
+#define BNG_RE_NUM_GIDS_SUPPORTED 256
+ u16 max_sgid;
+ u16 max_mrw;
+ u32 max_qp;
+#define BNG_RE_MAX_OUT_RD_ATOM 126
+ u32 max_qp_rd_atom;
+ u32 max_qp_init_rd_atom;
+ u32 max_qp_wqes;
+ u32 max_qp_sges;
+ u32 max_cq;
+ u32 max_cq_wqes;
+ u32 max_cq_sges;
+ u32 max_mr;
+ u64 max_mr_size;
+ u32 max_pd;
+ u32 max_mw;
+ u32 max_raw_ethy_qp;
+ u32 max_ah;
+ u32 max_srq;
+ u32 max_srq_wqes;
+ u32 max_srq_sges;
+ u32 max_pkey;
+ u32 max_inline_data;
+ u32 l2_db_size;
+ u8 tqm_alloc_reqs[BNG_MAX_TQM_ALLOC_REQ];
+ bool is_atomic;
+ u16 dev_cap_flags;
+ u16 dev_cap_flags2;
+ u32 max_dpi;
+};
+
+int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_tlv.h b/drivers/infiniband/hw/bng_re/bng_tlv.h
new file mode 100644
index 000000000000..278f4922962d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_tlv.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+
+#ifndef __BNG_TLV_H__
+#define __BNG_TLV_H__
+
+#include "roce_hsi.h"
+
+struct roce_tlv {
+ struct tlv tlv;
+ u8 total_size; // in units of 16 byte chunks
+ u8 unused[7]; // for 16 byte alignment
+};
+
+/*
+ * TLV size in units of 16 byte chunks
+ */
+#define TLV_SIZE ((sizeof(struct roce_tlv) + 15) / 16)
+/*
+ * TLV length in bytes
+ */
+#define TLV_BYTES (TLV_SIZE * 16)
+
+#define HAS_TLV_HEADER(msg) (le16_to_cpu(((struct tlv *)(msg))->cmd_discr) == CMD_DISCR_TLV_ENCAP)
+#define GET_TLV_DATA(tlv) ((void *)&((uint8_t *)(tlv))[TLV_BYTES])
+
+static inline u8 __get_cmdq_base_opcode(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->opcode;
+ else
+ return req->opcode;
+}
+
+static inline void __set_cmdq_base_opcode(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->opcode = val;
+ else
+ req->opcode = val;
+}
+
+static inline __le16 __get_cmdq_base_cookie(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->cookie;
+ else
+ return req->cookie;
+}
+
+static inline void __set_cmdq_base_cookie(struct cmdq_base *req,
+ u32 size, __le16 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->cookie = val;
+ else
+ req->cookie = val;
+}
+
+static inline __le64 __get_cmdq_base_resp_addr(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr;
+ else
+ return req->resp_addr;
+}
+
+static inline void __set_cmdq_base_resp_addr(struct cmdq_base *req,
+ u32 size, __le64 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr = val;
+ else
+ req->resp_addr = val;
+}
+
+static inline u8 __get_cmdq_base_resp_size(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size;
+ else
+ return req->resp_size;
+}
+
+static inline void __set_cmdq_base_resp_size(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size = val;
+ else
+ req->resp_size = val;
+}
+
+static inline u8 __get_cmdq_base_cmd_size(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct roce_tlv *)(req))->total_size;
+ else
+ return req->cmd_size;
+}
+
+static inline void __set_cmdq_base_cmd_size(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->cmd_size = val;
+ else
+ req->cmd_size = val;
+}
+
+static inline __le16 __get_cmdq_base_flags(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->flags;
+ else
+ return req->flags;
+}
+
+static inline void __set_cmdq_base_flags(struct cmdq_base *req,
+ u32 size, __le16 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->flags = val;
+ else
+ req->flags = val;
+}
+
+#endif /* __BNG_TLV_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 3485e495ac6a..3a7ce4729fcf 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -224,6 +224,8 @@ struct bnxt_re_dev {
struct workqueue_struct *dcb_wq;
struct dentry *cc_config;
struct bnxt_re_dbg_cc_config_params *cc_config_params;
+ struct dentry *cq_coal_cfg;
+ struct bnxt_re_dbg_cq_coal_params *cq_coal_cfg_params;
#define BNXT_VPD_FLD_LEN 32
char board_partno[BNXT_VPD_FLD_LEN];
/* RoCE mirror */
diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c
index be5e9b5ca2f0..88817c86ae24 100644
--- a/drivers/infiniband/hw/bnxt_re/debugfs.c
+++ b/drivers/infiniband/hw/bnxt_re/debugfs.c
@@ -23,6 +23,14 @@
static struct dentry *bnxt_re_debugfs_root;
+static const char * const bnxt_re_cq_coal_str[] = {
+ "buf_maxtime",
+ "normal_maxbuf",
+ "during_maxbuf",
+ "en_ring_idle_mode",
+ "enable",
+};
+
static const char * const bnxt_re_cc_gen0_name[] = {
"enable_cc",
"run_avg_weight_g",
@@ -349,6 +357,123 @@ static void bnxt_re_debugfs_add_info(struct bnxt_re_dev *rdev)
debugfs_create_file("info", 0400, rdev->dbg_root, rdev, &info_fops);
}
+static ssize_t cq_coal_cfg_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct seq_file *s = file->private_data;
+ struct bnxt_re_cq_coal_param *param = s->private;
+ struct bnxt_re_dev *rdev = param->rdev;
+ int offset = param->offset;
+ char lbuf[16] = { };
+ u32 val;
+
+ if (count > sizeof(lbuf))
+ return -EINVAL;
+
+ if (copy_from_user(lbuf, buf, count))
+ return -EFAULT;
+
+ lbuf[sizeof(lbuf) - 1] = '\0';
+
+ if (kstrtou32(lbuf, 0, &val))
+ return -EINVAL;
+
+ switch (offset) {
+ case BNXT_RE_COAL_CQ_BUF_MAXTIME:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME)
+ return -EINVAL;
+ rdev->cq_coalescing.buf_maxtime = val;
+ break;
+ case BNXT_RE_COAL_CQ_NORMAL_MAXBUF:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF)
+ return -EINVAL;
+ rdev->cq_coalescing.normal_maxbuf = val;
+ break;
+ case BNXT_RE_COAL_CQ_DURING_MAXBUF:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF)
+ return -EINVAL;
+ rdev->cq_coalescing.during_maxbuf = val;
+ break;
+ case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE:
+ if (val > BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE)
+ return -EINVAL;
+ rdev->cq_coalescing.en_ring_idle_mode = val;
+ break;
+ case BNXT_RE_COAL_CQ_ENABLE:
+ if (val > 1)
+ return -EINVAL;
+ rdev->cq_coalescing.enable = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return count;
+}
+
+static int cq_coal_cfg_show(struct seq_file *s, void *unused)
+{
+ struct bnxt_re_cq_coal_param *param = s->private;
+ struct bnxt_re_dev *rdev = param->rdev;
+ int offset = param->offset;
+ u32 val = 0;
+
+ switch (offset) {
+ case BNXT_RE_COAL_CQ_BUF_MAXTIME:
+ val = rdev->cq_coalescing.buf_maxtime;
+ break;
+ case BNXT_RE_COAL_CQ_NORMAL_MAXBUF:
+ val = rdev->cq_coalescing.normal_maxbuf;
+ break;
+ case BNXT_RE_COAL_CQ_DURING_MAXBUF:
+ val = rdev->cq_coalescing.during_maxbuf;
+ break;
+ case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE:
+ val = rdev->cq_coalescing.en_ring_idle_mode;
+ break;
+ case BNXT_RE_COAL_CQ_ENABLE:
+ val = rdev->cq_coalescing.enable;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ seq_printf(s, "%u\n", val);
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(cq_coal_cfg);
+
+static void bnxt_re_cleanup_cq_coal_debugfs(struct bnxt_re_dev *rdev)
+{
+ debugfs_remove_recursive(rdev->cq_coal_cfg);
+ kfree(rdev->cq_coal_cfg_params);
+}
+
+static void bnxt_re_init_cq_coal_debugfs(struct bnxt_re_dev *rdev)
+{
+ struct bnxt_re_dbg_cq_coal_params *dbg_cq_coal_params;
+ int i;
+
+ if (!_is_cq_coalescing_supported(rdev->dev_attr->dev_cap_flags2))
+ return;
+
+ dbg_cq_coal_params = kzalloc(sizeof(*dbg_cq_coal_params), GFP_KERNEL);
+ if (!dbg_cq_coal_params)
+ return;
+
+ rdev->cq_coal_cfg = debugfs_create_dir("cq_coal_cfg", rdev->dbg_root);
+ rdev->cq_coal_cfg_params = dbg_cq_coal_params;
+
+ for (i = 0; i < BNXT_RE_COAL_CQ_MAX; i++) {
+ dbg_cq_coal_params->params[i].offset = i;
+ dbg_cq_coal_params->params[i].rdev = rdev;
+ debugfs_create_file(bnxt_re_cq_coal_str[i],
+ 0600, rdev->cq_coal_cfg,
+ &dbg_cq_coal_params->params[i],
+ &cq_coal_cfg_fops);
+ }
+}
+
void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev)
{
struct pci_dev *pdev = rdev->en_dev->pdev;
@@ -374,10 +499,13 @@ void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev)
rdev->cc_config, tmp_params,
&bnxt_re_cc_config_ops);
}
+
+ bnxt_re_init_cq_coal_debugfs(rdev);
}
void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev)
{
+ bnxt_re_cleanup_cq_coal_debugfs(rdev);
debugfs_remove_recursive(rdev->qp_debugfs);
debugfs_remove_recursive(rdev->cc_config);
kfree(rdev->cc_config_params);
diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h
index 8f101df4e838..98f4620ef245 100644
--- a/drivers/infiniband/hw/bnxt_re/debugfs.h
+++ b/drivers/infiniband/hw/bnxt_re/debugfs.h
@@ -33,4 +33,23 @@ struct bnxt_re_cc_param {
struct bnxt_re_dbg_cc_config_params {
struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0];
};
+
+struct bnxt_re_cq_coal_param {
+ struct bnxt_re_dev *rdev;
+ u32 offset;
+};
+
+enum bnxt_re_cq_coal_types {
+ BNXT_RE_COAL_CQ_BUF_MAXTIME,
+ BNXT_RE_COAL_CQ_NORMAL_MAXBUF,
+ BNXT_RE_COAL_CQ_DURING_MAXBUF,
+ BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE,
+ BNXT_RE_COAL_CQ_ENABLE,
+ BNXT_RE_COAL_CQ_MAX
+
+};
+
+struct bnxt_re_dbg_cq_coal_params {
+ struct bnxt_re_cq_coal_param params[BNXT_RE_COAL_CQ_MAX];
+};
#endif
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 84ce3fce2826..f19b55c13d58 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -601,7 +601,8 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
mr->qplib_mr.va = (u64)(unsigned long)fence->va;
mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES;
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL,
- BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE);
+ BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE,
+ _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags));
if (rc) {
ibdev_err(&rdev->ibdev, "Failed to register fence-MR\n");
goto fail;
@@ -4027,7 +4028,7 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags)
mr->qplib_mr.hwq.level = PBL_LVL_MAX;
mr->qplib_mr.total_size = -1; /* Infinte length */
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, 0,
- PAGE_SIZE);
+ PAGE_SIZE, false);
if (rc)
goto fail_mr;
@@ -4257,7 +4258,8 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64
umem_pgs = ib_umem_num_dma_blocks(umem, page_size);
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem,
- umem_pgs, page_size);
+ umem_pgs, page_size,
+ _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags));
if (rc) {
ibdev_err(&rdev->ibdev, "Failed to register user MR - rc = %d\n", rc);
rc = -EIO;
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index b13810572c2e..73003ad25ee8 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1453,6 +1453,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev,
atomic_set(&rdev->stats.res.pd_count, 0);
rdev->cosq[0] = 0xFFFF;
rdev->cosq[1] = 0xFFFF;
+ rdev->cq_coalescing.enable = 1;
rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME;
if (bnxt_re_chip_gen_p7(en_dev->chip_num)) {
rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index ce90d3d834d4..c88f049136fc 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -2226,7 +2226,8 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
req.cq_handle = cpu_to_le64(cq->cq_handle);
req.cq_size = cpu_to_le32(cq->max_wqe);
- if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) {
+ if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2) &&
+ cq->coalescing->enable) {
req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID);
coalescing |= ((cq->coalescing->buf_maxtime <<
CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) &
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index b990d0c0ce1a..1b414a73b46d 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -395,6 +395,7 @@ struct bnxt_qplib_cq_coal_param {
u8 normal_maxbuf;
u8 during_maxbuf;
u8 en_ring_idle_mode;
+ u8 enable;
};
#define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 9ef581ed785c..408a34df2667 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -162,7 +162,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw)
attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
attr->max_srq_sges = sb->max_srq_sge;
attr->max_pkey = 1;
- attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
+ attr->max_inline_data = attr->max_qp_sges * sizeof(struct sq_sge);
if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx))
attr->l2_db_size = (sb->l2_db_space_size + 1) *
(0x01 << RCFW_DBR_BASE_PAGE_SHIFT);
@@ -578,7 +578,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
}
int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
- struct ib_umem *umem, int num_pbls, u32 buf_pg_size)
+ struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct bnxt_qplib_hwq_attr hwq_attr = {};
@@ -640,7 +640,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
req.access = (mr->access_flags & BNXT_QPLIB_MR_ACCESS_MASK);
req.va = cpu_to_le64(mr->va);
req.key = cpu_to_le32(mr->lkey);
- if (_is_alloc_mr_unified(res->dattr->dev_cap_flags))
+ if (unified_mr)
req.key = cpu_to_le32(mr->pd->id);
req.flags = cpu_to_le16(mr->flags);
req.mr_size = cpu_to_le64(mr->total_size);
@@ -651,7 +651,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
if (rc)
goto fail;
- if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) {
+ if (unified_mr) {
mr->lkey = le32_to_cpu(resp.xid);
mr->rkey = mr->lkey;
}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
index 147b5d9c0313..5a45c55c6464 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -341,7 +341,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
bool block);
int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
- struct ib_umem *umem, int num_pbls, u32 buf_pg_size);
+ struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr);
int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr);
int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res,
struct bnxt_qplib_mrw *mr, int max);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index dcdfe250bdbe..adeed7447e7b 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -348,7 +348,7 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
{
int err;
- pr_debug("*pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+ pr_debug("*pbl_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
pbl_addr, rdev->lldi.vr->pbl.start,
pbl_size);
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index b35f92e7d865..e4aef102dac0 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -745,8 +745,8 @@ static int create_workqueues(struct hfi1_devdata *dd)
ppd->hfi1_wq =
alloc_workqueue(
"hfi%d_%d",
- WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
- WQ_MEM_RECLAIM,
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM |
+ WQ_PERCPU,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
dd->unit, pidx);
if (!ppd->hfi1_wq)
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
index 370a5a8eaa71..6e0e3458d202 100644
--- a/drivers/infiniband/hw/hfi1/opfn.c
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -305,8 +305,8 @@ void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
int opfn_init(void)
{
opfn_wq = alloc_workqueue("hfi_opfn",
- WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
- WQ_MEM_RECLAIM,
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM |
+ WQ_PERCPU,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
if (!opfn_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile
index baf592e6f21b..d07ef02c5231 100644
--- a/drivers/infiniband/hw/hns/Makefile
+++ b/drivers/infiniband/hw/hns/Makefile
@@ -4,11 +4,13 @@
#
ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
+ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3pf
+ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3_common
ccflags-y += -I $(src)
hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \
- hns_roce_debugfs.o hns_roce_hw_v2.o
+ hns_roce_debugfs.o hns_roce_hw_v2.o hns_roce_bond.o
obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index 307c35888b30..0c1c32d23c88 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include "hns_roce_device.h"
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c
new file mode 100644
index 000000000000..cc85f3ce1f3e
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.c
@@ -0,0 +1,1012 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#include <net/lag.h>
+#include <net/bonding.h>
+#include "hns_roce_device.h"
+#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
+
+static DEFINE_XARRAY(roce_bond_xa);
+
+static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev)
+{
+ struct ib_device *ibdev =
+ ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS);
+
+ if (!ibdev)
+ return NULL;
+
+ return container_of(ibdev, struct hns_roce_dev, ib_dev);
+}
+
+static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev)
+{
+ struct net_device *upper_dev;
+
+ rcu_read_lock();
+ upper_dev = netdev_master_upper_dev_get_rcu(net_dev);
+ dev_hold(upper_dev);
+ rcu_read_unlock();
+
+ return upper_dev;
+}
+
+static int get_netdev_bond_slave_id(struct net_device *net_dev,
+ struct hns_roce_bond_group *bond_grp)
+{
+ int i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++)
+ if (net_dev == bond_grp->bond_func_info[i].net_dev)
+ return i;
+
+ return -ENOENT;
+}
+
+struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
+ u8 bus_num)
+{
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+ struct hns_roce_bond_group *bond_grp;
+ struct net_device *upper_dev = NULL;
+ int i;
+
+ if (!die_info)
+ return NULL;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ return bond_grp;
+ if (bond_grp->upper_dev) {
+ upper_dev = get_upper_dev_from_ndev(net_dev);
+ if (bond_grp->upper_dev == upper_dev) {
+ dev_put(upper_dev);
+ return bond_grp;
+ }
+ dev_put(upper_dev);
+ }
+ }
+
+ return NULL;
+}
+
+static int hns_roce_set_bond_netdev(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev)
+{
+ struct net_device *active_dev;
+ struct net_device *old_dev;
+ int i, ret = 0;
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ rcu_read_lock();
+ active_dev =
+ bond_option_active_slave_get_rcu(netdev_priv(bond_grp->upper_dev));
+ rcu_read_unlock();
+ } else {
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ active_dev = bond_grp->bond_func_info[i].net_dev;
+ if (active_dev &&
+ ib_get_curr_port_state(active_dev) == IB_PORT_ACTIVE)
+ break;
+ }
+ }
+
+ if (!active_dev || i == ROCE_BOND_FUNC_MAX)
+ active_dev = get_hr_netdev(hr_dev, 0);
+
+ old_dev = ib_device_get_netdev(&hr_dev->ib_dev, 1);
+ if (old_dev == active_dev)
+ goto out;
+
+ ret = ib_device_set_netdev(&hr_dev->ib_dev, active_dev, 1);
+ if (ret) {
+ dev_err(hr_dev->dev, "failed to set netdev for bond.\n");
+ goto out;
+ }
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ if (old_dev)
+ roce_del_all_netdev_gids(&hr_dev->ib_dev, 1, old_dev);
+ rdma_roce_rescan_port(&hr_dev->ib_dev, 1);
+ }
+out:
+ dev_put(old_dev);
+ return ret;
+}
+
+bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED &&
+ bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED)
+ return true;
+
+ return false;
+}
+
+static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ u32 active_slave_map = 0;
+ u8 active_slave_num = 0;
+ bool active;
+ u8 i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if (!net_dev || !(bond_grp->slave_map & (1U << i)))
+ continue;
+
+ active = (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ?
+ net_lag_port_dev_txable(net_dev) :
+ (ib_get_curr_port_state(net_dev) == IB_PORT_ACTIVE);
+ if (active) {
+ active_slave_num++;
+ active_slave_map |= (1U << i);
+ }
+ }
+
+ bond_grp->active_slave_num = active_slave_num;
+ bond_grp->active_slave_map = active_slave_map;
+}
+
+static int hns_roce_recover_bond(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev)
+{
+ bond_grp->main_hr_dev = hr_dev;
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ return hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND);
+}
+
+static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx)
+{
+ struct hnae3_handle *handle;
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (handle->priv)
+ hns_roce_bond_uninit_client(bond_grp, func_idx);
+}
+
+static struct hns_roce_dev
+ *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx, bool need_switch);
+
+static int switch_main_dev(struct hns_roce_bond_group *bond_grp,
+ u8 main_func_idx)
+{
+ struct hns_roce_dev *hr_dev;
+ struct net_device *net_dev;
+ u8 i;
+
+ bond_grp->main_hr_dev = NULL;
+ hns_roce_bond_uninit_client(bond_grp, main_func_idx);
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if ((bond_grp->slave_map & (1U << i)) && net_dev) {
+ /* In case this slave is still being registered as
+ * a non-bonded PF, uninit it first and then re-init
+ * it as the main device.
+ */
+ hns_roce_slave_uninit(bond_grp, i);
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev) {
+ bond_grp->main_hr_dev = hr_dev;
+ break;
+ }
+ }
+ }
+
+ if (!bond_grp->main_hr_dev)
+ return -ENODEV;
+
+ return 0;
+}
+
+static struct hns_roce_dev
+ *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx, bool need_switch)
+{
+ struct hns_roce_dev *hr_dev = NULL;
+ struct hnae3_handle *handle;
+ u8 main_func_idx;
+ int ret;
+
+ if (need_switch) {
+ main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
+ if (func_idx == main_func_idx) {
+ ret = switch_main_dev(bond_grp, main_func_idx);
+ if (ret == -ENODEV)
+ return NULL;
+ }
+ }
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (handle) {
+ if (handle->priv)
+ return handle->priv;
+ /* Prevent this device from being initialized as a bond device */
+ if (need_switch)
+ bond_grp->bond_func_info[func_idx].net_dev = NULL;
+ hr_dev = hns_roce_bond_init_client(bond_grp, func_idx);
+ if (!hr_dev)
+ BOND_ERR_LOG("failed to init slave %u.\n", func_idx);
+ }
+
+ return hr_dev;
+}
+
+static struct hns_roce_die_info *alloc_die_info(int bus_num)
+{
+ struct hns_roce_die_info *die_info;
+ int ret;
+
+ die_info = kzalloc(sizeof(*die_info), GFP_KERNEL);
+ if (!die_info)
+ return NULL;
+
+ ret = xa_err(xa_store(&roce_bond_xa, bus_num, die_info, GFP_KERNEL));
+ if (ret) {
+ kfree(die_info);
+ return NULL;
+ }
+
+ mutex_init(&die_info->die_mutex);
+
+ return die_info;
+}
+
+static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num)
+{
+ mutex_destroy(&die_info->die_mutex);
+ xa_erase(&roce_bond_xa, bus_num);
+ kfree(die_info);
+}
+
+static int alloc_bond_id(struct hns_roce_bond_group *bond_grp)
+{
+ u8 bus_num = bond_grp->bus_num;
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+ int i;
+
+ if (!die_info) {
+ die_info = alloc_die_info(bus_num);
+ if (!die_info)
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ if (die_info->bond_id_mask & BOND_ID(i))
+ continue;
+
+ die_info->bond_id_mask |= BOND_ID(i);
+ die_info->bgrps[i] = bond_grp;
+ bond_grp->bond_id = i;
+
+ return 0;
+ }
+
+ return -ENOSPC;
+}
+
+static int remove_bond_id(int bus_num, u8 bond_id)
+{
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+
+ if (bond_id >= ROCE_BOND_NUM_MAX)
+ return -EINVAL;
+
+ if (!die_info)
+ return -ENODEV;
+
+ die_info->bond_id_mask &= ~BOND_ID(bond_id);
+ die_info->bgrps[bond_id] = NULL;
+ if (!die_info->bond_id_mask)
+ dealloc_die_info(die_info, bus_num);
+
+ return 0;
+}
+
+static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp)
+{
+ struct hns_roce_dev *hr_dev;
+ int ret;
+ int i;
+
+ for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) {
+ if (bond_grp->slave_map & (1 << i))
+ hns_roce_slave_uninit(bond_grp, i);
+ }
+
+ mutex_lock(&bond_grp->bond_mutex);
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+ bond_grp->main_hr_dev = NULL;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ if (bond_grp->slave_map & (1 << i)) {
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev) {
+ bond_grp->main_hr_dev = hr_dev;
+ break;
+ }
+ }
+ }
+
+ if (!bond_grp->main_hr_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND);
+
+out:
+ if (ret) {
+ BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret);
+ hns_roce_cleanup_bond(bond_grp);
+ } else {
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE set bond finished!\n");
+ }
+}
+
+static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp)
+{
+ u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
+ struct hns_roce_dev *hr_dev;
+ u8 i;
+
+ if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED)
+ goto out;
+
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ bond_grp->main_hr_dev = NULL;
+
+ hns_roce_slave_uninit(bond_grp, main_func_idx);
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev)
+ bond_grp->main_hr_dev = hr_dev;
+ }
+
+out:
+ hns_roce_cleanup_bond(bond_grp);
+}
+
+static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND);
+
+ mutex_lock(&bond_grp->bond_mutex);
+ if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGESTATE)
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+
+ if (ret)
+ ibdev_err(&bond_grp->main_hr_dev->ib_dev,
+ "failed to change RoCE bond slave state, ret = %d.\n",
+ ret);
+ else
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE slave changestate finished!\n");
+}
+
+static void hns_roce_slave_change_num(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+ u8 i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ if (bond_grp->slave_map & (1U << i)) {
+ if (i == PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn))
+ continue;
+ hns_roce_slave_uninit(bond_grp, i);
+ } else {
+ hns_roce_slave_init(bond_grp, i, true);
+ if (!bond_grp->main_hr_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ bond_grp->bond_func_info[i].net_dev = NULL;
+ bond_grp->bond_func_info[i].handle = NULL;
+ }
+ }
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND);
+
+out:
+ if (ret) {
+ BOND_ERR_LOG("failed to change RoCE bond slave num, ret = %d.\n", ret);
+ hns_roce_cleanup_bond(bond_grp);
+ } else {
+ mutex_lock(&bond_grp->bond_mutex);
+ if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGE_NUM)
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE slave change num finished!\n");
+ }
+}
+
+static void hns_roce_bond_info_update_nolock(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ struct hns_roce_v2_priv *priv;
+ struct hns_roce_dev *hr_dev;
+ struct net_device *net_dev;
+ int func_idx;
+
+ bond_grp->slave_map = 0;
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(upper_dev, net_dev) {
+ func_idx = get_netdev_bond_slave_id(net_dev, bond_grp);
+ if (func_idx < 0) {
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev)
+ continue;
+ func_idx = PCI_FUNC(hr_dev->pci_dev->devfn);
+ if (!bond_grp->bond_func_info[func_idx].net_dev) {
+ priv = hr_dev->priv;
+ bond_grp->bond_func_info[func_idx].net_dev =
+ net_dev;
+ bond_grp->bond_func_info[func_idx].handle =
+ priv->handle;
+ }
+ ib_device_put(&hr_dev->ib_dev);
+ }
+
+ bond_grp->slave_map |= (1 << func_idx);
+ }
+ rcu_read_unlock();
+}
+
+static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct hns_roce_dev *hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ bool ret = true;
+
+ if (!hr_dev) {
+ if (bond_grp &&
+ get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ return true;
+ else
+ return false;
+ }
+
+ if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) {
+ ret = false;
+ goto out;
+ }
+
+ if (hr_dev->is_vf || pci_num_vf(hr_dev->pci_dev) > 0) {
+ ret = false;
+ goto out;
+ }
+
+ if (bond_grp->bus_num != get_hr_bus_num(hr_dev))
+ ret = false;
+
+out:
+ ib_device_put(&hr_dev->ib_dev);
+ return ret;
+}
+
+static bool check_slave_support(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ struct net_device *net_dev;
+ u8 slave_num = 0;
+
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(upper_dev, net_dev) {
+ if (is_dev_bond_supported(bond_grp, net_dev)) {
+ slave_num++;
+ continue;
+ }
+ rcu_read_unlock();
+ return false;
+ }
+ rcu_read_unlock();
+
+ return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX);
+}
+
+static void hns_roce_bond_work(struct work_struct *work)
+{
+ struct delayed_work *delayed_work = to_delayed_work(work);
+ struct hns_roce_bond_group *bond_grp =
+ container_of(delayed_work, struct hns_roce_bond_group,
+ bond_work);
+ enum hns_roce_bond_state bond_state;
+ bool bond_ready;
+
+ mutex_lock(&bond_grp->bond_mutex);
+ bond_ready = check_slave_support(bond_grp, bond_grp->upper_dev);
+ hns_roce_bond_info_update_nolock(bond_grp, bond_grp->upper_dev);
+ bond_state = bond_grp->bond_state;
+ bond_grp->bond_ready = bond_ready;
+ mutex_unlock(&bond_grp->bond_mutex);
+
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "bond work: bond_ready - %d, bond_state - %d.\n",
+ bond_ready, bond_state);
+
+ if (!bond_ready) {
+ hns_roce_clear_bond(bond_grp);
+ return;
+ }
+
+ switch (bond_state) {
+ case HNS_ROCE_BOND_NOT_BONDED:
+ hns_roce_set_bond(bond_grp);
+ /* In set_bond flow, we don't need to set bond netdev here as
+ * it has been done when bond_grp->main_hr_dev is registered.
+ */
+ return;
+ case HNS_ROCE_BOND_SLAVE_CHANGESTATE:
+ hns_roce_slave_changestate(bond_grp);
+ break;
+ case HNS_ROCE_BOND_SLAVE_CHANGE_NUM:
+ hns_roce_slave_change_num(bond_grp);
+ break;
+ default:
+ return;
+ }
+ hns_roce_set_bond_netdev(bond_grp, bond_grp->main_hr_dev);
+}
+
+static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev,
+ struct net_device *upper_dev)
+{
+ bond_grp->upper_dev = upper_dev;
+ bond_grp->main_hr_dev = hr_dev;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ bond_grp->bond_ready = false;
+}
+
+static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp)
+{
+ mutex_lock(&bond_grp->bond_mutex);
+
+ cancel_delayed_work(&bond_grp->bond_work);
+ bond_grp->upper_dev = NULL;
+ bond_grp->main_hr_dev = NULL;
+ bond_grp->bond_ready = false;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
+ bond_grp->slave_map = 0;
+ memset(bond_grp->bond_func_info, 0, sizeof(bond_grp->bond_func_info));
+
+ mutex_unlock(&bond_grp->bond_mutex);
+}
+
+void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+
+ ret = bond_grp->main_hr_dev ?
+ hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO;
+ if (ret)
+ BOND_ERR_LOG("failed to clear RoCE bond, ret = %d.\n", ret);
+ else
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE clear bond finished!\n");
+
+ hns_roce_detach_bond_grp(bond_grp);
+}
+
+static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct hns_roce_bond_group *bond_grp_tmp;
+
+ bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bond_grp->bus_num);
+ return bond_grp_tmp == bond_grp;
+}
+
+static void lowerstate_event_setting(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changelowerstate_info *info)
+{
+ mutex_lock(&bond_grp->bond_mutex);
+
+ if (bond_grp->bond_ready &&
+ bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED)
+ bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGESTATE;
+
+ mutex_unlock(&bond_grp->bond_mutex);
+}
+
+static bool hns_roce_bond_lowerstate_event(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changelowerstate_info *info)
+{
+ struct net_device *net_dev =
+ netdev_notifier_info_to_dev((struct netdev_notifier_info *)info);
+
+ if (!netif_is_lag_port(net_dev))
+ return false;
+
+ if (!lowerstate_event_filter(bond_grp, net_dev))
+ return false;
+
+ lowerstate_event_setting(bond_grp, info);
+
+ return true;
+}
+
+static bool is_bond_setting_supported(struct netdev_lag_upper_info *bond_info)
+{
+ if (!bond_info)
+ return false;
+
+ if (bond_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
+ bond_info->tx_type != NETDEV_LAG_TX_TYPE_HASH)
+ return false;
+
+ if (bond_info->tx_type == NETDEV_LAG_TX_TYPE_HASH &&
+ bond_info->hash_type > NETDEV_LAG_HASH_L23)
+ return false;
+
+ return true;
+}
+
+static void upper_event_setting(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netdev_lag_upper_info *bond_upper_info = NULL;
+ bool slave_inc = info->linking;
+
+ if (slave_inc)
+ bond_upper_info = info->upper_info;
+
+ if (bond_upper_info) {
+ bond_grp->tx_type = bond_upper_info->tx_type;
+ bond_grp->hash_type = bond_upper_info->hash_type;
+ }
+}
+
+static bool check_unlinking_bond_support(struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ u8 slave_num = 0;
+
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(bond_grp->upper_dev, net_dev) {
+ if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ slave_num++;
+ }
+ rcu_read_unlock();
+
+ return (slave_num > 1);
+}
+
+static bool check_linking_bond_support(struct netdev_lag_upper_info *bond_info,
+ struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ if (!is_bond_setting_supported(bond_info))
+ return false;
+
+ return check_slave_support(bond_grp, upper_dev);
+}
+
+static enum bond_support_type
+ check_bond_support(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ bool bond_grp_exist = false;
+ bool support;
+
+ if (upper_dev == bond_grp->upper_dev)
+ bond_grp_exist = true;
+
+ if (!info->linking && !bond_grp_exist)
+ return BOND_NOT_SUPPORT;
+
+ if (info->linking)
+ support = check_linking_bond_support(info->upper_info, bond_grp,
+ upper_dev);
+ else
+ support = check_unlinking_bond_support(bond_grp);
+
+ if (support)
+ return BOND_SUPPORT;
+
+ return bond_grp_exist ? BOND_EXISTING_NOT_SUPPORT : BOND_NOT_SUPPORT;
+}
+
+static bool upper_event_filter(struct netdev_notifier_changeupper_info *info,
+ struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct net_device *upper_dev = info->upper_dev;
+ struct hns_roce_bond_group *bond_grp_tmp;
+ struct hns_roce_dev *hr_dev;
+ bool ret = true;
+ u8 bus_num;
+
+ if (!info->linking ||
+ bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED)
+ return bond_grp->upper_dev == upper_dev;
+
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev)
+ return false;
+
+ bus_num = get_hr_bus_num(hr_dev);
+ if (bond_grp->bus_num != bus_num) {
+ ret = false;
+ goto out;
+ }
+
+ bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp_tmp && bond_grp_tmp != bond_grp)
+ ret = false;
+out:
+ ib_device_put(&hr_dev->ib_dev);
+ return ret;
+}
+
+static bool hns_roce_bond_upper_event(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *net_dev =
+ netdev_notifier_info_to_dev((struct netdev_notifier_info *)info);
+ struct net_device *upper_dev = info->upper_dev;
+ enum bond_support_type support = BOND_SUPPORT;
+ struct hns_roce_dev *hr_dev;
+ int slave_id;
+
+ if (!upper_dev || !netif_is_lag_master(upper_dev))
+ return false;
+
+ if (!upper_event_filter(info, bond_grp, net_dev))
+ return false;
+
+ mutex_lock(&bond_grp->bond_mutex);
+ support = check_bond_support(bond_grp, upper_dev, info);
+ if (support == BOND_NOT_SUPPORT) {
+ mutex_unlock(&bond_grp->bond_mutex);
+ return false;
+ }
+
+ if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_ATTACHED) {
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev) {
+ mutex_unlock(&bond_grp->bond_mutex);
+ return false;
+ }
+ hns_roce_attach_bond_grp(bond_grp, hr_dev, upper_dev);
+ ib_device_put(&hr_dev->ib_dev);
+ }
+
+ /* In the case of netdev being unregistered, the roce
+ * instance shouldn't be inited.
+ */
+ if (net_dev->reg_state >= NETREG_UNREGISTERING) {
+ slave_id = get_netdev_bond_slave_id(net_dev, bond_grp);
+ if (slave_id >= 0) {
+ bond_grp->bond_func_info[slave_id].net_dev = NULL;
+ bond_grp->bond_func_info[slave_id].handle = NULL;
+ }
+ }
+
+ if (support == BOND_SUPPORT) {
+ bond_grp->bond_ready = true;
+ if (bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED)
+ bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGE_NUM;
+ }
+ mutex_unlock(&bond_grp->bond_mutex);
+ if (support == BOND_SUPPORT)
+ upper_event_setting(bond_grp, info);
+
+ return true;
+}
+
+static int hns_roce_bond_event(struct notifier_block *self,
+ unsigned long event, void *ptr)
+{
+ struct hns_roce_bond_group *bond_grp =
+ container_of(self, struct hns_roce_bond_group, bond_nb);
+ bool changed = false;
+
+ if (event == NETDEV_CHANGEUPPER)
+ changed = hns_roce_bond_upper_event(bond_grp, ptr);
+ if (event == NETDEV_CHANGELOWERSTATE)
+ changed = hns_roce_bond_lowerstate_event(bond_grp, ptr);
+
+ if (changed)
+ schedule_delayed_work(&bond_grp->bond_work, HZ);
+
+ return NOTIFY_DONE;
+}
+
+int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ int ret;
+ int i;
+
+ if (xa_load(&roce_bond_xa, bus_num))
+ return 0;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = kvzalloc(sizeof(*bond_grp), GFP_KERNEL);
+ if (!bond_grp) {
+ ret = -ENOMEM;
+ goto mem_err;
+ }
+
+ mutex_init(&bond_grp->bond_mutex);
+ INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_bond_work);
+
+ bond_grp->bond_ready = false;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
+ bond_grp->bus_num = bus_num;
+
+ ret = alloc_bond_id(bond_grp);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "failed to alloc bond ID, ret = %d.\n", ret);
+ goto alloc_id_err;
+ }
+
+ bond_grp->bond_nb.notifier_call = hns_roce_bond_event;
+ ret = register_netdevice_notifier(&bond_grp->bond_nb);
+ if (ret) {
+ ibdev_err(&hr_dev->ib_dev,
+ "failed to register bond nb, ret = %d.\n", ret);
+ goto register_nb_err;
+ }
+ bgrps[i] = bond_grp;
+ }
+
+ return 0;
+
+register_nb_err:
+ remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
+alloc_id_err:
+ mutex_destroy(&bond_grp->bond_mutex);
+ kvfree(bond_grp);
+mem_err:
+ for (i--; i >= 0; i--) {
+ unregister_netdevice_notifier(&bgrps[i]->bond_nb);
+ cancel_delayed_work_sync(&bgrps[i]->bond_work);
+ remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id);
+ mutex_destroy(&bgrps[i]->bond_mutex);
+ kvfree(bgrps[i]);
+ }
+ return ret;
+}
+
+void hns_roce_dealloc_bond_grp(void)
+{
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ unsigned long id;
+ int i;
+
+ xa_for_each(&roce_bond_xa, id, die_info) {
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ unregister_netdevice_notifier(&bond_grp->bond_nb);
+ cancel_delayed_work_sync(&bond_grp->bond_work);
+ remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
+ mutex_destroy(&bond_grp->bond_mutex);
+ kvfree(bond_grp);
+ }
+ }
+}
+
+int hns_roce_bond_init(struct hns_roce_dev *hr_dev)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_v2_priv *priv = hr_dev->priv;
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ int ret;
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+
+ if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT) {
+ ret = hns_roce_recover_bond(bond_grp, hr_dev);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "failed to recover RoCE bond, ret = %d.\n", ret);
+ return ret;
+ }
+ }
+
+ return hns_roce_set_bond_netdev(bond_grp, hr_dev);
+}
+
+void hns_roce_bond_suspend(struct hnae3_handle *handle)
+{
+ u8 bus_num = handle->pdev->bus->number;
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ int i;
+
+ die_info = xa_load(&roce_bond_xa, bus_num);
+ if (!die_info)
+ return;
+
+ mutex_lock(&die_info->die_mutex);
+
+ /*
+ * Avoid duplicated processing when calling this function
+ * multiple times.
+ */
+ if (die_info->suspend_cnt)
+ goto out;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ unregister_netdevice_notifier(&bond_grp->bond_nb);
+ cancel_delayed_work_sync(&bond_grp->bond_work);
+ }
+
+out:
+ die_info->suspend_cnt++;
+ mutex_unlock(&die_info->die_mutex);
+}
+
+void hns_roce_bond_resume(struct hnae3_handle *handle)
+{
+ u8 bus_num = handle->pdev->bus->number;
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ int i, ret;
+
+ die_info = xa_load(&roce_bond_xa, bus_num);
+ if (!die_info)
+ return;
+
+ mutex_lock(&die_info->die_mutex);
+
+ die_info->suspend_cnt--;
+ if (die_info->suspend_cnt)
+ goto out;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ ret = register_netdevice_notifier(&bond_grp->bond_nb);
+ if (ret)
+ dev_err(&handle->pdev->dev,
+ "failed to resume bond notifier(bus_num = %u, id = %u), ret = %d.\n",
+ bus_num, bond_grp->bond_id, ret);
+ }
+
+out:
+ mutex_unlock(&die_info->die_mutex);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h
new file mode 100644
index 000000000000..98c295d78ca1
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#ifndef _HNS_ROCE_BOND_H
+#define _HNS_ROCE_BOND_H
+
+#include <linux/netdevice.h>
+#include <net/bonding.h>
+
+#define ROCE_BOND_FUNC_MAX 4
+#define ROCE_BOND_NUM_MAX 2
+
+#define BOND_ID(id) BIT(id)
+
+#define BOND_ERR_LOG(fmt, ...) \
+ pr_err("HNS RoCE Bonding: " fmt, ##__VA_ARGS__)
+
+enum {
+ BOND_MODE_1,
+ BOND_MODE_2_4,
+};
+
+enum hns_roce_bond_hashtype {
+ BOND_HASH_L2,
+ BOND_HASH_L34,
+ BOND_HASH_L23,
+};
+
+enum bond_support_type {
+ BOND_NOT_SUPPORT,
+ /*
+ * bond_grp already exists, but in the current
+ * conditions it's no longer supported
+ */
+ BOND_EXISTING_NOT_SUPPORT,
+ BOND_SUPPORT,
+};
+
+enum hns_roce_bond_state {
+ HNS_ROCE_BOND_NOT_ATTACHED,
+ HNS_ROCE_BOND_NOT_BONDED,
+ HNS_ROCE_BOND_IS_BONDED,
+ HNS_ROCE_BOND_SLAVE_CHANGE_NUM,
+ HNS_ROCE_BOND_SLAVE_CHANGESTATE,
+};
+
+enum hns_roce_bond_cmd_type {
+ HNS_ROCE_SET_BOND,
+ HNS_ROCE_CHANGE_BOND,
+ HNS_ROCE_CLEAR_BOND,
+};
+
+struct hns_roce_func_info {
+ struct net_device *net_dev;
+ struct hnae3_handle *handle;
+};
+
+struct hns_roce_bond_group {
+ struct net_device *upper_dev;
+ struct hns_roce_dev *main_hr_dev;
+ u8 active_slave_num;
+ u32 slave_map;
+ u32 active_slave_map;
+ u8 bond_id;
+ u8 bus_num;
+ struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX];
+ bool bond_ready;
+ enum hns_roce_bond_state bond_state;
+ enum netdev_lag_tx_type tx_type;
+ enum netdev_lag_hash hash_type;
+ struct mutex bond_mutex;
+ struct notifier_block bond_nb;
+ struct delayed_work bond_work;
+};
+
+struct hns_roce_die_info {
+ u8 bond_id_mask;
+ struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+ struct mutex die_mutex;
+ u8 suspend_cnt;
+};
+
+struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
+ u8 bus_num);
+int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev);
+void hns_roce_dealloc_bond_grp(void);
+void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp);
+bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev);
+int hns_roce_bond_init(struct hns_roce_dev *hr_dev);
+void hns_roce_bond_suspend(struct hnae3_handle *handle);
+void hns_roce_bond_resume(struct hnae3_handle *handle);
+
+#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 06832c0ac055..318f18cf37aa 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -33,6 +33,7 @@
#ifndef _HNS_ROCE_DEVICE_H
#define _HNS_ROCE_DEVICE_H
+#include <linux/pci.h>
#include <rdma/ib_verbs.h>
#include <rdma/hns-abi.h>
#include "hns_roce_debugfs.h"
@@ -153,6 +154,7 @@ enum {
HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14),
HNS_ROCE_CAP_FLAG_STASH = BIT(17),
HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19),
+ HNS_ROCE_CAP_FLAG_BOND = BIT(21),
HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22),
};
@@ -177,6 +179,7 @@ enum hns_roce_instance_state {
HNS_ROCE_STATE_INIT,
HNS_ROCE_STATE_INITED,
HNS_ROCE_STATE_UNINIT,
+ HNS_ROCE_STATE_BOND_UNINIT,
};
enum {
@@ -1167,6 +1170,17 @@ static inline u8 get_tclass(const struct ib_global_route *grh)
grh->traffic_class >> DSCP_SHIFT : grh->traffic_class;
}
+static inline struct net_device *get_hr_netdev(struct hns_roce_dev *hr_dev,
+ u8 port)
+{
+ return hr_dev->iboe.netdevs[port];
+}
+
+static inline u8 get_hr_bus_num(struct hns_roce_dev *hr_dev)
+{
+ return hr_dev->pci_dev->bus->number;
+}
+
void hns_roce_init_uar_table(struct hns_roce_dev *dev);
int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar);
@@ -1293,7 +1307,7 @@ void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn);
void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
int hns_roce_init(struct hns_roce_dev *hr_dev);
-void hns_roce_exit(struct hns_roce_dev *hr_dev);
+void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup);
int hns_roce_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq);
int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq);
int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 63052c0e7613..2d6ae89e525b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -43,11 +43,13 @@
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
+#include "hclge_main.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_cmd.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
#define CREATE_TRACE_POINTS
#include "hns_roce_trace.h"
@@ -1434,6 +1436,79 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
return ret;
}
+static enum hns_roce_opcode_type
+ get_bond_opcode(enum hns_roce_bond_cmd_type bond_type)
+{
+ switch (bond_type) {
+ case HNS_ROCE_SET_BOND:
+ return HNS_ROCE_OPC_SET_BOND_INFO;
+ case HNS_ROCE_CHANGE_BOND:
+ return HNS_ROCE_OPC_CHANGE_ACTIVE_PORT;
+ case HNS_ROCE_CLEAR_BOND:
+ return HNS_ROCE_OPC_CLEAR_BOND_INFO;
+ default:
+ WARN(true, "Invalid bond type %d!\n", bond_type);
+ return HNS_ROCE_OPC_SET_BOND_INFO;
+ }
+}
+
+static enum hns_roce_bond_hashtype
+ get_bond_hashtype(enum netdev_lag_hash netdev_hashtype)
+{
+ switch (netdev_hashtype) {
+ case NETDEV_LAG_HASH_L2:
+ return BOND_HASH_L2;
+ case NETDEV_LAG_HASH_L34:
+ return BOND_HASH_L34;
+ case NETDEV_LAG_HASH_L23:
+ return BOND_HASH_L23;
+ default:
+ WARN(true, "Invalid hash type %d!\n", netdev_hashtype);
+ return BOND_HASH_L2;
+ }
+}
+
+int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
+ enum hns_roce_bond_cmd_type bond_type)
+{
+ enum hns_roce_opcode_type opcode = get_bond_opcode(bond_type);
+ struct hns_roce_bond_info *slave_info;
+ struct hns_roce_cmq_desc desc = {};
+ int ret;
+
+ slave_info = (struct hns_roce_bond_info *)desc.data;
+ hns_roce_cmq_setup_basic_desc(&desc, opcode, false);
+
+ slave_info->bond_id = cpu_to_le32(bond_grp->bond_id);
+ if (bond_type == HNS_ROCE_CLEAR_BOND)
+ goto out;
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ slave_info->bond_mode = cpu_to_le32(BOND_MODE_1);
+ if (bond_grp->active_slave_num != 1)
+ ibdev_warn(&bond_grp->main_hr_dev->ib_dev,
+ "active slave cnt(%u) in Mode 1 is invalid.\n",
+ bond_grp->active_slave_num);
+ } else {
+ slave_info->bond_mode = cpu_to_le32(BOND_MODE_2_4);
+ slave_info->hash_policy =
+ cpu_to_le32(get_bond_hashtype(bond_grp->hash_type));
+ }
+
+ slave_info->active_slave_cnt = cpu_to_le32(bond_grp->active_slave_num);
+ slave_info->active_slave_mask = cpu_to_le32(bond_grp->active_slave_map);
+ slave_info->slave_mask = cpu_to_le32(bond_grp->slave_map);
+
+out:
+ ret = hns_roce_cmq_send(bond_grp->main_hr_dev, &desc, 1);
+ if (ret)
+ ibdev_err(&bond_grp->main_hr_dev->ib_dev,
+ "cmq bond type(%d) failed, ret = %d.\n",
+ bond_type, ret);
+
+ return ret;
+}
+
static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev,
dma_addr_t base_addr, u8 cmd, unsigned long tag)
{
@@ -2275,6 +2350,9 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) <<
HNS_ROCE_CAP_FLAGS_EX_SHIFT;
+ if (hr_dev->is_vf)
+ caps->flags &= ~HNS_ROCE_CAP_FLAG_BOND;
+
caps->num_cqs = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_CQS);
caps->gid_table_len[0] = hr_reg_read(resp_c, PF_CAPS_C_MAX_GID);
caps->max_cqes = 1 << hr_reg_read(resp_c, PF_CAPS_C_CQ_DEPTH);
@@ -7067,7 +7145,7 @@ error_failed_kzalloc:
}
static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
- bool reset)
+ bool reset, bool bond_cleanup)
{
struct hns_roce_dev *hr_dev = handle->priv;
@@ -7079,7 +7157,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT;
hns_roce_handle_device_err(hr_dev);
- hns_roce_exit(hr_dev);
+ hns_roce_exit(hr_dev, bond_cleanup);
kfree(hr_dev->priv);
ib_dealloc_device(&hr_dev->ib_dev);
}
@@ -7130,12 +7208,51 @@ reset_chk_err:
static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
bool reset)
{
+ /* Suspend bond to avoid concurrency */
+ hns_roce_bond_suspend(handle);
+
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
- return;
+ goto out;
handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
- __hns_roce_hw_v2_uninit_instance(handle, reset);
+ __hns_roce_hw_v2_uninit_instance(handle, reset, true);
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+
+out:
+ hns_roce_bond_resume(handle);
+}
+
+struct hns_roce_dev
+ *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx)
+{
+ struct hnae3_handle *handle;
+ int ret;
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (!handle || !handle->client)
+ return NULL;
+
+ ret = hns_roce_hw_v2_init_instance(handle);
+ if (ret)
+ return NULL;
+
+ return handle->priv;
+}
+
+void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx)
+{
+ struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle;
+
+ if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
+ return;
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT;
+
+ __hns_roce_hw_v2_uninit_instance(handle, false, false);
handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
}
@@ -7144,6 +7261,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
{
struct hns_roce_dev *hr_dev;
+ /* Suspend bond to avoid concurrency */
+ hns_roce_bond_suspend(handle);
+
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
return 0;
@@ -7174,6 +7294,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN,
&handle->rinfo.state)) {
handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+ hns_roce_bond_resume(handle);
return 0;
}
@@ -7193,6 +7314,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
dev_info(dev, "reset done, RoCE client reinit finished.\n");
}
+ hns_roce_bond_resume(handle);
return ret;
}
@@ -7204,7 +7326,7 @@ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle)
handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT;
dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n");
msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY);
- __hns_roce_hw_v2_uninit_instance(handle, false);
+ __hns_roce_hw_v2_uninit_instance(handle, false, false);
return 0;
}
@@ -7240,6 +7362,14 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle,
if (linkup || !hr_dev)
return;
+ /* For bond device, the link status depends on the upper netdev,
+ * and the upper device's link status depends on all the slaves'
+ * netdev but not only one. So bond device cannot get a correct
+ * link status from this path.
+ */
+ if (hns_roce_get_bond_grp(netdev, get_hr_bus_num(hr_dev)))
+ return;
+
ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev);
}
@@ -7264,6 +7394,7 @@ static int __init hns_roce_hw_v2_init(void)
static void __exit hns_roce_hw_v2_exit(void)
{
+ hns_roce_dealloc_bond_grp();
hnae3_unregister_client(&hns_roce_hw_v2_client);
hns_roce_cleanup_debugfs();
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index e64a04d6f85b..285fe0875fac 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -35,6 +35,7 @@
#include <linux/bitops.h>
#include "hnae3.h"
+#include "hns_roce_bond.h"
#define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32
#define HNS_ROCE_V2_MTT_ENTRY_SZ 64
@@ -228,6 +229,9 @@ enum hns_roce_opcode_type {
HNS_ROCE_OPC_CFG_GMV_BT = 0x8510,
HNS_ROCE_QUERY_RAM_ECC = 0x8513,
HNS_SWITCH_PARAMETER_CFG = 0x1033,
+ HNS_ROCE_OPC_SET_BOND_INFO = 0x8601,
+ HNS_ROCE_OPC_CLEAR_BOND_INFO = 0x8602,
+ HNS_ROCE_OPC_CHANGE_ACTIVE_PORT = 0x8603,
};
#define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000
@@ -1465,7 +1469,23 @@ struct hns_roce_sccc_clr_done {
__le32 rsv[5];
};
+struct hns_roce_bond_info {
+ __le32 bond_id;
+ __le32 bond_mode;
+ __le32 active_slave_cnt;
+ __le32 active_slave_mask;
+ __le32 slave_mask;
+ __le32 hash_policy;
+};
+
+struct hns_roce_dev
+ *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx);
+void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx);
int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
+ enum hns_roce_bond_cmd_type bond_type);
static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
void __iomem *dest)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index f3607fe107a7..2f4864ab7d4e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -32,7 +32,6 @@
*/
#include <linux/acpi.h>
#include <linux/module.h>
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
@@ -41,6 +40,7 @@
#include "hns_roce_device.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port,
const u8 *addr)
@@ -89,30 +89,75 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
return ret;
}
-static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port,
- unsigned long event)
+static int hns_roce_get_port_state(struct hns_roce_dev *hr_dev, u32 port_num,
+ enum ib_port_state *state)
{
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ struct net_device *net_dev;
+
+ net_dev = ib_device_get_netdev(&hr_dev->ib_dev, port_num);
+ if (!net_dev)
+ return -ENODEV;
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp) {
+ *state = ib_get_curr_port_state(bond_grp->upper_dev);
+ goto out;
+ }
+ }
+
+ *state = ib_get_curr_port_state(net_dev);
+out:
+ dev_put(net_dev);
+ return 0;
+}
+
+static int handle_en_event(struct net_device *netdev,
+ struct hns_roce_dev *hr_dev,
+ u32 port, unsigned long event)
+{
+ struct ib_device *ibdev = &hr_dev->ib_dev;
struct device *dev = hr_dev->dev;
- struct net_device *netdev;
+ enum ib_port_state curr_state;
+ struct ib_event ibevent;
int ret = 0;
- netdev = hr_dev->iboe.netdevs[port];
if (!netdev) {
dev_err(dev, "can't find netdev on port(%u)!\n", port);
return -ENODEV;
}
switch (event) {
- case NETDEV_UP:
- case NETDEV_CHANGE:
case NETDEV_REGISTER:
case NETDEV_CHANGEADDR:
ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
break;
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
+ if (ret)
+ return ret;
+ fallthrough;
case NETDEV_DOWN:
- /*
- * In v1 engine, only support all ports closed together.
- */
+ if (!netif_is_lag_master(netdev))
+ break;
+ curr_state = ib_get_curr_port_state(netdev);
+
+ write_lock_irq(&ibdev->cache_lock);
+ if (ibdev->port_data[port].cache.last_port_state == curr_state) {
+ write_unlock_irq(&ibdev->cache_lock);
+ return 0;
+ }
+ ibdev->port_data[port].cache.last_port_state = curr_state;
+ write_unlock_irq(&ibdev->cache_lock);
+
+ ibevent.event = (curr_state == IB_PORT_DOWN) ?
+ IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE;
+ ibevent.device = ibdev;
+ ibevent.element.port_num = port + 1;
+ ib_dispatch_event(&ibevent);
break;
default:
dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event));
@@ -126,17 +171,25 @@ static int hns_roce_netdev_event(struct notifier_block *self,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct hns_roce_bond_group *bond_grp;
struct hns_roce_ib_iboe *iboe = NULL;
struct hns_roce_dev *hr_dev = NULL;
+ struct net_device *upper = NULL;
int ret;
u32 port;
hr_dev = container_of(self, struct hns_roce_dev, iboe.nb);
iboe = &hr_dev->iboe;
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0),
+ get_hr_bus_num(hr_dev));
+ upper = bond_grp ? bond_grp->upper_dev : NULL;
+ }
for (port = 0; port < hr_dev->caps.num_ports; port++) {
- if (dev == iboe->netdevs[port]) {
- ret = handle_en_event(hr_dev, port, event);
+ if ((!upper && dev == iboe->netdevs[port]) ||
+ (upper && dev == upper)) {
+ ret = handle_en_event(dev, hr_dev, port, event);
if (ret)
return NOTIFY_DONE;
break;
@@ -148,12 +201,13 @@ static int hns_roce_netdev_event(struct notifier_block *self,
static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev)
{
+ struct net_device *net_dev;
int ret;
u8 i;
for (i = 0; i < hr_dev->caps.num_ports; i++) {
- ret = hns_roce_set_mac(hr_dev, i,
- hr_dev->iboe.netdevs[i]->dev_addr);
+ net_dev = get_hr_netdev(hr_dev, i);
+ ret = hns_roce_set_mac(hr_dev, i, net_dev->dev_addr);
if (ret)
return ret;
}
@@ -221,9 +275,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
struct ib_port_attr *props)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
- struct device *dev = hr_dev->dev;
struct net_device *net_dev;
- unsigned long flags;
enum ib_mtu mtu;
u32 port;
int ret;
@@ -244,26 +296,26 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
if (ret)
ibdev_warn(ib_dev, "failed to get speed, ret = %d.\n", ret);
- spin_lock_irqsave(&hr_dev->iboe.lock, flags);
-
- net_dev = hr_dev->iboe.netdevs[port];
+ net_dev = ib_device_get_netdev(ib_dev, port_num);
if (!net_dev) {
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
- dev_err(dev, "find netdev %u failed!\n", port);
+ ibdev_err(ib_dev, "find netdev %u failed!\n", port);
return -EINVAL;
}
mtu = iboe_get_mtu(net_dev->mtu);
props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256;
- props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ?
- IB_PORT_ACTIVE :
- IB_PORT_DOWN;
+
+ dev_put(net_dev);
+
+ ret = hns_roce_get_port_state(hr_dev, port_num, &props->state);
+ if (ret) {
+ ibdev_err(ib_dev, "failed to get port state.\n");
+ return ret;
+ }
+
props->phys_state = props->state == IB_PORT_ACTIVE ?
IB_PORT_PHYS_STATE_LINK_UP :
IB_PORT_PHYS_STATE_DISABLED;
-
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
-
return 0;
}
@@ -617,9 +669,40 @@ static int hns_roce_get_hw_stats(struct ib_device *device,
return num_counters;
}
-static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
+static void
+ hns_roce_unregister_bond_cleanup(struct hns_roce_dev *hr_dev,
+ struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ int i;
+
+ /* To avoid the loss of other slave devices when main_hr_dev
+ * is unregistered, re-initialize the remaining slaves before
+ * the bond resources cleanup.
+ */
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if (net_dev && net_dev != get_hr_netdev(hr_dev, 0))
+ hns_roce_bond_init_client(bond_grp, i);
+ }
+
+ hns_roce_cleanup_bond(bond_grp);
+}
+
+static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev,
+ bool bond_cleanup)
{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
struct hns_roce_ib_iboe *iboe = &hr_dev->iboe;
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ if (bond_cleanup && hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp)
+ hns_roce_unregister_bond_cleanup(hr_dev, bond_grp);
+ }
hr_dev->active = false;
unregister_netdevice_notifier(&iboe->nb);
@@ -708,11 +791,12 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = {
static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
{
- int ret;
struct hns_roce_ib_iboe *iboe = NULL;
- struct ib_device *ib_dev = NULL;
struct device *dev = hr_dev->dev;
+ struct ib_device *ib_dev = NULL;
+ struct net_device *net_dev;
unsigned int i;
+ int ret;
iboe = &hr_dev->iboe;
spin_lock_init(&iboe->lock);
@@ -747,17 +831,38 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops);
- for (i = 0; i < hr_dev->caps.num_ports; i++) {
- if (!hr_dev->iboe.netdevs[i])
- continue;
- ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i],
- i + 1);
- if (ret)
+ dma_set_max_seg_size(dev, SZ_2G);
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ ret = hns_roce_alloc_bond_grp(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to alloc bond_grp for bus %u, ret = %d\n",
+ get_hr_bus_num(hr_dev), ret);
return ret;
+ }
+ }
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND &&
+ hns_roce_bond_is_active(hr_dev)) {
+ ret = hns_roce_bond_init(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to init bond!\n");
+ return ret;
+ }
+ ret = ib_register_device(ib_dev, "hns_bond_%d", dev);
+ } else {
+ for (i = 0; i < hr_dev->caps.num_ports; i++) {
+ net_dev = get_hr_netdev(hr_dev, i);
+ if (!net_dev)
+ continue;
+
+ ret = ib_device_set_netdev(ib_dev, net_dev, i + 1);
+ if (ret)
+ return ret;
+ }
+ ret = ib_register_device(ib_dev, "hns_%d", dev);
}
- dma_set_max_seg_size(dev, SZ_2G);
- ret = ib_register_device(ib_dev, "hns_%d", dev);
if (ret) {
dev_err(dev, "ib_register_device failed!\n");
return ret;
@@ -1157,10 +1262,10 @@ error_failed_alloc_dfx_cnt:
return ret;
}
-void hns_roce_exit(struct hns_roce_dev *hr_dev)
+void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup)
{
hns_roce_unregister_debugfs(hr_dev);
- hns_roce_unregister_device(hr_dev);
+ hns_roce_unregister_device(hr_dev, bond_cleanup);
if (hr_dev->hw->hw_exit)
hr_dev->hw->hw_exit(hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index d35cf59d0f43..225c3e328e0e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include "hns_roce_device.h"
void hns_roce_init_pd_table(struct hns_roce_dev *hr_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index bdd879ac12dd..d1640c5fbaab 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -31,7 +31,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
@@ -1348,11 +1347,13 @@ static int check_mtu_validate(struct hns_roce_dev *hr_dev,
struct hns_roce_qp *hr_qp,
struct ib_qp_attr *attr, int attr_mask)
{
+ struct net_device *net_dev;
enum ib_mtu active_mtu;
int p;
p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
- active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu);
+ net_dev = get_hr_netdev(hr_dev, p);
+ active_mtu = iboe_get_mtu(net_dev->mtu);
if ((hr_dev->caps.max_mtu >= IB_MTU_2048 &&
attr->path_mtu > hr_dev->caps.max_mtu) ||
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 1090051f493b..8a6efb6b9c9e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -3,7 +3,6 @@
* Copyright (c) 2018 Hisilicon Limited.
*/
-#include <linux/pci.h>
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
#include "hns_roce_device.h"
diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c
index c6a0a661d6e7..f4f4f92ba63a 100644
--- a/drivers/infiniband/hw/irdma/cm.c
+++ b/drivers/infiniband/hw/irdma/cm.c
@@ -3710,7 +3710,7 @@ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
iwpd = iwqp->iwpd;
tagged_offset = (uintptr_t)iwqp->ietf_mem.va;
ibmr = irdma_reg_phys_mr(&iwpd->ibpd, iwqp->ietf_mem.pa, buf_len,
- IB_ACCESS_LOCAL_WRITE, &tagged_offset);
+ IB_ACCESS_LOCAL_WRITE, &tagged_offset, false);
if (IS_ERR(ibmr)) {
ret = -ENOMEM;
goto error;
diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c
index 4ef1c29032f7..ce5cf89c463c 100644
--- a/drivers/infiniband/hw/irdma/ctrl.c
+++ b/drivers/infiniband/hw/irdma/ctrl.c
@@ -2943,8 +2943,6 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch,
__le64 *wqe;
struct irdma_sc_cqp *cqp;
u64 hdr;
- struct irdma_sc_ceq *ceq;
- int ret_code = 0;
cqp = cq->dev->cqp;
if (cq->cq_uk.cq_id >= cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt)
@@ -2953,19 +2951,9 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch,
if (cq->ceq_id >= cq->dev->hmc_fpm_misc.max_ceqs)
return -EINVAL;
- ceq = cq->dev->ceq[cq->ceq_id];
- if (ceq && ceq->reg_cq)
- ret_code = irdma_sc_add_cq_ctx(ceq, cq);
-
- if (ret_code)
- return ret_code;
-
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
- if (!wqe) {
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, cq);
+ if (!wqe)
return -ENOMEM;
- }
set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
@@ -3018,17 +3006,12 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq)
struct irdma_sc_cqp *cqp;
__le64 *wqe;
u64 hdr;
- struct irdma_sc_ceq *ceq;
cqp = cq->dev->cqp;
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
if (!wqe)
return -ENOMEM;
- ceq = cq->dev->ceq[cq->ceq_id];
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, cq);
-
set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
set_64bit_val(wqe, 40, cq->shadow_area_pa);
@@ -3602,71 +3585,6 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
}
/**
- * irdma_sc_find_reg_cq - find cq ctx index
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-static u32 irdma_sc_find_reg_cq(struct irdma_sc_ceq *ceq,
- struct irdma_sc_cq *cq)
-{
- u32 i;
-
- for (i = 0; i < ceq->reg_cq_size; i++) {
- if (cq == ceq->reg_cq[i])
- return i;
- }
-
- return IRDMA_INVALID_CQ_IDX;
-}
-
-/**
- * irdma_sc_add_cq_ctx - add cq ctx tracking for ceq
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-int irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
-
- if (ceq->reg_cq_size == ceq->elem_cnt) {
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
- return -ENOMEM;
- }
-
- ceq->reg_cq[ceq->reg_cq_size++] = cq;
-
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
-
- return 0;
-}
-
-/**
- * irdma_sc_remove_cq_ctx - remove cq ctx tracking for ceq
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-void irdma_sc_remove_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq)
-{
- unsigned long flags;
- u32 cq_ctx_idx;
-
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
- cq_ctx_idx = irdma_sc_find_reg_cq(ceq, cq);
- if (cq_ctx_idx == IRDMA_INVALID_CQ_IDX)
- goto exit;
-
- ceq->reg_cq_size--;
- if (cq_ctx_idx != ceq->reg_cq_size)
- ceq->reg_cq[cq_ctx_idx] = ceq->reg_cq[ceq->reg_cq_size];
- ceq->reg_cq[ceq->reg_cq_size] = NULL;
-
-exit:
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
-}
-
-/**
* irdma_sc_cqp_init - Initialize buffers for a control Queue Pair
* @cqp: IWARP control queue pair pointer
* @info: IWARP control queue pair init info pointer
@@ -3950,11 +3868,13 @@ int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp)
*/
void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
{
+ unsigned long flags;
u64 temp_val;
u16 sw_cq_sel;
u8 arm_next_se;
u8 arm_seq_num;
+ spin_lock_irqsave(&ccq->dev->cqp_lock, flags);
get_64bit_val(ccq->cq_uk.shadow_area, 32, &temp_val);
sw_cq_sel = (u16)FIELD_GET(IRDMA_CQ_DBSA_SW_CQ_SELECT, temp_val);
arm_next_se = (u8)FIELD_GET(IRDMA_CQ_DBSA_ARM_NEXT_SE, temp_val);
@@ -3965,6 +3885,7 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT_SE, arm_next_se) |
FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT, 1);
set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val);
+ spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags);
dma_wmb(); /* make sure shadow area is updated before arming */
@@ -4387,9 +4308,6 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq,
ceq->ceq_elem_pa = info->ceqe_pa;
ceq->virtual_map = info->virtual_map;
ceq->itr_no_expire = info->itr_no_expire;
- ceq->reg_cq = info->reg_cq;
- ceq->reg_cq_size = 0;
- spin_lock_init(&ceq->req_cq_lock);
ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0);
ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0);
ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL);
@@ -4472,9 +4390,6 @@ int irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq)
{
struct irdma_sc_cqp *cqp;
- if (ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, ceq->dev->ccq);
-
cqp = ceq->dev->cqp;
cqp->process_cqp_sds = irdma_update_sds_noccq;
@@ -4493,11 +4408,6 @@ int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch)
struct irdma_sc_dev *dev = ceq->dev;
dev->ccq->vsi_idx = ceq->vsi_idx;
- if (ceq->reg_cq) {
- ret_code = irdma_sc_add_cq_ctx(ceq, ceq->dev->ccq);
- if (ret_code)
- return ret_code;
- }
ret_code = irdma_sc_ceq_create(ceq, scratch, true);
if (!ret_code)
@@ -4562,7 +4472,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq)
struct irdma_sc_cq *temp_cq;
u8 polarity;
u32 cq_idx;
- unsigned long flags;
do {
cq_idx = 0;
@@ -4583,11 +4492,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq)
}
cq = temp_cq;
- if (ceq->reg_cq) {
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
- cq_idx = irdma_sc_find_reg_cq(ceq, cq);
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
- }
IRDMA_RING_MOVE_TAIL(ceq->ceq_ring);
if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring))
@@ -4731,7 +4635,8 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch,
u64 hdr;
dev = aeq->dev;
- if (dev->privileged)
+
+ if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2)
writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]);
cqp = dev->cqp;
diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c
index 7bad0e38786a..d1fc5726b979 100644
--- a/drivers/infiniband/hw/irdma/hw.c
+++ b/drivers/infiniband/hw/irdma/hw.c
@@ -2365,7 +2365,6 @@ static int irdma_cqp_manage_apbvt_cmd(struct irdma_device *iwdev,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.manage_apbvt_entry.info;
- memset(info, 0, sizeof(*info));
info->add = add_port;
info->port = accel_local_port;
cqp_info->cqp_cmd = IRDMA_OP_MANAGE_APBVT_ENTRY;
@@ -2474,7 +2473,6 @@ void irdma_manage_arp_cache(struct irdma_pci_f *rf,
if (action == IRDMA_ARP_ADD) {
cqp_info->cqp_cmd = IRDMA_OP_ADD_ARP_CACHE_ENTRY;
info = &cqp_info->in.u.add_arp_cache_entry.info;
- memset(info, 0, sizeof(*info));
info->arp_index = (u16)arp_index;
info->permanent = true;
ether_addr_copy(info->mac_addr, mac_addr);
@@ -2533,7 +2531,6 @@ int irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.manage_qhash_table_entry.info;
- memset(info, 0, sizeof(*info));
info->vsi = &iwdev->vsi;
info->manage = mtype;
info->entry_type = etype;
diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c
index 27b191f61caf..b49fd9cf2476 100644
--- a/drivers/infiniband/hw/irdma/icrdma_if.c
+++ b/drivers/infiniband/hw/irdma/icrdma_if.c
@@ -302,7 +302,8 @@ err_rt_init:
err_ctrl_init:
icrdma_deinit_interrupts(rf, cdev_info);
err_init_interrupts:
- kfree(iwdev->rf);
+ mutex_destroy(&rf->ah_tbl_lock);
+ kfree(rf);
ib_dealloc_device(&iwdev->ibdev);
return err;
@@ -319,6 +320,9 @@ static void icrdma_remove(struct auxiliary_device *aux_dev)
ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false);
irdma_ib_unregister_device(iwdev);
icrdma_deinit_interrupts(iwdev->rf, cdev_info);
+ mutex_destroy(&iwdev->rf->ah_tbl_lock);
+
+ kfree(iwdev->rf);
pr_debug("INIT: Gen[%d] func[%d] device remove success\n",
rdma_ver, PCI_FUNC(cdev_info->pdev->devfn));
diff --git a/drivers/infiniband/hw/irdma/ig3rdma_if.c b/drivers/infiniband/hw/irdma/ig3rdma_if.c
index 1bb42eb298ba..e1d6670d9396 100644
--- a/drivers/infiniband/hw/irdma/ig3rdma_if.c
+++ b/drivers/infiniband/hw/irdma/ig3rdma_if.c
@@ -55,6 +55,7 @@ static int ig3rdma_vchnl_init(struct irdma_pci_f *rf,
ret = irdma_sc_vchnl_init(&rf->sc_dev, &virt_info);
if (ret) {
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
return ret;
}
@@ -124,7 +125,9 @@ static void ig3rdma_decfg_rf(struct irdma_pci_f *rf)
{
struct irdma_hw *hw = &rf->hw;
+ mutex_destroy(&rf->ah_tbl_lock);
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
kfree(hw->io_regs);
iounmap(hw->rdma_reg.addr);
}
@@ -149,6 +152,7 @@ static int ig3rdma_cfg_rf(struct irdma_pci_f *rf,
err = ig3rdma_cfg_regions(&rf->hw, cdev_info);
if (err) {
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
return err;
}
diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h
index 886b30da188a..baab61e424a2 100644
--- a/drivers/infiniband/hw/irdma/main.h
+++ b/drivers/infiniband/hw/irdma/main.h
@@ -556,7 +556,7 @@ void irdma_copy_ip_htonl(__be32 *dst, u32 *src);
u16 irdma_get_vlan_ipv4(u32 *addr);
void irdma_get_vlan_mac_ipv6(u32 *addr, u16 *vlan_id, u8 *mac);
struct ib_mr *irdma_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
- int acc, u64 *iova_start);
+ int acc, u64 *iova_start, bool dma_mr);
int irdma_upload_qp_context(struct irdma_qp *iwqp, bool freeze, bool raw);
void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq);
int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd,
@@ -564,7 +564,6 @@ int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd,
void (*callback_fcn)(struct irdma_cqp_request *cqp_request),
void *cb_param);
void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request);
-bool irdma_cq_empty(struct irdma_cq *iwcq);
int irdma_inetaddr_event(struct notifier_block *notifier, unsigned long event,
void *ptr);
int irdma_inet6addr_event(struct notifier_block *notifier, unsigned long event,
diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c
index fa6325adaede..28dfad7f940c 100644
--- a/drivers/infiniband/hw/irdma/pble.c
+++ b/drivers/infiniband/hw/irdma/pble.c
@@ -506,12 +506,14 @@ exit:
void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
struct irdma_pble_alloc *palloc)
{
- pble_rsrc->freedpbles += palloc->total_cnt;
-
if (palloc->level == PBLE_LEVEL_2)
free_lvl2(pble_rsrc, palloc);
else
irdma_prm_return_pbles(&pble_rsrc->pinfo,
&palloc->level1.chunkinfo);
+
+ mutex_lock(&pble_rsrc->pble_mutex_lock);
+ pble_rsrc->freedpbles += palloc->total_cnt;
pble_rsrc->stats_alloc_freed++;
+ mutex_unlock(&pble_rsrc->pble_mutex_lock);
}
diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c
index 694e5a9ed15d..cee47ddbd1b5 100644
--- a/drivers/infiniband/hw/irdma/puda.c
+++ b/drivers/infiniband/hw/irdma/puda.c
@@ -685,7 +685,6 @@ static int irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc)
ukqp->rq_size = rsrc->rq_size;
IRDMA_RING_INIT(ukqp->sq_ring, ukqp->sq_size);
- IRDMA_RING_INIT(ukqp->initial_ring, ukqp->sq_size);
IRDMA_RING_INIT(ukqp->rq_ring, ukqp->rq_size);
ukqp->wqe_alloc_db = qp->pd->dev->wqe_alloc_db;
@@ -726,7 +725,6 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq)
struct irdma_sc_cqp *cqp;
u64 hdr;
struct irdma_ccq_cqe_info compl_info;
- int status = 0;
cqp = dev->cqp;
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0);
@@ -756,16 +754,8 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq)
print_hex_dump_debug("PUDA: PUDA CREATE CQ", DUMP_PREFIX_OFFSET, 16,
8, wqe, IRDMA_CQP_WQE_SIZE * 8, false);
irdma_sc_cqp_post_sq(dev->cqp);
- status = irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ,
- &compl_info);
- if (!status) {
- struct irdma_sc_ceq *ceq = dev->ceq[0];
-
- if (ceq && ceq->reg_cq)
- status = irdma_sc_add_cq_ctx(ceq, cq);
- }
-
- return status;
+ return irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ,
+ &compl_info);
}
/**
@@ -897,23 +887,17 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type,
struct irdma_puda_buf *buf = NULL;
struct irdma_puda_buf *nextbuf = NULL;
struct irdma_virt_mem *vmem;
- struct irdma_sc_ceq *ceq;
- ceq = vsi->dev->ceq[0];
switch (type) {
case IRDMA_PUDA_RSRC_TYPE_ILQ:
rsrc = vsi->ilq;
vmem = &vsi->ilq_mem;
vsi->ilq = NULL;
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, &rsrc->cq);
break;
case IRDMA_PUDA_RSRC_TYPE_IEQ:
rsrc = vsi->ieq;
vmem = &vsi->ieq_mem;
vsi->ieq = NULL;
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, &rsrc->cq);
break;
default:
ibdev_dbg(to_ibdev(dev), "PUDA: error resource type = 0x%x\n",
diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h
index c1b8f81ea283..cab4896640a1 100644
--- a/drivers/infiniband/hw/irdma/type.h
+++ b/drivers/infiniband/hw/irdma/type.h
@@ -492,9 +492,6 @@ struct irdma_sc_ceq {
u32 first_pm_pbl_idx;
u8 polarity;
u16 vsi_idx;
- struct irdma_sc_cq **reg_cq;
- u32 reg_cq_size;
- spinlock_t req_cq_lock; /* protect access to reg_cq array */
bool virtual_map:1;
bool tph_en:1;
bool itr_no_expire:1;
@@ -894,8 +891,6 @@ struct irdma_ceq_init_info {
u8 tph_val;
u16 vsi_idx;
u32 first_pm_pbl_idx;
- struct irdma_sc_cq **reg_cq;
- u32 reg_cq_idx;
};
struct irdma_aeq_init_info {
diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c
index ce1ae10c30fc..f0846b800913 100644
--- a/drivers/infiniband/hw/irdma/uk.c
+++ b/drivers/infiniband/hw/irdma/uk.c
@@ -114,33 +114,8 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx)
*/
void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp)
{
- u64 temp;
- u32 hw_sq_tail;
- u32 sw_sq_head;
-
- /* valid bit is written and loads completed before reading shadow */
- mb();
-
- /* read the doorbell shadow area */
- get_64bit_val(qp->shadow_area, 0, &temp);
-
- hw_sq_tail = (u32)FIELD_GET(IRDMA_QP_DBSA_HW_SQ_TAIL, temp);
- sw_sq_head = IRDMA_RING_CURRENT_HEAD(qp->sq_ring);
- if (sw_sq_head != qp->initial_ring.head) {
- if (sw_sq_head != hw_sq_tail) {
- if (sw_sq_head > qp->initial_ring.head) {
- if (hw_sq_tail >= qp->initial_ring.head &&
- hw_sq_tail < sw_sq_head)
- writel(qp->qp_id, qp->wqe_alloc_db);
- } else {
- if (hw_sq_tail >= qp->initial_ring.head ||
- hw_sq_tail < sw_sq_head)
- writel(qp->qp_id, qp->wqe_alloc_db);
- }
- }
- }
-
- qp->initial_ring.head = qp->sq_ring.head;
+ dma_wmb();
+ writel(qp->qp_id, qp->wqe_alloc_db);
}
/**
@@ -194,6 +169,7 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx,
qp->sq_wrtrk_array[*wqe_idx].wrid = info->wr_id;
qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size;
qp->sq_wrtrk_array[*wqe_idx].quanta = quanta;
+ qp->sq_wrtrk_array[*wqe_idx].signaled = info->signaled;
return wqe;
}
@@ -1137,6 +1113,27 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq,
}
/**
+ * irdma_uk_cq_empty - Check if CQ is empty
+ * @cq: hw cq
+ */
+bool irdma_uk_cq_empty(struct irdma_cq_uk *cq)
+{
+ __le64 *cqe;
+ u8 polarity;
+ u64 qword3;
+
+ if (cq->avoid_mem_cflct)
+ cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(cq);
+ else
+ cqe = IRDMA_GET_CURRENT_CQ_ELEM(cq);
+
+ get_64bit_val(cqe, 24, &qword3);
+ polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
+
+ return polarity != cq->polarity;
+}
+
+/**
* irdma_uk_cq_poll_cmpl - get cq completion info
* @cq: hw cq
* @info: cq poll information returned
@@ -1287,6 +1284,8 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3);
if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) {
+ unsigned long flags;
+
srq = qp->srq_uk;
get_64bit_val(cqe, 8, &info->wr_id);
@@ -1299,8 +1298,11 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
} else {
info->stag_invalid_set = false;
}
+ spin_lock_irqsave(srq->lock, flags);
IRDMA_RING_MOVE_TAIL(srq->srq_ring);
+ spin_unlock_irqrestore(srq->lock, flags);
pring = &srq->srq_ring;
+
} else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) {
u32 array_idx;
@@ -1355,6 +1357,10 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
if (!info->comp_status)
info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len;
+ if (!qp->sq_wrtrk_array[wqe_idx].signaled) {
+ ret_code = -EFAULT;
+ goto exit;
+ }
info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3);
IRDMA_RING_SET_TAIL(qp->sq_ring,
wqe_idx + qp->sq_wrtrk_array[wqe_idx].quanta);
@@ -1420,8 +1426,9 @@ exit:
IRDMA_RING_MOVE_TAIL(cq->cq_ring);
if (!cq->avoid_mem_cflct && ext_valid)
IRDMA_RING_MOVE_TAIL(cq->cq_ring);
- set_64bit_val(cq->shadow_area, 0,
- IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
+ if (IRDMA_RING_CURRENT_HEAD(cq->cq_ring) & 0x3F || irdma_uk_cq_empty(cq))
+ set_64bit_val(cq->shadow_area, 0,
+ IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
} else {
qword3 &= ~IRDMA_CQ_WQEIDX;
qword3 |= FIELD_PREP(IRDMA_CQ_WQEIDX, pring->tail);
@@ -1574,7 +1581,6 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp,
qp->conn_wqes = move_cnt;
IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->sq_ring, move_cnt);
IRDMA_RING_MOVE_TAIL_BY_COUNT(qp->sq_ring, move_cnt);
- IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->initial_ring, move_cnt);
}
/**
@@ -1719,7 +1725,6 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info)
qp->max_sq_frag_cnt = info->max_sq_frag_cnt;
sq_ring_size = qp->sq_size << info->sq_shift;
IRDMA_RING_INIT(qp->sq_ring, sq_ring_size);
- IRDMA_RING_INIT(qp->initial_ring, sq_ring_size);
if (info->first_sq_wq) {
irdma_setup_connection_wqes(qp, info);
qp->swqe_polarity = 1;
diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h
index ab57f689827a..9eb7fd0b1cbf 100644
--- a/drivers/infiniband/hw/irdma/user.h
+++ b/drivers/infiniband/hw/irdma/user.h
@@ -429,6 +429,7 @@ struct irdma_wqe_uk_ops {
struct irdma_bind_window *op_info);
};
+bool irdma_uk_cq_empty(struct irdma_cq_uk *cq);
int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
struct irdma_cq_poll_info *info);
void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq,
@@ -456,7 +457,6 @@ struct irdma_srq_uk {
struct irdma_uk_attrs *uk_attrs;
__le64 *shadow_area;
struct irdma_ring srq_ring;
- struct irdma_ring initial_ring;
u32 srq_id;
u32 srq_size;
u32 max_srq_frag_cnt;
@@ -465,6 +465,7 @@ struct irdma_srq_uk {
u8 wqe_size;
u8 wqe_size_multiplier;
u8 deferred_flag;
+ spinlock_t *lock;
};
struct irdma_srq_uk_init_info {
@@ -482,7 +483,8 @@ struct irdma_sq_uk_wr_trk_info {
u64 wrid;
u32 wr_len;
u16 quanta;
- u8 reserved[2];
+ u8 signaled;
+ u8 reserved[1];
};
struct irdma_qp_quanta {
diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c
index 8b94d87b0192..cc2a12f735d3 100644
--- a/drivers/infiniband/hw/irdma/utils.c
+++ b/drivers/infiniband/hw/irdma/utils.c
@@ -452,6 +452,7 @@ struct irdma_cqp_request *irdma_alloc_and_get_cqp_request(struct irdma_cqp *cqp,
cqp_request->waiting = wait;
refcount_set(&cqp_request->refcnt, 1);
memset(&cqp_request->compl_info, 0, sizeof(cqp_request->compl_info));
+ memset(&cqp_request->info, 0, sizeof(cqp_request->info));
return cqp_request;
}
@@ -1068,7 +1069,6 @@ int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp)
cqp_info = &cqp_request->info;
qp_info = &cqp_request->info.in.u.qp_create.info;
- memset(qp_info, 0, sizeof(*qp_info));
qp_info->cq_num_valid = true;
qp_info->next_iwarp_state = IRDMA_QP_STATE_RTS;
cqp_info->cqp_cmd = IRDMA_OP_QP_CREATE;
@@ -1343,7 +1343,6 @@ int irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp)
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = IRDMA_OP_QP_DESTROY;
cqp_info->post_sq = 1;
cqp_info->in.u.qp_destroy.qp = qp;
@@ -1749,7 +1748,6 @@ int irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = IRDMA_OP_STATS_GATHER;
cqp_info->post_sq = 1;
cqp_info->in.u.stats_gather.info = pestat->gather_info;
@@ -1789,7 +1787,6 @@ int irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = cmd;
cqp_info->post_sq = 1;
cqp_info->in.u.stats_manage.info = *stats_info;
@@ -1890,7 +1887,6 @@ int irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = cmd;
cqp_info->post_sq = 1;
cqp_info->in.u.ws_node.info = *node_info;
@@ -2357,24 +2353,6 @@ void irdma_ib_qp_event(struct irdma_qp *iwqp, enum irdma_qp_event_type event)
iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context);
}
-bool irdma_cq_empty(struct irdma_cq *iwcq)
-{
- struct irdma_cq_uk *ukcq;
- u64 qword3;
- __le64 *cqe;
- u8 polarity;
-
- ukcq = &iwcq->sc_cq.cq_uk;
- if (ukcq->avoid_mem_cflct)
- cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(ukcq);
- else
- cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq);
- get_64bit_val(cqe, 24, &qword3);
- polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
-
- return polarity != ukcq->polarity;
-}
-
void irdma_remove_cmpls_list(struct irdma_cq *iwcq)
{
struct irdma_cmpl_gen *cmpl_node;
@@ -2436,6 +2414,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
struct irdma_qp_uk *qp = &iwqp->sc_qp.qp_uk;
struct irdma_ring *sq_ring = &qp->sq_ring;
struct irdma_ring *rq_ring = &qp->rq_ring;
+ struct irdma_cq *iwscq = iwqp->iwscq;
+ struct irdma_cq *iwrcq = iwqp->iwrcq;
struct irdma_cmpl_gen *cmpl;
__le64 *sw_wqe;
u64 wqe_qword;
@@ -2443,8 +2423,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
bool compl_generated = false;
unsigned long flags1;
- spin_lock_irqsave(&iwqp->iwscq->lock, flags1);
- if (irdma_cq_empty(iwqp->iwscq)) {
+ spin_lock_irqsave(&iwscq->lock, flags1);
+ if (irdma_uk_cq_empty(&iwscq->sc_cq.cq_uk)) {
unsigned long flags2;
spin_lock_irqsave(&iwqp->lock, flags2);
@@ -2452,7 +2432,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC);
if (!cmpl) {
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
return;
}
@@ -2471,24 +2451,24 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
kfree(cmpl);
continue;
}
- ibdev_dbg(iwqp->iwscq->ibcq.device,
+ ibdev_dbg(iwscq->ibcq.device,
"DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n",
__func__, cmpl->cpi.wr_id, qp->qp_id);
- list_add_tail(&cmpl->list, &iwqp->iwscq->cmpl_generated);
+ list_add_tail(&cmpl->list, &iwscq->cmpl_generated);
compl_generated = true;
}
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
if (compl_generated)
- irdma_comp_handler(iwqp->iwscq);
+ irdma_comp_handler(iwscq);
} else {
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush,
msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS));
}
- spin_lock_irqsave(&iwqp->iwrcq->lock, flags1);
- if (irdma_cq_empty(iwqp->iwrcq)) {
+ spin_lock_irqsave(&iwrcq->lock, flags1);
+ if (irdma_uk_cq_empty(&iwrcq->sc_cq.cq_uk)) {
unsigned long flags2;
spin_lock_irqsave(&iwqp->lock, flags2);
@@ -2496,7 +2476,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC);
if (!cmpl) {
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
return;
}
@@ -2508,20 +2488,20 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl->cpi.q_type = IRDMA_CQE_QTYPE_RQ;
/* remove the RQ WR by moving RQ tail */
IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1);
- ibdev_dbg(iwqp->iwrcq->ibcq.device,
+ ibdev_dbg(iwrcq->ibcq.device,
"DEV: %s: adding wr_id = 0x%llx RQ Completion to list qp_id=%d, wqe_idx=%d\n",
__func__, cmpl->cpi.wr_id, qp->qp_id,
wqe_idx);
- list_add_tail(&cmpl->list, &iwqp->iwrcq->cmpl_generated);
+ list_add_tail(&cmpl->list, &iwrcq->cmpl_generated);
compl_generated = true;
}
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
if (compl_generated)
- irdma_comp_handler(iwqp->iwrcq);
+ irdma_comp_handler(iwrcq);
} else {
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush,
msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS));
}
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index c883c9ea5a83..6d9af41a2884 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -27,7 +27,8 @@ static int irdma_query_device(struct ib_device *ibdev,
irdma_fw_minor_ver(&rf->sc_dev);
props->device_cap_flags = IB_DEVICE_MEM_WINDOW |
IB_DEVICE_MEM_MGT_EXTENSIONS;
- props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY;
+ if (hw_attrs->uk_attrs.hw_rev < IRDMA_GEN_3)
+ props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY;
props->vendor_id = pcidev->vendor;
props->vendor_part_id = pcidev->device;
@@ -771,7 +772,6 @@ static int irdma_cqp_create_qp_cmd(struct irdma_qp *iwqp)
cqp_info = &cqp_request->info;
qp_info = &cqp_request->info.in.u.qp_create.info;
- memset(qp_info, 0, sizeof(*qp_info));
qp_info->mac_valid = true;
qp_info->cq_num_valid = true;
qp_info->next_iwarp_state = IRDMA_QP_STATE_IDLE;
@@ -2029,6 +2029,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
struct irdma_pci_f *rf;
struct irdma_cq_buf *cq_buf = NULL;
unsigned long flags;
+ u8 cqe_size;
int ret;
iwdev = to_iwdev(ibcq->device);
@@ -2045,7 +2046,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
return -EINVAL;
if (!iwcq->user_mode) {
- entries++;
+ entries += 2;
if (!iwcq->sc_cq.cq_uk.avoid_mem_cflct &&
dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
@@ -2053,6 +2054,10 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
if (entries & 1)
entries += 1; /* cq size must be an even number */
+
+ cqe_size = iwcq->sc_cq.cq_uk.avoid_mem_cflct ? 64 : 32;
+ if (entries * cqe_size == IRDMA_HW_PAGE_SIZE)
+ entries += 2;
}
info.cq_size = max(entries, 4);
@@ -2306,8 +2311,8 @@ static int irdma_setup_kmode_srq(struct irdma_device *iwdev,
ukinfo->srq_size = depth >> shift;
ukinfo->shadow_area = mem->va + ring_size;
- info->shadow_area_pa = info->srq_pa + ring_size;
info->srq_pa = mem->pa;
+ info->shadow_area_pa = info->srq_pa + ring_size;
return 0;
}
@@ -2384,6 +2389,7 @@ static int irdma_create_srq(struct ib_srq *ibsrq,
info.vsi = &iwdev->vsi;
info.pd = &iwpd->sc_pd;
+ iwsrq->sc_srq.srq_uk.lock = &iwsrq->lock;
err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info);
if (err_code)
goto free_dmem;
@@ -2483,6 +2489,7 @@ static int irdma_create_cq(struct ib_cq *ibcq,
int err_code;
int entries = attr->cqe;
bool cqe_64byte_ena;
+ u8 cqe_size;
err_code = cq_validate_flags(attr->flags, dev->hw_attrs.uk_attrs.hw_rev);
if (err_code)
@@ -2509,6 +2516,7 @@ static int irdma_create_cq(struct ib_cq *ibcq,
ukinfo->cq_id = cq_num;
cqe_64byte_ena = dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_64_BYTE_CQE ?
true : false;
+ cqe_size = cqe_64byte_ena ? 64 : 32;
ukinfo->avoid_mem_cflct = cqe_64byte_ena;
iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size;
if (attr->comp_vector < rf->ceqs_count)
@@ -2581,13 +2589,16 @@ static int irdma_create_cq(struct ib_cq *ibcq,
goto cq_free_rsrc;
}
- entries++;
+ entries += 2;
if (!cqe_64byte_ena && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
entries *= 2;
if (entries & 1)
entries += 1; /* cq size must be an even number */
+ if (entries * cqe_size == IRDMA_HW_PAGE_SIZE)
+ entries += 2;
+
ukinfo->cq_size = entries;
if (cqe_64byte_ena)
@@ -3103,12 +3114,10 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.alloc_stag.info;
- memset(info, 0, sizeof(*info));
info->page_size = PAGE_SIZE;
info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S;
info->pd_id = iwpd->sc_pd.pd_id;
info->total_len = iwmr->len;
- info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY;
info->remote_access = true;
cqp_info->cqp_cmd = IRDMA_OP_ALLOC_STAG;
cqp_info->post_sq = 1;
@@ -3119,7 +3128,7 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev,
if (status)
return status;
- iwmr->is_hwreg = 1;
+ iwmr->is_hwreg = true;
return 0;
}
@@ -3253,7 +3262,6 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
cqp_info = &cqp_request->info;
stag_info = &cqp_info->in.u.mr_reg_non_shared.info;
- memset(stag_info, 0, sizeof(*stag_info));
stag_info->va = iwpbl->user_base;
stag_info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S;
stag_info->stag_key = (u8)iwmr->stag;
@@ -3263,7 +3271,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS)
stag_info->remote_atomics_en = (access & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
stag_info->pd_id = iwpd->sc_pd.pd_id;
- stag_info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY;
+ stag_info->all_memory = iwmr->dma_mr;
if (stag_info->access_rights & IRDMA_ACCESS_FLAGS_ZERO_BASED)
stag_info->addr_type = IRDMA_ADDR_TYPE_ZERO_BASED;
else
@@ -3290,7 +3298,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request);
if (!ret)
- iwmr->is_hwreg = 1;
+ iwmr->is_hwreg = true;
return ret;
}
@@ -3647,7 +3655,6 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr)
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.dealloc_stag.info;
- memset(info, 0, sizeof(*info));
info->pd_id = iwpd->sc_pd.pd_id;
info->stag_idx = ib_mr->rkey >> IRDMA_CQPSQ_STAG_IDX_S;
info->mr = true;
@@ -3663,7 +3670,7 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr)
if (status)
return status;
- iwmr->is_hwreg = 0;
+ iwmr->is_hwreg = false;
return 0;
}
@@ -3786,9 +3793,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
* @size: size of memory to register
* @access: Access rights
* @iova_start: start of virtual address for physical buffers
+ * @dma_mr: Flag indicating whether this region is a PD DMA MR
*/
struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access,
- u64 *iova_start)
+ u64 *iova_start, bool dma_mr)
{
struct irdma_device *iwdev = to_iwdev(pd->device);
struct irdma_pbl *iwpbl;
@@ -3805,6 +3813,7 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access
iwpbl = &iwmr->iwpbl;
iwpbl->iwmr = iwmr;
iwmr->type = IRDMA_MEMREG_TYPE_MEM;
+ iwmr->dma_mr = dma_mr;
iwpbl->user_base = *iova_start;
stag = irdma_create_stag(iwdev);
if (!stag) {
@@ -3843,7 +3852,7 @@ static struct ib_mr *irdma_get_dma_mr(struct ib_pd *pd, int acc)
{
u64 kva = 0;
- return irdma_reg_phys_mr(pd, 0, 0, acc, &kva);
+ return irdma_reg_phys_mr(pd, 0, 0, acc, &kva, true);
}
/**
@@ -4078,7 +4087,7 @@ static int irdma_post_send(struct ib_qp *ibqp,
break;
case IB_WR_LOCAL_INV:
info.op_type = IRDMA_OP_TYPE_INV_STAG;
- info.local_fence = info.read_fence;
+ info.local_fence = true;
info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
err = irdma_uk_stag_local_invalidate(ukqp, &info, true);
break;
@@ -4505,7 +4514,7 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq,
}
if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
- (!irdma_cq_empty(iwcq) || !list_empty(&iwcq->cmpl_generated)))
+ (!irdma_uk_cq_empty(ukcq) || !list_empty(&iwcq->cmpl_generated)))
ret = 1;
spin_unlock_irqrestore(&iwcq->lock, flags);
@@ -5204,7 +5213,7 @@ static int irdma_create_user_ah(struct ib_ah *ibah,
struct irdma_ah *parent_ah;
int err;
- if (udata && udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN)
+ if (udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN)
return -EINVAL;
err = irdma_setup_ah(ibah, attr);
@@ -5500,7 +5509,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev)
irdma_rt_deinit_hw(iwdev);
if (!iwdev->is_vport) {
irdma_ctrl_deinit_hw(iwdev->rf);
- if (iwdev->rf->vchnl_wq)
+ if (iwdev->rf->vchnl_wq) {
destroy_workqueue(iwdev->rf->vchnl_wq);
+ mutex_destroy(&iwdev->rf->sc_dev.vchnl_mutex);
+ }
}
}
diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h
index ac8b38701835..aabbb3442098 100644
--- a/drivers/infiniband/hw/irdma/verbs.h
+++ b/drivers/infiniband/hw/irdma/verbs.h
@@ -111,7 +111,8 @@ struct irdma_mr {
};
struct ib_umem *region;
int access;
- u8 is_hwreg;
+ bool is_hwreg:1;
+ bool dma_mr:1;
u16 type;
u32 page_cnt;
u64 page_size;
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index 12b481d138cf..03aacd526860 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -591,7 +591,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
int mlx4_ib_cm_init(void)
{
- cm_wq = alloc_workqueue("mlx4_ib_cm", 0, 0);
+ cm_wq = alloc_workqueue("mlx4_ib_cm", WQ_PERCPU, 0);
if (!cm_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 8b506417ad2f..d31d7f3005c6 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1225,6 +1225,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(create_flow_table_in, in, other_vport));
MLX5_SET(destroy_flow_table_in, din, vport_number,
MLX5_GET(create_flow_table_in, in, vport_number));
+ MLX5_SET(destroy_flow_table_in, din, other_eswitch,
+ MLX5_GET(create_flow_table_in, in, other_eswitch));
+ MLX5_SET(destroy_flow_table_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(create_flow_table_in, in,
+ eswitch_owner_vhca_id));
MLX5_SET(destroy_flow_table_in, din, table_type,
MLX5_GET(create_flow_table_in, in, table_type));
MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id);
@@ -1237,6 +1242,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(create_flow_group_in, in, other_vport));
MLX5_SET(destroy_flow_group_in, din, vport_number,
MLX5_GET(create_flow_group_in, in, vport_number));
+ MLX5_SET(destroy_flow_group_in, din, other_eswitch,
+ MLX5_GET(create_flow_group_in, in, other_eswitch));
+ MLX5_SET(destroy_flow_group_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(create_flow_group_in, in,
+ eswitch_owner_vhca_id));
MLX5_SET(destroy_flow_group_in, din, table_type,
MLX5_GET(create_flow_group_in, in, table_type));
MLX5_SET(destroy_flow_group_in, din, table_id,
@@ -1251,6 +1261,10 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(set_fte_in, in, other_vport));
MLX5_SET(delete_fte_in, din, vport_number,
MLX5_GET(set_fte_in, in, vport_number));
+ MLX5_SET(delete_fte_in, din, other_eswitch,
+ MLX5_GET(set_fte_in, in, other_eswitch));
+ MLX5_SET(delete_fte_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(set_fte_in, in, eswitch_owner_vhca_id));
MLX5_SET(delete_fte_in, din, table_type,
MLX5_GET(set_fte_in, in, table_type));
MLX5_SET(delete_fte_in, din, table_id,
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index b0f7663c24c1..d17823ce7f38 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -691,22 +691,13 @@ static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device)
return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed);
}
-static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev,
- struct mlx5_flow_namespace *ns,
+static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
struct mlx5_ib_flow_prio *prio,
- int priority,
- int num_entries, int num_groups,
- u32 flags, u16 vport)
+ struct mlx5_flow_table_attr *ft_attr)
{
- struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_table *ft;
- ft_attr.prio = priority;
- ft_attr.max_fte = num_entries;
- ft_attr.flags = flags;
- ft_attr.vport = vport;
- ft_attr.autogroup.max_num_groups = num_groups;
- ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+ ft = mlx5_create_auto_grouped_flow_table(ns, ft_attr);
if (IS_ERR(ft))
return ERR_CAST(ft);
@@ -720,6 +711,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
enum flow_table_type ft_type)
{
bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
+ struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_namespace *ns = NULL;
enum mlx5_flow_namespace_type fn_type;
struct mlx5_ib_flow_prio *prio;
@@ -797,11 +789,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
max_table_size = min_t(int, num_entries, max_table_size);
ft = prio->flow_table;
- if (!ft)
- return _get_prio(dev, ns, prio, priority, max_table_size,
- num_groups, flags, 0);
+ if (ft)
+ return prio;
- return prio;
+ ft_attr.prio = priority;
+ ft_attr.max_fte = max_table_size;
+ ft_attr.flags = flags;
+ ft_attr.autogroup.max_num_groups = num_groups;
+ return _get_prio(ns, prio, &ft_attr);
}
enum {
@@ -950,6 +945,7 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev,
enum mlx5_ib_optional_counter_type type)
{
enum mlx5_ib_optional_counter_type per_qp_type;
+ struct mlx5_flow_table_attr ft_attr = {};
enum mlx5_flow_namespace_type fn_type;
struct mlx5_flow_namespace *ns;
struct mlx5_ib_flow_prio *prio;
@@ -1003,7 +999,10 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev,
if (prio->flow_table)
return 0;
- prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = MLX5_FS_MAX_POOL_SIZE;
+ ft_attr.autogroup.max_num_groups = 1;
+ prio = _get_prio(ns, prio, &ft_attr);
if (IS_ERR(prio))
return PTR_ERR(prio);
@@ -1223,6 +1222,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
struct mlx5_ib_op_fc *opfc,
enum mlx5_ib_optional_counter_type type)
{
+ struct mlx5_flow_table_attr ft_attr = {};
enum mlx5_flow_namespace_type fn_type;
int priority, i, err, spec_num;
struct mlx5_flow_act flow_act = {};
@@ -1304,8 +1304,10 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
if (err)
goto free;
- prio = _get_prio(dev, ns, prio, priority,
- dev->num_ports * MAX_OPFC_RULES, 1, 0, 0);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = dev->num_ports * MAX_OPFC_RULES;
+ ft_attr.autogroup.max_num_groups = 1;
+ prio = _get_prio(ns, prio, &ft_attr);
if (IS_ERR(prio)) {
err = PTR_ERR(prio);
goto put_prio;
@@ -1872,7 +1874,7 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev,
u32 *flags, u16 *vport_idx,
u16 *vport,
struct mlx5_core_dev **ft_mdev,
- u32 ib_port)
+ u32 ib_port, u16 *esw_owner_vhca_id)
{
struct mlx5_core_dev *esw_mdev;
@@ -1886,8 +1888,13 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev,
return -EINVAL;
esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw);
- if (esw_mdev != dev->mdev)
- return -EOPNOTSUPP;
+ if (esw_mdev != dev->mdev) {
+ if (!MLX5_CAP_ADV_RDMA(dev->mdev,
+ rdma_transport_manager_other_eswitch))
+ return -EOPNOTSUPP;
+ *flags |= MLX5_FLOW_TABLE_OTHER_ESWITCH;
+ *esw_owner_vhca_id = MLX5_CAP_GEN(esw_mdev, vhca_id);
+ }
*flags |= MLX5_FLOW_TABLE_OTHER_VPORT;
*ft_mdev = esw_mdev;
@@ -1903,8 +1910,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
bool mcast, u32 ib_port)
{
struct mlx5_core_dev *ft_mdev = dev->mdev;
+ struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_namespace *ns = NULL;
struct mlx5_ib_flow_prio *prio = NULL;
+ u16 esw_owner_vhca_id = 0;
int max_table_size = 0;
u16 vport_idx = 0;
bool esw_encap;
@@ -1966,7 +1975,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
return ERR_PTR(-EINVAL);
ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags,
&vport_idx, &vport,
- &ft_mdev, ib_port);
+ &ft_mdev, ib_port,
+ &esw_owner_vhca_id);
if (ret)
return ERR_PTR(ret);
@@ -2026,8 +2036,13 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
if (prio->flow_table)
return prio;
- return _get_prio(dev, ns, prio, priority, max_table_size,
- MLX5_FS_MAX_TYPES, flags, vport);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = max_table_size;
+ ft_attr.flags = flags;
+ ft_attr.vport = vport;
+ ft_attr.esw_owner_vhca_id = esw_owner_vhca_id;
+ ft_attr.autogroup.max_num_groups = MLX5_FS_MAX_TYPES;
+ return _get_prio(ns, prio, &ft_attr);
}
static struct mlx5_ib_flow_handler *
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index cc8859d3c2f5..bbecca405171 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -44,6 +44,63 @@ static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports)
}
}
+static int mlx5_ib_set_owner_transport(struct mlx5_core_dev *cur_owner,
+ struct mlx5_core_dev *new_owner)
+{
+ int ret;
+
+ if (!MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(cur_owner, ft_support) ||
+ !MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(cur_owner, ft_support))
+ return 0;
+
+ if (!MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager) ||
+ !MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager_other_eswitch))
+ return 0;
+
+ ret = mlx5_fs_set_root_dev(cur_owner, new_owner,
+ FS_FT_RDMA_TRANSPORT_TX);
+ if (ret)
+ return ret;
+
+ ret = mlx5_fs_set_root_dev(cur_owner, new_owner,
+ FS_FT_RDMA_TRANSPORT_RX);
+ if (ret) {
+ mlx5_fs_set_root_dev(cur_owner, cur_owner,
+ FS_FT_RDMA_TRANSPORT_TX);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mlx5_ib_release_transport(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_dev *peer_dev;
+ int i, ret;
+
+ mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
+ ret = mlx5_ib_set_owner_transport(peer_dev, peer_dev);
+ WARN_ON_ONCE(ret);
+ }
+}
+
+static int mlx5_ib_take_transport(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_dev *peer_dev;
+ int ret;
+ int i;
+
+ mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
+ ret = mlx5_ib_set_owner_transport(peer_dev, dev);
+ if (ret) {
+ mlx5_ib_release_transport(dev);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static int
mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
@@ -88,10 +145,18 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
else
return mlx5_ib_set_vport_rep(lag_master, rep, vport_index);
+ if (mlx5_lag_is_shared_fdb(dev)) {
+ ret = mlx5_ib_take_transport(lag_master);
+ if (ret)
+ return ret;
+ }
+
ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
mlx5_core_net(lag_master));
- if (!ibdev)
- return -ENOMEM;
+ if (!ibdev) {
+ ret = -ENOMEM;
+ goto release_transport;
+ }
ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port),
GFP_KERNEL);
@@ -127,6 +192,10 @@ fail_add:
kfree(ibdev->port);
fail_port:
ib_dealloc_device(&ibdev->ib_dev);
+release_transport:
+ if (mlx5_lag_is_shared_fdb(lag_master))
+ mlx5_ib_release_transport(lag_master);
+
return ret;
}
@@ -182,6 +251,7 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
esw = peer_mdev->priv.eswitch;
mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
}
+ mlx5_ib_release_transport(mdev);
}
__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 90daa58126f4..40284bbb45d6 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -511,6 +511,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_XDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_1600TAUI_8_1600TBASE_CR8_KR8):
+ *active_width = IB_WIDTH_8X;
+ *active_speed = IB_SPEED_XDR;
+ break;
default:
return -EINVAL;
}
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 0e8ae85af5a6..e71ee3d52eb0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -97,33 +97,28 @@ struct mlx5_pagefault {
* a pagefault. */
#define MMU_NOTIFIER_TIMEOUT 1000
-#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
-#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
-#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
-#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
-#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
-
-#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
-
static u64 mlx5_imr_ksm_entries;
+static u64 mlx5_imr_mtt_entries;
+static u64 mlx5_imr_mtt_size;
+static u8 mlx5_imr_mtt_shift;
+static u8 mlx5_imr_ksm_page_shift;
-static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
+static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries,
struct mlx5_ib_mr *imr, int flags)
{
struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev;
- struct mlx5_klm *end = pklm + nentries;
- int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0;
+ struct mlx5_ksm *end = pksm + nentries;
+ u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0;
__be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ?
cpu_to_be32(imr->null_mmkey.key) :
mr_to_mdev(imr)->mkeys.null_mkey;
u64 va =
- MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0;
+ MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0;
if (flags & MLX5_IB_UPD_XLT_ZAP) {
- for (; pklm != end; pklm++, idx++, va += step) {
- pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- pklm->key = key;
- pklm->va = cpu_to_be64(va);
+ for (; pksm != end; pksm++, idx++, va += step) {
+ pksm->key = key;
+ pksm->va = cpu_to_be64(va);
}
return;
}
@@ -147,16 +142,15 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
*/
lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
- for (; pklm != end; pklm++, idx++, va += step) {
+ for (; pksm != end; pksm++, idx++, va += step) {
struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
- pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
if (mtt) {
- pklm->key = cpu_to_be32(mtt->ibmr.lkey);
- pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
+ pksm->key = cpu_to_be32(mtt->ibmr.lkey);
+ pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size);
} else {
- pklm->key = key;
- pklm->va = cpu_to_be64(va);
+ pksm->key = key;
+ pksm->va = cpu_to_be64(va);
}
}
}
@@ -201,7 +195,7 @@ int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
{
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
- populate_klm(xlt, idx, nentries, mr, flags);
+ populate_ksm(xlt, idx, nentries, mr, flags);
return 0;
} else {
return populate_mtt(xlt, idx, nentries, mr, flags);
@@ -226,7 +220,7 @@ static void free_implicit_child_mr_work(struct work_struct *work)
mutex_lock(&odp_imr->umem_mutex);
mlx5r_umr_update_xlt(mr->parent,
- ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0,
+ ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
mutex_unlock(&odp_imr->umem_mutex);
mlx5_ib_dereg_mr(&mr->ibmr, NULL);
@@ -237,7 +231,7 @@ static void free_implicit_child_mr_work(struct work_struct *work)
static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
+ unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift;
struct mlx5_ib_mr *imr = mr->parent;
/*
@@ -265,7 +259,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
- queue_work(system_unbound_wq, &mr->odp_destroy.work);
+ queue_work(system_dfl_wq, &mr->odp_destroy.work);
}
static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
@@ -425,7 +419,10 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
MLX5_CAP_GEN(dev->mdev, null_mkey) &&
MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
- !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
+ !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) &&
+ mlx5_imr_ksm_entries != 0 &&
+ !(mlx5_imr_ksm_page_shift >
+ get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM)))
caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
}
@@ -476,14 +473,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
int err;
odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
- idx * MLX5_IMR_MTT_SIZE,
- MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
+ idx * mlx5_imr_mtt_size,
+ mlx5_imr_mtt_size, &mlx5_mn_ops);
if (IS_ERR(odp))
return ERR_CAST(odp);
mr = mlx5_mr_cache_alloc(dev, imr->access_flags,
MLX5_MKC_ACCESS_MODE_MTT,
- MLX5_IMR_MTT_ENTRIES);
+ mlx5_imr_mtt_entries);
if (IS_ERR(mr)) {
ib_umem_odp_release(odp);
return mr;
@@ -495,7 +492,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
mr->umem = &odp->umem;
mr->ibmr.lkey = mr->mmkey.key;
mr->ibmr.rkey = mr->mmkey.key;
- mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE;
+ mr->ibmr.iova = idx * mlx5_imr_mtt_size;
mr->parent = imr;
odp->private = mr;
@@ -506,7 +503,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_set(&mr->mmkey.usecount, 2);
err = mlx5r_umr_update_xlt(mr, 0,
- MLX5_IMR_MTT_ENTRIES,
+ mlx5_imr_mtt_entries,
PAGE_SHIFT,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ENABLE);
@@ -611,7 +608,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
struct mlx5_ib_mr *imr;
int err;
- if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE))
+ if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE))
return ERR_PTR(-EOPNOTSUPP);
umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
@@ -647,7 +644,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
err = mlx5r_umr_update_xlt(imr, 0,
mlx5_imr_ksm_entries,
- MLX5_KSM_PAGE_SHIFT,
+ mlx5_imr_ksm_page_shift,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ENABLE);
@@ -750,20 +747,20 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
struct ib_umem_odp *odp_imr, u64 user_va,
size_t bcnt, u32 *bytes_mapped, u32 flags)
{
- unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
+ unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift;
unsigned long upd_start_idx = end_idx + 1;
unsigned long upd_len = 0;
unsigned long npages = 0;
int err;
int ret;
- if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
- mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
+ if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size ||
+ mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt))
return -EFAULT;
/* Fault each child mr that intersects with our interval. */
while (bcnt) {
- unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
+ unsigned long idx = user_va >> mlx5_imr_mtt_shift;
struct ib_umem_odp *umem_odp;
struct mlx5_ib_mr *mtt;
u64 len;
@@ -1924,9 +1921,25 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
int mlx5_ib_odp_init(void)
{
+ u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT;
+ u8 mlx5_imr_mtt_bits;
+
+ /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */
+ if (log_va_pages <= 48 - PAGE_SHIFT)
+ mlx5_imr_mtt_shift = 30;
+ /* 56 is x86-64, 5-level paging */
+ else if (log_va_pages <= 56 - PAGE_SHIFT)
+ mlx5_imr_mtt_shift = 34;
+ else
+ return 0;
+
+ mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift);
+ mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT;
+ mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits);
mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
- MLX5_IMR_MTT_BITS);
+ mlx5_imr_mtt_bits);
+ mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift;
return 0;
}
@@ -2093,6 +2106,6 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
destroy_prefetch_work(work);
return rc;
}
- queue_work(system_unbound_wq, &work->work);
+ queue_work(system_dfl_wq, &work->work);
return 0;
}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 88724d15705d..69af20790481 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -3451,10 +3451,11 @@ int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate)
{
u32 stat_rate_support;
- if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS)
+ if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS ||
+ rate == IB_RATE_1600_GBPS)
return 0;
- if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS)
+ if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_1600_GBPS)
return -EINVAL;
stat_rate_support = MLX5_CAP_GEN(dev->mdev, stat_rate_support);
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 0ca2743f1075..e7835ca70e2b 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -518,7 +518,8 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
*/
int rvt_driver_cq_init(void)
{
- comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ comp_vector_wq = alloc_workqueue("%s",
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_PERCPU,
0, "rdmavt_cq");
if (!comp_vector_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index bcb97b3ea58a..b1df05238848 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -452,7 +452,6 @@ static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int leng
length -= bytes;
iova += bytes;
- page_offset = 0;
}
return 0;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index ac0183a2ff7a..0195d361e5e3 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -20,6 +20,54 @@
static struct rxe_recv_sockets recv_sockets;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key rxe_recv_sk_key[2];
+static struct lock_class_key rxe_recv_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void rxe_reclassify_recv_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-RXE-RECV",
+ &rxe_recv_slock_key[0],
+ "sk_lock-AF_INET-RDMA-RXE-RECV",
+ &rxe_recv_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-RXE-RECV",
+ &rxe_recv_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-RXE-RECV",
+ &rxe_recv_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
struct net_device *ndev,
struct in_addr *saddr,
@@ -192,6 +240,7 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
err = udp_sock_create(net, &udp_cfg, &sock);
if (err < 0)
return ERR_PTR(err);
+ rxe_reclassify_recv_socket(sock);
tnl_cfg.encap_type = 1;
tnl_cfg.encap_rcv = rxe_udp_encap_recv;
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index f58e3ec6252f..ae71812bea82 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -358,7 +358,6 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
length -= bytes;
iova += bytes;
- page_offset = 0;
}
mutex_unlock(&umem_odp->umem_mutex);
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 95f1c1c2949d..845bdd03ca28 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -15,6 +15,54 @@
#include "rxe_queue.h"
#include "rxe_task.h"
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key rxe_send_sk_key[2];
+static struct lock_class_key rxe_send_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void rxe_reclassify_send_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-RXE-SEND",
+ &rxe_send_slock_key[0],
+ "sk_lock-AF_INET-RDMA-RXE-SEND",
+ &rxe_send_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-RXE-SEND",
+ &rxe_send_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-RXE-SEND",
+ &rxe_send_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
int has_srq)
{
@@ -244,6 +292,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
if (err < 0)
return err;
+ rxe_reclassify_send_socket(qp->sk);
qp->sk->sk->sk_user_data = qp;
/* pick a source UDP port number for this QP based on
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
index 3661cb627d28..2a234f26ac10 100644
--- a/drivers/infiniband/sw/rxe/rxe_srq.c
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -171,7 +171,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
udata, mi, &srq->rq.producer_lock,
&srq->rq.consumer_lock);
if (err)
- goto err_free;
+ return err;
srq->rq.max_wr = attr->max_wr;
}
@@ -180,11 +180,6 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
srq->limit = attr->srq_limit;
return 0;
-
-err_free:
- rxe_queue_cleanup(q);
- srq->rq.queue = NULL;
- return err;
}
void rxe_srq_cleanup(struct rxe_pool_elem *elem)
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index eb0bd4f79a85..1d3de8209bfa 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -39,6 +39,55 @@ static void siw_cm_llp_error_report(struct sock *s);
static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
int status);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key siw_sk_key[2];
+static struct lock_class_key siw_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void siw_reclassify_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-SIW",
+ &siw_slock_key[0],
+ "sk_lock-AF_INET-RDMA-SIW",
+ &siw_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-SIW",
+ &siw_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-SIW",
+ &siw_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static void siw_sk_assign_cm_upcalls(struct sock *sk)
{
struct siw_cep *cep = sk_to_cep(sk);
@@ -1394,6 +1443,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
if (rv < 0)
goto error;
+ siw_reclassify_socket(s);
/*
* NOTE: For simplification, connect() is called in blocking
@@ -1770,6 +1820,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
if (rv < 0)
return rv;
+ siw_reclassify_socket(s);
/*
* Allow binding local port when still in TIME_WAIT from last close.
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 2e3c0516ce8f..dc531fad73de 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -1029,7 +1029,7 @@ static int __init iser_init(void)
mutex_init(&ig.connlist_mutex);
INIT_LIST_HEAD(&ig.connlist);
- release_wq = alloc_workqueue("release workqueue", 0, 0);
+ release_wq = alloc_workqueue("release workqueue", WQ_PERCPU, 0);
if (!release_wq) {
iser_err("failed to allocate release workqueue\n");
err = -ENOMEM;
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 42977a5326ee..af811d060cc8 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -2613,7 +2613,7 @@ static struct iscsit_transport iser_target_transport = {
static int __init isert_init(void)
{
- isert_login_wq = alloc_workqueue("isert_login_wq", 0, 0);
+ isert_login_wq = alloc_workqueue("isert_login_wq", WQ_PERCPU, 0);
if (!isert_login_wq) {
isert_err("Unable to allocate isert_login_wq\n");
return -ENOMEM;
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
index ef4abdea3c2d..9ecc6343455d 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -1450,7 +1450,7 @@ err_free_chunks:
kfree(srv->chunks);
err_free_srv:
- kfree(srv);
+ put_device(&srv->dev);
return ERR_PTR(-ENOMEM);
}
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 70d29b14d851..99095645134f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -40,12 +40,13 @@ config IOMMU_IO_PGTABLE_LPAE
sizes at both stage-1 and stage-2, as well as address spaces
up to 48-bits in size.
-config IOMMU_IO_PGTABLE_LPAE_SELFTEST
- bool "LPAE selftests"
- depends on IOMMU_IO_PGTABLE_LPAE
+config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST
+ tristate "KUnit tests for LPAE"
+ depends on IOMMU_IO_PGTABLE_LPAE && KUNIT
+ default KUNIT_ALL_TESTS
help
- Enable self-tests for LPAE page table allocator. This performs
- a series of page-table consistency checks during boot.
+ Enable kunit tests for LPAE page table allocator. This performs
+ a series of page-table consistency checks.
If unsure, say N here.
@@ -247,7 +248,7 @@ config SUN50I_IOMMU
config TEGRA_IOMMU_SMMU
bool "NVIDIA Tegra SMMU Support"
- depends on ARCH_TEGRA
+ depends on ARCH_TEGRA || COMPILE_TEST
depends on TEGRA_AHB
depends on TEGRA_MC
select IOMMU_API
@@ -384,3 +385,5 @@ config SPRD_IOMMU
Say Y here if you want to use the multimedia devices listed above.
endif # IOMMU_SUPPORT
+
+source "drivers/iommu/generic_pt/Kconfig"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 355294fa9033..8e8843316c4b 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -3,6 +3,7 @@ obj-y += arm/ iommufd/
obj-$(CONFIG_AMD_IOMMU) += amd/
obj-$(CONFIG_INTEL_IOMMU) += intel/
obj-$(CONFIG_RISCV_IOMMU) += riscv/
+obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/
obj-$(CONFIG_IOMMU_API) += iommu.o
obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o
obj-$(CONFIG_IOMMU_API) += iommu-traces.o
@@ -12,6 +13,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o
obj-$(CONFIG_IOMMU_IOVA) += iova.o
obj-$(CONFIG_OF_IOMMU) += of_iommu.o
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index ecef69c11144..f2acf471cb5d 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -11,10 +11,13 @@ config AMD_IOMMU
select MMU_NOTIFIER
select IOMMU_API
select IOMMU_IOVA
- select IOMMU_IO_PGTABLE
select IOMMU_SVA
select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_AMDV1
+ select IOMMU_PT_X86_64
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help
With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 59c04a67f398..5412a563c697 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o
+obj-y += iommu.o init.o quirks.o ppr.o pasid.o
obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 9b4b589a54b5..25044d28f28a 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag);
* the IOMMU used by this driver.
*/
void amd_iommu_flush_all_caches(struct amd_iommu *iommu);
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain);
void amd_iommu_domain_flush_pages(struct protection_domain *domain,
u64 address, size_t size);
void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index a698a2e7ce2a..78b1c44bd6b5 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -18,7 +18,7 @@
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/irqreturn.h>
-#include <linux/io-pgtable.h>
+#include <linux/generic_pt/iommu.h>
/*
* Maximum number of IOMMUs supported
@@ -247,6 +247,10 @@
#define CMD_BUFFER_ENTRIES 512
#define MMIO_CMD_SIZE_SHIFT 56
#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */
+#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x))
+#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */
+#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x))
/* constants for event buffer handling */
#define EVT_BUFFER_SIZE 8192 /* 512 entries */
@@ -337,76 +341,7 @@
#define GUEST_PGTABLE_4_LEVEL 0x00
#define GUEST_PGTABLE_5_LEVEL 0x01
-#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
- ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
- (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
- IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k 0
#define PM_ADDR_MASK 0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
- (~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
- ((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
- (1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
- ((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address and a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize) \
- (((address) | ((pagesize) - 1)) & \
- (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
- (1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-/*
- * Takes a page-table level and returns the default page-size for this level
- */
-#define PTE_LEVEL_PAGE_SIZE(level) \
- (1ULL << (12 + (9 * (level))))
-
-/*
- * The IOPTE dirty bit
- */
-#define IOMMU_PTE_HD_BIT (6)
-
-/*
- * Bit value definition for I/O PTE fields
- */
-#define IOMMU_PTE_PR BIT_ULL(0)
-#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
-#define IOMMU_PTE_U BIT_ULL(59)
-#define IOMMU_PTE_FC BIT_ULL(60)
-#define IOMMU_PTE_IR BIT_ULL(61)
-#define IOMMU_PTE_IW BIT_ULL(62)
/*
* Bit value definition for DTE fields
@@ -436,12 +371,6 @@
/* DTE[128:179] | DTE[184:191] */
#define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52)
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
-#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
-#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
#define IOMMU_PROT_MASK 0x03
#define IOMMU_PROT_IR 0x01
#define IOMMU_PROT_IW 0x02
@@ -534,19 +463,6 @@ struct amd_irte_ops;
#define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0)
-#define io_pgtable_to_data(x) \
- container_of((x), struct amd_io_pgtable, pgtbl)
-
-#define io_pgtable_ops_to_data(x) \
- io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
-
-#define io_pgtable_ops_to_domain(x) \
- container_of(io_pgtable_ops_to_data(x), \
- struct protection_domain, iop)
-
-#define io_pgtable_cfg_to_data(x) \
- container_of((x), struct amd_io_pgtable, pgtbl.cfg)
-
struct gcr3_tbl_info {
u64 *gcr3_tbl; /* Guest CR3 table */
int glx; /* Number of levels for GCR3 table */
@@ -554,14 +470,6 @@ struct gcr3_tbl_info {
u16 domid; /* Per device domain ID */
};
-struct amd_io_pgtable {
- seqcount_t seqcount; /* Protects root/mode update */
- struct io_pgtable pgtbl;
- int mode;
- u64 *root;
- u64 *pgd; /* v2 pgtable pgd pointer */
-};
-
enum protection_domain_mode {
PD_MODE_NONE,
PD_MODE_V1,
@@ -589,10 +497,13 @@ struct pdom_iommu_info {
* independent of their use.
*/
struct protection_domain {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ struct pt_iommu_amdv1 amdv1;
+ struct pt_iommu_x86_64 amdv2;
+ };
struct list_head dev_list; /* List of all devices in this domain */
- struct iommu_domain domain; /* generic domain handle used by
- iommu core code */
- struct amd_io_pgtable iop;
spinlock_t lock; /* mostly used to lock the page table*/
u16 id; /* the domain id written to the device table */
enum protection_domain_mode pd_mode; /* Track page table type */
@@ -602,6 +513,9 @@ struct protection_domain {
struct mmu_notifier mn; /* mmu notifier for the SVA domain */
struct list_head dev_data_list; /* List of pdom_dev_data */
};
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain);
/*
* This structure contains information about one PCI segment in the system.
diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c
index 10fa217a7119..20b04996441d 100644
--- a/drivers/iommu/amd/debugfs.c
+++ b/drivers/iommu/amd/debugfs.c
@@ -37,7 +37,7 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf,
if (ret)
return ret;
- if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - 4) {
+ if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) {
iommu->dbg_mmio_offset = -1;
return -EINVAL;
}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index f2991c11867c..4f4d4955269e 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1710,13 +1710,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id,
list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list);
if (alloc_dev_table(pci_seg))
- return NULL;
+ goto err_free_pci_seg;
if (alloc_alias_table(pci_seg))
- return NULL;
+ goto err_free_dev_table;
if (alloc_rlookup_table(pci_seg))
- return NULL;
+ goto err_free_alias_table;
return pci_seg;
+
+err_free_alias_table:
+ free_alias_table(pci_seg);
+err_free_dev_table:
+ free_dev_table(pci_seg);
+err_free_pci_seg:
+ list_del(&pci_seg->list);
+ kfree(pci_seg);
+ return NULL;
}
static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id,
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
deleted file mode 100644
index 70c2f5b1631b..000000000000
--- a/drivers/iommu/amd/io_pgtable.c
+++ /dev/null
@@ -1,577 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table allocator.
- *
- * Copyright (C) 2020 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#define pr_fmt(fmt) "AMD-Vi: " fmt
-#define dev_fmt(fmt) pr_fmt(fmt)
-
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/dma-mapping.h>
-#include <linux/seqlock.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-/*
- * Helper function to get the first pte of a large mapping
- */
-static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
- unsigned long *count)
-{
- unsigned long pte_mask, pg_size, cnt;
- u64 *fpte;
-
- pg_size = PTE_PAGE_SIZE(*pte);
- cnt = PAGE_SIZE_PTE_COUNT(pg_size);
- pte_mask = ~((cnt << 3) - 1);
- fpte = (u64 *)(((unsigned long)pte) & pte_mask);
-
- if (page_size)
- *page_size = pg_size;
-
- if (count)
- *count = cnt;
-
- return fpte;
-}
-
-static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl)
-{
- u64 *p;
- int i;
-
- for (i = 0; i < 512; ++i) {
- /* PTE present? */
- if (!IOMMU_PTE_PRESENT(pt[i]))
- continue;
-
- /* Large PTE? */
- if (PM_PTE_LEVEL(pt[i]) == 0 ||
- PM_PTE_LEVEL(pt[i]) == 7)
- continue;
-
- /*
- * Free the next level. No need to look at l1 tables here since
- * they can only contain leaf PTEs; just free them directly.
- */
- p = IOMMU_PTE_PAGE(pt[i]);
- if (lvl > 2)
- free_pt_lvl(p, freelist, lvl - 1);
- else
- iommu_pages_list_add(freelist, p);
- }
-
- iommu_pages_list_add(freelist, pt);
-}
-
-static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist)
-{
- switch (mode) {
- case PAGE_MODE_NONE:
- case PAGE_MODE_7_LEVEL:
- break;
- case PAGE_MODE_1_LEVEL:
- iommu_pages_list_add(freelist, root);
- break;
- case PAGE_MODE_2_LEVEL:
- case PAGE_MODE_3_LEVEL:
- case PAGE_MODE_4_LEVEL:
- case PAGE_MODE_5_LEVEL:
- case PAGE_MODE_6_LEVEL:
- free_pt_lvl(root, freelist, mode);
- break;
- default:
- BUG();
- }
-}
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned int page_size_level,
- gfp_t gfp)
-{
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- struct protection_domain *domain =
- container_of(pgtable, struct protection_domain, iop);
- unsigned long flags;
- bool ret = true;
- u64 *pte;
-
- pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K);
- if (!pte)
- return false;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- if (address <= PM_LEVEL_SIZE(pgtable->mode) &&
- pgtable->mode - 1 >= page_size_level)
- goto out;
-
- ret = false;
- if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level))
- goto out;
-
- *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root));
-
- write_seqcount_begin(&pgtable->seqcount);
- pgtable->root = pte;
- pgtable->mode += 1;
- write_seqcount_end(&pgtable->seqcount);
-
- amd_iommu_update_and_flush_device_table(domain);
-
- pte = NULL;
- ret = true;
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
- iommu_free_pages(pte);
-
- return ret;
-}
-
-static u64 *alloc_pte(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned long page_size,
- u64 **pte_page,
- gfp_t gfp,
- bool *updated)
-{
- unsigned long last_addr = address + (page_size - 1);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- unsigned int seqcount;
- int level, end_lvl;
- u64 *pte, *page;
-
- BUG_ON(!is_power_of_2(page_size));
-
- while (last_addr > PM_LEVEL_SIZE(pgtable->mode) ||
- pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) {
- /*
- * Return an error if there is no memory to update the
- * page-table.
- */
- if (!increase_address_space(pgtable, last_addr,
- PAGE_SIZE_LEVEL(page_size), gfp))
- return NULL;
- }
-
-
- do {
- seqcount = read_seqcount_begin(&pgtable->seqcount);
-
- level = pgtable->mode - 1;
- pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
- } while (read_seqcount_retry(&pgtable->seqcount, seqcount));
-
-
- address = PAGE_SIZE_ALIGN(address, page_size);
- end_lvl = PAGE_SIZE_LEVEL(page_size);
-
- while (level > end_lvl) {
- u64 __pte, __npte;
- int pte_level;
-
- __pte = *pte;
- pte_level = PM_PTE_LEVEL(__pte);
-
- /*
- * If we replace a series of large PTEs, we need
- * to tear down all of them.
- */
- if (IOMMU_PTE_PRESENT(__pte) &&
- pte_level == PAGE_MODE_7_LEVEL) {
- unsigned long count, i;
- u64 *lpte;
-
- lpte = first_pte_l7(pte, NULL, &count);
-
- /*
- * Unmap the replicated PTEs that still match the
- * original large mapping
- */
- for (i = 0; i < count; ++i)
- cmpxchg64(&lpte[i], __pte, 0ULL);
-
- *updated = true;
- continue;
- }
-
- if (!IOMMU_PTE_PRESENT(__pte) ||
- pte_level == PAGE_MODE_NONE) {
- page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp,
- SZ_4K);
-
- if (!page)
- return NULL;
-
- __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
-
- /* pte could have been changed somewhere. */
- if (!try_cmpxchg64(pte, &__pte, __npte))
- iommu_free_pages(page);
- else if (IOMMU_PTE_PRESENT(__pte))
- *updated = true;
-
- continue;
- }
-
- /* No level skipping support yet */
- if (pte_level != level)
- return NULL;
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(__pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned long *page_size)
-{
- int level;
- unsigned int seqcount;
- u64 *pte;
-
- *page_size = 0;
-
- if (address > PM_LEVEL_SIZE(pgtable->mode))
- return NULL;
-
- do {
- seqcount = read_seqcount_begin(&pgtable->seqcount);
- level = pgtable->mode - 1;
- pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
- } while (read_seqcount_retry(&pgtable->seqcount, seqcount));
-
- *page_size = PTE_LEVEL_PAGE_SIZE(level);
-
- while (level > 0) {
-
- /* Not Present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Large PTE */
- if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL ||
- PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE)
- break;
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- /* Walk to the next level */
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- *page_size = PTE_LEVEL_PAGE_SIZE(level);
- }
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
- pte = first_pte_l7(pte, page_size, NULL);
-
- return pte;
-}
-
-static void free_clear_pte(u64 *pte, u64 pteval,
- struct iommu_pages_list *freelist)
-{
- u64 *pt;
- int mode;
-
- while (!try_cmpxchg64(pte, &pteval, 0))
- pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
-
- if (!IOMMU_PTE_PRESENT(pteval))
- return;
-
- pt = IOMMU_PTE_PAGE(pteval);
- mode = IOMMU_PTE_MODE(pteval);
-
- free_sub_pt(pt, mode, freelist);
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
- bool updated = false;
- u64 __pte, *pte;
- int ret, i, count;
- size_t size = pgcount << __ffs(pgsize);
- unsigned long o_iova = iova;
-
- BUG_ON(!IS_ALIGNED(iova, pgsize));
- BUG_ON(!IS_ALIGNED(paddr, pgsize));
-
- ret = -EINVAL;
- if (!(prot & IOMMU_PROT_MASK))
- goto out;
-
- while (pgcount > 0) {
- count = PAGE_SIZE_PTE_COUNT(pgsize);
- pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated);
-
- ret = -ENOMEM;
- if (!pte)
- goto out;
-
- for (i = 0; i < count; ++i)
- free_clear_pte(&pte[i], pte[i], &freelist);
-
- if (!iommu_pages_list_empty(&freelist))
- updated = true;
-
- if (count > 1) {
- __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
- } else
- __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
-
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- for (i = 0; i < count; ++i)
- pte[i] = __pte;
-
- iova += pgsize;
- paddr += pgsize;
- pgcount--;
- if (mapped)
- *mapped += pgsize;
- }
-
- ret = 0;
-
-out:
- if (updated) {
- struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- /*
- * Flush domain TLB(s) and wait for completion. Any Device-Table
- * Updates and flushing already happened in
- * increase_address_space().
- */
- amd_iommu_domain_flush_pages(dom, o_iova, size);
- spin_unlock_irqrestore(&dom->lock, flags);
- }
-
- /* Everything flushed out, free pages now */
- iommu_put_pages_list(&freelist);
-
- return ret;
-}
-
-static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long long unmapped;
- unsigned long unmap_size;
- u64 *pte;
- size_t size = pgcount << __ffs(pgsize);
-
- BUG_ON(!is_power_of_2(pgsize));
-
- unmapped = 0;
-
- while (unmapped < size) {
- pte = fetch_pte(pgtable, iova, &unmap_size);
- if (pte) {
- int i, count;
-
- count = PAGE_SIZE_PTE_COUNT(unmap_size);
- for (i = 0; i < count; i++)
- pte[i] = 0ULL;
- } else {
- return unmapped;
- }
-
- iova = (iova & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- return unmapped;
-}
-
-static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long offset_mask, pte_pgsize;
- u64 *pte, __pte;
-
- pte = fetch_pte(pgtable, iova, &pte_pgsize);
-
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- offset_mask = pte_pgsize - 1;
- __pte = __sme_clr(*pte & PM_ADDR_MASK);
-
- return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
- unsigned long flags)
-{
- bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
- bool dirty = false;
- int i, count;
-
- /*
- * 2.2.3.2 Host Dirty Support
- * When a non-default page size is used , software must OR the
- * Dirty bits in all of the replicated host PTEs used to map
- * the page. The IOMMU does not guarantee the Dirty bits are
- * set in all of the replicated PTEs. Any portion of the page
- * may have been written even if the Dirty bit is set in only
- * one of the replicated PTEs.
- */
- count = PAGE_SIZE_PTE_COUNT(size);
- for (i = 0; i < count && test_only; i++) {
- if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
- dirty = true;
- break;
- }
- }
-
- for (i = 0; i < count && !test_only; i++) {
- if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
- (unsigned long *)&ptep[i])) {
- dirty = true;
- }
- }
-
- return dirty;
-}
-
-static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long end = iova + size - 1;
-
- do {
- unsigned long pgsize = 0;
- u64 *ptep, pte;
-
- ptep = fetch_pte(pgtable, iova, &pgsize);
- if (ptep)
- pte = READ_ONCE(*ptep);
- if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
- pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
- iova += pgsize;
- continue;
- }
-
- /*
- * Mark the whole IOVA range as dirty even if only one of
- * the replicated PTEs were marked dirty.
- */
- if (pte_test_and_clear_dirty(ptep, pgsize, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v1_free_pgtable(struct io_pgtable *iop)
-{
- struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
- struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
-
- if (pgtable->mode == PAGE_MODE_NONE)
- return;
-
- /* Page-table is not visible to IOMMU anymore, so free it */
- BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
- pgtable->mode > amd_iommu_hpt_level);
-
- free_sub_pt(pgtable->root, pgtable->mode, &freelist);
- iommu_put_pages_list(&freelist);
-}
-
-static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
-
- pgtable->root =
- iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K);
- if (!pgtable->root)
- return NULL;
- pgtable->mode = PAGE_MODE_3_LEVEL;
- seqcount_init(&pgtable->seqcount);
-
- cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap;
- cfg->ias = IOMMU_IN_ADDR_BIT_SIZE;
- cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE;
-
- pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages;
- pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages;
- pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys;
- pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
-
- return &pgtable->pgtbl;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
- .alloc = v1_alloc_pgtable,
- .free = v1_free_pgtable,
-};
diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
deleted file mode 100644
index b47941353ccb..000000000000
--- a/drivers/iommu/amd/io_pgtable_v2.c
+++ /dev/null
@@ -1,370 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table v2 allocator.
- *
- * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- * Author: Vasant Hegde <vasant.hegde@amd.com>
- */
-
-#define pr_fmt(fmt) "AMD-Vi: " fmt
-#define dev_fmt(fmt) pr_fmt(fmt)
-
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */
-#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */
-#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */
-#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */
-#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */
-#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */
-#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */
-#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */
-#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */
-
-#define MAX_PTRS_PER_PAGE 512
-
-#define IOMMU_PAGE_SIZE_2M BIT_ULL(21)
-#define IOMMU_PAGE_SIZE_1G BIT_ULL(30)
-
-
-static inline int get_pgtable_level(void)
-{
- return amd_iommu_gpt_level;
-}
-
-static inline bool is_large_pte(u64 pte)
-{
- return (pte & IOMMU_PAGE_PSE);
-}
-
-static inline u64 set_pgtable_attr(u64 *page)
-{
- u64 prot;
-
- prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER;
- prot |= IOMMU_PAGE_ACCESS;
-
- return (iommu_virt_to_phys(page) | prot);
-}
-
-static inline void *get_pgtable_pte(u64 pte)
-{
- return iommu_phys_to_virt(pte & PM_ADDR_MASK);
-}
-
-static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
-{
- u64 pte;
-
- pte = __sme_set(paddr & PM_ADDR_MASK);
- pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
- pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
-
- if (prot & IOMMU_PROT_IW)
- pte |= IOMMU_PAGE_RW;
-
- /* Large page */
- if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M)
- pte |= IOMMU_PAGE_PSE;
-
- return pte;
-}
-
-static inline u64 get_alloc_page_size(u64 size)
-{
- if (size >= IOMMU_PAGE_SIZE_1G)
- return IOMMU_PAGE_SIZE_1G;
-
- if (size >= IOMMU_PAGE_SIZE_2M)
- return IOMMU_PAGE_SIZE_2M;
-
- return PAGE_SIZE;
-}
-
-static inline int page_size_to_level(u64 pg_size)
-{
- if (pg_size == IOMMU_PAGE_SIZE_1G)
- return PAGE_MODE_3_LEVEL;
- if (pg_size == IOMMU_PAGE_SIZE_2M)
- return PAGE_MODE_2_LEVEL;
-
- return PAGE_MODE_1_LEVEL;
-}
-
-static void free_pgtable(u64 *pt, int level)
-{
- u64 *p;
- int i;
-
- for (i = 0; i < MAX_PTRS_PER_PAGE; i++) {
- /* PTE present? */
- if (!IOMMU_PTE_PRESENT(pt[i]))
- continue;
-
- if (is_large_pte(pt[i]))
- continue;
-
- /*
- * Free the next level. No need to look at l1 tables here since
- * they can only contain leaf PTEs; just free them directly.
- */
- p = get_pgtable_pte(pt[i]);
- if (level > 2)
- free_pgtable(p, level - 1);
- else
- iommu_free_pages(p);
- }
-
- iommu_free_pages(pt);
-}
-
-/* Allocate page table */
-static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova,
- unsigned long pg_size, gfp_t gfp, bool *updated)
-{
- u64 *pte, *page;
- int level, end_level;
-
- level = get_pgtable_level() - 1;
- end_level = page_size_to_level(pg_size);
- pte = &pgd[PM_LEVEL_INDEX(level, iova)];
- iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE);
-
- while (level >= end_level) {
- u64 __pte, __npte;
-
- __pte = *pte;
-
- if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) {
- /* Unmap large pte */
- cmpxchg64(pte, *pte, 0ULL);
- *updated = true;
- continue;
- }
-
- if (!IOMMU_PTE_PRESENT(__pte)) {
- page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K);
- if (!page)
- return NULL;
-
- __npte = set_pgtable_attr(page);
- /* pte could have been changed somewhere. */
- if (!try_cmpxchg64(pte, &__pte, __npte))
- iommu_free_pages(page);
- else if (IOMMU_PTE_PRESENT(__pte))
- *updated = true;
-
- continue;
- }
-
- level -= 1;
- pte = get_pgtable_pte(__pte);
- pte = &pte[PM_LEVEL_INDEX(level, iova)];
- }
-
- /* Tear down existing pte entries */
- if (IOMMU_PTE_PRESENT(*pte)) {
- u64 *__pte;
-
- *updated = true;
- __pte = get_pgtable_pte(*pte);
- cmpxchg64(pte, *pte, 0ULL);
- if (pg_size == IOMMU_PAGE_SIZE_1G)
- free_pgtable(__pte, end_level - 1);
- else if (pg_size == IOMMU_PAGE_SIZE_2M)
- iommu_free_pages(__pte);
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address.
- * If there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
- unsigned long iova, unsigned long *page_size)
-{
- u64 *pte;
- int level;
-
- level = get_pgtable_level() - 1;
- pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)];
- /* Default page size is 4K */
- *page_size = PAGE_SIZE;
-
- while (level) {
- /* Not present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Walk to the next level */
- pte = get_pgtable_pte(*pte);
- pte = &pte[PM_LEVEL_INDEX(level - 1, iova)];
-
- /* Large page */
- if (is_large_pte(*pte)) {
- if (level == PAGE_MODE_3_LEVEL)
- *page_size = IOMMU_PAGE_SIZE_1G;
- else if (level == PAGE_MODE_2_LEVEL)
- *page_size = IOMMU_PAGE_SIZE_2M;
- else
- return NULL; /* Wrongly set PSE bit in PTE */
-
- break;
- }
-
- level -= 1;
- }
-
- return pte;
-}
-
-static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- u64 *pte;
- unsigned long map_size;
- unsigned long mapped_size = 0;
- unsigned long o_iova = iova;
- size_t size = pgcount << __ffs(pgsize);
- int ret = 0;
- bool updated = false;
-
- if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount)
- return -EINVAL;
-
- if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- while (mapped_size < size) {
- map_size = get_alloc_page_size(pgsize);
- pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd,
- iova, map_size, gfp, &updated);
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
-
- *pte = set_pte_attr(paddr, map_size, prot);
-
- iova += map_size;
- paddr += map_size;
- mapped_size += map_size;
- }
-
-out:
- if (updated) {
- struct protection_domain *pdom = io_pgtable_ops_to_domain(ops);
- unsigned long flags;
-
- spin_lock_irqsave(&pdom->lock, flags);
- amd_iommu_domain_flush_pages(pdom, o_iova, size);
- spin_unlock_irqrestore(&pdom->lock, flags);
- }
-
- if (mapped)
- *mapped += mapped_size;
-
- return ret;
-}
-
-static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
- unsigned long unmap_size;
- unsigned long unmapped = 0;
- size_t size = pgcount << __ffs(pgsize);
- u64 *pte;
-
- if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount))
- return 0;
-
- while (unmapped < size) {
- pte = fetch_pte(pgtable, iova, &unmap_size);
- if (!pte)
- return unmapped;
-
- *pte = 0ULL;
-
- iova = (iova & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- return unmapped;
-}
-
-static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long offset_mask, pte_pgsize;
- u64 *pte, __pte;
-
- pte = fetch_pte(pgtable, iova, &pte_pgsize);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- offset_mask = pte_pgsize - 1;
- __pte = __sme_clr(*pte & PM_ADDR_MASK);
-
- return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v2_free_pgtable(struct io_pgtable *iop)
-{
- struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl);
-
- if (!pgtable || !pgtable->pgd)
- return;
-
- /* Free page table */
- free_pgtable(pgtable->pgd, get_pgtable_level());
- pgtable->pgd = NULL;
-}
-
-static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
- int ias = IOMMU_IN_ADDR_BIT_SIZE;
-
- pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K);
- if (!pgtable->pgd)
- return NULL;
-
- if (get_pgtable_level() == PAGE_MODE_5_LEVEL)
- ias = 57;
-
- pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages;
- pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages;
- pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys;
-
- cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
- cfg->ias = ias;
- cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE;
-
- return &pgtable->pgtbl;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = {
- .alloc = v2_alloc_pgtable,
- .free = v2_free_pgtable,
-};
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 2e1865daa1ce..9f1d56a5e145 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -30,7 +30,6 @@
#include <linux/msi.h>
#include <linux/irqdomain.h>
#include <linux/percpu.h>
-#include <linux/io-pgtable.h>
#include <linux/cc_platform.h>
#include <asm/irq_remapping.h>
#include <asm/io_apic.h>
@@ -41,9 +40,9 @@
#include <asm/gart.h>
#include <asm/dma.h>
#include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
#include "amd_iommu.h"
-#include "../dma-iommu.h"
#include "../irq_remapping.h"
#include "../iommu-pages.h"
@@ -60,7 +59,6 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
-static const struct iommu_dirty_ops amd_dirty_ops;
int amd_iommu_max_glx_val = -1;
@@ -70,15 +68,22 @@ int amd_iommu_max_glx_val = -1;
*/
DEFINE_IDA(pdom_ids);
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev);
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+ struct iommu_domain *old);
static void set_dte_entry(struct amd_iommu *iommu,
- struct iommu_dev_data *dev_data);
+ struct iommu_dev_data *dev_data,
+ phys_addr_t top_paddr, unsigned int top_level);
+
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level);
static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid);
static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid);
+static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain);
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
/****************************************************************************
*
@@ -1157,6 +1162,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
*
****************************************************************************/
+static void dump_command_buffer(struct amd_iommu *iommu)
+{
+ struct iommu_cmd *cmd;
+ u32 head, tail;
+ int i;
+
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head),
+ MMIO_CMD_BUFFER_TAIL(tail));
+
+ for (i = 0; i < CMD_BUFFER_ENTRIES; i++) {
+ cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd));
+ pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2],
+ cmd->data[3]);
+ }
+}
+
static int wait_on_sem(struct amd_iommu *iommu, u64 data)
{
int i = 0;
@@ -1167,7 +1191,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data)
}
if (i == LOOP_TIMEOUT) {
- pr_alert("Completion-Wait loop timed out\n");
+
+ pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n",
+ iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid),
+ PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid));
+
+ if (amd_iommu_dump)
+ DO_ONCE_LITE(dump_command_buffer, iommu);
+
return -EIO;
}
@@ -1756,42 +1787,6 @@ static void dev_flush_pasid_all(struct iommu_dev_data *dev_data,
CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
}
-/* Flush the not present cache if it exists */
-static void domain_flush_np_cache(struct protection_domain *domain,
- dma_addr_t iova, size_t size)
-{
- if (unlikely(amd_iommu_np_cache)) {
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
- amd_iommu_domain_flush_pages(domain, iova, size);
- spin_unlock_irqrestore(&domain->lock, flags);
- }
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- lockdep_assert_held(&domain->lock);
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
-
- set_dte_entry(iommu, dev_data);
- clone_aliases(iommu, dev_data->dev);
- }
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- device_flush_dte(dev_data);
-
- domain_flush_complete(domain);
-}
-
int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
{
struct iommu_dev_data *dev_data;
@@ -2051,7 +2046,8 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
}
static void set_dte_entry(struct amd_iommu *iommu,
- struct iommu_dev_data *dev_data)
+ struct iommu_dev_data *dev_data,
+ phys_addr_t top_paddr, unsigned int top_level)
{
u16 domid;
u32 old_domid;
@@ -2060,19 +2056,36 @@ static void set_dte_entry(struct amd_iommu *iommu,
struct protection_domain *domain = dev_data->domain;
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
+ struct pt_iommu_amdv1_hw_info pt_info;
+
+ make_clear_dte(dev_data, dte, &new);
if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;
- else
+ else {
domid = domain->id;
- make_clear_dte(dev_data, dte, &new);
-
- if (domain->iop.mode != PAGE_MODE_NONE)
- new.data[0] |= iommu_virt_to_phys(domain->iop.root);
+ if (domain->domain.type & __IOMMU_DOMAIN_PAGING) {
+ /*
+ * When updating the IO pagetable, the new top and level
+ * are provided as parameters. For other operations i.e.
+ * device attach, retrieve the current pagetable info
+ * via the IOMMU PT API.
+ */
+ if (top_paddr) {
+ pt_info.host_pt_root = top_paddr;
+ pt_info.mode = top_level + 1;
+ } else {
+ WARN_ON(top_paddr || top_level);
+ pt_iommu_amdv1_hw_info(&domain->amdv1,
+ &pt_info);
+ }
- new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
+ new.data[0] |= __sme_set(pt_info.host_pt_root) |
+ (pt_info.mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ }
+ }
new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW;
@@ -2138,7 +2151,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set)
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
if (set)
- set_dte_entry(iommu, dev_data);
+ set_dte_entry(iommu, dev_data, 0, 0);
else
clear_dte_entry(iommu, dev_data);
@@ -2156,6 +2169,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
{
struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
int max_pasids = dev_data->max_pasids;
+ struct pt_iommu_x86_64_hw_info pt_info;
int ret = 0;
/*
@@ -2178,7 +2192,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
if (!pdom_is_v2_pgtbl_mode(pdom))
return ret;
- ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
+ pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info);
+ ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true);
if (ret)
free_gcr3_table(&dev_data->gcr3_info);
@@ -2500,94 +2515,240 @@ struct protection_domain *protection_domain_alloc(void)
return domain;
}
-static int pdom_setup_pgtable(struct protection_domain *domain,
- struct device *dev)
+static bool amd_iommu_hd_support(struct amd_iommu *iommu)
+{
+ if (amd_iommu_hatdis)
+ return false;
+
+ return iommu && (iommu->features & FEATURE_HDSUP);
+}
+
+static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt)
{
- struct io_pgtable_ops *pgtbl_ops;
- enum io_pgtable_fmt fmt;
+ struct protection_domain *pdom =
+ container_of(iommupt, struct protection_domain, iommu);
- switch (domain->pd_mode) {
- case PD_MODE_V1:
- fmt = AMD_IOMMU_V1;
- break;
- case PD_MODE_V2:
- fmt = AMD_IOMMU_V2;
- break;
- case PD_MODE_NONE:
- WARN_ON_ONCE(1);
- return -EPERM;
+ return &pdom->lock;
+}
+
+/*
+ * Update all HW references to the domain with a new pgtable configuration.
+ */
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level)
+{
+ struct protection_domain *pdom =
+ container_of(iommu_table, struct protection_domain, iommu);
+ struct iommu_dev_data *dev_data;
+
+ lockdep_assert_held(&pdom->lock);
+
+ /* Update the DTE for all devices attached to this domain */
+ list_for_each_entry(dev_data, &pdom->dev_list, list) {
+ struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
+
+ /* Update the HW references with the new level and top ptr */
+ set_dte_entry(iommu, dev_data, top_paddr, top_level);
+ clone_aliases(iommu, dev_data->dev);
}
- domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev);
- pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain);
- if (!pgtbl_ops)
- return -ENOMEM;
+ list_for_each_entry(dev_data, &pdom->dev_list, list)
+ device_flush_dte(dev_data);
+
+ domain_flush_complete(pdom);
+}
+
+/*
+ * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to
+ * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non
+ * present caching (like hypervisor shadowing).
+ */
+static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
+ unsigned long iova, size_t size)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ if (likely(!amd_iommu_np_cache))
+ return 0;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ amd_iommu_domain_flush_pages(domain, iova, size);
+ spin_unlock_irqrestore(&domain->lock, flags);
return 0;
}
-static inline u64 dma_max_address(enum protection_domain_mode pgtable)
+static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
{
- if (pgtable == PD_MODE_V1)
- return PM_LEVEL_SIZE(amd_iommu_hpt_level);
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
- /*
- * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page
- * Translation" shows that the V2 table sign extends the top of the
- * address space creating a reserved region in the middle of the
- * translation, just like the CPU does. Further Vasant says the docs are
- * incomplete and this only applies to non-zero PASIDs. If the AMDv2
- * page table is assigned to the 0 PASID then there is no sign extension
- * check.
- *
- * Since the IOMMU must have a fixed geometry, and the core code does
- * not understand sign extended addressing, we have to chop off the high
- * bit to get consistent behavior with attachments of the domain to any
- * PASID.
- */
- return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1);
+ spin_lock_irqsave(&dom->lock, flags);
+ amd_iommu_domain_flush_all(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
}
-static bool amd_iommu_hd_support(struct amd_iommu *iommu)
+static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- if (amd_iommu_hatdis)
- return false;
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
- return iommu && (iommu->features & FEATURE_HDSUP);
+ spin_lock_irqsave(&dom->lock, flags);
+ amd_iommu_domain_flush_pages(dom, gather->start,
+ gather->end - gather->start + 1);
+ spin_unlock_irqrestore(&dom->lock, flags);
+ iommu_put_pages_list(&gather->freelist);
}
-static struct iommu_domain *
-do_iommu_domain_alloc(struct device *dev, u32 flags,
- enum protection_domain_mode pgtable)
+static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = {
+ .get_top_lock = amd_iommu_get_top_lock,
+ .change_top = amd_iommu_change_top,
+};
+
+static const struct iommu_domain_ops amdv1_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1),
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+ .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
+
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1),
+ .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+};
+
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev,
+ u32 flags)
{
- bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
+ struct pt_iommu_amdv1_cfg cfg = {};
struct protection_domain *domain;
int ret;
+ if (amd_iommu_hatdis)
+ return ERR_PTR(-EOPNOTSUPP);
+
domain = protection_domain_alloc();
if (!domain)
return ERR_PTR(-ENOMEM);
- domain->pd_mode = pgtable;
- ret = pdom_setup_pgtable(domain, dev);
+ domain->pd_mode = PD_MODE_V1;
+ domain->iommu.driver_ops = &amd_hw_driver_ops_v1;
+ domain->iommu.nid = dev_to_node(dev);
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ domain->domain.dirty_ops = &amdv1_dirty_ops;
+
+ /*
+ * Someday FORCE_COHERENCE should be set by
+ * amd_iommu_enforce_cache_coherency() like VT-d does.
+ */
+ cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
+
+ /*
+ * AMD's IOMMU can flush as many pages as necessary in a single flush.
+ * Unless we run in a virtual machine, which can be inferred according
+ * to whether "non-present cache" is on, it is probably best to prefer
+ * (potentially) too extensive TLB flushing (i.e., more misses) over
+ * multiple TLB flushes (i.e., more flushes). For virtual machines the
+ * hypervisor needs to synchronize the host IOMMU PTEs with those of
+ * the guest, and the trade-off is different: unnecessary TLB flushes
+ * should be avoided.
+ */
+ if (amd_iommu_np_cache)
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
+ else
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
+
+ cfg.common.hw_max_vasz_lg2 =
+ min(64, (amd_iommu_hpt_level - 1) * 9 + 21);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.starting_level = 2;
+ domain->domain.ops = &amdv1_ops;
+
+ ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL);
if (ret) {
- pdom_id_free(domain->id);
- kfree(domain);
+ amd_iommu_domain_free(&domain->domain);
return ERR_PTR(ret);
}
- domain->domain.geometry.aperture_start = 0;
- domain->domain.geometry.aperture_end = dma_max_address(pgtable);
- domain->domain.geometry.force_aperture = true;
- domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap;
+ /*
+ * Narrow the supported page sizes to those selected by the kernel
+ * command line.
+ */
+ domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap;
+ return &domain->domain;
+}
- domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
- domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+static const struct iommu_domain_ops amdv2_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+ /*
+ * Note the AMDv2 page table format does not support a Force Coherency
+ * bit, so enforce_cache_coherency should not be set. However VFIO is
+ * not prepared to handle a case where some domains will support
+ * enforcement and others do not. VFIO and iommufd will have to be fixed
+ * before it can fully use the V2 page table. See the comment in
+ * iommufd_hwpt_paging_alloc(). For now leave things as they have
+ * historically been and lie about enforce_cache_coherencey.
+ */
+ .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
- if (dirty_tracking)
- domain->domain.dirty_ops = &amd_dirty_ops;
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev,
+ u32 flags)
+{
+ struct pt_iommu_x86_64_cfg cfg = {};
+ struct protection_domain *domain;
+ int ret;
+ if (!amd_iommu_v2_pgtbl_supported())
+ return ERR_PTR(-EOPNOTSUPP);
+
+ domain = protection_domain_alloc();
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ domain->pd_mode = PD_MODE_V2;
+ domain->iommu.nid = dev_to_node(dev);
+
+ cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES);
+ if (amd_iommu_np_cache)
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
+ else
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
+
+ /*
+ * The v2 table behaves differently if it is attached to PASID 0 vs a
+ * non-zero PASID. On PASID 0 it has no sign extension and the full
+ * 57/48 bits decode the lower addresses. Otherwise it behaves like a
+ * normal sign extended x86 page table. Since we want the domain to work
+ * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not
+ * set which creates a table that is compatible in both modes.
+ */
+ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
+ cfg.common.hw_max_vasz_lg2 = 56;
+ cfg.top_level = 4;
+ } else {
+ cfg.common.hw_max_vasz_lg2 = 47;
+ cfg.top_level = 3;
+ }
+ cfg.common.hw_max_oasz_lg2 = 52;
+ domain->domain.ops = &amdv2_ops;
+
+ ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL);
+ if (ret) {
+ amd_iommu_domain_free(&domain->domain);
+ return ERR_PTR(ret);
+ }
return &domain->domain;
}
@@ -2608,15 +2769,27 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
/* Allocate domain with v1 page table for dirty tracking */
if (!amd_iommu_hd_support(iommu))
break;
- return do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
+ return amd_iommu_domain_alloc_paging_v1(dev, flags);
case IOMMU_HWPT_ALLOC_PASID:
/* Allocate domain with v2 page table if IOMMU supports PASID. */
if (!amd_iommu_pasid_supported())
break;
- return do_iommu_domain_alloc(dev, flags, PD_MODE_V2);
- case 0:
+ return amd_iommu_domain_alloc_paging_v2(dev, flags);
+ case 0: {
+ struct iommu_domain *ret;
+
/* If nothing specific is required use the kernel commandline default */
- return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
+ if (amd_iommu_pgtable == PD_MODE_V1) {
+ ret = amd_iommu_domain_alloc_paging_v1(dev, flags);
+ if (ret != ERR_PTR(-EOPNOTSUPP))
+ return ret;
+ return amd_iommu_domain_alloc_paging_v2(dev, flags);
+ }
+ ret = amd_iommu_domain_alloc_paging_v2(dev, flags);
+ if (ret != ERR_PTR(-EOPNOTSUPP))
+ return ret;
+ return amd_iommu_domain_alloc_paging_v1(dev, flags);
+ }
default:
break;
}
@@ -2628,14 +2801,14 @@ void amd_iommu_domain_free(struct iommu_domain *dom)
struct protection_domain *domain = to_pdomain(dom);
WARN_ON(!list_empty(&domain->dev_list));
- if (domain->domain.type & __IOMMU_DOMAIN_PAGING)
- free_io_pgtable_ops(&domain->iop.pgtbl.ops);
+ pt_iommu_deinit(&domain->iommu);
pdom_id_free(domain->id);
kfree(domain);
}
static int blocked_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
@@ -2685,16 +2858,8 @@ void amd_iommu_init_identity_domain(void)
protection_domain_init(&identity_domain);
}
-/* Same as blocked domain except it supports only ops->attach_dev() */
-static struct iommu_domain release_domain = {
- .type = IOMMU_DOMAIN_BLOCKED,
- .ops = &(const struct iommu_domain_ops) {
- .attach_dev = blocked_domain_attach_device,
- }
-};
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+ struct iommu_domain *old)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
struct protection_domain *domain = to_pdomain(dom);
@@ -2734,93 +2899,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
return ret;
}
-static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
- unsigned long iova, size_t size)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
-
- if (ops->map_pages)
- domain_flush_np_cache(domain, iova, size);
- return 0;
-}
-
-static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int iommu_prot, gfp_t gfp, size_t *mapped)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- int prot = 0;
- int ret = -EINVAL;
-
- if ((domain->pd_mode == PD_MODE_V1) &&
- (domain->iop.mode == PAGE_MODE_NONE))
- return -EINVAL;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- if (ops->map_pages) {
- ret = ops->map_pages(ops, iova, paddr, pgsize,
- pgcount, prot, gfp, mapped);
- }
-
- return ret;
-}
-
-static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size)
-{
- /*
- * AMD's IOMMU can flush as many pages as necessary in a single flush.
- * Unless we run in a virtual machine, which can be inferred according
- * to whether "non-present cache" is on, it is probably best to prefer
- * (potentially) too extensive TLB flushing (i.e., more misses) over
- * mutliple TLB flushes (i.e., more flushes). For virtual machines the
- * hypervisor needs to synchronize the host IOMMU PTEs with those of
- * the guest, and the trade-off is different: unnecessary TLB flushes
- * should be avoided.
- */
- if (amd_iommu_np_cache &&
- iommu_iotlb_gather_is_disjoint(gather, iova, size))
- iommu_iotlb_sync(domain, gather);
-
- iommu_iotlb_gather_add_range(gather, iova, size);
-}
-
-static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- size_t r;
-
- if ((domain->pd_mode == PD_MODE_V1) &&
- (domain->iop.mode == PAGE_MODE_NONE))
- return 0;
-
- r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
-
- if (r)
- amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
-
- return r;
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- dma_addr_t iova)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
-
- return ops->iova_to_phys(ops, iova);
-}
-
static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
{
switch (cap) {
@@ -2887,28 +2965,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
return 0;
}
-static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct protection_domain *pdomain = to_pdomain(domain);
- struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops;
- unsigned long lflags;
-
- if (!ops || !ops->read_and_clear_dirty)
- return -EOPNOTSUPP;
-
- spin_lock_irqsave(&pdomain->lock, lflags);
- if (!pdomain->dirty_tracking && dirty->bitmap) {
- spin_unlock_irqrestore(&pdomain->lock, lflags);
- return -EINVAL;
- }
- spin_unlock_irqrestore(&pdomain->lock, lflags);
-
- return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
-}
-
static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head)
{
@@ -2978,28 +3034,6 @@ static bool amd_iommu_is_attach_deferred(struct device *dev)
return dev_data->defer_attach;
}
-static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_all(dom);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-
-static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather)
-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_pages(dom, gather->start,
- gather->end - gather->start + 1);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-
static int amd_iommu_def_domain_type(struct device *dev)
{
struct iommu_dev_data *dev_data;
@@ -3034,15 +3068,10 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true;
}
-static const struct iommu_dirty_ops amd_dirty_ops = {
- .set_dirty_tracking = amd_iommu_set_dirty_tracking,
- .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
-};
-
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.blocked_domain = &blocked_domain,
- .release_domain = &release_domain,
+ .release_domain = &blocked_domain,
.identity_domain = &identity_domain.domain,
.domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags,
.domain_alloc_sva = amd_iommu_domain_alloc_sva,
@@ -3053,17 +3082,6 @@ const struct iommu_ops amd_iommu_ops = {
.is_attach_deferred = amd_iommu_is_attach_deferred,
.def_domain_type = amd_iommu_def_domain_type,
.page_response = amd_iommu_page_response,
- .default_domain_ops = &(const struct iommu_domain_ops) {
- .attach_dev = amd_iommu_attach_device,
- .map_pages = amd_iommu_map_pages,
- .unmap_pages = amd_iommu_unmap_pages,
- .iotlb_sync_map = amd_iommu_iotlb_sync_map,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .flush_iotlb_all = amd_iommu_flush_iotlb_all,
- .iotlb_sync = amd_iommu_iotlb_sync,
- .free = amd_iommu_domain_free,
- .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
- }
};
#ifdef CONFIG_IRQ_REMAP
@@ -3354,7 +3372,7 @@ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
struct irte_ga *irte)
{
- bool ret;
+ int ret;
ret = __modify_irte_ga(iommu, devid, index, irte);
if (ret)
@@ -4072,3 +4090,5 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
return 0;
}
#endif
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index 95a4e62b8f63..83a5aabcd15d 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -672,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain,
}
static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret, i;
struct apple_dart_stream_map *stream_map;
@@ -693,7 +694,8 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
}
static int apple_dart_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
struct apple_dart_stream_map *stream_map;
@@ -717,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = {
};
static int apple_dart_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
struct apple_dart_stream_map *stream_map;
@@ -802,6 +805,8 @@ static int apple_dart_of_xlate(struct device *dev,
struct apple_dart *cfg_dart;
int i, sid;
+ put_device(&iommu_pdev->dev);
+
if (args->args_count != 1)
return -EINVAL;
sid = args->args[0];
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 8cd8929bbfdf..93fdadd07431 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -99,6 +99,8 @@ static void arm_smmu_make_nested_domain_ste(
int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
struct arm_smmu_nested_domain *nested_domain)
{
+ unsigned int cfg =
+ FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
struct arm_smmu_vmaster *vmaster;
unsigned long vsid;
int ret;
@@ -107,8 +109,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core,
state->master->dev, &vsid);
- if (ret)
+ /*
+ * Attaching to a translate nested domain must allocate a vDEVICE prior,
+ * as CD/ATS invalidations and vevents require a vSID to work properly.
+ * A abort/bypass domain is allowed to attach w/o vmaster for GBPA case.
+ */
+ if (ret) {
+ if (cfg == STRTAB_STE_0_CFG_ABORT ||
+ cfg == STRTAB_STE_0_CFG_BYPASS)
+ return 0;
return ret;
+ }
vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL);
if (!vmaster)
@@ -138,14 +149,15 @@ void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master)
}
static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_nested_domain *nested_domain =
to_smmu_nested_domain(domain);
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
struct arm_smmu_attach_state state = {
.master = master,
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
struct arm_smmu_ste ste;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2a8b46b948f0..d16d35c78c06 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1464,7 +1464,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size,
&cd_table->cdtab_dma,
GFP_KERNEL);
- if (!cd_table->l2.l2ptrs) {
+ if (!cd_table->l2.l1tab) {
ret = -ENOMEM;
goto err_free_l2ptrs;
}
@@ -3002,7 +3002,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
master->ats_enabled = state->ats_enabled;
}
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old_domain)
{
int ret = 0;
struct arm_smmu_ste target;
@@ -3010,7 +3011,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
struct arm_smmu_device *smmu;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_attach_state state = {
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
struct arm_smmu_master *master;
@@ -3186,7 +3187,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
/*
* When the last user of the CD table goes away downgrade the STE back
- * to a non-cd_table one.
+ * to a non-cd_table one, by re-attaching its sid_domain.
*/
if (!arm_smmu_ssids_in_use(&master->cd_table)) {
struct iommu_domain *sid_domain =
@@ -3194,12 +3195,14 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
if (sid_domain->type == IOMMU_DOMAIN_IDENTITY ||
sid_domain->type == IOMMU_DOMAIN_BLOCKED)
- sid_domain->ops->attach_dev(sid_domain, dev);
+ sid_domain->ops->attach_dev(sid_domain, dev,
+ sid_domain);
}
return 0;
}
static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
+ struct iommu_domain *old_domain,
struct device *dev,
struct arm_smmu_ste *ste,
unsigned int s1dss)
@@ -3207,7 +3210,7 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
struct arm_smmu_attach_state state = {
.master = master,
- .old_domain = iommu_get_domain_for_dev(dev),
+ .old_domain = old_domain,
.ssid = IOMMU_NO_PASID,
};
@@ -3248,14 +3251,16 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
}
static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_ste ste;
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
arm_smmu_master_clear_vmaster(master);
arm_smmu_make_bypass_ste(master->smmu, &ste);
- arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS);
+ arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
+ STRTAB_STE_1_S1DSS_BYPASS);
return 0;
}
@@ -3269,14 +3274,15 @@ static struct iommu_domain arm_smmu_identity_domain = {
};
static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_ste ste;
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
arm_smmu_master_clear_vmaster(master);
arm_smmu_make_abort_ste(&ste);
- arm_smmu_attach_dev_ste(domain, dev, &ste,
+ arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
STRTAB_STE_1_S1DSS_TERMINATE);
return 0;
}
@@ -3582,12 +3588,6 @@ static void arm_smmu_release_device(struct device *dev)
WARN_ON(master->iopf_refcount);
- /* Put the STE back to what arm_smmu_init_strtab() sets */
- if (dev->iommu->require_direct)
- arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev);
- else
- arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev);
-
arm_smmu_disable_pasid(master);
arm_smmu_remove_master(master);
if (arm_smmu_cdtab_allocated(&master->cd_table))
@@ -3678,6 +3678,7 @@ static int arm_smmu_def_domain_type(struct device *dev)
static const struct iommu_ops arm_smmu_ops = {
.identity_domain = &arm_smmu_identity_domain,
.blocked_domain = &arm_smmu_blocked_domain,
+ .release_domain = &arm_smmu_blocked_domain,
.capable = arm_smmu_capable,
.hw_info = arm_smmu_hw_info,
.domain_alloc_sva = arm_smmu_sva_domain_alloc,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 57c097e87613..573085349df3 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -367,6 +367,7 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,adreno" },
{ .compatible = "qcom,adreno-gmu" },
+ { .compatible = "qcom,glymur-mdss" },
{ .compatible = "qcom,mdp4" },
{ .compatible = "qcom,mdss" },
{ .compatible = "qcom,qcm2290-mdss" },
@@ -431,17 +432,19 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
/*
* Some platforms support more than the Arm SMMU architected maximum of
- * 128 stream matching groups. For unknown reasons, the additional
- * groups don't exhibit the same behavior as the architected registers,
- * so limit the groups to 128 until the behavior is fixed for the other
- * groups.
+ * 128 stream matching groups. The additional registers appear to have
+ * the same behavior as the architected registers in the hardware.
+ * However, on some firmware versions, the hypervisor does not
+ * correctly trap and emulate accesses to the additional registers,
+ * resulting in unexpected behavior.
+ *
+ * If there are more than 128 groups, use the last reliable group to
+ * detect if we need to apply the bypass quirk.
*/
- if (smmu->num_mapping_groups > 128) {
- dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
- smmu->num_mapping_groups = 128;
- }
-
- last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
+ if (smmu->num_mapping_groups > 128)
+ last_s2cr = ARM_SMMU_GR0_S2CR(127);
+ else
+ last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
/*
* With some firmware versions writes to S2CR of type FAULT are
@@ -464,6 +467,11 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS);
arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg);
+
+ if (smmu->num_mapping_groups > 128) {
+ dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
+ smmu->num_mapping_groups = 128;
+ }
}
for (i = 0; i < smmu->num_mapping_groups; i++) {
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4ced4b5bee4d..5e690cf85ec9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1165,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg,
}
}
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
@@ -1234,7 +1235,8 @@ static int arm_smmu_attach_dev_type(struct device *dev,
}
static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS);
}
@@ -1249,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = {
};
static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT);
}
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index c5be95e56031..f69d9276dc55 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -359,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
kfree(qcom_domain);
}
-static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int qcom_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev, struct iommu_domain *old)
{
struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
@@ -388,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
}
static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct qcom_iommu_domain *qcom_domain;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
unsigned int i;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- qcom_domain = to_qcom_iommu_domain(domain);
+ qcom_domain = to_qcom_iommu_domain(old);
if (WARN_ON(!qcom_domain->iommu))
return -EINVAL;
@@ -565,14 +566,14 @@ static int qcom_iommu_of_xlate(struct device *dev,
qcom_iommu = platform_get_drvdata(iommu_pdev);
+ put_device(&iommu_pdev->dev);
+
/* make sure the asid specified in dt is valid, so we don't have
* to sanity check this elsewhere:
*/
if (WARN_ON(asid > qcom_iommu->max_asid) ||
- WARN_ON(qcom_iommu->ctxs[asid] == NULL)) {
- put_device(&iommu_pdev->dev);
+ WARN_ON(qcom_iommu->ctxs[asid] == NULL))
return -EINVAL;
- }
if (!dev_iommu_priv_get(dev)) {
dev_iommu_priv_set(dev, qcom_iommu);
@@ -581,10 +582,8 @@ static int qcom_iommu_of_xlate(struct device *dev,
* multiple different iommu devices. Multiple context
* banks are ok, but multiple devices are not:
*/
- if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) {
- put_device(&iommu_pdev->dev);
+ if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev)))
return -EINVAL;
- }
}
return iommu_fwspec_add_ids(dev, &asid, 1);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f1fb27681b0b..c92088855450 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
* as a bus address, __finalise_sg() will copy the dma
* address into the output segment.
*/
- s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
- sg_phys(s));
+ s->dma_address = pci_p2pdma_bus_addr_map(
+ p2pdma_state.mem, sg_phys(s));
sg_dma_len(s) = sg->length;
sg_dma_mark_bus_address(s);
continue;
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index b6edd178fe25..b512c6b939ac 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -984,7 +984,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
}
static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
struct exynos_iommu_domain *domain;
@@ -1035,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = {
};
static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
@@ -1044,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
unsigned long flags;
int err;
- err = exynos_iommu_identity_attach(&exynos_identity_domain, dev);
+ err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old);
if (err)
return err;
@@ -1429,8 +1431,6 @@ static void exynos_iommu_release_device(struct device *dev)
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
struct sysmmu_drvdata *data;
- WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev));
-
list_for_each_entry(data, &owner->controllers, owner_node)
device_link_del(data->link);
}
@@ -1446,17 +1446,14 @@ static int exynos_iommu_of_xlate(struct device *dev,
return -ENODEV;
data = platform_get_drvdata(sysmmu);
- if (!data) {
- put_device(&sysmmu->dev);
+ put_device(&sysmmu->dev);
+ if (!data)
return -ENODEV;
- }
if (!owner) {
owner = kzalloc(sizeof(*owner), GFP_KERNEL);
- if (!owner) {
- put_device(&sysmmu->dev);
+ if (!owner)
return -ENOMEM;
- }
INIT_LIST_HEAD(&owner->controllers);
mutex_init(&owner->rpm_lock);
@@ -1476,6 +1473,7 @@ static int exynos_iommu_of_xlate(struct device *dev,
static const struct iommu_ops exynos_iommu_ops = {
.identity_domain = &exynos_identity_domain,
+ .release_domain = &exynos_identity_domain,
.domain_alloc_paging = exynos_iommu_domain_alloc_paging,
.device_group = generic_device_group,
.probe_device = exynos_iommu_probe_device,
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 5f08523f97cb..9664ef9840d2 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
}
static int fsl_pamu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
unsigned long flags;
@@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain,
* switches to what looks like BLOCKING.
*/
static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct fsl_dma_domain *dma_domain;
const u32 *prop;
int len;
@@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
* Hack to keep things working as they always have, only leaving an
* UNMANAGED domain makes it BLOCKING.
*/
- if (domain == platform_domain || !domain ||
- domain->type != IOMMU_DOMAIN_UNMANAGED)
+ if (old == platform_domain || !old ||
+ old->type != IOMMU_DOMAIN_UNMANAGED)
return 0;
- dma_domain = to_fsl_dma_domain(domain);
+ dma_domain = to_fsl_dma_domain(old);
/*
* Use LIODN of the PCI controller while detaching a
diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig
new file mode 100644
index 000000000000..52ac9e661ffd
--- /dev/null
+++ b/drivers/iommu/generic_pt/.kunitconfig
@@ -0,0 +1,14 @@
+CONFIG_KUNIT=y
+CONFIG_GENERIC_PT=y
+CONFIG_DEBUG_GENERIC_PT=y
+CONFIG_IOMMU_PT=y
+CONFIG_IOMMU_PT_AMDV1=y
+CONFIG_IOMMU_PT_VTDSS=y
+CONFIG_IOMMU_PT_X86_64=y
+CONFIG_IOMMU_PT_KUNIT_TEST=y
+
+CONFIG_IOMMUFD=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_FAULT_INJECTION=y
+CONFIG_RUNTIME_TESTING_MENU=y
+CONFIG_IOMMUFD_TEST=y
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
new file mode 100644
index 000000000000..ce4fb4786914
--- /dev/null
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig GENERIC_PT
+ bool "Generic Radix Page Table" if COMPILE_TEST
+ help
+ Generic library for building radix tree page tables.
+
+ Generic PT provides a set of HW page table formats and a common
+ set of APIs to work with them.
+
+if GENERIC_PT
+config DEBUG_GENERIC_PT
+ bool "Extra debugging checks for GENERIC_PT"
+ help
+ Enable extra run time debugging checks for GENERIC_PT code. This
+ incurs a runtime cost and should not be enabled for production
+ kernels.
+
+ The kunit tests require this to be enabled to get full coverage.
+
+config IOMMU_PT
+ tristate "IOMMU Page Tables"
+ select IOMMU_API
+ depends on IOMMU_SUPPORT
+ depends on GENERIC_PT
+ help
+ Generic library for building IOMMU page tables
+
+ IOMMU_PT provides an implementation of the page table operations
+ related to struct iommu_domain using GENERIC_PT. It provides a single
+ implementation of the page table operations that can be shared by
+ multiple drivers.
+
+if IOMMU_PT
+config IOMMU_PT_AMDV1
+ tristate "IOMMU page table for 64-bit AMD IOMMU v1"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the AMD v1 page table. AMDv1 is the
+ "host" page table. It supports granular page sizes of almost every
+ power of 2 and decodes the full 64-bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_VTDSS
+ tristate "IOMMU page table for Intel VT-d Second Stage"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5
+ level Second Stage page table. It is similar to the X86_64 format with
+ 4K/2M/1G page sizes.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_X86_64
+ tristate "IOMMU page table for x86 64-bit, 4/5 levels"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the x86 64-bit 4/5 level page table.
+ It supports 4K/2M/1G page sizes and can decode a sign-extended
+ portion of the 64-bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_KUNIT_TEST
+ tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
+ depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
+ depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS
+ default KUNIT_ALL_TESTS
+ help
+ Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
+ enabled page table formats. The test covers most of the GENERIC_PT
+ functions provided by the page table format, as well as covering the
+ iommu_domain related functions.
+
+endif
+endif
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
new file mode 100644
index 000000000000..976b49ec97dc
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
+iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
+
+IOMMU_PT_KUNIT_TEST :=
+define create_format
+obj-$(2) += iommu_$(1).o
+iommu_pt_kunit_test-y += kunit_iommu_$(1).o
+CFLAGS_kunit_iommu_$(1).o += -DGENERIC_PT_KUNIT=1
+IOMMU_PT_KUNIT_TEST := iommu_pt_kunit_test.o
+
+endef
+
+$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y)))
+$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m)))
+
+# The kunit objects are constructed by compiling the main source
+# with -DGENERIC_PT_KUNIT
+$(obj)/kunit_iommu_%.o: $(src)/iommu_%.c FORCE
+ $(call rule_mkdir)
+ $(call if_changed_dep,cc_o_c)
+
+obj-$(CONFIG_IOMMU_PT_KUNIT_TEST) += $(IOMMU_PT_KUNIT_TEST)
diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h
new file mode 100644
index 000000000000..aa8e1a8ec95f
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/amdv1.h
@@ -0,0 +1,411 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * AMD IOMMU v1 page table
+ *
+ * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
+ * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
+ *
+ * Note the level numbering here matches the core code, so level 0 is the same
+ * as mode 1.
+ *
+ */
+#ifndef __GENERIC_PT_FMT_AMDV1_H
+#define __GENERIC_PT_FMT_AMDV1_H
+
+#include "defs_amdv1.h"
+#include "../pt_defs.h"
+
+#include <asm/page.h>
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/mem_encrypt.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+
+enum {
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ /*
+ * The IOMMUFD selftest uses the AMDv1 format with some alterations It
+ * uses a 2k page size to test cases where the CPU page size is not the
+ * same.
+ */
+#ifdef AMDV1_IOMMUFD_SELFTEST
+ PT_MAX_VA_ADDRESS_LG2 = 56,
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 11,
+#else
+ PT_MAX_VA_ADDRESS_LG2 = 64,
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_TOP_LEVEL = 5,
+ PT_GRANULE_LG2SZ = 12,
+#endif
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* The DTE only has these bits for the top phyiscal address */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* PTE bits */
+enum {
+ AMDV1PT_FMT_PR = BIT(0),
+ AMDV1PT_FMT_D = BIT(6),
+ AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
+ AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
+ AMDV1PT_FMT_FC = BIT_ULL(60),
+ AMDV1PT_FMT_IR = BIT_ULL(61),
+ AMDV1PT_FMT_IW = BIT_ULL(62),
+};
+
+/*
+ * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
+ * these defines to avoid it.
+ */
+#define AMDV1PT_FMT_NL_DEFAULT 0
+#define AMDV1PT_FMT_NL_SIZE 7
+
+static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
+}
+#define pt_table_pa amdv1pt_table_pa
+
+/* Returns the oa for the start of the contiguous entry */
+static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+ pt_oaddr_t oa;
+
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
+ unsigned int sz_bits = oaffz(oa);
+
+ oa = oalog2_set_mod(oa, 0, sz_bits);
+ } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
+ AMDV1PT_FMT_NL_DEFAULT))
+ return 0;
+ return oalog2_mul(oa, PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa amdv1pt_entry_oa
+
+static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
+{
+ /*
+ * Table 15: Page Table Level Parameters
+ * The top most level cannot have translation entries
+ */
+ return pts->level < PT_MAX_TOP_LEVEL;
+}
+#define pt_can_have_leaf amdv1pt_can_have_leaf
+
+/* Body in pt_fmt_defaults.h */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+static inline unsigned int
+amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ u32 code;
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
+ AMDV1PT_FMT_NL_DEFAULT)
+ return ilog2(1);
+
+ PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
+ AMDV1PT_FMT_NL_SIZE);
+
+ /*
+ * The contiguous size is encoded in the length of a string of 1's in
+ * the low bits of the OA. Reverse the equation:
+ * code = log2_to_int(num_contig_lg2 + item_lg2sz -
+ * PT_GRANULE_LG2SZ - 1) - 1
+ * Which can be expressed as:
+ * num_contig_lg2 = oalog2_ffz(code) + 1 -
+ * item_lg2sz - PT_GRANULE_LG2SZ
+ *
+ * Assume the bit layout is correct and remove the masking. Reorganize
+ * the equation to move all the arithmetic before the ffz.
+ */
+ code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
+ pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
+ return ffz_t(u32, code);
+}
+#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
+
+static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
+{
+ /*
+ * Top entry covers bits [63:57] only, this is handled through
+ * max_vasz_lg2.
+ */
+ if (PT_WARN_ON(pts->level == 5))
+ return 7;
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 amdv1pt_num_items_lg2
+
+static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!amdv1pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * Table 14: Example Page Size Encodings
+ * Address bits 51:32 can be used to encode page sizes greater than 4
+ * Gbytes. Address bits 63:52 are zero-extended.
+ *
+ * 512GB Pages are not supported due to a hardware bug.
+ * Otherwise every power of two size is supported.
+ */
+ return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
+ isz_lg2) & ~SZ_512G;
+}
+#define pt_possible_sizes amdv1pt_possible_sizes
+
+static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ unsigned int next_level;
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(*tablep);
+ if (!(entry & AMDV1PT_FMT_PR))
+ return PT_ENTRY_EMPTY;
+
+ next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
+ if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
+ next_level == AMDV1PT_FMT_NL_SIZE)
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw amdv1pt_load_entry_raw
+
+static inline void
+amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+
+ if (oasz_lg2 == isz_lg2) {
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_DEFAULT);
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_SIZE) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
+ 1) -
+ 1);
+
+ /* See amdv1pt_clear_entries() */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ memset64(tablep, entry, log2_to_int(num_contig_lg2));
+ }
+ }
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry amdv1pt_install_leaf_entry
+
+static inline bool amdv1pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ /*
+ * IR and IW are ANDed from the table levels along with the PTE. We
+ * always control permissions from the PTE, so always set IR and IW for
+ * tables.
+ */
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ log2_div(table_pa, PT_GRANULE_LG2SZ)) |
+ AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table amdv1pt_install_table
+
+static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits =
+ pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
+}
+#define pt_attr_from_entry amdv1pt_attr_from_entry
+
+static inline void amdv1pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ /*
+ * gcc generates rep stos for the io-pgtable code, and this difference
+ * can show in microbenchmarks with larger contiguous page sizes.
+ * rep is slower for small cases.
+ */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+ } else {
+ memset64(tablep, 0, log2_to_int(num_contig_lg2));
+ }
+}
+#define pt_clear_entries amdv1pt_clear_entries
+
+static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
+ return true;
+ return false;
+}
+#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
+
+static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
+}
+#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
+
+static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | AMDV1PT_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_amdv1
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
+ ->amdpt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
+}
+
+static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
+ pte |= AMDV1PT_FMT_FC;
+ if (iommu_prot & IOMMU_READ)
+ pte |= AMDV1PT_FMT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= AMDV1PT_FMT_IW;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot amdv1pt_iommu_set_prot
+
+static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
+ const struct pt_iommu_amdv1_cfg *cfg)
+{
+ struct pt_amdv1 *table = &iommu_table->amdpt;
+ unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+
+ if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
+ return -EINVAL;
+
+ if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
+ cfg->starting_level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
+ (cfg->starting_level + 1);
+
+ table->common.max_vasz_lg2 =
+ min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ pt_top_set_level(&table->common, cfg->starting_level);
+ return 0;
+}
+#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
+
+#ifndef PT_FMT_VARIANT
+static inline void
+amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_amdv1_hw_info *info)
+{
+ info->host_pt_root = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
+ info->mode = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
+#endif
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
+ /* Matches what io_pgtable does */
+ [0] = { .starting_level = 2 },
+};
+#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = 0 };
+#endif
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
new file mode 100644
index 000000000000..0b9614ca6d10
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H
+#define __GENERIC_PT_FMT_DEFS_AMDV1_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct amdv1pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs amdv1pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
new file mode 100644
index 000000000000..4a239bcaae2a
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H
+#define __GENERIC_PT_FMT_DEFS_VTDSS_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct vtdss_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs vtdss_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
new file mode 100644
index 000000000000..6f589e1f55d3
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H
+#define __GENERIC_PT_FMT_DEFS_X86_64_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct x86_64_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs x86_64_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
new file mode 100644
index 000000000000..72a2337d0c55
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT amdv1
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \
+ BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+#define PT_FORCE_ENABLED_FEATURES \
+ (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c
new file mode 100644
index 000000000000..74e597cba9d9
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define AMDV1_IOMMUFD_SELFTEST 1
+#define PT_FMT amdv1
+#define PT_FMT_VARIANT mock
+#define PT_SUPPORTED_FEATURES 0
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h
new file mode 100644
index 000000000000..d28e86abdf2e
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_template.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Template to build the iommu module and kunit from the format and
+ * implementation headers.
+ *
+ * The format should have:
+ * #define PT_FMT <name>
+ * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy))
+ * And optionally:
+ * #define PT_FORCE_ENABLED_FEATURES ..
+ * #define PT_FMT_VARIANT <suffix>
+ */
+#include <linux/args.h>
+#include <linux/stringify.h>
+
+#ifdef PT_FMT_VARIANT
+#define PTPFX_RAW \
+ CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT)
+#else
+#define PTPFX_RAW PT_FMT
+#endif
+
+#define PTPFX CONCATENATE(PTPFX_RAW, _)
+
+#define _PT_FMT_H PT_FMT.h
+#define PT_FMT_H __stringify(_PT_FMT_H)
+
+#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H)
+#define PT_DEFS_H __stringify(_PT_DEFS_H)
+
+#include <linux/generic_pt/common.h>
+#include PT_DEFS_H
+#include "../pt_defs.h"
+#include PT_FMT_H
+#include "../pt_common.h"
+
+#ifndef GENERIC_PT_KUNIT
+#include "../iommu_pt.h"
+#else
+/*
+ * The makefile will compile the .c file twice, once with GENERIC_PT_KUNIT set
+ * which means we are building the kunit modle.
+ */
+#include "../kunit_generic_pt.h"
+#include "../kunit_iommu_pt.h"
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
new file mode 100644
index 000000000000..f551711e2a33
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT vtdss
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \
+ BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
new file mode 100644
index 000000000000..5472660c2d71
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT x86_64
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \
+ BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
new file mode 100644
index 000000000000..f5f8981edde7
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Intel VT-d Second Stange 5/4 level page table
+ *
+ * This is described in
+ * Section "3.7 Second-Stage Translation"
+ * Section "9.8 Second-Stage Paging Entries"
+ *
+ * Of the "Intel Virtualization Technology for Directed I/O Architecture
+ * Specification".
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/SS-PTE - 0
+ * Directory/SS-PDE - 1
+ * Directory Ptr/SS-PDPTE - 2
+ * PML4/SS-PML4E - 3
+ * PML5/SS-PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_VTDSS_H
+#define __GENERIC_PT_FMT_VTDSS_H
+
+#include "defs_vtdss.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* SSPTPTR is 4k aligned and limited by HAW */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ VTDSS_FMT_R = BIT(0),
+ VTDSS_FMT_W = BIT(1),
+ VTDSS_FMT_A = BIT(8),
+ VTDSS_FMT_D = BIT(9),
+ VTDSS_FMT_SNP = BIT(11),
+ VTDSS_FMT_OA = GENMASK_ULL(51, 12),
+};
+
+/* PDPTE/PDE */
+enum {
+ VTDSS_FMT_PS = BIT(7),
+};
+
+#define common_to_vtdss_pt(common_ptr) \
+ container_of_const(common_ptr, struct pt_vtdss, common)
+#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common)
+
+static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa vtdss_pt_table_pa
+
+static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa vtdss_pt_entry_oa
+
+static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf vtdss_pt_can_have_leaf
+
+static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 vtdss_pt_num_items_lg2
+
+static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!entry)
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw vtdss_pt_load_entry_raw
+
+static inline void
+vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= VTDSS_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry vtdss_pt_install_leaf_entry
+
+static inline bool vtdss_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = VTDSS_FMT_R | VTDSS_FMT_W |
+ FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table vtdss_pt_install_table
+
+static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP);
+}
+#define pt_attr_from_entry vtdss_pt_attr_from_entry
+
+static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ return READ_ONCE(*tablep) & VTDSS_FMT_D;
+}
+#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty
+
+static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D);
+}
+#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean
+
+static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | VTDSS_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty
+
+static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common)
+{
+ return 10;
+}
+#define pt_max_sw_bit vtdss_pt_max_sw_bit
+
+static inline u64 vtdss_pt_sw_bit(unsigned int bitnr)
+{
+ if (__builtin_constant_p(bitnr) && bitnr > 10)
+ BUILD_BUG();
+
+ /* Bits marked Ignored in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(10);
+ case 1 ... 9:
+ return BIT_ULL((bitnr - 1) + 52);
+ case 10:
+ return BIT_ULL(63);
+ /* Some bits in 9-3 are available in some entries */
+ default:
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit vtdss_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_vtdss
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->vtdss_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, vtdss_pt.common)
+ ->iommu;
+}
+
+static inline int vtdss_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ /*
+ * VTDSS does not have a present bit, so we tell if any entry is present
+ * by checking for R or W.
+ */
+ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+ return -EINVAL;
+
+ if (iommu_prot & IOMMU_READ)
+ pte |= VTDSS_FMT_R;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= VTDSS_FMT_W;
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE))
+ pte |= VTDSS_FMT_SNP;
+
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) &&
+ !(iommu_prot & IOMMU_WRITE)) {
+ pr_err_ratelimited(
+ "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot vtdss_pt_iommu_set_prot
+
+static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
+ const struct pt_iommu_vtdss_cfg *cfg)
+{
+ struct pt_vtdss *table = &iommu_table->vtdss_pt;
+
+ if (cfg->top_level > 4 || cfg->top_level < 2)
+ return -EOPNOTSUPP;
+
+ pt_top_set_level(&table->common, cfg->top_level);
+ return 0;
+}
+#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
+
+static inline void
+vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_vtdss_hw_info *info)
+{
+ info->ssptptr = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK);
+ /*
+ * top_level = 2 = 3 level table aw=1
+ * top_level = 3 = 4 level table aw=2
+ * top_level = 4 = 5 level table aw=3
+ */
+ info->aw = top_range->top_level - 1;
+}
+#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
+ [0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2},
+ [1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3},
+ [2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4},
+};
+#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
+#endif
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h
new file mode 100644
index 000000000000..210748d9d6e8
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/x86_64.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * x86 page table. Supports the 4 and 5 level variations.
+ *
+ * The 4 and 5 level version is described in:
+ * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software
+ * Developer's Manual Volume 3
+ *
+ * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization
+ * Technology for Directed I/O Architecture Specification"
+ *
+ * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O
+ * Virtualization Technology (IOMMU) Specification"
+ *
+ * It is used by x86 CPUs, AMD and VT-d IOMMU HW.
+ *
+ * Note the 3 level format is very similar and almost implemented here. The
+ * reserved/ignored layout is different and there are functional bit
+ * differences.
+ *
+ * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower
+ * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign
+ * extended addressing with this page table format.
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/PTE - 0
+ * Directory/PDE - 1
+ * Directory Ptr/PDPTE - 2
+ * PML4/PML4E - 3
+ * PML5/PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_X86_64_H
+#define __GENERIC_PT_FMT_X86_64_H
+
+#include "defs_x86_64.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+#include <linux/mem_encrypt.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /*
+ * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k
+ * aligned and is limited by the architected HAW
+ */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ X86_64_FMT_P = BIT(0),
+ X86_64_FMT_RW = BIT(1),
+ X86_64_FMT_U = BIT(2),
+ X86_64_FMT_A = BIT(5),
+ X86_64_FMT_D = BIT(6),
+ X86_64_FMT_OA = GENMASK_ULL(51, 12),
+ X86_64_FMT_XD = BIT_ULL(63),
+};
+
+/* PDPTE/PDE */
+enum {
+ X86_64_FMT_PS = BIT(7),
+};
+
+static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa x86_64_pt_table_pa
+
+static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa x86_64_pt_entry_oa
+
+static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf x86_64_pt_can_have_leaf
+
+static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 x86_64_pt_num_items_lg2
+
+static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!(entry & X86_64_FMT_P))
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw x86_64_pt_load_entry_raw
+
+static inline void
+x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = X86_64_FMT_P |
+ FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= X86_64_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry x86_64_pt_install_leaf_entry
+
+static inline bool x86_64_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+ FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table x86_64_pt_install_table
+
+static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+ X86_64_FMT_D | X86_64_FMT_XD);
+}
+#define pt_attr_from_entry x86_64_pt_attr_from_entry
+
+static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common)
+{
+ return 12;
+}
+#define pt_max_sw_bit x86_64_pt_max_sw_bit
+
+static inline u64 x86_64_pt_sw_bit(unsigned int bitnr)
+{
+ if (__builtin_constant_p(bitnr) && bitnr > 12)
+ BUILD_BUG();
+
+ /* Bits marked Ignored/AVL in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(9);
+ case 1:
+ return BIT(11);
+ case 2 ... 12:
+ return BIT_ULL((bitnr - 2) + 52);
+ /* Some bits in 8,6,4,3 are available in some entries */
+ default:
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit x86_64_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_x86_64
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->x86_64_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, x86_64_pt.common)
+ ->iommu;
+}
+
+static inline int x86_64_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte;
+
+ pte = X86_64_FMT_U | X86_64_FMT_A;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= X86_64_FMT_RW | X86_64_FMT_D;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot x86_64_pt_iommu_set_prot
+
+static inline int
+x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table,
+ const struct pt_iommu_x86_64_cfg *cfg)
+{
+ struct pt_x86_64 *table = &iommu_table->x86_64_pt;
+
+ if (cfg->top_level < 3 || cfg->top_level > 4)
+ return -EOPNOTSUPP;
+
+ pt_top_set_level(&table->common, cfg->top_level);
+
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ return 0;
+}
+#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init
+
+static inline void
+x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_x86_64_hw_info *info)
+{
+ info->gcr3_pt = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK);
+ info->levels = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = {
+ [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+ .common.hw_max_vasz_lg2 = 48, .top_level = 3 },
+ [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+ .common.hw_max_vasz_lg2 = 57, .top_level = 4 },
+ /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */
+ [2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 },
+ [3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4},
+};
+#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)};
+#endif
+#endif
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
new file mode 100644
index 000000000000..97aeda1ad01c
--- /dev/null
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -0,0 +1,1289 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * "Templated C code" for implementing the iommu operations for page tables.
+ * This is compiled multiple times, over all the page table formats to pick up
+ * the per-format definitions.
+ */
+#ifndef __GENERIC_PT_IOMMU_PT_H
+#define __GENERIC_PT_IOMMU_PT_H
+
+#include "pt_iter.h"
+
+#include <linux/export.h>
+#include <linux/iommu.h>
+#include "../iommu-pages.h"
+#include <linux/cleanup.h>
+#include <linux/dma-mapping.h>
+
+enum {
+ SW_BIT_CACHE_FLUSH_DONE = 0,
+};
+
+static void flush_writes_range(const struct pt_state *pts,
+ unsigned int start_index, unsigned int end_index)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, start_index * PT_ITEM_WORD_SIZE,
+ (end_index - start_index) * PT_ITEM_WORD_SIZE);
+}
+
+static void flush_writes_item(const struct pt_state *pts)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, pts->index * PT_ITEM_WORD_SIZE,
+ PT_ITEM_WORD_SIZE);
+}
+
+static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
+ struct pt_iommu *iommu_table, pt_vaddr_t iova,
+ pt_vaddr_t len,
+ struct iommu_pages_list *free_list)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(free_list,
+ iommu_table->iommu_device);
+
+ if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
+ iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
+ iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
+ /*
+ * Note that the sync frees the gather's free list, so we must
+ * not have any pages on that list that are covered by iova/len
+ */
+ } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) {
+ iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
+ }
+
+ iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
+}
+
+#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
+
+static int make_range_ul(struct pt_common *common, struct pt_range *range,
+ unsigned long iova, unsigned long len)
+{
+ unsigned long last;
+
+ if (unlikely(len == 0))
+ return -EINVAL;
+
+ if (check_add_overflow(iova, len - 1, &last))
+ return -EOVERFLOW;
+
+ *range = pt_make_range(common, iova, last);
+ if (sizeof(iova) > sizeof(range->va)) {
+ if (unlikely(range->va != iova || range->last_va != last))
+ return -EOVERFLOW;
+ }
+ return 0;
+}
+
+static __maybe_unused int make_range_u64(struct pt_common *common,
+ struct pt_range *range, u64 iova,
+ u64 len)
+{
+ if (unlikely(iova > ULONG_MAX || len > ULONG_MAX))
+ return -EOVERFLOW;
+ return make_range_ul(common, range, iova, len);
+}
+
+/*
+ * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch
+ * to the correct validation based on the type.
+ */
+#define make_range_no_check(common, range, iova, len) \
+ ({ \
+ int ret; \
+ if (sizeof(iova) > sizeof(unsigned long) || \
+ sizeof(len) > sizeof(unsigned long)) \
+ ret = make_range_u64(common, range, iova, len); \
+ else \
+ ret = make_range_ul(common, range, iova, len); \
+ ret; \
+ })
+
+#define make_range(common, range, iova, len) \
+ ({ \
+ int ret = make_range_no_check(common, range, iova, len); \
+ if (!ret) \
+ ret = pt_check_range(range); \
+ ret; \
+ })
+
+static inline unsigned int compute_best_pgsize(struct pt_state *pts,
+ pt_oaddr_t oa)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(pts->range->common);
+
+ if (!pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * The page size is limited by the domain's bitmap. This allows the core
+ * code to reduce the supported page sizes by changing the bitmap.
+ */
+ return pt_compute_best_pgsize(pt_possible_sizes(pts) &
+ iommu_table->domain.pgsize_bitmap,
+ pts->range->va, pts->range->last_va, oa);
+}
+
+static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ pt_oaddr_t *res = arg;
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, descend_fn);
+ case PT_ENTRY_OA:
+ *res = pt_entry_oa_exact(&pts);
+ return 0;
+ }
+ return -ENOENT;
+}
+PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys);
+
+/**
+ * iova_to_phys() - Return the output address for the given IOVA
+ * @domain: Table to query
+ * @iova: IO virtual address to query
+ *
+ * Determine the output address from the given IOVA. @iova may have any
+ * alignment, the returned physical will be adjusted with any sub page offset.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Return: 0 if there is no translation for the given iova.
+ */
+phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_range range;
+ pt_oaddr_t res;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __iova_to_phys, &res);
+ /* PHYS_ADDR_MAX would be a better error code */
+ if (ret)
+ return 0;
+ return res;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
+
+struct pt_iommu_dirty_args {
+ struct iommu_dirty_bitmap *dirty;
+ unsigned int flags;
+};
+
+static void record_dirty(struct pt_state *pts,
+ struct pt_iommu_dirty_args *dirty,
+ unsigned int num_contig_lg2)
+{
+ pt_vaddr_t dirty_len;
+
+ if (num_contig_lg2 != ilog2(1)) {
+ unsigned int index = pts->index;
+ unsigned int end_index = log2_set_mod_max_t(
+ unsigned int, pts->index, num_contig_lg2);
+
+ /* Adjust for being contained inside a contiguous page */
+ end_index = min(end_index, pts->end_index);
+ dirty_len = (end_index - index) *
+ log2_to_int(pt_table_item_lg2sz(pts));
+ } else {
+ dirty_len = log2_to_int(pt_table_item_lg2sz(pts));
+ }
+
+ if (dirty->dirty->bitmap)
+ iova_bitmap_set(dirty->dirty->bitmap, pts->range->va,
+ dirty_len);
+
+ if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) {
+ /*
+ * No write log required because DMA incoherence and atomic
+ * dirty tracking bits can't work together
+ */
+ pt_entry_make_write_clean(pts);
+ iommu_iotlb_gather_add_range(dirty->dirty->gather,
+ pts->range->va, dirty_len);
+ }
+}
+
+static inline int __read_and_clear_dirty(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_dirty_args *dirty = arg;
+ int ret;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ ret = pt_descend(&pts, arg, __read_and_clear_dirty);
+ if (ret)
+ return ret;
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts))
+ record_dirty(&pts, dirty,
+ pt_entry_num_contig_lg2(&pts));
+ }
+ return 0;
+}
+
+/**
+ * read_and_clear_dirty() - Manipulate the HW set write dirty state
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @size: Length of the IOVA
+ * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR
+ * @dirty: Place to store the dirty bits
+ *
+ * Iterate over all the entries in the mapped range and record their write dirty
+ * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then
+ * the entries will be left dirty, otherwise they are returned to being not
+ * write dirty.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Returns: -ERRNO on failure, 0 on success.
+ */
+int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_iommu_dirty_args dirty_args = {
+ .dirty = dirty,
+ .flags = flags,
+ };
+ struct pt_range range;
+ int ret;
+
+#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty)
+ return -EOPNOTSUPP;
+#endif
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, size);
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args);
+ PT_WARN_ON(ret);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU");
+
+static inline int __set_dirty(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, __set_dirty);
+ case PT_ENTRY_OA:
+ if (!pt_entry_make_write_dirty(&pts))
+ return -EAGAIN;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table,
+ dma_addr_t iova)
+{
+ struct pt_range range;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Note: There is no locking here yet, if the test suite races this it
+ * can crash. It should use RCU locking eventually.
+ */
+ return pt_walk_range(&range, __set_dirty, NULL);
+}
+
+struct pt_iommu_collect_args {
+ struct iommu_pages_list free_list;
+ /* Fail if any OAs are within the range */
+ u8 check_mapped : 1;
+};
+
+static int __collect_tables(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_collect_args *collect = arg;
+ int ret;
+
+ if (!collect->check_mapped && !pt_can_have_table(&pts))
+ return 0;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ iommu_pages_list_add(&collect->free_list, pts.table_lower);
+ ret = pt_descend(&pts, arg, __collect_tables);
+ if (ret)
+ return ret;
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA && collect->check_mapped)
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH};
+
+/* Allocate a table, the empty table will be ready to be installed. */
+static inline struct pt_table_p *_table_alloc(struct pt_common *common,
+ size_t lg2sz, gfp_t gfp,
+ enum alloc_mode mode)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(common);
+ struct pt_table_p *table_mem;
+
+ table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp,
+ log2_to_int(lg2sz));
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ mode == ALLOC_NORMAL) {
+ int ret = iommu_pages_start_incoherent(
+ table_mem, iommu_table->iommu_device);
+ if (ret) {
+ iommu_free_pages(table_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ return table_mem;
+}
+
+static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
+ uintptr_t top_of_table,
+ gfp_t gfp,
+ enum alloc_mode mode)
+{
+ /*
+ * Top doesn't need the free list or otherwise, so it technically
+ * doesn't need to use iommu pages. Use the API anyhow as the top is
+ * usually not smaller than PAGE_SIZE to keep things simple.
+ */
+ return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table),
+ gfp, mode);
+}
+
+/* Allocate an interior table */
+static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts,
+ gfp_t gfp, enum alloc_mode mode)
+{
+ struct pt_state child_pts =
+ pt_init(parent_pts->range, parent_pts->level - 1, NULL);
+
+ return _table_alloc(parent_pts->range->common,
+ pt_num_items_lg2(&child_pts) +
+ ilog2(PT_ITEM_WORD_SIZE),
+ gfp, mode);
+}
+
+static inline int pt_iommu_new_table(struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ struct pt_table_p *table_mem;
+ phys_addr_t phys;
+
+ /* Given PA/VA/length can't be represented */
+ if (PT_WARN_ON(!pt_can_have_table(pts)))
+ return -ENXIO;
+
+ table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+
+ phys = virt_to_phys(table_mem);
+ if (!pt_install_table(pts, phys, attrs)) {
+ iommu_pages_free_incoherent(
+ table_mem,
+ iommu_from_common(pts->range->common)->iommu_device);
+ return -EAGAIN;
+ }
+
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) {
+ flush_writes_item(pts);
+ pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE);
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+ /*
+ * The underlying table can't store the physical table address.
+ * This happens when kunit testing tables outside their normal
+ * environment where a CPU might be limited.
+ */
+ pt_load_single_entry(pts);
+ if (PT_WARN_ON(pt_table_pa(pts) != phys)) {
+ pt_clear_entries(pts, ilog2(1));
+ iommu_pages_free_incoherent(
+ table_mem, iommu_from_common(pts->range->common)
+ ->iommu_device);
+ return -EINVAL;
+ }
+ }
+
+ pts->table_lower = table_mem;
+ return 0;
+}
+
+struct pt_iommu_map_args {
+ struct iommu_iotlb_gather *iotlb_gather;
+ struct pt_write_attrs attrs;
+ pt_oaddr_t oa;
+ unsigned int leaf_pgsize_lg2;
+ unsigned int leaf_level;
+};
+
+/*
+ * This will recursively check any tables in the block to validate they are
+ * empty and then free them through the gather.
+ */
+static int clear_contig(const struct pt_state *start_pts,
+ struct iommu_iotlb_gather *iotlb_gather,
+ unsigned int step, unsigned int pgsize_lg2)
+{
+ struct pt_iommu *iommu_table =
+ iommu_from_common(start_pts->range->common);
+ struct pt_range range = *start_pts->range;
+ struct pt_state pts =
+ pt_init(&range, start_pts->level, start_pts->table);
+ struct pt_iommu_collect_args collect = { .check_mapped = true };
+ int ret;
+
+ pts.index = start_pts->index;
+ pts.end_index = start_pts->index + step;
+ for (; _pt_iter_load(&pts); pt_next_entry(&pts)) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ collect.free_list =
+ IOMMU_PAGES_LIST_INIT(collect.free_list);
+ ret = pt_walk_descend_all(&pts, __collect_tables,
+ &collect);
+ if (ret)
+ return ret;
+
+ /*
+ * The table item must be cleared before we can update
+ * the gather
+ */
+ pt_clear_entries(&pts, ilog2(1));
+ flush_writes_item(&pts);
+
+ iommu_pages_list_add(&collect.free_list,
+ pt_table_ptr(&pts));
+ gather_range_pages(
+ iotlb_gather, iommu_table, range.va,
+ log2_to_int(pt_table_item_lg2sz(&pts)),
+ &collect.free_list);
+ } else if (pts.type != PT_ENTRY_EMPTY) {
+ return -EADDRINUSE;
+ }
+ }
+ return 0;
+}
+
+static int __map_range_leaf(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2;
+ unsigned int start_index;
+ pt_oaddr_t oa = map->oa;
+ unsigned int step;
+ bool need_contig;
+ int ret = 0;
+
+ PT_WARN_ON(map->leaf_level != level);
+ PT_WARN_ON(!pt_can_have_leaf(&pts));
+
+ step = log2_to_int_t(unsigned int,
+ leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts));
+ need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts);
+
+ _pt_iter_first(&pts);
+ start_index = pts.index;
+ do {
+ pts.type = pt_load_entry_raw(&pts);
+ if (pts.type != PT_ENTRY_EMPTY || need_contig) {
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ ret = clear_contig(&pts, map->iotlb_gather, step,
+ leaf_pgsize_lg2);
+ if (ret)
+ break;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+ pt_index_to_va(&pts);
+ PT_WARN_ON(compute_best_pgsize(&pts, oa) !=
+ leaf_pgsize_lg2);
+ }
+ pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs);
+
+ oa += log2_to_int(leaf_pgsize_lg2);
+ pts.index += step;
+ } while (pts.index < pts.end_index);
+
+ flush_writes_range(&pts, start_index, pts.index);
+
+ map->oa = oa;
+ return ret;
+}
+
+static int __map_range(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ int ret;
+
+ PT_WARN_ON(map->leaf_level == level);
+ PT_WARN_ON(!pt_can_have_table(&pts));
+
+ _pt_iter_first(&pts);
+
+ /* Descend to a child table */
+ do {
+ pts.type = pt_load_entry_raw(&pts);
+
+ if (pts.type != PT_ENTRY_TABLE) {
+ if (pts.type != PT_ENTRY_EMPTY)
+ return -EADDRINUSE;
+ ret = pt_iommu_new_table(&pts, &map->attrs);
+ if (ret) {
+ /*
+ * Racing with another thread installing a table
+ */
+ if (ret == -EAGAIN)
+ continue;
+ return ret;
+ }
+ } else {
+ pts.table_lower = pt_table_ptr(&pts);
+ /*
+ * Racing with a shared pt_iommu_new_table()? The other
+ * thread is still flushing the cache, so we have to
+ * also flush it to ensure that when our thread's map
+ * completes all the table items leading to our mapping
+ * are visible.
+ *
+ * This requires the pt_set_bit_release() to be a
+ * release of the cache flush so that this can acquire
+ * visibility at the iommu.
+ */
+ if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) &&
+ !pt_test_sw_bit_acquire(&pts,
+ SW_BIT_CACHE_FLUSH_DONE))
+ flush_writes_item(&pts);
+ }
+
+ /*
+ * The already present table can possibly be shared with another
+ * concurrent map.
+ */
+ if (map->leaf_level == level - 1)
+ ret = pt_descend(&pts, arg, __map_range_leaf);
+ else
+ ret = pt_descend(&pts, arg, __map_range);
+ if (ret)
+ return ret;
+
+ pts.index++;
+ pt_index_to_va(&pts);
+ if (pts.index >= pts.end_index)
+ break;
+ } while (true);
+ return 0;
+}
+
+/*
+ * Fast path for the easy case of mapping a 4k page to an already allocated
+ * table. This is a common workload. If it returns EAGAIN run the full algorithm
+ * instead.
+ */
+static __always_inline int __do_map_single_page(struct pt_range *range,
+ void *arg, unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+
+ pts.type = pt_load_single_entry(&pts);
+ if (level == 0) {
+ if (pts.type != PT_ENTRY_EMPTY)
+ return -EADDRINUSE;
+ pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT,
+ &map->attrs);
+ /* No flush, not used when incoherent */
+ map->oa += PAGE_SIZE;
+ return 0;
+ }
+ if (pts.type == PT_ENTRY_TABLE)
+ return pt_descend(&pts, arg, descend_fn);
+ /* Something else, use the slow path */
+ return -EAGAIN;
+}
+PT_MAKE_LEVELS(__map_single_page, __do_map_single_page);
+
+/*
+ * Add a table to the top, increasing the top level as much as necessary to
+ * encompass range.
+ */
+static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_iommu_map_args *map)
+{
+ struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list);
+ struct pt_common *common = common_from_iommu(iommu_table);
+ uintptr_t top_of_table = READ_ONCE(common->top_of_table);
+ uintptr_t new_top_of_table = top_of_table;
+ struct pt_table_p *table_mem;
+ unsigned int new_level;
+ spinlock_t *domain_lock;
+ unsigned long flags;
+ int ret;
+
+ while (true) {
+ struct pt_range top_range =
+ _pt_top_range(common, new_top_of_table);
+ struct pt_state pts = pt_init_top(&top_range);
+
+ top_range.va = range->va;
+ top_range.last_va = range->last_va;
+
+ if (!pt_check_range(&top_range) &&
+ map->leaf_level <= pts.level) {
+ new_level = pts.level;
+ break;
+ }
+
+ pts.level++;
+ if (pts.level > PT_MAX_TOP_LEVEL ||
+ pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) {
+ ret = -ERANGE;
+ goto err_free;
+ }
+
+ table_mem =
+ table_alloc_top(common, _pt_top_set(NULL, pts.level),
+ map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH);
+ if (IS_ERR(table_mem)) {
+ ret = PTR_ERR(table_mem);
+ goto err_free;
+ }
+ iommu_pages_list_add(&free_list, table_mem);
+
+ /* The new table links to the lower table always at index 0 */
+ top_range.va = 0;
+ top_range.top_level = pts.level;
+ pts.table_lower = pts.table;
+ pts.table = table_mem;
+ pt_load_single_entry(&pts);
+ PT_WARN_ON(pts.index != 0);
+ pt_install_table(&pts, virt_to_phys(pts.table_lower),
+ &map->attrs);
+ new_top_of_table = _pt_top_set(pts.table, pts.level);
+ }
+
+ /*
+ * Avoid double flushing, flush it once after all pt_install_table()
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ ret = iommu_pages_start_incoherent_list(
+ &free_list, iommu_table->iommu_device);
+ if (ret)
+ goto err_free;
+ }
+
+ /*
+ * top_of_table is write locked by the spinlock, but readers can use
+ * READ_ONCE() to get the value. Since we encode both the level and the
+ * pointer in one quanta the lockless reader will always see something
+ * valid. The HW must be updated to the new level under the spinlock
+ * before top_of_table is updated so that concurrent readers don't map
+ * into the new level until it is fully functional. If another thread
+ * already updated it while we were working then throw everything away
+ * and try again.
+ */
+ domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table);
+ spin_lock_irqsave(domain_lock, flags);
+ if (common->top_of_table != top_of_table ||
+ top_of_table == new_top_of_table) {
+ spin_unlock_irqrestore(domain_lock, flags);
+ ret = -EAGAIN;
+ goto err_free;
+ }
+
+ /*
+ * We do not issue any flushes for change_top on the expectation that
+ * any walk cache will not become a problem by adding another layer to
+ * the tree. Misses will rewalk from the updated top pointer, hits
+ * continue to be correct. Negative caching is fine too since all the
+ * new IOVA added by the new top is non-present.
+ */
+ iommu_table->driver_ops->change_top(
+ iommu_table, virt_to_phys(table_mem), new_level);
+ WRITE_ONCE(common->top_of_table, new_top_of_table);
+ spin_unlock_irqrestore(domain_lock, flags);
+ return 0;
+
+err_free:
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&free_list,
+ iommu_table->iommu_device);
+ iommu_put_pages_list(&free_list);
+ return ret;
+}
+
+static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_iommu_map_args *map)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ int ret;
+
+ do {
+ ret = pt_check_range(range);
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ return ret;
+
+ if (!ret && map->leaf_level <= range->top_level)
+ break;
+
+ ret = increase_top(iommu_table, range, map);
+ if (ret && ret != -EAGAIN)
+ return ret;
+
+ /* Reload the new top */
+ *range = pt_make_range(common, range->va, range->last_va);
+ } while (ret);
+ PT_WARN_ON(pt_check_range(range));
+ return 0;
+}
+
+static int do_map(struct pt_range *range, struct pt_common *common,
+ bool single_page, struct pt_iommu_map_args *map)
+{
+ /*
+ * The __map_single_page() fast path does not support DMA_INCOHERENT
+ * flushing to keep its .text small.
+ */
+ if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ int ret;
+
+ ret = pt_walk_range(range, __map_single_page, map);
+ if (ret != -EAGAIN)
+ return ret;
+ /* EAGAIN falls through to the full path */
+ }
+
+ if (map->leaf_level == range->top_level)
+ return pt_walk_range(range, __map_range_leaf, map);
+ return pt_walk_range(range, __map_range, map);
+}
+
+/**
+ * map_pages() - Install translation for an IOVA range
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @paddr: Physical/Output address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO
+ * @gfp: GFP flags for any memory allocations
+ * @mapped: Total bytes successfully mapped
+ *
+ * The range starting at IOVA will have paddr installed into it. The caller
+ * must specify a valid pgsize and pgcount to segment the range into compatible
+ * blocks.
+ *
+ * On error the caller will probably want to invoke unmap on the range from iova
+ * up to the amount indicated by @mapped to return the table back to an
+ * unchanged state.
+ *
+ * Context: The caller must hold a write range lock that includes the whole
+ * range.
+ *
+ * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were
+ * mapped are added to @mapped, @mapped is not zerod first.
+ */
+int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct iommu_iotlb_gather iotlb_gather;
+ pt_vaddr_t len = pgsize * pgcount;
+ struct pt_iommu_map_args map = {
+ .iotlb_gather = &iotlb_gather,
+ .oa = paddr,
+ .leaf_pgsize_lg2 = vaffs(pgsize),
+ };
+ bool single_page = false;
+ struct pt_range range;
+ int ret;
+
+ iommu_iotlb_gather_init(&iotlb_gather);
+
+ if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE))))
+ return -EINVAL;
+
+ /* Check the paddr doesn't exceed what the table can store */
+ if ((sizeof(pt_oaddr_t) < sizeof(paddr) &&
+ (pt_vaddr_t)paddr > PT_VADDR_MAX) ||
+ (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 &&
+ oalog2_div(paddr, common->max_oasz_lg2)))
+ return -ERANGE;
+
+ ret = pt_iommu_set_prot(common, &map.attrs, prot);
+ if (ret)
+ return ret;
+ map.attrs.gfp = gfp;
+
+ ret = make_range_no_check(common, &range, iova, len);
+ if (ret)
+ return ret;
+
+ /* Calculate target page size and level for the leaves */
+ if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE &&
+ pgcount == 1) {
+ PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE));
+ if (log2_mod(iova | paddr, PAGE_SHIFT))
+ return -ENXIO;
+ map.leaf_pgsize_lg2 = PAGE_SHIFT;
+ map.leaf_level = 0;
+ single_page = true;
+ } else {
+ map.leaf_pgsize_lg2 = pt_compute_best_pgsize(
+ pgsize_bitmap, range.va, range.last_va, paddr);
+ if (!map.leaf_pgsize_lg2)
+ return -ENXIO;
+ map.leaf_level =
+ pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2);
+ }
+
+ ret = check_map_range(iommu_table, &range, &map);
+ if (ret)
+ return ret;
+
+ PT_WARN_ON(map.leaf_level > range.top_level);
+
+ ret = do_map(&range, common, single_page, &map);
+
+ /*
+ * Table levels were freed and replaced with large items, flush any walk
+ * cache that may refer to the freed levels.
+ */
+ if (!iommu_pages_list_empty(&iotlb_gather.freelist))
+ iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather);
+
+ /* Bytes successfully mapped */
+ PT_WARN_ON(!ret && map.oa - paddr != len);
+ *mapped += map.oa - paddr;
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU");
+
+struct pt_unmap_args {
+ struct iommu_pages_list free_list;
+ pt_vaddr_t unmapped;
+};
+
+static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_unmap_args *unmap = arg;
+ unsigned int num_oas = 0;
+ unsigned int start_index;
+ int ret = 0;
+
+ _pt_iter_first(&pts);
+ start_index = pts.index;
+ pts.type = pt_load_entry_raw(&pts);
+ /*
+ * A starting index is in the middle of a contiguous entry
+ *
+ * The IOMMU API does not require drivers to support unmapping parts of
+ * large pages. Long ago VFIO would try to split maps but the current
+ * version never does.
+ *
+ * Instead when unmap reaches a partial unmap of the start of a large
+ * IOPTE it should remove the entire IOPTE and return that size to the
+ * caller.
+ */
+ if (pts.type == PT_ENTRY_OA) {
+ if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts)))
+ return -EINVAL;
+ /* Micro optimization */
+ goto start_oa;
+ }
+
+ do {
+ if (pts.type != PT_ENTRY_OA) {
+ bool fully_covered;
+
+ if (pts.type != PT_ENTRY_TABLE) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ pts.table_lower = pt_table_ptr(&pts);
+
+ fully_covered = pt_entry_fully_covered(
+ &pts, pt_table_item_lg2sz(&pts));
+
+ ret = pt_descend(&pts, arg, __unmap_range);
+ if (ret)
+ break;
+
+ /*
+ * If the unmapping range fully covers the table then we
+ * can free it as well. The clear is delayed until we
+ * succeed in clearing the lower table levels.
+ */
+ if (fully_covered) {
+ iommu_pages_list_add(&unmap->free_list,
+ pts.table_lower);
+ pt_clear_entries(&pts, ilog2(1));
+ }
+ pts.index++;
+ } else {
+ unsigned int num_contig_lg2;
+start_oa:
+ /*
+ * If the caller requested an last that falls within a
+ * single entry then the entire entry is unmapped and
+ * the length returned will be larger than requested.
+ */
+ num_contig_lg2 = pt_entry_num_contig_lg2(&pts);
+ pt_clear_entries(&pts, num_contig_lg2);
+ num_oas += log2_to_int(num_contig_lg2);
+ pts.index += log2_to_int(num_contig_lg2);
+ }
+ if (pts.index >= pts.end_index)
+ break;
+ pts.type = pt_load_entry_raw(&pts);
+ } while (true);
+
+ unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
+ flush_writes_range(&pts, start_index, pts.index);
+
+ return ret;
+}
+
+/**
+ * unmap_pages() - Make a range of IOVA empty/not present
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @iotlb_gather: Gather struct that must be flushed on return
+ *
+ * unmap_pages() will remove a translation created by map_pages(). It cannot
+ * subdivide a mapping created by map_pages(), so it should be called with IOVA
+ * ranges that match those passed to map_pages(). The IOVA range can aggregate
+ * contiguous map_pages() calls so long as no individual range is split.
+ *
+ * Context: The caller must hold a write range lock that includes
+ * the whole range.
+ *
+ * Returns: Number of bytes of VA unmapped. iova + res will be the point
+ * unmapping stopped.
+ */
+size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT(
+ unmap.free_list) };
+ pt_vaddr_t len = pgsize * pgcount;
+ struct pt_range range;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
+ if (ret)
+ return 0;
+
+ pt_walk_range(&range, __unmap_range, &unmap);
+
+ gather_range_pages(iotlb_gather, iommu_table, iova, len,
+ &unmap.free_list);
+
+ return unmap.unmapped;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU");
+
+static void NS(get_info)(struct pt_iommu *iommu_table,
+ struct pt_iommu_info *info)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_top_range(common);
+ struct pt_state pts = pt_init_top(&range);
+ pt_vaddr_t pgsize_bitmap = 0;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) {
+ for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL;
+ pts.level++) {
+ if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2)
+ break;
+ pgsize_bitmap |= pt_possible_sizes(&pts);
+ }
+ } else {
+ for (pts.level = 0; pts.level <= range.top_level; pts.level++)
+ pgsize_bitmap |= pt_possible_sizes(&pts);
+ }
+
+ /* Hide page sizes larger than the maximum OA */
+ info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2);
+}
+
+static void NS(deinit)(struct pt_iommu *iommu_table)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_all_range(common);
+ struct pt_iommu_collect_args collect = {
+ .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+ };
+
+ iommu_pages_list_add(&collect.free_list, range.top_table);
+ pt_walk_range(&range, __collect_tables, &collect);
+
+ /*
+ * The driver has to already have fenced the HW access to the page table
+ * and invalidated any caching referring to this memory.
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&collect.free_list,
+ iommu_table->iommu_device);
+ iommu_put_pages_list(&collect.free_list);
+}
+
+static const struct pt_iommu_ops NS(ops) = {
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \
+ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)
+ .set_dirty = NS(set_dirty),
+#endif
+ .get_info = NS(get_info),
+ .deinit = NS(deinit),
+};
+
+static int pt_init_common(struct pt_common *common)
+{
+ struct pt_range top_range = pt_top_range(common);
+
+ if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL))
+ return -EINVAL;
+
+ if (top_range.top_level == PT_MAX_TOP_LEVEL ||
+ common->max_vasz_lg2 == top_range.max_vasz_lg2)
+ common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP);
+
+ if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2)
+ common->features |= BIT(PT_FEAT_FULL_VA);
+
+ /* Requested features must match features compiled into this format */
+ if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) ||
+ (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) &&
+ (common->features & PT_FORCE_ENABLED_FEATURES) !=
+ PT_FORCE_ENABLED_FEATURES))
+ return -EOPNOTSUPP;
+
+ /*
+ * Check if the top level of the page table is too small to hold the
+ * specified maxvasz.
+ */
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ top_range.top_level != PT_MAX_TOP_LEVEL) {
+ struct pt_state pts = { .range = &top_range,
+ .level = top_range.top_level };
+
+ if (common->max_vasz_lg2 >
+ pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts))
+ return -EOPNOTSUPP;
+ }
+
+ if (common->max_oasz_lg2 == 0)
+ common->max_oasz_lg2 = pt_max_oa_lg2(common);
+ else
+ common->max_oasz_lg2 = min(common->max_oasz_lg2,
+ pt_max_oa_lg2(common));
+ return 0;
+}
+
+static int pt_iommu_init_domain(struct pt_iommu *iommu_table,
+ struct iommu_domain *domain)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_iommu_info info;
+ struct pt_range range;
+
+ NS(get_info)(iommu_table, &info);
+
+ domain->type = __IOMMU_DOMAIN_PAGING;
+ domain->pgsize_bitmap = info.pgsize_bitmap;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ range = _pt_top_range(common,
+ _pt_top_set(NULL, PT_MAX_TOP_LEVEL));
+ else
+ range = pt_top_range(common);
+
+ /* A 64-bit high address space table on a 32-bit system cannot work. */
+ domain->geometry.aperture_start = (unsigned long)range.va;
+ if ((pt_vaddr_t)domain->geometry.aperture_start != range.va)
+ return -EOVERFLOW;
+
+ /*
+ * The aperture is limited to what the API can do after considering all
+ * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used
+ * to store a VA. Set the aperture to something that is valid for all
+ * cases. Saturate instead of truncate the end if the types are smaller
+ * than the top range. aperture_end should be called aperture_last.
+ */
+ domain->geometry.aperture_end = (unsigned long)range.last_va;
+ if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) {
+ domain->geometry.aperture_end = ULONG_MAX;
+ domain->pgsize_bitmap &= ULONG_MAX;
+ }
+ domain->geometry.force_aperture = true;
+
+ return 0;
+}
+
+static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_iommu cfg = *iommu_table;
+
+ static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0);
+ memset_after(fmt_table, 0, iommu.domain);
+
+ /* The caller can initialize some of these values */
+ iommu_table->iommu_device = cfg.iommu_device;
+ iommu_table->driver_ops = cfg.driver_ops;
+ iommu_table->nid = cfg.nid;
+}
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+ const struct pt_iommu_table_cfg *cfg, gfp_t gfp)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_table_p *table_mem;
+ int ret;
+
+ if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 ||
+ !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2)
+ return -EINVAL;
+
+ pt_iommu_zero(fmt_table);
+ common->features = cfg->common.features;
+ common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+ common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2;
+ ret = pt_iommu_fmt_init(fmt_table, cfg);
+ if (ret)
+ return ret;
+
+ if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common))
+ return -EINVAL;
+
+ ret = pt_init_common(common);
+ if (ret)
+ return ret;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ WARN_ON(!iommu_table->driver_ops ||
+ !iommu_table->driver_ops->change_top ||
+ !iommu_table->driver_ops->get_top_lock))
+ return -EINVAL;
+
+ if (pt_feature(common, PT_FEAT_SIGN_EXTEND) &&
+ (pt_feature(common, PT_FEAT_FULL_VA) ||
+ pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
+ return -EINVAL;
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ WARN_ON(!iommu_table->iommu_device))
+ return -EINVAL;
+
+ ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
+ if (ret)
+ return ret;
+
+ table_mem = table_alloc_top(common, common->top_of_table, gfp,
+ ALLOC_NORMAL);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+ pt_top_set(common, table_mem, pt_top_get_level(common));
+
+ /* Must be last, see pt_iommu_deinit() */
+ iommu_table->ops = &NS(ops);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU");
+
+#ifdef pt_iommu_fmt_hw_info
+#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info)
+#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info)
+void pt_iommu_hw_info(struct pt_iommu_table *fmt_table,
+ struct pt_iommu_table_hw_info *info)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range top_range = pt_top_range(common);
+
+ pt_iommu_fmt_hw_info(fmt_table, &top_range, info);
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU");
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW));
+MODULE_IMPORT_NS("GENERIC_PT");
+/* For iommu_dirty_bitmap_record() */
+MODULE_IMPORT_NS("IOMMUFD");
+
+#endif /* __GENERIC_PT_IOMMU_PT_H */
diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h
new file mode 100644
index 000000000000..68278bf15cfe
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_generic_pt.h
@@ -0,0 +1,823 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Test the format API directly.
+ *
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ int ret;
+
+ KUNIT_ASSERT_EQ(test, len, (size_t)len);
+
+ ret = iommu_map(&priv->domain, va, pa, len, IOMMU_READ | IOMMU_WRITE,
+ GFP_KERNEL);
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "map_pages", ret);
+}
+
+#define KUNIT_ASSERT_PT_LOAD(test, pts, entry) \
+ ({ \
+ pt_load_entry(pts); \
+ KUNIT_ASSERT_EQ(test, (pts)->type, entry); \
+ })
+
+struct check_levels_arg {
+ struct kunit *test;
+ void *fn_arg;
+ void (*fn)(struct kunit *test, struct pt_state *pts, void *arg);
+};
+
+static int __check_all_levels(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct check_levels_arg *chk = arg;
+ struct kunit *test = chk->test;
+ int ret;
+
+ _pt_iter_first(&pts);
+
+
+ /*
+ * If we were able to use the full VA space this should always be the
+ * last index in each table.
+ */
+ if (!(IS_32BIT && range->max_vasz_lg2 > 32)) {
+ if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND) &&
+ pts.level == pts.range->top_level)
+ KUNIT_ASSERT_EQ(test, pts.index,
+ log2_to_int(range->max_vasz_lg2 - 1 -
+ pt_table_item_lg2sz(&pts)) -
+ 1);
+ else
+ KUNIT_ASSERT_EQ(test, pts.index,
+ log2_to_int(pt_table_oa_lg2sz(&pts) -
+ pt_table_item_lg2sz(&pts)) -
+ 1);
+ }
+
+ if (pt_can_have_table(&pts)) {
+ pt_load_single_entry(&pts);
+ KUNIT_ASSERT_EQ(test, pts.type, PT_ENTRY_TABLE);
+ ret = pt_descend(&pts, arg, __check_all_levels);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* Index 0 is used by the test */
+ if (IS_32BIT && !pts.index)
+ return 0;
+ KUNIT_ASSERT_NE(chk->test, pts.index, 0);
+ }
+
+ /*
+ * A format should not create a table with only one entry, at least this
+ * test approach won't work.
+ */
+ KUNIT_ASSERT_GT(chk->test, pts.end_index, 1);
+
+ /*
+ * For increase top we end up using index 0 for the original top's tree,
+ * so use index 1 for testing instead.
+ */
+ pts.index = 0;
+ pt_index_to_va(&pts);
+ pt_load_single_entry(&pts);
+ if (pts.type == PT_ENTRY_TABLE && pts.end_index > 2) {
+ pts.index = 1;
+ pt_index_to_va(&pts);
+ }
+ (*chk->fn)(chk->test, &pts, chk->fn_arg);
+ return 0;
+}
+
+/*
+ * Call fn for each level in the table with a pts setup to index 0 in a table
+ * for that level. This allows writing tests that run on every level.
+ * The test can use every index in the table except the last one.
+ */
+static void check_all_levels(struct kunit *test,
+ void (*fn)(struct kunit *test,
+ struct pt_state *pts, void *arg),
+ void *fn_arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct check_levels_arg chk = {
+ .test = test,
+ .fn = fn,
+ .fn_arg = fn_arg,
+ };
+ int ret;
+
+ if (pt_feature(priv->common, PT_FEAT_DYNAMIC_TOP) &&
+ priv->common->max_vasz_lg2 > range.max_vasz_lg2)
+ range.last_va = fvalog2_set_mod_max(range.va,
+ priv->common->max_vasz_lg2);
+
+ /*
+ * Map a page at the highest VA, this will populate all the levels so we
+ * can then iterate over them. Index 0 will be used for testing.
+ */
+ if (IS_32BIT && range.max_vasz_lg2 > 32)
+ range.last_va = (u32)range.last_va;
+ range.va = range.last_va - (priv->smallest_pgsz - 1);
+ do_map(test, range.va, 0, priv->smallest_pgsz);
+
+ range = pt_make_range(priv->common, range.va, range.last_va);
+ ret = pt_walk_range(&range, __check_all_levels, &chk);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+}
+
+static void test_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ /* Fixture does the setup */
+ KUNIT_ASSERT_NE(test, priv->info.pgsize_bitmap, 0);
+}
+
+/*
+ * Basic check that the log2_* functions are working, especially at the integer
+ * limits.
+ */
+static void test_bitops(struct kunit *test)
+{
+ int i;
+
+ KUNIT_ASSERT_EQ(test, fls_t(u32, 0), 0);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, 1), 1);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, BIT(2)), 3);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, U32_MAX), 32);
+
+ KUNIT_ASSERT_EQ(test, fls_t(u64, 0), 0);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, 1), 1);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, BIT(2)), 3);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, U64_MAX), 64);
+
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, 1), 0);
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(2)), 2);
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(31)), 31);
+
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, 1), 0);
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT(2)), 2);
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT_ULL(63)), 63);
+
+ for (i = 0; i != 31; i++)
+ KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i);
+
+ for (i = 0; i != 63; i++)
+ KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i);
+
+ for (i = 0; i != 32; i++) {
+ u64 val = get_random_u64();
+
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffs_t(u32, val)), 0);
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffs_t(u64, val)), 0);
+
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffz_t(u32, val)),
+ log2_to_max_int_t(u32, ffz_t(u32, val)));
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffz_t(u64, val)),
+ log2_to_max_int_t(u64, ffz_t(u64, val)));
+ }
+}
+
+static unsigned int ref_best_pgsize(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va,
+ pt_vaddr_t last_va, pt_oaddr_t oa)
+{
+ pt_vaddr_t pgsz_lg2;
+
+ /* Brute force the constraints described in pt_compute_best_pgsize() */
+ for (pgsz_lg2 = PT_VADDR_MAX_LG2 - 1; pgsz_lg2 != 0; pgsz_lg2--) {
+ if ((pgsz_bitmap & log2_to_int(pgsz_lg2)) &&
+ log2_mod(va, pgsz_lg2) == 0 &&
+ oalog2_mod(oa, pgsz_lg2) == 0 &&
+ va + log2_to_int(pgsz_lg2) - 1 <= last_va &&
+ log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2) &&
+ oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2))
+ return pgsz_lg2;
+ }
+ return 0;
+}
+
+/* Check that the bit logic in pt_compute_best_pgsize() works. */
+static void test_best_pgsize(struct kunit *test)
+{
+ unsigned int a_lg2;
+ unsigned int b_lg2;
+ unsigned int c_lg2;
+
+ /* Try random prefixes with every suffix combination */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+
+ /* 0 prefix, every suffix */
+ for (c_lg2 = 1; c_lg2 != PT_VADDR_MAX_LG2 - 1; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = 0;
+ pt_oaddr_t oa = 0;
+ pt_vaddr_t last_va = log2_set_mod_max(0, c_lg2);
+
+ KUNIT_ASSERT_EQ(test,
+ pt_compute_best_pgsize(pgsz_bitmap, va, last_va,
+ oa),
+ ref_best_pgsize(pgsz_bitmap, va, last_va, oa));
+ }
+
+ /* 1's prefix, every suffix */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = PT_VADDR_MAX << a_lg2;
+ pt_oaddr_t oa = PT_VADDR_MAX << b_lg2;
+ pt_vaddr_t last_va = PT_VADDR_MAX;
+
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+
+ /* pgsize_bitmap is always 0 */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = 0;
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ 0);
+ }
+ }
+ }
+
+ if (sizeof(pt_vaddr_t) <= 4)
+ return;
+
+ /* over 32 bit page sizes */
+ for (a_lg2 = 32; a_lg2 != 42; a_lg2++) {
+ for (b_lg2 = 32; b_lg2 != 42; b_lg2++) {
+ for (c_lg2 = 32; c_lg2 != 42; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+}
+
+/*
+ * Check that pt_install_table() and pt_table_pa() match
+ */
+static void test_lvl_table_ptr(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ struct pt_write_attrs attrs = {};
+
+ if (!pt_can_have_table(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ pt_load_single_entry(pts);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ /* A second install should pass because install updates pts->entry. */
+ KUNIT_ASSERT_EQ(test, pt_install_table(pts, paddr, &attrs), true);
+
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_TABLE);
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entries(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static void test_table_ptr(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_table_ptr, NULL);
+}
+
+struct lvl_radix_arg {
+ pt_vaddr_t vbits;
+};
+
+/*
+ * Check pt_table_oa_lg2sz() and pt_table_item_lg2sz() they need to decode a
+ * continuous list of VA across all the levels that covers the entire advertised
+ * VA space.
+ */
+static void test_lvl_radix(struct kunit *test, struct pt_state *pts, void *arg)
+{
+ unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct lvl_radix_arg *radix = arg;
+
+ /* Every bit below us is decoded */
+ KUNIT_ASSERT_EQ(test, log2_set_mod_max(0, isz_lg2), radix->vbits);
+
+ /* We are not decoding bits someone else is */
+ KUNIT_ASSERT_EQ(test, log2_div(radix->vbits, isz_lg2), 0);
+
+ /* Can't decode past the pt_vaddr_t size */
+ KUNIT_ASSERT_LE(test, table_lg2sz, PT_VADDR_MAX_LG2);
+ KUNIT_ASSERT_EQ(test, fvalog2_div(table_lg2sz, PT_MAX_VA_ADDRESS_LG2),
+ 0);
+
+ radix->vbits = fvalog2_set_mod_max(0, table_lg2sz);
+}
+
+static void test_max_va(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+
+ KUNIT_ASSERT_GE(test, priv->common->max_vasz_lg2, range.max_vasz_lg2);
+}
+
+static void test_table_radix(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct lvl_radix_arg radix = { .vbits = priv->smallest_pgsz - 1 };
+ struct pt_range range;
+
+ check_all_levels(test, test_lvl_radix, &radix);
+
+ range = pt_top_range(priv->common);
+ if (range.max_vasz_lg2 == PT_VADDR_MAX_LG2) {
+ KUNIT_ASSERT_EQ(test, radix.vbits, PT_VADDR_MAX);
+ } else {
+ if (!IS_32BIT)
+ KUNIT_ASSERT_EQ(test,
+ log2_set_mod_max(0, range.max_vasz_lg2),
+ radix.vbits);
+ KUNIT_ASSERT_EQ(test, log2_div(radix.vbits, range.max_vasz_lg2),
+ 0);
+ }
+}
+
+static unsigned int safe_pt_num_items_lg2(const struct pt_state *pts)
+{
+ struct pt_range top_range = pt_top_range(pts->range->common);
+ struct pt_state top_pts = pt_init_top(&top_range);
+
+ /*
+ * Avoid calling pt_num_items_lg2() on the top, instead we can derive
+ * the size of the top table from the top range.
+ */
+ if (pts->level == top_range.top_level)
+ return ilog2(pt_range_to_end_index(&top_pts));
+ return pt_num_items_lg2(pts);
+}
+
+static void test_lvl_possible_sizes(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ unsigned int num_items_lg2 = safe_pt_num_items_lg2(pts);
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!pt_can_have_leaf(pts)) {
+ KUNIT_ASSERT_EQ(test, pgsize_bitmap, 0);
+ return;
+ }
+
+ /* No bits for sizes that would be outside this table */
+ KUNIT_ASSERT_EQ(test, log2_mod(pgsize_bitmap, isz_lg2), 0);
+ KUNIT_ASSERT_EQ(
+ test, fvalog2_div(pgsize_bitmap, num_items_lg2 + isz_lg2), 0);
+
+ /*
+ * Non contiguous must be supported. AMDv1 has a HW bug where it does
+ * not support it on one of the levels.
+ */
+ if ((u64)pgsize_bitmap != 0xff0000000000ULL ||
+ strcmp(__stringify(PTPFX_RAW), "amdv1") != 0)
+ KUNIT_ASSERT_TRUE(test, pgsize_bitmap & log2_to_int(isz_lg2));
+ else
+ KUNIT_ASSERT_NE(test, pgsize_bitmap, 0);
+
+ /* A contiguous entry should not span the whole table */
+ if (num_items_lg2 + isz_lg2 != PT_VADDR_MAX_LG2)
+ KUNIT_ASSERT_FALSE(
+ test,
+ pgsize_bitmap & log2_to_int(num_items_lg2 + isz_lg2));
+}
+
+static void test_entry_possible_sizes(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_possible_sizes, NULL);
+}
+
+static void sweep_all_pgsizes(struct kunit *test, struct pt_state *pts,
+ struct pt_write_attrs *attrs,
+ pt_oaddr_t test_oaddr)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ unsigned int len_lg2;
+
+ if (pts->index != 0)
+ return;
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ struct pt_state sub_pts = *pts;
+ pt_oaddr_t oaddr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ oaddr = log2_set_mod(test_oaddr, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, attrs);
+ /* Verify that every contiguous item translates correctly */
+ for (sub_pts.index = 0;
+ sub_pts.index != log2_to_int(len_lg2 - isz_lg2);
+ sub_pts.index++) {
+ KUNIT_ASSERT_PT_LOAD(test, &sub_pts, PT_ENTRY_OA);
+ KUNIT_ASSERT_EQ(test, pt_item_oa(&sub_pts),
+ oaddr + sub_pts.index *
+ oalog2_mul(1, isz_lg2));
+ KUNIT_ASSERT_EQ(test, pt_entry_oa(&sub_pts), oaddr);
+ KUNIT_ASSERT_EQ(test, pt_entry_num_contig_lg2(&sub_pts),
+ len_lg2 - isz_lg2);
+ }
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+/*
+ * Check that pt_install_leaf_entry() and pt_entry_oa() match.
+ * Check that pt_clear_entries() works.
+ */
+static void test_lvl_entry_oa(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ unsigned int max_oa_lg2 = pts->range->common->max_oasz_lg2;
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ sweep_all_pgsizes(test, pts, &attrs, priv->test_oa);
+
+ /* Check that the table can store the boundary OAs */
+ sweep_all_pgsizes(test, pts, &attrs, 0);
+ if (max_oa_lg2 == PT_OADDR_MAX_LG2)
+ sweep_all_pgsizes(test, pts, &attrs, PT_OADDR_MAX);
+ else
+ sweep_all_pgsizes(test, pts, &attrs,
+ oalog2_to_max_int(max_oa_lg2));
+}
+
+static void test_entry_oa(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_entry_oa, NULL);
+}
+
+/* Test pt_attr_from_entry() */
+static void test_lvl_attr_from_entry(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct kunit_iommu_priv *priv = test->priv;
+ unsigned int len_lg2;
+ unsigned int prot;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) {
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+ for (prot = 0; prot <= (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE |
+ IOMMU_NOEXEC | IOMMU_MMIO);
+ prot++) {
+ pt_oaddr_t oaddr;
+ struct pt_write_attrs attrs = {};
+ u64 good_entry;
+
+ /*
+ * If the format doesn't support this combination of
+ * prot bits skip it
+ */
+ if (pt_iommu_set_prot(pts->range->common, &attrs,
+ prot)) {
+ /* But RW has to be supported */
+ KUNIT_ASSERT_NE(test, prot,
+ IOMMU_READ | IOMMU_WRITE);
+ continue;
+ }
+
+ oaddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ good_entry = pts->entry;
+
+ memset(&attrs, 0, sizeof(attrs));
+ pt_attr_from_entry(pts, &attrs);
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ /*
+ * The descriptor produced by pt_attr_from_entry()
+ * produce an identical entry value when re-written
+ */
+ KUNIT_ASSERT_EQ(test, good_entry, pts->entry);
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ }
+ }
+}
+
+static void test_attr_from_entry(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_attr_from_entry, NULL);
+}
+
+static void test_lvl_dirty(struct kunit *test, struct pt_state *pts, void *arg)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct kunit_iommu_priv *priv = test->priv;
+ unsigned int start_idx = pts->index;
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ | IOMMU_WRITE));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) {
+ pt_oaddr_t oaddr;
+ unsigned int i;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ oaddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ pt_load_entry(pts);
+ pt_entry_make_write_clean(pts);
+ pt_load_entry(pts);
+ KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts));
+
+ for (i = 0; i != log2_to_int(len_lg2 - isz_lg2); i++) {
+ /* dirty every contiguous entry */
+ pts->index = start_idx + i;
+ pt_load_entry(pts);
+ KUNIT_ASSERT_TRUE(test, pt_entry_make_write_dirty(pts));
+ pts->index = start_idx;
+ pt_load_entry(pts);
+ KUNIT_ASSERT_TRUE(test, pt_entry_is_write_dirty(pts));
+
+ pt_entry_make_write_clean(pts);
+ pt_load_entry(pts);
+ KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts));
+ }
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ }
+}
+
+static __maybe_unused void test_dirty(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!pt_dirty_supported(priv->common))
+ kunit_skip(test,
+ "Page table features do not support dirty tracking");
+
+ check_all_levels(test, test_lvl_dirty, NULL);
+}
+
+static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ struct pt_write_attrs new_attrs = {};
+ unsigned int bitnr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ pt_install_leaf_entry(pts, paddr, len_lg2, &attrs);
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++) {
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr);
+
+ /* SW bits didn't leak into the attrs */
+ pt_attr_from_entry(pts, &new_attrs);
+ KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs));
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+static __maybe_unused void test_sw_bit_leaf(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_leaf, NULL);
+}
+
+static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ unsigned int bitnr;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) {
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entries(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static __maybe_unused void test_sw_bit_table(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_table, NULL);
+}
+
+static struct kunit_case generic_pt_test_cases[] = {
+ KUNIT_CASE_FMT(test_init),
+ KUNIT_CASE_FMT(test_bitops),
+ KUNIT_CASE_FMT(test_best_pgsize),
+ KUNIT_CASE_FMT(test_table_ptr),
+ KUNIT_CASE_FMT(test_max_va),
+ KUNIT_CASE_FMT(test_table_radix),
+ KUNIT_CASE_FMT(test_entry_possible_sizes),
+ KUNIT_CASE_FMT(test_entry_oa),
+ KUNIT_CASE_FMT(test_attr_from_entry),
+#ifdef pt_entry_is_write_dirty
+ KUNIT_CASE_FMT(test_dirty),
+#endif
+#ifdef pt_sw_bit
+ KUNIT_CASE_FMT(test_sw_bit_leaf),
+ KUNIT_CASE_FMT(test_sw_bit_table),
+#endif
+ {},
+};
+
+static int pt_kunit_generic_pt_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv;
+ int ret;
+
+ priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ ret = pt_kunit_priv_init(test, priv);
+ if (ret) {
+ kunit_kfree(test, priv);
+ return ret;
+ }
+ test->priv = priv;
+ return 0;
+}
+
+static void pt_kunit_generic_pt_exit(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!test->priv)
+ return;
+
+ pt_iommu_deinit(priv->iommu);
+ kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(generic_pt_suite) = {
+ .name = __stringify(NS(fmt_test)),
+ .init = pt_kunit_generic_pt_init,
+ .exit = pt_kunit_generic_pt_exit,
+ .test_cases = generic_pt_test_cases,
+};
+kunit_test_suites(&NS(generic_pt_suite));
diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h
new file mode 100644
index 000000000000..22c9e4c4dd97
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_KUNIT_IOMMU_H
+#define __GENERIC_PT_KUNIT_IOMMU_H
+
+#define GENERIC_PT_KUNIT 1
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include "../iommu-pages.h"
+#include "pt_iter.h"
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+ const struct pt_iommu_table_cfg *cfg, gfp_t gfp);
+
+/* The format can provide a list of configurations it would like to test */
+#ifdef kunit_fmt_cfgs
+static const void *kunit_pt_gen_params_cfg(struct kunit *test, const void *prev,
+ char *desc)
+{
+ uintptr_t cfg_id = (uintptr_t)prev;
+
+ cfg_id++;
+ if (cfg_id >= ARRAY_SIZE(kunit_fmt_cfgs) + 1)
+ return NULL;
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s_cfg_%u",
+ __stringify(PTPFX_RAW), (unsigned int)(cfg_id - 1));
+ return (void *)cfg_id;
+}
+#define KUNIT_CASE_FMT(test_name) \
+ KUNIT_CASE_PARAM(test_name, kunit_pt_gen_params_cfg)
+#else
+#define KUNIT_CASE_FMT(test_name) KUNIT_CASE(test_name)
+#endif
+
+#define KUNIT_ASSERT_NO_ERRNO(test, ret) \
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, KUNIT_SUBSUBTEST_INDENT "errno %pe", \
+ ERR_PTR(ret))
+
+#define KUNIT_ASSERT_NO_ERRNO_FN(test, fn, ret) \
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, \
+ KUNIT_SUBSUBTEST_INDENT "errno %pe from %s", \
+ ERR_PTR(ret), fn)
+
+/*
+ * When the test is run on a 32 bit system unsigned long can be 32 bits. This
+ * cause the iommu op signatures to be restricted to 32 bits. Meaning the test
+ * has to be mindful not to create any VA's over the 32 bit limit. Reduce the
+ * scope of the testing as the main purpose of checking on full 32 bit is to
+ * look for 32bitism in the core code. Run the test on i386 with X86_PAE=y to
+ * get the full coverage when dma_addr_t & phys_addr_t are 8 bytes
+ */
+#define IS_32BIT (sizeof(unsigned long) == 4)
+
+struct kunit_iommu_priv {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu_table fmt_table;
+ };
+ spinlock_t top_lock;
+ struct device *dummy_dev;
+ struct pt_iommu *iommu;
+ struct pt_common *common;
+ struct pt_iommu_table_cfg cfg;
+ struct pt_iommu_info info;
+ unsigned int smallest_pgsz_lg2;
+ pt_vaddr_t smallest_pgsz;
+ unsigned int largest_pgsz_lg2;
+ pt_oaddr_t test_oa;
+ pt_vaddr_t safe_pgsize_bitmap;
+ unsigned long orig_nr_secondary_pagetable;
+
+};
+PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain);
+
+static void pt_kunit_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ iommu_put_pages_list(&gather->freelist);
+}
+
+#define IOMMU_PT_DOMAIN_OPS1(x) IOMMU_PT_DOMAIN_OPS(x)
+static const struct iommu_domain_ops kunit_pt_ops = {
+ IOMMU_PT_DOMAIN_OPS1(PTPFX_RAW),
+ .iotlb_sync = &pt_kunit_iotlb_sync,
+};
+
+static void pt_kunit_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level)
+{
+}
+
+static spinlock_t *pt_kunit_get_top_lock(struct pt_iommu *iommu_table)
+{
+ struct kunit_iommu_priv *priv = container_of(
+ iommu_table, struct kunit_iommu_priv, fmt_table.iommu);
+
+ return &priv->top_lock;
+}
+
+static const struct pt_iommu_driver_ops pt_kunit_driver_ops = {
+ .change_top = &pt_kunit_change_top,
+ .get_top_lock = &pt_kunit_get_top_lock,
+};
+
+static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv)
+{
+ unsigned int va_lg2sz;
+ int ret;
+
+ /* Enough so the memory allocator works */
+ priv->dummy_dev = kunit_device_register(test, "pt_kunit_dev");
+ if (IS_ERR(priv->dummy_dev))
+ return PTR_ERR(priv->dummy_dev);
+ set_dev_node(priv->dummy_dev, NUMA_NO_NODE);
+
+ spin_lock_init(&priv->top_lock);
+
+#ifdef kunit_fmt_cfgs
+ priv->cfg = kunit_fmt_cfgs[((uintptr_t)test->param_value) - 1];
+ /*
+ * The format can set a list of features that the kunit_fmt_cfgs
+ * controls, other features are default to on.
+ */
+ priv->cfg.common.features |= PT_SUPPORTED_FEATURES &
+ (~KUNIT_FMT_FEATURES);
+#else
+ priv->cfg.common.features = PT_SUPPORTED_FEATURES;
+#endif
+
+ /* Defaults, for the kunit */
+ if (!priv->cfg.common.hw_max_vasz_lg2)
+ priv->cfg.common.hw_max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+ if (!priv->cfg.common.hw_max_oasz_lg2)
+ priv->cfg.common.hw_max_oasz_lg2 = pt_max_oa_lg2(NULL);
+
+ priv->fmt_table.iommu.nid = NUMA_NO_NODE;
+ priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops;
+ priv->fmt_table.iommu.iommu_device = priv->dummy_dev;
+ priv->domain.ops = &kunit_pt_ops;
+ ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL);
+ if (ret) {
+ if (ret == -EOVERFLOW)
+ kunit_skip(
+ test,
+ "This configuration cannot be tested on 32 bit");
+ return ret;
+ }
+
+ priv->iommu = &priv->fmt_table.iommu;
+ priv->common = common_from_iommu(&priv->fmt_table.iommu);
+ priv->iommu->ops->get_info(priv->iommu, &priv->info);
+
+ /*
+ * size_t is used to pass the mapping length, it can be 32 bit, truncate
+ * the pagesizes so we don't use large sizes.
+ */
+ priv->info.pgsize_bitmap = (size_t)priv->info.pgsize_bitmap;
+
+ priv->smallest_pgsz_lg2 = vaffs(priv->info.pgsize_bitmap);
+ priv->smallest_pgsz = log2_to_int(priv->smallest_pgsz_lg2);
+ priv->largest_pgsz_lg2 =
+ vafls((dma_addr_t)priv->info.pgsize_bitmap) - 1;
+
+ priv->test_oa =
+ oalog2_mod(0x74a71445deadbeef, priv->common->max_oasz_lg2);
+
+ /*
+ * We run out of VA space if the mappings get too big, make something
+ * smaller that can safely pass through dma_addr_t API.
+ */
+ va_lg2sz = priv->common->max_vasz_lg2;
+ if (IS_32BIT && va_lg2sz > 32)
+ va_lg2sz = 32;
+ priv->safe_pgsize_bitmap =
+ log2_mod(priv->info.pgsize_bitmap, va_lg2sz - 1);
+
+ return 0;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h
new file mode 100644
index 000000000000..e8a63c8ea850
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len);
+
+struct count_valids {
+ u64 per_size[PT_VADDR_MAX_LG2];
+};
+
+static int __count_valids(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct count_valids *valids = arg;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ pt_descend(&pts, arg, __count_valids);
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA) {
+ valids->per_size[pt_entry_oa_lg2sz(&pts)]++;
+ continue;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Number of valid table entries. This counts contiguous entries as a single
+ * valid.
+ */
+static unsigned int count_valids(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ u64 total = 0;
+ unsigned int i;
+
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+
+ for (i = 0; i != ARRAY_SIZE(valids.per_size); i++)
+ total += valids.per_size[i];
+ return total;
+}
+
+/* Only a single page size is present, count the number of valid entries */
+static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ u64 total = 0;
+ unsigned int i;
+
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+
+ for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) {
+ if ((1ULL << i) == pgsz)
+ total = valids.per_size[i];
+ else
+ KUNIT_ASSERT_EQ(test, valids.per_size[i], 0);
+ }
+ return total;
+}
+
+static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ size_t ret;
+
+ ret = iommu_unmap(&priv->domain, va, len);
+ KUNIT_ASSERT_EQ(test, ret, len);
+}
+
+static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2);
+ pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2);
+
+ for (; pfn != end_pfn; pfn++) {
+ phys_addr_t res = iommu_iova_to_phys(&priv->domain,
+ pfn * priv->smallest_pgsz);
+
+ KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa);
+ if (res != pa)
+ break;
+ pa += priv->smallest_pgsz;
+ }
+}
+
+static void test_increase_level(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_common *common = priv->common;
+
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format");
+
+ if (IS_32BIT)
+ kunit_skip(test, "Unable to test on 32bit");
+
+ KUNIT_ASSERT_GT(test, common->max_vasz_lg2,
+ pt_top_range(common).max_vasz_lg2);
+
+ /* Add every possible level to the max */
+ while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) {
+ struct pt_range top_range = pt_top_range(common);
+
+ if (top_range.va == 0)
+ do_map(test, top_range.last_va + 1, 0,
+ priv->smallest_pgsz);
+ else
+ do_map(test, top_range.va - priv->smallest_pgsz, 0,
+ priv->smallest_pgsz);
+
+ KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level,
+ top_range.top_level + 1);
+ KUNIT_ASSERT_GE(test, common->max_vasz_lg2,
+ pt_top_range(common).max_vasz_lg2);
+ }
+}
+
+static void test_map_simple(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+ unsigned int pgsz_lg2;
+ pt_vaddr_t cur_va;
+
+ /* Map every reported page size */
+ cur_va = range.va + priv->smallest_pgsz * 256;
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+ u64 len = log2_to_int(pgsz_lg2);
+
+ if (!(pgsize_bitmap & len))
+ continue;
+
+ cur_va = ALIGN(cur_va, len);
+ do_map(test, cur_va, paddr, len);
+ if (len <= SZ_2G)
+ check_iova(test, cur_va, paddr, len);
+ cur_va += len;
+ }
+
+ /* The read interface reports that every page size was created */
+ range = pt_top_range(priv->common);
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ if (pgsize_bitmap & (1ULL << pgsz_lg2))
+ KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1);
+ else
+ KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0);
+ }
+
+ /* Unmap works */
+ range = pt_top_range(priv->common);
+ cur_va = range.va + priv->smallest_pgsz * 256;
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ u64 len = log2_to_int(pgsz_lg2);
+
+ if (!(pgsize_bitmap & len))
+ continue;
+ cur_va = ALIGN(cur_va, len);
+ do_unmap(test, cur_va, len);
+ cur_va += len;
+ }
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+}
+
+/*
+ * Test to convert a table pointer into an OA by mapping something small,
+ * unmapping it so as to leave behind a table pointer, then mapping something
+ * larger that will convert the table into an OA.
+ */
+static void test_map_table_to_oa(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t limited_pgbitmap =
+ priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G);
+ struct pt_range range = pt_top_range(priv->common);
+ unsigned int pgsz_lg2;
+ pt_vaddr_t max_pgsize;
+ pt_vaddr_t cur_va;
+
+ max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1);
+ KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize);
+
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+ u64 len = log2_to_int(pgsz_lg2);
+ pt_vaddr_t offset;
+
+ if (!(priv->info.pgsize_bitmap & len))
+ continue;
+ if (len > max_pgsize)
+ break;
+
+ cur_va = ALIGN(range.va + priv->smallest_pgsz * 256,
+ max_pgsize);
+ for (offset = 0; offset != max_pgsize; offset += len)
+ do_map(test, cur_va + offset, paddr + offset, len);
+ check_iova(test, cur_va, paddr, max_pgsize);
+ KUNIT_ASSERT_EQ(test, count_valids_single(test, len),
+ log2_div(max_pgsize, pgsz_lg2));
+
+ if (len == max_pgsize) {
+ do_unmap(test, cur_va, max_pgsize);
+ } else {
+ do_unmap(test, cur_va, max_pgsize / 2);
+ for (offset = max_pgsize / 2; offset != max_pgsize;
+ offset += len)
+ do_unmap(test, cur_va + offset, len);
+ }
+
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+ }
+}
+
+/*
+ * Test unmapping a small page at the start of a large page. This always unmaps
+ * the large page.
+ */
+static void test_unmap_split(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+ unsigned int pgsz_lg2;
+ unsigned int count = 0;
+
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_vaddr_t base_len = log2_to_int(pgsz_lg2);
+ unsigned int next_pgsz_lg2;
+
+ if (!(pgsize_bitmap & base_len))
+ continue;
+
+ for (next_pgsz_lg2 = pgsz_lg2 + 1;
+ next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) {
+ pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2);
+ pt_vaddr_t vaddr = top_range.va;
+ pt_oaddr_t paddr = 0;
+ size_t gnmapped;
+
+ if (!(pgsize_bitmap & next_len))
+ continue;
+
+ do_map(test, vaddr, paddr, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+ /* Make sure unmap doesn't keep going */
+ do_map(test, vaddr, paddr, next_len);
+ do_map(test, vaddr + next_len, paddr, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr + next_len,
+ next_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+ count++;
+ }
+ }
+
+ if (count == 0)
+ kunit_skip(test, "Test needs two page sizes");
+}
+
+static void unmap_collisions(struct kunit *test, struct maple_tree *mt,
+ pt_vaddr_t start, pt_vaddr_t last)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ MA_STATE(mas, mt, start, last);
+ void *entry;
+
+ mtree_lock(mt);
+ mas_for_each(&mas, entry, last) {
+ pt_vaddr_t mas_start = mas.index;
+ pt_vaddr_t len = (mas.last - mas_start) + 1;
+ pt_oaddr_t paddr;
+
+ mas_erase(&mas);
+ mas_pause(&mas);
+ mtree_unlock(mt);
+
+ paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2);
+ check_iova(test, mas_start, paddr, len);
+ do_unmap(test, mas_start, len);
+ mtree_lock(mt);
+ }
+ mtree_unlock(mt);
+}
+
+static void clamp_range(struct kunit *test, struct pt_range *range)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (range->last_va - range->va > SZ_1G)
+ range->last_va = range->va + SZ_1G;
+ KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX);
+ if (range->va <= MAPLE_RESERVED_RANGE)
+ range->va =
+ ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz);
+}
+
+/*
+ * Randomly map and unmap ranges that can large physical pages. If a random
+ * range overlaps with existing ranges then unmap them. This hits all the
+ * special cases.
+ */
+static void test_random_map(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range upper_range = pt_upper_range(priv->common);
+ struct pt_range top_range = pt_top_range(priv->common);
+ struct maple_tree mt;
+ unsigned int iter;
+
+ mt_init(&mt);
+
+ /*
+ * Shrink the range so randomization is more likely to have
+ * intersections
+ */
+ clamp_range(test, &top_range);
+ clamp_range(test, &upper_range);
+
+ for (iter = 0; iter != 1000; iter++) {
+ struct pt_range *range = &top_range;
+ pt_oaddr_t paddr;
+ pt_vaddr_t start;
+ pt_vaddr_t end;
+ int ret;
+
+ if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) &&
+ ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1))
+ range = &upper_range;
+
+ start = get_random_u32_below(
+ min(U32_MAX, range->last_va - range->va));
+ end = get_random_u32_below(
+ min(U32_MAX, range->last_va - start));
+
+ start = ALIGN_DOWN(start, priv->smallest_pgsz);
+ end = ALIGN(end, priv->smallest_pgsz);
+ start += range->va;
+ end += start;
+ if (start < range->va || end > range->last_va + 1 ||
+ start >= end)
+ continue;
+
+ /* Try overmapping to test the failure handling */
+ paddr = oalog2_mod(start, priv->common->max_oasz_lg2);
+ ret = iommu_map(&priv->domain, start, paddr, end - start,
+ IOMMU_READ | IOMMU_WRITE, GFP_KERNEL);
+ if (ret) {
+ KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE);
+ unmap_collisions(test, &mt, start, end - 1);
+ do_map(test, start, paddr, end - start);
+ }
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range",
+ mtree_insert_range(&mt, start, end - 1,
+ XA_ZERO_ENTRY,
+ GFP_KERNEL));
+
+ check_iova(test, start, paddr, end - start);
+ if (iter % 100)
+ cond_resched();
+ }
+
+ unmap_collisions(test, &mt, 0, PT_VADDR_MAX);
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+
+ mtree_destroy(&mt);
+}
+
+/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */
+static void test_pgsize_boundary(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+
+ if (top_range.va != 0 || top_range.last_va < 0xfef9ffff ||
+ priv->smallest_pgsz != SZ_4K)
+ kunit_skip(test, "Format does not have the required range");
+
+ do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1);
+}
+
+/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */
+static void test_mixed(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ u64 start = 0x3fe400ULL << 12;
+ u64 end = 0x4c0600ULL << 12;
+ pt_vaddr_t len = end - start;
+ pt_oaddr_t oa = start;
+
+ if (top_range.last_va <= start || sizeof(unsigned long) == 4)
+ kunit_skip(test, "range is too small");
+ if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21)))
+ kunit_skip(test, "incompatible psize");
+
+ do_map(test, start, oa, len);
+ /* 14 2M, 3 1G, 3 2M */
+ KUNIT_ASSERT_EQ(test, count_valids(test), 20);
+ check_iova(test, start, oa, len);
+}
+
+static struct kunit_case iommu_test_cases[] = {
+ KUNIT_CASE_FMT(test_increase_level),
+ KUNIT_CASE_FMT(test_map_simple),
+ KUNIT_CASE_FMT(test_map_table_to_oa),
+ KUNIT_CASE_FMT(test_unmap_split),
+ KUNIT_CASE_FMT(test_random_map),
+ KUNIT_CASE_FMT(test_pgsize_boundary),
+ KUNIT_CASE_FMT(test_mixed),
+ {},
+};
+
+static int pt_kunit_iommu_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv;
+ int ret;
+
+ priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->orig_nr_secondary_pagetable =
+ global_node_page_state(NR_SECONDARY_PAGETABLE);
+ ret = pt_kunit_priv_init(test, priv);
+ if (ret) {
+ kunit_kfree(test, priv);
+ return ret;
+ }
+ test->priv = priv;
+ return 0;
+}
+
+static void pt_kunit_iommu_exit(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!test->priv)
+ return;
+
+ pt_iommu_deinit(priv->iommu);
+ /*
+ * Look for memory leaks, assumes kunit is running isolated and nothing
+ * else is using secondary page tables.
+ */
+ KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable,
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
+ kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(iommu_suite) = {
+ .name = __stringify(NS(iommu_test)),
+ .init = pt_kunit_iommu_init,
+ .exit = pt_kunit_iommu_exit,
+ .test_cases = iommu_test_cases,
+};
+kunit_test_suites(&NS(iommu_suite));
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kunit for generic page table");
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h
new file mode 100644
index 000000000000..e1123d35c907
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_common.h
@@ -0,0 +1,389 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included after the format. It contains definitions
+ * that build on the format definitions to create the basic format API.
+ *
+ * The format API is listed here, with kdocs. The functions without bodies are
+ * implemented in the format using the pattern:
+ * static inline FMTpt_XXX(..) {..}
+ * #define pt_XXX FMTpt_XXX
+ *
+ * If the format doesn't implement a function then pt_fmt_defaults.h can provide
+ * a generic version.
+ *
+ * The routines marked "@pts: Entry to query" operate on the entire contiguous
+ * entry and can be called with a pts->index pointing to any sub item that makes
+ * up that entry.
+ *
+ * The header order is:
+ * pt_defs.h
+ * FMT.h
+ * pt_common.h
+ */
+#ifndef __GENERIC_PT_PT_COMMON_H
+#define __GENERIC_PT_PT_COMMON_H
+
+#include "pt_defs.h"
+#include "pt_fmt_defaults.h"
+
+/**
+ * pt_attr_from_entry() - Convert the permission bits back to attrs
+ * @pts: Entry to convert from
+ * @attrs: Resulting attrs
+ *
+ * Fill in the attrs with the permission bits encoded in the current leaf entry.
+ * The attrs should be usable with pt_install_leaf_entry() to reconstruct the
+ * same entry.
+ */
+static inline void pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs);
+
+/**
+ * pt_can_have_leaf() - True if the current level can have an OA entry
+ * @pts: The current level
+ *
+ * True if the current level can support pt_install_leaf_entry(). A leaf
+ * entry produce an OA.
+ */
+static inline bool pt_can_have_leaf(const struct pt_state *pts);
+
+/**
+ * pt_can_have_table() - True if the current level can have a lower table
+ * @pts: The current level
+ *
+ * Every level except 0 is allowed to have a lower table.
+ */
+static inline bool pt_can_have_table(const struct pt_state *pts)
+{
+ /* No further tables at level 0 */
+ return pts->level > 0;
+}
+
+/**
+ * pt_clear_entries() - Make entries empty (non-present)
+ * @pts: Starting table index
+ * @num_contig_lg2: Number of contiguous items to clear
+ *
+ * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY
+ * and does not have any effect on table walking. The starting index must be
+ * aligned to num_contig_lg2.
+ */
+static inline void pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2);
+
+/**
+ * pt_entry_make_write_dirty() - Make an entry dirty
+ * @pts: Table entry to change
+ *
+ * Make pt_entry_is_write_dirty() return true for this entry. This can be called
+ * asynchronously with any other table manipulation under a RCU lock and must
+ * not corrupt the table.
+ */
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts);
+
+/**
+ * pt_entry_make_write_clean() - Make the entry write clean
+ * @pts: Table entry to change
+ *
+ * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will
+ * eventually be notified of this change via a TLB flush, which is the point
+ * that the HW must become synchronized. Any "write dirty" prior to the TLB
+ * flush can be lost, but once the TLB flush completes all writes must make
+ * their entries write dirty.
+ *
+ * The format should alter the entry in a way that is compatible with any
+ * concurrent update from HW. The entire contiguous entry is changed.
+ */
+static inline void pt_entry_make_write_clean(struct pt_state *pts);
+
+/**
+ * pt_entry_is_write_dirty() - True if the entry has been written to
+ * @pts: Entry to query
+ *
+ * "write dirty" means that the HW has written to the OA translated
+ * by this entry. If the entry is contiguous then the consolidated
+ * "write dirty" for all the items must be returned.
+ */
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts);
+
+/**
+ * pt_dirty_supported() - True if the page table supports dirty tracking
+ * @common: Page table to query
+ */
+static inline bool pt_dirty_supported(struct pt_common *common);
+
+/**
+ * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the number of contiguous items this leaf entry spans. If the entry
+ * is single item it returns ilog2(1).
+ */
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa() - Output Address for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the output address for the start of the entry. If the entry
+ * is contiguous this returns the same value for each sub-item. I.e.::
+ *
+ * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0
+ *
+ * See pt_item_oa(). The format should implement one of these two functions
+ * depending on how it stores the OAs in the table.
+ */
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa_lg2sz() - Return the size of an OA entry
+ * @pts: Entry to query
+ *
+ * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise
+ * it returns the total VA/OA size of the entire contiguous entry.
+ */
+static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts)
+{
+ return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
+}
+
+/**
+ * pt_entry_oa_exact() - Return the complete OA for an entry
+ * @pts: Entry to query
+ *
+ * During iteration the first entry could have a VA with an offset from the
+ * natural start of the entry. Return the exact OA including the pts's VA
+ * offset.
+ */
+static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts)
+{
+ return _pt_entry_oa_fast(pts) |
+ log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts));
+}
+
+/**
+ * pt_full_va_prefix() - The top bits of the VA
+ * @common: Page table to query
+ *
+ * This is usually 0, but some formats have their VA space going downward from
+ * PT_VADDR_MAX, and will return that instead. This value must always be
+ * adjusted by struct pt_common max_vasz_lg2.
+ */
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common);
+
+/**
+ * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry
+ * @common: Page table to query
+ *
+ * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT).
+ * This is useful to create optimized paths for common cases of PAGE_SIZE
+ * mappings.
+ */
+static inline bool pt_has_system_page_size(const struct pt_common *common);
+
+/**
+ * pt_install_leaf_entry() - Write a leaf entry to the table
+ * @pts: Table index to change
+ * @oa: Output Address for this leaf
+ * @oasz_lg2: Size in VA/OA for this leaf
+ * @attrs: Attributes to modify the entry
+ *
+ * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates
+ * the VA indicated by pts to the given OA.
+ *
+ * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz().
+ * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2.
+ *
+ * This must not be called if pt_can_have_leaf() == false. Contiguous sizes
+ * not indicated by pt_possible_sizes() must not be specified.
+ */
+static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs);
+
+/**
+ * pt_install_table() - Write a table entry to the table
+ * @pts: Table index to change
+ * @table_pa: CPU physical address of the lower table's memory
+ * @attrs: Attributes to modify the table index
+ *
+ * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa
+ * is the table at pts->level - 1. This is done by cmpxchg so pts must have the
+ * current entry loaded. The pts is updated with the installed entry.
+ *
+ * This must not be called if pt_can_have_table() == false.
+ *
+ * Returns: true if the table was installed successfully.
+ */
+static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs);
+
+/**
+ * pt_item_oa() - Output Address for this leaf item
+ * @pts: Item to query
+ *
+ * Return the output address for this item. If the item is part of a contiguous
+ * entry it returns the value of the OA for this individual sub item.
+ *
+ * See pt_entry_oa(). The format should implement one of these two functions
+ * depending on how it stores the OA's in the table.
+ */
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts);
+
+/**
+ * pt_load_entry_raw() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Return the type of entry that was loaded. pts->entry will be filled in with
+ * the entry's content. See pt_load_entry()
+ */
+static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts);
+
+/**
+ * pt_max_oa_lg2() - Return the maximum OA the table format can hold
+ * @common: Page table to query
+ *
+ * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the
+ * OA. This is the absolute maximum address the table can hold. struct pt_common
+ * max_oasz_lg2 sets a lower dynamic maximum based on HW capability.
+ */
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common);
+
+/**
+ * pt_num_items_lg2() - Return the number of items in this table level
+ * @pts: The current level
+ *
+ * The number of items in a table level defines the number of bits this level
+ * decodes from the VA. This function is not called for the top level,
+ * so it does not need to compute a special value for the top case. The
+ * result for the top is based on pt_common max_vasz_lg2.
+ *
+ * The value is used as part of determining the table indexes via the
+ * equation::
+ *
+ * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2())
+ */
+static inline unsigned int pt_num_items_lg2(const struct pt_state *pts);
+
+/**
+ * pt_pgsz_lg2_to_level - Return the level that maps the page size
+ * @common: Page table to query
+ * @pgsize_lg2: Log2 page size
+ *
+ * Returns the table level that will map the given page size. The page
+ * size must be part of the pt_possible_sizes() for some level.
+ */
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+ unsigned int pgsize_lg2);
+
+/**
+ * pt_possible_sizes() - Return a bitmap of possible output sizes at this level
+ * @pts: The current level
+ *
+ * Each level has a list of possible output sizes that can be installed as
+ * leaf entries. If pt_can_have_leaf() is false returns zero.
+ *
+ * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating
+ * that a non-contiguous single item leaf entry is supported. The following
+ * pt_num_items_lg2() number of bits can be set indicating contiguous entries
+ * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be
+ * set, contiguous entries cannot span the entire table.
+ *
+ * The OR of pt_possible_sizes() of all levels is the typical bitmask of all
+ * supported sizes in the entire table.
+ */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts);
+
+/**
+ * pt_table_item_lg2sz() - Size of a single item entry in this table level
+ * @pts: The current level
+ *
+ * The size of the item specifies how much VA and OA a single item occupies.
+ *
+ * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous
+ * entries.
+ */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+/**
+ * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table
+ * @pts: The current level
+ *
+ * Return the size of VA decoded by the entire table level.
+ */
+static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts)
+{
+ if (pts->range->top_level == pts->level)
+ return pts->range->max_vasz_lg2;
+ return min_t(unsigned int, pts->range->common->max_vasz_lg2,
+ pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts));
+}
+
+/**
+ * pt_table_pa() - Return the CPU physical address of the table entry
+ * @pts: Entry to query
+ *
+ * This is only ever called on PT_ENTRY_TABLE entries. Must return the same
+ * value passed to pt_install_table().
+ */
+static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts);
+
+/**
+ * pt_table_ptr() - Return a CPU pointer for a table item
+ * @pts: Entry to query
+ *
+ * Same as pt_table_pa() but returns a CPU pointer.
+ */
+static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts)
+{
+ return __va(pt_table_pa(pts));
+}
+
+/**
+ * pt_max_sw_bit() - Return the maximum software bit usable for any level and
+ * entry
+ * @common: Page table
+ *
+ * The swbit can be passed as bitnr to the other sw_bit functions.
+ */
+static inline unsigned int pt_max_sw_bit(struct pt_common *common);
+
+/**
+ * pt_test_sw_bit_acquire() - Read a software bit in an item
+ * @pts: Entry to read
+ * @bitnr: Bit to read
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a test bit and acquire operation.
+ */
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_set_sw_bit_release() - Set a software bit in an item
+ * @pts: Entry to set
+ * @bitnr: Bit to set
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a set bit and release operation.
+ */
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_load_entry() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Set the type of entry that was loaded. pts->entry and pts->table_lower
+ * will be filled in with the entry's content.
+ */
+static inline void pt_load_entry(struct pt_state *pts)
+{
+ pts->type = pt_load_entry_raw(pts);
+ if (pts->type == PT_ENTRY_TABLE)
+ pts->table_lower = pt_table_ptr(pts);
+}
+#endif
diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h
new file mode 100644
index 000000000000..c25544d72f97
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_defs.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included before the format. It contains definitions
+ * that are required to compile the format. The header order is:
+ * pt_defs.h
+ * fmt_XX.h
+ * pt_common.h
+ */
+#ifndef __GENERIC_PT_DEFS_H
+#define __GENERIC_PT_DEFS_H
+
+#include <linux/generic_pt/common.h>
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/bits.h>
+#include <linux/limits.h>
+#include <linux/bug.h>
+#include <linux/kconfig.h>
+#include "pt_log2.h"
+
+/* Header self-compile default defines */
+#ifndef pt_write_attrs
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+#endif
+
+struct pt_table_p;
+
+enum {
+ PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX,
+ PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32,
+ PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX,
+ PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32,
+};
+
+/*
+ * The format instantiation can have features wired off or on to optimize the
+ * code gen. Supported features are just a reflection of what the current set of
+ * kernel users want to use.
+ */
+#ifndef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES 0
+#endif
+
+/*
+ * When in debug mode we compile all formats with all features. This allows the
+ * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or
+ * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+enum {
+ PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES,
+ PT_DEBUG_SUPPORTED_FEATURES =
+ UINT_MAX &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ?
+ 0 :
+ BIT(PT_FEAT_DMA_INCOHERENT))) &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ?
+ BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) :
+ BIT(PT_FEAT_SIGN_EXTEND)),
+};
+#undef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES
+#endif
+
+#ifndef PT_FORCE_ENABLED_FEATURES
+#define PT_FORCE_ENABLED_FEATURES 0
+#endif
+
+/**
+ * DOC: Generic Page Table Language
+ *
+ * Language used in Generic Page Table
+ * VA
+ * The input address to the page table, often the virtual address.
+ * OA
+ * The output address from the page table, often the physical address.
+ * leaf
+ * An entry that results in an output address.
+ * start/end
+ * An half-open range, e.g. [0,0) refers to no VA.
+ * start/last
+ * An inclusive closed range, e.g. [0,0] refers to the VA 0
+ * common
+ * The generic page table container struct pt_common
+ * level
+ * Level 0 is always a table of only leaves with no futher table pointers.
+ * Increasing levels increase the size of the table items. The least
+ * significant VA bits used to index page tables are used to index the Level
+ * 0 table. The various labels for table levels used by HW descriptions are
+ * not used.
+ * top_level
+ * The inclusive highest level of the table. A two-level table
+ * has a top level of 1.
+ * table
+ * A linear array of translation items for that level.
+ * index
+ * The position in a table of an element: item = table[index]
+ * item
+ * A single index in a table
+ * entry
+ * A single logical element in a table. If contiguous pages are not
+ * supported then item and entry are the same thing, otherwise entry refers
+ * to all the items that comprise a single contiguous translation.
+ * item/entry_size
+ * The number of bytes of VA the table index translates for.
+ * If the item is a table entry then the next table covers
+ * this size. If the entry translates to an output address then the
+ * full OA is: OA | (VA % entry_size)
+ * contig_count
+ * The number of consecutive items fused into a single entry.
+ * item_size * contig_count is the size of that entry's translation.
+ * lg2
+ * Indicates the value is encoded as log2, i.e. 1<<x is the actual value.
+ * Normally the compiler is fine to optimize divide and mod with log2 values
+ * automatically when inlining, however if the values are not constant
+ * expressions it can't. So we do it by hand; we want to avoid 64-bit
+ * divmod.
+ */
+
+/* Returned by pt_load_entry() and for_each_pt_level_entry() */
+enum pt_entry_type {
+ PT_ENTRY_EMPTY,
+ /* Entry is valid and points to a lower table level */
+ PT_ENTRY_TABLE,
+ /* Entry is valid and returns an output address */
+ PT_ENTRY_OA,
+};
+
+struct pt_range {
+ struct pt_common *common;
+ struct pt_table_p *top_table;
+ pt_vaddr_t va;
+ pt_vaddr_t last_va;
+ u8 top_level;
+ u8 max_vasz_lg2;
+};
+
+/*
+ * Similar to xa_state, this records information about an in-progress parse at a
+ * single level.
+ */
+struct pt_state {
+ struct pt_range *range;
+ struct pt_table_p *table;
+ struct pt_table_p *table_lower;
+ u64 entry;
+ enum pt_entry_type type;
+ unsigned short index;
+ unsigned short end_index;
+ u8 level;
+};
+
+#define pt_cur_table(pts, type) ((type *)((pts)->table))
+
+/*
+ * Try to install a new table pointer. The locking methodology requires this to
+ * be atomic (multiple threads can race to install a pointer). The losing
+ * threads will fail the atomic and return false. They should free any memory
+ * and reparse the table level again.
+ */
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry)
+{
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ bool ret;
+
+ /*
+ * Ensure the zero'd table content itself is visible before its PTE can
+ * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+ */
+ if (!IS_ENABLED(CONFIG_SMP))
+ dma_wmb();
+ ret = try_cmpxchg64_release(entryp, &old_entry, table_entry);
+ if (ret)
+ pts->entry = table_entry;
+ return ret;
+}
+#endif
+
+static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry)
+{
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ bool ret;
+
+ /*
+ * Ensure the zero'd table content itself is visible before its PTE can
+ * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+ */
+ if (!IS_ENABLED(CONFIG_SMP))
+ dma_wmb();
+ ret = try_cmpxchg_release(entryp, &old_entry, table_entry);
+ if (ret)
+ pts->entry = table_entry;
+ return ret;
+}
+
+#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr))
+
+static inline bool pt_feature(const struct pt_common *common,
+ unsigned int feature_nr)
+{
+ if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr))
+ return true;
+ if (!PT_SUPPORTED_FEATURE(feature_nr))
+ return false;
+ return common->features & BIT(feature_nr);
+}
+
+static inline bool pts_feature(const struct pt_state *pts,
+ unsigned int feature_nr)
+{
+ return pt_feature(pts->range->common, feature_nr);
+}
+
+/*
+ * PT_WARN_ON is used for invariants that the kunit should be checking can't
+ * happen.
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+#define PT_WARN_ON WARN_ON
+#else
+static inline bool PT_WARN_ON(bool condition)
+{
+ return false;
+}
+#endif
+
+/* These all work on the VA type */
+#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2)
+#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2)
+#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2)
+#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2)
+#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2)
+#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2)
+#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2)
+#define vaffs(a) ffs_t(pt_vaddr_t, a)
+#define vafls(a) fls_t(pt_vaddr_t, a)
+#define vaffz(a) ffz_t(pt_vaddr_t, a)
+
+/*
+ * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and
+ * generate a useful defined result. The non-fva versions will malfunction at
+ * this extreme.
+ */
+static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return 0;
+ return log2_div_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return a;
+ return log2_mod_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b,
+ unsigned int c_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2)
+ return true;
+ return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val,
+ unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return val;
+ return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return PT_VADDR_MAX;
+ return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2);
+}
+
+/* These all work on the OA type */
+#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2)
+#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2)
+#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2)
+#define oaffs(a) ffs_t(pt_oaddr_t, a)
+#define oafls(a) fls_t(pt_oaddr_t, a)
+#define oaffz(a) ffz_t(pt_oaddr_t, a)
+
+static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem,
+ unsigned int top_level)
+{
+ return top_level | (uintptr_t)table_mem;
+}
+
+static inline void pt_top_set(struct pt_common *common,
+ struct pt_table_p *table_mem,
+ unsigned int top_level)
+{
+ WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level));
+}
+
+static inline void pt_top_set_level(struct pt_common *common,
+ unsigned int top_level)
+{
+ pt_top_set(common, NULL, top_level);
+}
+
+static inline unsigned int pt_top_get_level(const struct pt_common *common)
+{
+ return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS);
+}
+
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+ pt_oaddr_t oa,
+ unsigned int oasz_lg2);
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h
new file mode 100644
index 000000000000..69fb7c2314ca
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Default definitions for formats that don't define these functions.
+ */
+#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H
+#define __GENERIC_PT_PT_FMT_DEFAULTS_H
+
+#include "pt_defs.h"
+#include <linux/log2.h>
+
+/* Header self-compile default defines */
+#ifndef pt_load_entry_raw
+#include "fmt/amdv1.h"
+#endif
+
+/*
+ * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and
+ * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top.
+ */
+#ifndef pt_table_item_lg2sz
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts)
+{
+ return PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level;
+}
+#endif
+
+#ifndef pt_pgsz_lg2_to_level
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+ unsigned int pgsize_lg2)
+{
+ return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) /
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE));
+}
+#endif
+
+/*
+ * If not supplied by the format then contiguous pages are not supported.
+ *
+ * If contiguous pages are supported then the format must also provide
+ * pt_contig_count_lg2() if it supports a single contiguous size per level,
+ * or pt_possible_sizes() if it supports multiple sizes per level.
+ */
+#ifndef pt_entry_num_contig_lg2
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ return ilog2(1);
+}
+
+/*
+ * Return the number of contiguous OA items forming an entry at this table level
+ */
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts)
+{
+ return ilog2(1);
+}
+#endif
+
+/* If not supplied by the format then dirty tracking is not supported */
+#ifndef pt_entry_is_write_dirty
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ return false;
+}
+
+static inline void pt_entry_make_write_clean(struct pt_state *pts)
+{
+}
+
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+ return false;
+}
+#else
+/* If not supplied then dirty tracking is always enabled */
+#ifndef pt_dirty_supported
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+ return true;
+}
+#endif
+#endif
+
+#ifndef pt_entry_make_write_dirty
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ return false;
+}
+#endif
+
+/*
+ * Format supplies either:
+ * pt_entry_oa - OA is at the start of a contiguous entry
+ * or
+ * pt_item_oa - OA is adjusted for every item in a contiguous entry
+ *
+ * Build the missing one
+ *
+ * The internal helper _pt_entry_oa_fast() allows generating
+ * an efficient pt_entry_oa_exact(), it doesn't care which
+ * option is selected.
+ */
+#ifdef pt_entry_oa
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts)
+{
+ return pt_entry_oa(pts) |
+ log2_mul(pts->index, pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_entry_oa
+#endif
+
+#ifdef pt_item_oa
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts)
+{
+ return log2_set_mod(pt_item_oa(pts), 0,
+ pt_entry_num_contig_lg2(pts) +
+ pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_item_oa
+#endif
+
+/*
+ * If not supplied by the format then use the constant
+ * PT_MAX_OUTPUT_ADDRESS_LG2.
+ */
+#ifndef pt_max_oa_lg2
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common)
+{
+ return PT_MAX_OUTPUT_ADDRESS_LG2;
+}
+#endif
+
+#ifndef pt_has_system_page_size
+static inline bool pt_has_system_page_size(const struct pt_common *common)
+{
+ return PT_GRANULE_LG2SZ == PAGE_SHIFT;
+}
+#endif
+
+/*
+ * If not supplied by the format then assume only one contiguous size determined
+ * by pt_contig_count_lg2()
+ */
+#ifndef pt_possible_sizes
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts);
+
+/* Return a bitmap of possible leaf page sizes at this level */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!pt_can_have_leaf(pts))
+ return 0;
+ return log2_to_int(isz_lg2) |
+ log2_to_int(pt_contig_count_lg2(pts) + isz_lg2);
+}
+#endif
+
+/* If not supplied by the format then use 0. */
+#ifndef pt_full_va_prefix
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common)
+{
+ return 0;
+}
+#endif
+
+/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */
+#ifndef pt_clear_entries
+static inline void pt_clear_entries64(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries32(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u32 *tablep = pt_cur_table(pts, u32) + pts->index;
+ u32 *end = tablep + log2_to_int(num_contig_lg2);
+
+ PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ if (PT_ITEM_WORD_SIZE == sizeof(u32))
+ pt_clear_entries32(pts, num_contig_lg2);
+ else
+ pt_clear_entries64(pts, num_contig_lg2);
+}
+#define pt_clear_entries pt_clear_entries
+#endif
+
+/* If not supplied then SW bits are not supported */
+#ifdef pt_sw_bit
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ /* Acquire, pairs with pt_set_sw_bit_release() */
+ smp_mb();
+ /* For a contiguous entry the sw bit is only stored in the first item. */
+ return pts->entry & pt_sw_bit(bitnr);
+}
+#define pt_test_sw_bit_acquire pt_test_sw_bit_acquire
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+ if (PT_ITEM_WORD_SIZE == sizeof(u64)) {
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ u64 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ return;
+ }
+#endif
+ if (PT_ITEM_WORD_SIZE == sizeof(u32)) {
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ u32 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ } else
+ BUILD_BUG();
+}
+#define pt_set_sw_bit_release pt_set_sw_bit_release
+#else
+static inline unsigned int pt_max_sw_bit(struct pt_common *common)
+{
+ return 0;
+}
+
+extern void __pt_no_sw_bit(void);
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+ return false;
+}
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+}
+#endif
+
+/*
+ * Format can call in the pt_install_leaf_entry() to check the arguments are all
+ * aligned correctly.
+ */
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+ pt_oaddr_t oa,
+ unsigned int oasz_lg2)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2)))
+ return false;
+
+#ifdef pt_possible_sizes
+ if (PT_WARN_ON(isz_lg2 > oasz_lg2 ||
+ oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts)))
+ return false;
+#else
+ if (PT_WARN_ON(oasz_lg2 != isz_lg2 &&
+ oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts)))
+ return false;
+#endif
+
+ if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2)))
+ return false;
+ return true;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h
new file mode 100644
index 000000000000..c0d8617cce29
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_iter.h
@@ -0,0 +1,636 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Iterators for Generic Page Table
+ */
+#ifndef __GENERIC_PT_PT_ITER_H
+#define __GENERIC_PT_PT_ITER_H
+
+#include "pt_common.h"
+
+#include <linux/errno.h>
+
+/*
+ * Use to mangle symbols so that backtraces and the symbol table are
+ * understandable. Any non-inlined function should get mangled like this.
+ */
+#define NS(fn) CONCATENATE(PTPFX, fn)
+
+/**
+ * pt_check_range() - Validate the range can be iterated
+ * @range: Range to validate
+ *
+ * Check that VA and last_va fall within the permitted range of VAs. If the
+ * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension
+ * is correct.
+ */
+static inline int pt_check_range(struct pt_range *range)
+{
+ pt_vaddr_t prefix;
+
+ PT_WARN_ON(!range->max_vasz_lg2);
+
+ if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) {
+ PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2);
+ prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ?
+ PT_VADDR_MAX :
+ 0;
+ } else {
+ prefix = pt_full_va_prefix(range->common);
+ }
+
+ if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) ||
+ !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2))
+ return -ERANGE;
+ return 0;
+}
+
+/**
+ * pt_index_to_va() - Update range->va to the current pts->index
+ * @pts: Iteration State
+ *
+ * Adjust range->va to match the current index. This is done in a lazy manner
+ * since computing the VA takes several instructions and is rarely required.
+ */
+static inline void pt_index_to_va(struct pt_state *pts)
+{
+ pt_vaddr_t lower_va;
+
+ lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts));
+ pts->range->va = fvalog2_set_mod(pts->range->va, lower_va,
+ pt_table_oa_lg2sz(pts));
+}
+
+/*
+ * Add index_count_lg2 number of entries to pts's VA and index. The VA will be
+ * adjusted to the end of the contiguous block if it is currently in the middle.
+ */
+static inline void _pt_advance(struct pt_state *pts,
+ unsigned int index_count_lg2)
+{
+ pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
+ index_count_lg2);
+}
+
+/**
+ * pt_entry_fully_covered() - Check if the item or entry is entirely contained
+ * within pts->range
+ * @pts: Iteration State
+ * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or
+ * pt_entry_oa_lg2sz()
+ *
+ * Returns: true if the item is fully enclosed by the pts->range.
+ */
+static inline bool pt_entry_fully_covered(const struct pt_state *pts,
+ unsigned int oasz_lg2)
+{
+ struct pt_range *range = pts->range;
+
+ /* Range begins at the start of the entry */
+ if (log2_mod(pts->range->va, oasz_lg2))
+ return false;
+
+ /* Range ends past the end of the entry */
+ if (!log2_div_eq(range->va, range->last_va, oasz_lg2))
+ return true;
+
+ /* Range ends at the end of the entry */
+ return log2_mod_eq_max(range->last_va, oasz_lg2);
+}
+
+/**
+ * pt_range_to_index() - Starting index for an iteration
+ * @pts: Iteration State
+ *
+ * Return: the starting index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_index(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ PT_WARN_ON(pts->level > pts->range->top_level);
+ if (pts->range->top_level == pts->level)
+ return log2_div(fvalog2_mod(pts->range->va,
+ pts->range->max_vasz_lg2),
+ isz_lg2);
+ return log2_mod(log2_div(pts->range->va, isz_lg2),
+ pt_num_items_lg2(pts));
+}
+
+/**
+ * pt_range_to_end_index() - Ending index iteration
+ * @pts: Iteration State
+ *
+ * Return: the last index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_range *range = pts->range;
+ unsigned int num_entries_lg2;
+
+ if (range->va == range->last_va)
+ return pts->index + 1;
+
+ if (pts->range->top_level == pts->level)
+ return log2_div(fvalog2_mod(pts->range->last_va,
+ pts->range->max_vasz_lg2),
+ isz_lg2) +
+ 1;
+
+ num_entries_lg2 = pt_num_items_lg2(pts);
+
+ /* last_va falls within this table */
+ if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2))
+ return log2_mod(log2_div(pts->range->last_va, isz_lg2),
+ num_entries_lg2) +
+ 1;
+
+ return log2_to_int(num_entries_lg2);
+}
+
+static inline void _pt_iter_first(struct pt_state *pts)
+{
+ pts->index = pt_range_to_index(pts);
+ pts->end_index = pt_range_to_end_index(pts);
+ PT_WARN_ON(pts->index > pts->end_index);
+}
+
+static inline bool _pt_iter_load(struct pt_state *pts)
+{
+ if (pts->index >= pts->end_index)
+ return false;
+ pt_load_entry(pts);
+ return true;
+}
+
+/**
+ * pt_next_entry() - Advance pts to the next entry
+ * @pts: Iteration State
+ *
+ * Update pts to go to the next index at this level. If pts is pointing at a
+ * contiguous entry then the index may advance my more than one.
+ */
+static inline void pt_next_entry(struct pt_state *pts)
+{
+ if (pts->type == PT_ENTRY_OA &&
+ !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0))
+ _pt_advance(pts, pt_entry_num_contig_lg2(pts));
+ else
+ pts->index++;
+ pt_index_to_va(pts);
+}
+
+/**
+ * for_each_pt_level_entry() - For loop wrapper over entries in the range
+ * @pts: Iteration State
+ *
+ * This is the basic iteration primitive. It iterates over all the entries in
+ * pts->range that fall within the pts's current table level. Each step does
+ * pt_load_entry(pts).
+ */
+#define for_each_pt_level_entry(pts) \
+ for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts))
+
+/**
+ * pt_load_single_entry() - Version of pt_load_entry() usable within a walker
+ * @pts: Iteration State
+ *
+ * Alternative to for_each_pt_level_entry() if the walker function uses only a
+ * single entry.
+ */
+static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts)
+{
+ pts->index = pt_range_to_index(pts);
+ pt_load_entry(pts);
+ return pts->type;
+}
+
+static __always_inline struct pt_range _pt_top_range(struct pt_common *common,
+ uintptr_t top_of_table)
+{
+ struct pt_range range = {
+ .common = common,
+ .top_table =
+ (struct pt_table_p *)(top_of_table &
+ ~(uintptr_t)PT_TOP_LEVEL_MASK),
+ .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),
+ };
+ struct pt_state pts = { .range = &range, .level = range.top_level };
+ unsigned int max_vasz_lg2;
+
+ max_vasz_lg2 = common->max_vasz_lg2;
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ pts.level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2,
+ pt_num_items_lg2(&pts) +
+ pt_table_item_lg2sz(&pts));
+
+ /*
+ * The top range will default to the lower region only with sign extend.
+ */
+ range.max_vasz_lg2 = max_vasz_lg2;
+ if (pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ max_vasz_lg2--;
+
+ range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2);
+ range.last_va =
+ fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2);
+ return range;
+}
+
+/**
+ * pt_top_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static __always_inline struct pt_range pt_top_range(struct pt_common *common)
+{
+ /*
+ * The top pointer can change without locking. We capture the value and
+ * it's level here and are safe to walk it so long as both values are
+ * captured without tearing.
+ */
+ return _pt_top_range(common, READ_ONCE(common->top_of_table));
+}
+
+/**
+ * pt_all_range() - Return a range that spans the entire page table
+ * @common: Table
+ *
+ * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND
+ * is supported range->va and range->last_va will be incorrect during the
+ * iteration and must not be accessed.
+ */
+static inline struct pt_range pt_all_range(struct pt_common *common)
+{
+ struct pt_range range = pt_top_range(common);
+
+ if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ return range;
+
+ /*
+ * Pretend the table is linear from 0 without a sign extension. This
+ * generates the correct indexes for iteration.
+ */
+ range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2);
+ return range;
+}
+
+/**
+ * pt_upper_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static inline struct pt_range pt_upper_range(struct pt_common *common)
+{
+ struct pt_range range = pt_top_range(common);
+
+ if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ return range;
+
+ range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1);
+ range.last_va = PT_VADDR_MAX;
+ return range;
+}
+
+/**
+ * pt_make_range() - Return a range that spans part of the table
+ * @common: Table
+ * @va: Start address
+ * @last_va: Last address
+ *
+ * The caller must validate the range with pt_check_range() before using it.
+ */
+static __always_inline struct pt_range
+pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va)
+{
+ struct pt_range range =
+ _pt_top_range(common, READ_ONCE(common->top_of_table));
+
+ range.va = va;
+ range.last_va = last_va;
+
+ return range;
+}
+
+/*
+ * Span a slice of the table starting at a lower table level from an active
+ * walk.
+ */
+static __always_inline struct pt_range
+pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va,
+ pt_vaddr_t last_va)
+{
+ struct pt_range range = *parent;
+
+ range.va = va;
+ range.last_va = last_va;
+
+ PT_WARN_ON(last_va < va);
+ PT_WARN_ON(pt_check_range(&range));
+
+ return range;
+}
+
+/**
+ * pt_init() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ * @level: Table level for the state
+ * @table: Pointer to the table memory at level
+ *
+ * Helper to initialize the on-stack pt_state from walker arguments.
+ */
+static __always_inline struct pt_state
+pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = {
+ .range = range,
+ .table = table,
+ .level = level,
+ };
+ return pts;
+}
+
+/**
+ * pt_init_top() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ *
+ * The pt_state points to the top most level.
+ */
+static __always_inline struct pt_state pt_init_top(struct pt_range *range)
+{
+ return pt_init(range, range->top_level, range->top_table);
+}
+
+typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table);
+
+/**
+ * pt_descend() - Recursively invoke the walker for the lower level
+ * @pts: Iteration State
+ * @arg: Value to pass to the function
+ * @fn: Walker function to call
+ *
+ * pts must point to a table item. Invoke fn as a walker on the table
+ * pts points to.
+ */
+static __always_inline int pt_descend(struct pt_state *pts, void *arg,
+ pt_level_fn_t fn)
+{
+ int ret;
+
+ if (PT_WARN_ON(!pts->table_lower))
+ return -EINVAL;
+
+ ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower);
+ return ret;
+}
+
+/**
+ * pt_walk_range() - Walk over a VA range
+ * @range: Range pointer
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * Walk over a VA range. The caller should have done a validity check, at
+ * least calling pt_check_range(), when building range. The walk will
+ * start at the top most table.
+ */
+static __always_inline int pt_walk_range(struct pt_range *range,
+ pt_level_fn_t fn, void *arg)
+{
+ return fn(range, arg, range->top_level, range->top_table);
+}
+
+/*
+ * pt_walk_descend() - Recursively invoke the walker for a slice of a lower
+ * level
+ * @pts: Iteration State
+ * @va: Start address
+ * @last_va: Last address
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over a slice of the
+ * lower table. The caller must ensure that va/last_va are within the table
+ * item. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int pt_walk_descend(const struct pt_state *pts,
+ pt_vaddr_t va, pt_vaddr_t last_va,
+ pt_level_fn_t fn, void *arg)
+{
+ struct pt_range range = pt_make_child_range(pts->range, va, last_va);
+
+ if (PT_WARN_ON(!pt_can_have_table(pts)) ||
+ PT_WARN_ON(!pts->table_lower))
+ return -EINVAL;
+
+ return fn(&range, arg, pts->level - 1, pts->table_lower);
+}
+
+/*
+ * pt_walk_descend_all() - Recursively invoke the walker for a table item
+ * @parent_pts: Iteration State
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over the entire lower
+ * table. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int
+pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
+ void *arg)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts);
+
+ return pt_walk_descend(parent_pts,
+ log2_set_mod(parent_pts->range->va, 0, isz_lg2),
+ log2_set_mod_max(parent_pts->range->va, isz_lg2),
+ fn, arg);
+}
+
+/**
+ * pt_range_slice() - Return a range that spans indexes
+ * @pts: Iteration State
+ * @start_index: Starting index within pts
+ * @end_index: Ending index within pts
+ *
+ * Create a range than spans an index range of the current table level
+ * pt_state points at.
+ */
+static inline struct pt_range pt_range_slice(const struct pt_state *pts,
+ unsigned int start_index,
+ unsigned int end_index)
+{
+ unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+ pt_vaddr_t last_va;
+ pt_vaddr_t va;
+
+ va = fvalog2_set_mod(pts->range->va,
+ log2_mul(start_index, pt_table_item_lg2sz(pts)),
+ table_lg2sz);
+ last_va = fvalog2_set_mod(
+ pts->range->va,
+ log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz);
+ return pt_make_child_range(pts->range, va, last_va);
+}
+
+/**
+ * pt_top_memsize_lg2()
+ * @common: Table
+ * @top_of_table: Top of table value from _pt_top_set()
+ *
+ * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this
+ * will compute the top size assuming the table will grow.
+ */
+static inline unsigned int pt_top_memsize_lg2(struct pt_common *common,
+ uintptr_t top_of_table)
+{
+ struct pt_range range = _pt_top_range(common, top_of_table);
+ struct pt_state pts = pt_init_top(&range);
+ unsigned int num_items_lg2;
+
+ num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts);
+ if (range.top_level != PT_MAX_TOP_LEVEL &&
+ pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts));
+
+ /* Round up the allocation size to the minimum alignment */
+ return max(ffs_t(u64, PT_TOP_PHYS_MASK),
+ num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE));
+}
+
+/**
+ * pt_compute_best_pgsize() - Determine the best page size for leaf entries
+ * @pgsz_bitmap: Permitted page sizes
+ * @va: Starting virtual address for the leaf entry
+ * @last_va: Last virtual address for the leaf entry, sets the max page size
+ * @oa: Starting output address for the leaf entry
+ *
+ * Compute the largest page size for va, last_va, and oa together and return it
+ * in lg2. The largest page size depends on the format's supported page sizes at
+ * this level, and the relative alignment of the VA and OA addresses. 0 means
+ * the OA cannot be stored with the provided pgsz_bitmap.
+ */
+static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
+ pt_vaddr_t va,
+ pt_vaddr_t last_va,
+ pt_oaddr_t oa)
+{
+ unsigned int best_pgsz_lg2;
+ unsigned int pgsz_lg2;
+ pt_vaddr_t len = last_va - va + 1;
+ pt_vaddr_t mask;
+
+ if (PT_WARN_ON(va >= last_va))
+ return 0;
+
+ /*
+ * Given a VA/OA pair the best page size is the largest page size
+ * where:
+ *
+ * 1) VA and OA start at the page. Bitwise this is the count of least
+ * significant 0 bits.
+ * This also implies that last_va/oa has the same prefix as va/oa.
+ */
+ mask = va | oa;
+
+ /*
+ * 2) The page size is not larger than the last_va (length). Since page
+ * sizes are always power of two this can't be larger than the
+ * largest power of two factor of the length.
+ */
+ mask |= log2_to_int(vafls(len) - 1);
+
+ best_pgsz_lg2 = vaffs(mask);
+
+ /* Choose the highest bit <= best_pgsz_lg2 */
+ if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1)
+ pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1);
+
+ pgsz_lg2 = vafls(pgsz_bitmap);
+ if (!pgsz_lg2)
+ return 0;
+
+ pgsz_lg2--;
+
+ PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0);
+ PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0);
+ PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va);
+ PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+ PT_WARN_ON(
+ !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+ return pgsz_lg2;
+}
+
+#define _PT_MAKE_CALL_LEVEL(fn) \
+ static __always_inline int fn(struct pt_range *range, void *arg, \
+ unsigned int level, \
+ struct pt_table_p *table) \
+ { \
+ static_assert(PT_MAX_TOP_LEVEL <= 5); \
+ if (level == 0) \
+ return CONCATENATE(fn, 0)(range, arg, 0, table); \
+ if (level == 1 || PT_MAX_TOP_LEVEL == 1) \
+ return CONCATENATE(fn, 1)(range, arg, 1, table); \
+ if (level == 2 || PT_MAX_TOP_LEVEL == 2) \
+ return CONCATENATE(fn, 2)(range, arg, 2, table); \
+ if (level == 3 || PT_MAX_TOP_LEVEL == 3) \
+ return CONCATENATE(fn, 3)(range, arg, 3, table); \
+ if (level == 4 || PT_MAX_TOP_LEVEL == 4) \
+ return CONCATENATE(fn, 4)(range, arg, 4, table); \
+ return CONCATENATE(fn, 5)(range, arg, 5, table); \
+ }
+
+static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg,
+ unsigned int unused_level,
+ struct pt_table_p *table)
+{
+ static_assert(PT_MAX_TOP_LEVEL <= 5);
+ return -EPROTOTYPE;
+}
+
+#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \
+ static inline int fn(struct pt_range *range, void *arg, \
+ unsigned int unused_level, \
+ struct pt_table_p *table) \
+ { \
+ return do_fn(range, arg, level, table, descend_fn); \
+ }
+
+/**
+ * PT_MAKE_LEVELS() - Build an unwound walker
+ * @fn: Name of the walker function
+ * @do_fn: Function to call at each level
+ *
+ * This builds a function call tree that can be fully inlined.
+ * The caller must provide a function body in an __always_inline function::
+ *
+ * static __always_inline int do_fn(struct pt_range *range, void *arg,
+ * unsigned int level, struct pt_table_p *table,
+ * pt_level_fn_t descend_fn)
+ *
+ * An inline function will be created for each table level that calls do_fn with
+ * a compile time constant for level and a pointer to the next lower function.
+ * This generates an optimally inlined walk where each of the functions sees a
+ * constant level and can codegen the exact constants/etc for that level.
+ *
+ * Note this can produce a lot of code!
+ */
+#define PT_MAKE_LEVELS(fn, do_fn) \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \
+ do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \
+ _PT_MAKE_CALL_LEVEL(fn)
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h
new file mode 100644
index 000000000000..6dbbed119238
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_log2.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Helper macros for working with log2 values
+ *
+ */
+#ifndef __GENERIC_PT_LOG2_H
+#define __GENERIC_PT_LOG2_H
+#include <linux/bitops.h>
+#include <linux/limits.h>
+
+/* Compute a */
+#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2)))
+static_assert(log2_to_int_t(unsigned int, 0) == 1);
+
+/* Compute a - 1 (aka all low bits set) */
+#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1))
+
+/* Compute a / b */
+#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2)))
+static_assert(log2_div_t(unsigned int, 4, 2) == 1);
+
+/*
+ * Compute:
+ * a / c == b / c
+ * aka the high bits are equal
+ */
+#define log2_div_eq_t(type, a, b, c_lg2) \
+ (log2_div_t(type, (a) ^ (b), c_lg2) == 0)
+static_assert(log2_div_eq_t(unsigned int, 1, 1, 2));
+
+/* Compute a % b */
+#define log2_mod_t(type, a, b_lg2) \
+ ((type)(((type)a) & log2_to_max_int_t(type, b_lg2)))
+static_assert(log2_mod_t(unsigned int, 1, 2) == 1);
+
+/*
+ * Compute:
+ * a % b == b - 1
+ * aka the low bits are all 1s
+ */
+#define log2_mod_eq_max_t(type, a, b_lg2) \
+ (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2))
+static_assert(log2_mod_eq_max_t(unsigned int, 3, 2));
+
+/*
+ * Return a value such that:
+ * a / b == ret / b
+ * ret % b == val
+ * aka set the low bits to val. val must be < b
+ */
+#define log2_set_mod_t(type, a, val, b_lg2) \
+ ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val)))
+static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1);
+
+/* Return a value such that:
+ * a / b == ret / b
+ * ret % b == b - 1
+ * aka set the low bits to all 1s
+ */
+#define log2_set_mod_max_t(type, a, b_lg2) \
+ (((type)(a)) | log2_to_max_int_t(type, b_lg2))
+static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3);
+
+/* Compute a * b */
+#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2)))
+static_assert(log2_mul_t(unsigned int, 2, 2) == 8);
+
+#define _dispatch_sz(type, fn, a) \
+ (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a))
+
+/*
+ * Return the highest value such that:
+ * fls_t(u32, 0) == 0
+ * fls_t(u3, 1) == 1
+ * a >= log2_to_int(ret - 1)
+ * aka find last set bit
+ */
+static inline unsigned int fls32(u32 a)
+{
+ return fls(a);
+}
+#define fls_t(type, a) _dispatch_sz(type, fls, a)
+
+/*
+ * Return the highest value such that:
+ * ffs_t(u32, 0) == UNDEFINED
+ * ffs_t(u32, 1) == 0
+ * log_mod(a, ret) == 0
+ * aka find first set bit
+ */
+static inline unsigned int __ffs32(u32 a)
+{
+ return __ffs(a);
+}
+#define ffs_t(type, a) _dispatch_sz(type, __ffs, a)
+
+/*
+ * Return the highest value such that:
+ * ffz_t(u32, U32_MAX) == UNDEFINED
+ * ffz_t(u32, 0) == 0
+ * ffz_t(u32, 1) == 1
+ * log_mod(a, ret) == log_to_max_int(ret)
+ * aka find first zero bit
+ */
+static inline unsigned int ffz32(u32 a)
+{
+ return ffz(a);
+}
+static inline unsigned int ffz64(u64 a)
+{
+ if (sizeof(u64) == sizeof(unsigned long))
+ return ffz(a);
+
+ if ((u32)a == U32_MAX)
+ return ffz32(a >> 32) + 32;
+ return ffz32(a);
+}
+#define ffz_t(type, a) _dispatch_sz(type, ffz, a)
+
+#endif
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index f2f538c70650..5471f814e073 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -13,6 +13,10 @@ config INTEL_IOMMU
bool "Support for Intel IOMMU using DMA Remapping Devices"
depends on PCI_MSI && ACPI && X86
select IOMMU_API
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_X86_64
+ select IOMMU_PT_VTDSS
select IOMMU_IOVA
select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
@@ -66,7 +70,7 @@ config INTEL_IOMMU_DEFAULT_ON
config INTEL_IOMMU_FLOPPY_WA
def_bool y
- depends on X86
+ depends on X86 && BLK_DEV_FD
help
Floppy disk drivers are known to bypass DMA API calls
thereby failing to work when IOMMU is enabled. This
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e236c7ec221f..4e888867e85c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -45,16 +45,9 @@
#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
-#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
-
-/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
- to match. That way, we can use 'unsigned long' for PFNs with impunity. */
-#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
- __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
-#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
-
static void __init check_tylersburg_isoch(void);
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
static int rwbf_quirk;
#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap))
@@ -217,7 +210,6 @@ static int disable_igfx_iommu;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
-static const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
@@ -285,13 +277,6 @@ static int __init intel_iommu_setup(char *str)
}
__setup("intel_iommu=", intel_iommu_setup);
-static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
-{
- int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-
- return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
-}
-
/*
* Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
* Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
@@ -353,23 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
}
-/* Return the super pagesize bitmap if supported. */
-static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
-{
- unsigned long bitmap = 0;
-
- /*
- * 1-level super page supports page size of 2MiB, 2-level super page
- * supports page size of both 2MiB and 1GiB.
- */
- if (domain->iommu_superpage == 1)
- bitmap |= SZ_2M;
- else if (domain->iommu_superpage == 2)
- bitmap |= SZ_2M | SZ_1G;
-
- return bitmap;
-}
-
struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
u8 devfn, int alloc)
{
@@ -556,13 +524,6 @@ out:
return iommu;
}
-static void domain_flush_cache(struct dmar_domain *domain,
- void *addr, int size)
-{
- if (!domain->iommu_coherency)
- clflush_cache_range(addr, size);
-}
-
static void free_context_table(struct intel_iommu *iommu)
{
struct context_entry *context;
@@ -707,280 +668,6 @@ pgtable_walk:
}
#endif
-static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
- unsigned long pfn, int *target_level,
- gfp_t gfp)
-{
- struct dma_pte *parent, *pte;
- int level = agaw_to_level(domain->agaw);
- int offset;
-
- if (!domain_pfn_supported(domain, pfn))
- /* Address beyond IOMMU's addressing capabilities. */
- return NULL;
-
- parent = domain->pgd;
-
- while (1) {
- void *tmp_page;
-
- offset = pfn_level_offset(pfn, level);
- pte = &parent[offset];
- if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
- break;
- if (level == *target_level)
- break;
-
- if (!dma_pte_present(pte)) {
- uint64_t pteval, tmp;
-
- tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp,
- SZ_4K);
-
- if (!tmp_page)
- return NULL;
-
- domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
- pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
- DMA_PTE_WRITE;
- if (domain->use_first_level)
- pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
-
- tmp = 0ULL;
- if (!try_cmpxchg64(&pte->val, &tmp, pteval))
- /* Someone else set it while we were thinking; use theirs. */
- iommu_free_pages(tmp_page);
- else
- domain_flush_cache(domain, pte, sizeof(*pte));
- }
- if (level == 1)
- break;
-
- parent = phys_to_virt(dma_pte_addr(pte));
- level--;
- }
-
- if (!*target_level)
- *target_level = level;
-
- return pte;
-}
-
-/* return address's pte at specific level */
-static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
- unsigned long pfn,
- int level, int *large_page)
-{
- struct dma_pte *parent, *pte;
- int total = agaw_to_level(domain->agaw);
- int offset;
-
- parent = domain->pgd;
- while (level <= total) {
- offset = pfn_level_offset(pfn, total);
- pte = &parent[offset];
- if (level == total)
- return pte;
-
- if (!dma_pte_present(pte)) {
- *large_page = total;
- break;
- }
-
- if (dma_pte_superpage(pte)) {
- *large_page = total;
- return pte;
- }
-
- parent = phys_to_virt(dma_pte_addr(pte));
- total--;
- }
- return NULL;
-}
-
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned int large_page;
- struct dma_pte *first_pte, *pte;
-
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- do {
- large_page = 1;
- first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
- if (!pte) {
- start_pfn = align_to_level(start_pfn + 1, large_page + 1);
- continue;
- }
- do {
- dma_clear_pte(pte);
- start_pfn += lvl_to_nr_pages(large_page);
- pte++;
- } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
-
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
-
- } while (start_pfn && start_pfn <= last_pfn);
-}
-
-static void dma_pte_free_level(struct dmar_domain *domain, int level,
- int retain_level, struct dma_pte *pte,
- unsigned long pfn, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn;
- struct dma_pte *level_pte;
-
- if (!dma_pte_present(pte) || dma_pte_superpage(pte))
- goto next;
-
- level_pfn = pfn & level_mask(level);
- level_pte = phys_to_virt(dma_pte_addr(pte));
-
- if (level > 2) {
- dma_pte_free_level(domain, level - 1, retain_level,
- level_pte, level_pfn, start_pfn,
- last_pfn);
- }
-
- /*
- * Free the page table if we're below the level we want to
- * retain and the range covers the entire table.
- */
- if (level < retain_level && !(start_pfn > level_pfn ||
- last_pfn < level_pfn + level_size(level) - 1)) {
- dma_clear_pte(pte);
- domain_flush_cache(domain, pte, sizeof(*pte));
- iommu_free_pages(level_pte);
- }
-next:
- pfn += level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-}
-
-/*
- * clear last level (leaf) ptes and free page table pages below the
- * level we wish to keep intact.
- */
-static void dma_pte_free_pagetable(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn,
- int retain_level)
-{
- dma_pte_clear_range(domain, start_pfn, last_pfn);
-
- /* We don't need lock here; nobody else touches the iova range */
- dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
- domain->pgd, 0, start_pfn, last_pfn);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_free_pages(domain->pgd);
- domain->pgd = NULL;
- }
-}
-
-/* When a page at a given level is being unlinked from its parent, we don't
- need to *modify* it at all. All we need to do is make a list of all the
- pages which can be freed just as soon as we've flushed the IOTLB and we
- know the hardware page-walk will no longer touch them.
- The 'pte' argument is the *parent* PTE, pointing to the page that is to
- be freed. */
-static void dma_pte_list_pagetables(struct dmar_domain *domain,
- int level, struct dma_pte *parent_pte,
- struct iommu_pages_list *freelist)
-{
- struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
-
- iommu_pages_list_add(freelist, pte);
-
- if (level == 1)
- return;
-
- do {
- if (dma_pte_present(pte) && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
- pte++;
- } while (!first_pte_in_page(pte));
-}
-
-static void dma_pte_clear_level(struct dmar_domain *domain, int level,
- struct dma_pte *pte, unsigned long pfn,
- unsigned long start_pfn, unsigned long last_pfn,
- struct iommu_pages_list *freelist)
-{
- struct dma_pte *first_pte = NULL, *last_pte = NULL;
-
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn = pfn & level_mask(level);
-
- if (!dma_pte_present(pte))
- goto next;
-
- /* If range covers entire pagetable, free it */
- if (start_pfn <= level_pfn &&
- last_pfn >= level_pfn + level_size(level) - 1) {
- /* These suborbinate page tables are going away entirely. Don't
- bother to clear them; we're just going to *free* them. */
- if (level > 1 && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
-
- dma_clear_pte(pte);
- if (!first_pte)
- first_pte = pte;
- last_pte = pte;
- } else if (level > 1) {
- /* Recurse down into a level that isn't *entirely* obsolete */
- dma_pte_clear_level(domain, level - 1,
- phys_to_virt(dma_pte_addr(pte)),
- level_pfn, start_pfn, last_pfn,
- freelist);
- }
-next:
- pfn = level_pfn + level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-
- if (first_pte)
- domain_flush_cache(domain, first_pte,
- (void *)++last_pte - (void *)first_pte);
-}
-
-/* We can't just free the pages because the IOMMU may still be walking
- the page tables, and may have cached the intermediate levels. The
- pages can only be freed after the IOTLB flush has been done. */
-static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
- unsigned long last_pfn,
- struct iommu_pages_list *freelist)
-{
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
- domain->pgd, 0, start_pfn, last_pfn, freelist);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_pages_list_add(freelist, domain->pgd);
- domain->pgd = NULL;
- }
-}
-
/* iommu handling */
static int iommu_alloc_root_entry(struct intel_iommu *iommu)
{
@@ -1460,13 +1147,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
domain_lookup_dev_info(domain, iommu, bus, devfn);
u16 did = domain_id_iommu(domain, iommu);
int translation = CONTEXT_TT_MULTI_LEVEL;
- struct dma_pte *pgd = domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
struct context_entry *context;
int ret;
if (WARN_ON(!intel_domain_is_ss_paging(domain)))
return -EINVAL;
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
+
pr_debug("Set context mapping for %02x:%02x.%d\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1489,8 +1178,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
else
translation = CONTEXT_TT_MULTI_LEVEL;
- context_set_address_root(context, virt_to_phys(pgd));
- context_set_address_width(context, domain->agaw);
+ context_set_address_root(context, pt_info.ssptptr);
+ context_set_address_width(context, pt_info.aw);
context_set_translation_type(context, translation);
context_set_fault_enable(context);
context_set_present(context);
@@ -1537,177 +1226,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev)
return 0;
}
-/* Return largest possible superpage level for a given mapping */
-static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phy_pfn, unsigned long pages)
-{
- int support, level = 1;
- unsigned long pfnmerge;
-
- support = domain->iommu_superpage;
-
- /* To use a large page, the virtual *and* physical addresses
- must be aligned to 2MiB/1GiB/etc. Lower bits set in either
- of them will mean we have to use smaller pages. So just
- merge them and check both at once. */
- pfnmerge = iov_pfn | phy_pfn;
-
- while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
- pages >>= VTD_STRIDE_SHIFT;
- if (!pages)
- break;
- pfnmerge >>= VTD_STRIDE_SHIFT;
- level++;
- support--;
- }
- return level;
-}
-
-/*
- * Ensure that old small page tables are removed to make room for superpage(s).
- * We're going to add new large pages, so make sure we don't remove their parent
- * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
- */
-static void switch_to_super_page(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long end_pfn, int level)
-{
- unsigned long lvl_pages = lvl_to_nr_pages(level);
- struct dma_pte *pte = NULL;
-
- if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
- !IS_ALIGNED(end_pfn + 1, lvl_pages)))
- return;
-
- while (start_pfn <= end_pfn) {
- if (!pte)
- pte = pfn_to_dma_pte(domain, start_pfn, &level,
- GFP_ATOMIC);
-
- if (dma_pte_present(pte)) {
- dma_pte_free_pagetable(domain, start_pfn,
- start_pfn + lvl_pages - 1,
- level + 1);
-
- cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
- end_pfn << VTD_PAGE_SHIFT, 0);
- }
-
- pte++;
- start_pfn += lvl_pages;
- if (first_pte_in_page(pte))
- pte = NULL;
- }
-}
-
-static int
-__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phys_pfn, unsigned long nr_pages, int prot,
- gfp_t gfp)
-{
- struct dma_pte *first_pte = NULL, *pte = NULL;
- unsigned int largepage_lvl = 0;
- unsigned long lvl_pages = 0;
- phys_addr_t pteval;
- u64 attr;
-
- if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
- return -EINVAL;
-
- if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
- return -EINVAL;
-
- if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
- pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
- return -EINVAL;
- }
-
- attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
- if (domain->use_first_level) {
- attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
- if (prot & DMA_PTE_WRITE)
- attr |= DMA_FL_PTE_DIRTY;
- }
-
- domain->has_mappings = true;
-
- pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-
- while (nr_pages > 0) {
- uint64_t tmp;
-
- if (!pte) {
- largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
- phys_pfn, nr_pages);
-
- pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
- gfp);
- if (!pte)
- return -ENOMEM;
- first_pte = pte;
-
- lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
- /* It is large page*/
- if (largepage_lvl > 1) {
- unsigned long end_pfn;
- unsigned long pages_to_remove;
-
- pteval |= DMA_PTE_LARGE_PAGE;
- pages_to_remove = min_t(unsigned long,
- round_down(nr_pages, lvl_pages),
- nr_pte_to_next_page(pte) * lvl_pages);
- end_pfn = iov_pfn + pages_to_remove - 1;
- switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
- } else {
- pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
- }
-
- }
- /* We don't need lock here, nobody else
- * touches the iova range
- */
- tmp = 0ULL;
- if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
- static int dumps = 5;
- pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
- iov_pfn, tmp, (unsigned long long)pteval);
- if (dumps) {
- dumps--;
- debug_dma_dump_mappings(NULL);
- }
- WARN_ON(1);
- }
-
- nr_pages -= lvl_pages;
- iov_pfn += lvl_pages;
- phys_pfn += lvl_pages;
- pteval += lvl_pages * VTD_PAGE_SIZE;
-
- /* If the next PTE would be the first in a new page, then we
- * need to flush the cache on the entries we've just written.
- * And then we'll need to recalculate 'pte', so clear it and
- * let it get set again in the if (!pte) block above.
- *
- * If we're done (!nr_pages) we need to flush the cache too.
- *
- * Also if we've been setting superpages, we may need to
- * recalculate 'pte' and switch back to smaller pages for the
- * end of the mapping, if the trailing size is not enough to
- * use another superpage (i.e. nr_pages < lvl_pages).
- */
- pte++;
- if (!nr_pages || first_pte_in_page(pte) ||
- (largepage_lvl > 1 && nr_pages < lvl_pages)) {
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
- pte = NULL;
- }
- }
-
- return 0;
-}
-
static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
{
struct intel_iommu *iommu = info->iommu;
@@ -1769,22 +1287,26 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
struct device *dev,
u32 pasid, struct iommu_domain *old)
{
- struct dma_pte *pgd = domain->pgd;
- int level, flags = 0;
+ struct pt_iommu_x86_64_hw_info pt_info;
+ unsigned int flags = 0;
- level = agaw_to_level(domain->agaw);
- if (level != 4 && level != 5)
+ pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info);
+ if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5))
return -EINVAL;
- if (level == 5)
+ if (pt_info.levels == 5)
flags |= PASID_FLAG_FL5LP;
if (domain->force_snooping)
flags |= PASID_FLAG_PAGE_SNOOP;
+ if (!(domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ flags |= PASID_FLAG_PWSNP;
+
return __domain_setup_first_level(iommu, dev, pasid,
domain_id_iommu(domain, iommu),
- __pa(pgd), flags, old);
+ pt_info.gcr3_pt, flags, old);
}
static int dmar_domain_attach_device(struct dmar_domain *domain,
@@ -3230,7 +2752,8 @@ void device_block_translation(struct device *dev)
}
static int blocking_domain_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
@@ -3251,23 +2774,9 @@ static struct iommu_domain blocking_domain = {
}
};
-static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
-{
- if (!intel_iommu_superpage)
- return 0;
-
- if (first_stage)
- return cap_fl1gp_support(iommu->cap) ? 2 : 1;
-
- return fls(cap_super_page_val(iommu->cap));
-}
-
-static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
+static struct dmar_domain *paging_domain_alloc(void)
{
- struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct intel_iommu *iommu = info->iommu;
struct dmar_domain *domain;
- int addr_width;
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
@@ -3282,56 +2791,38 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st
INIT_LIST_HEAD(&domain->s1_domains);
spin_lock_init(&domain->s1_lock);
- domain->nid = dev_to_node(dev);
- domain->use_first_level = first_stage;
-
- domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
-
- /* calculate the address width */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
- domain->gaw = addr_width;
- domain->agaw = iommu->agaw;
- domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
-
- /* iommu memory access coherency */
- domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
+ return domain;
+}
- /* pagesize bitmap */
- domain->domain.pgsize_bitmap = SZ_4K;
- domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
- domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
+static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu,
+ unsigned int *top_level)
+{
+ unsigned int mgaw = cap_mgaw(iommu->cap);
/*
- * IOVA aperture: First-level translation restricts the input-address
- * to a canonical address (i.e., address bits 63:N have the same value
- * as address bit [N-1], where N is 48-bits with 4-level paging and
- * 57-bits with 5-level paging). Hence, skip bit [N-1].
+ * Spec 3.6 First-Stage Translation:
+ *
+ * Software must limit addresses to less than the minimum of MGAW
+ * and the lower canonical address width implied by FSPM (i.e.,
+ * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level).
*/
- domain->domain.geometry.force_aperture = true;
- domain->domain.geometry.aperture_start = 0;
- if (first_stage)
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
- else
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
-
- /* always allocate the top pgd */
- domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K);
- if (!domain->pgd) {
- kfree(domain);
- return ERR_PTR(-ENOMEM);
+ if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) {
+ *top_level = 4;
+ return min(57, mgaw);
}
- domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
- return domain;
+ /* Four level is always supported */
+ *top_level = 3;
+ return min(48, mgaw);
}
static struct iommu_domain *
intel_iommu_domain_alloc_first_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_x86_64_cfg cfg = {};
struct dmar_domain *dmar_domain;
+ int ret;
if (flags & ~IOMMU_HWPT_ALLOC_PASID)
return ERR_PTR(-EOPNOTSUPP);
@@ -3340,10 +2831,20 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
- dmar_domain = paging_domain_alloc(dev, true);
+ dmar_domain = paging_domain_alloc();
if (IS_ERR(dmar_domain))
return ERR_CAST(dmar_domain);
+ cfg.common.hw_max_vasz_lg2 =
+ compute_vasz_lg2_fs(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
+ BIT(PT_FEAT_FLUSH_RANGE);
+ /* First stage always uses scalable mode */
+ if (!ecap_smpwc(iommu->ecap))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
/*
* iotlb sync for map is only needed for legacy implementations that
@@ -3353,14 +2854,58 @@ intel_iommu_domain_alloc_first_stage(struct device *dev,
if (rwbf_required(iommu))
dmar_domain->iotlb_sync_map = true;
+ ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ if (!cap_fl1gp_support(iommu->cap))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
+
return &dmar_domain->domain;
}
+static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu,
+ unsigned int *top_level)
+{
+ unsigned int sagaw = cap_sagaw(iommu->cap);
+ unsigned int mgaw = cap_mgaw(iommu->cap);
+
+ /*
+ * Find the largest table size that both the mgaw and sagaw support.
+ * This sets the valid range of IOVA and the top starting level.
+ * Some HW may only support a 4 or 5 level walk but must limit IOVA to
+ * 3 levels.
+ */
+ if (mgaw > 48 && sagaw >= BIT(3)) {
+ *top_level = 4;
+ return min(57, mgaw);
+ } else if (mgaw > 39 && sagaw >= BIT(2)) {
+ *top_level = 3 + ffs(sagaw >> 3);
+ return min(48, mgaw);
+ } else if (mgaw > 30 && sagaw >= BIT(1)) {
+ *top_level = 2 + ffs(sagaw >> 2);
+ return min(39, mgaw);
+ }
+ return 0;
+}
+
+static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(vtdss),
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+};
+
static struct iommu_domain *
intel_iommu_domain_alloc_second_stage(struct device *dev,
struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_vtdss_cfg cfg = {};
struct dmar_domain *dmar_domain;
+ unsigned int sslps;
+ int ret;
if (flags &
(~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
@@ -3377,15 +2922,46 @@ intel_iommu_domain_alloc_second_stage(struct device *dev,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
- dmar_domain = paging_domain_alloc(dev, false);
+ dmar_domain = paging_domain_alloc();
if (IS_ERR(dmar_domain))
return ERR_CAST(dmar_domain);
+ cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
+
+ /*
+ * Read-only mapping is disallowed on the domain which serves as the
+ * parent in a nested configuration, due to HW errata
+ * (ERRATA_772415_SPR17)
+ */
+ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)
+ cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE);
+
+ if (!iommu_paging_structure_coherency(iommu))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
- dmar_domain->domain.dirty_ops = &intel_dirty_ops;
+ dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops;
+
+ ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ /* Adjust the supported page sizes to HW capability */
+ sslps = cap_super_page_val(iommu->cap);
+ if (!(sslps & BIT(0)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M;
+ if (!(sslps & BIT(1)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
/*
* Besides the internal write buffer flush, the caching mode used for
@@ -3427,14 +3003,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain)
if (WARN_ON(!list_empty(&dmar_domain->devices)))
return;
- if (dmar_domain->pgd) {
- struct iommu_pages_list freelist =
- IOMMU_PAGES_LIST_INIT(freelist);
-
- domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw),
- &freelist);
- iommu_put_pages_list(&freelist);
- }
+ pt_iommu_deinit(&dmar_domain->iommu);
kfree(dmar_domain->qi_batch);
kfree(dmar_domain);
@@ -3451,6 +3020,16 @@ static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return -EINVAL;
+ if (!ecap_smpwc(iommu->ecap) &&
+ !(dmar_domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
+
+ /* Supports the number of table levels */
+ if (!cap_fl5lp_support(iommu->cap) &&
+ dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48)
+ return -EINVAL;
+
/* Same page size support */
if (!cap_fl1gp_support(iommu->cap) &&
(dmar_domain->domain.pgsize_bitmap & SZ_1G))
@@ -3467,7 +3046,11 @@ static int
paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
struct intel_iommu *iommu)
{
+ unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2;
unsigned int sslps = cap_super_page_val(iommu->cap);
+ struct pt_iommu_vtdss_hw_info pt_info;
+
+ pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info);
if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
return -EINVAL;
@@ -3478,6 +3061,19 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
return -EINVAL;
+ if (!iommu_paging_structure_coherency(iommu) &&
+ !(dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
+
+ /* Address width falls within the capability */
+ if (cap_mgaw(iommu->cap) < vasz_lg2)
+ return -EINVAL;
+
+ /* Page table level is supported. */
+ if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
+ return -EINVAL;
+
/* Same page size support */
if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
return -EINVAL;
@@ -3489,6 +3085,14 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
!dmar_domain->iotlb_sync_map)
return -EINVAL;
+ /*
+ * FIXME this is locked wrong, it needs to be under the
+ * dmar_domain->lock
+ */
+ if ((dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) &&
+ !ecap_sc_support(iommu->ecap))
+ return -EINVAL;
return 0;
}
@@ -3498,7 +3102,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu = info->iommu;
int ret = -EINVAL;
- int addr_width;
if (intel_domain_is_fs_paging(dmar_domain))
ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
@@ -3509,26 +3112,6 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
if (ret)
return ret;
- /*
- * FIXME this is locked wrong, it needs to be under the
- * dmar_domain->lock
- */
- if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
- return -EINVAL;
-
- if (dmar_domain->iommu_coherency !=
- iommu_paging_structure_coherency(iommu))
- return -EINVAL;
-
-
- /* check if this iommu agaw is sufficient for max mapped address */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
-
- if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
- return -EINVAL;
-
if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
context_copied(iommu, info->bus, info->devfn))
return intel_pasid_setup_sm_context(dev);
@@ -3537,7 +3120,8 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
}
static int intel_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret;
@@ -3558,110 +3142,6 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
return ret;
}
-static int intel_iommu_map(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t hpa,
- size_t size, int iommu_prot, gfp_t gfp)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- u64 max_addr;
- int prot = 0;
-
- if (iommu_prot & IOMMU_READ)
- prot |= DMA_PTE_READ;
- if (iommu_prot & IOMMU_WRITE)
- prot |= DMA_PTE_WRITE;
- if (dmar_domain->set_pte_snp)
- prot |= DMA_PTE_SNP;
-
- max_addr = iova + size;
- if (dmar_domain->max_addr < max_addr) {
- u64 end;
-
- /* check if minimum agaw is sufficient for mapped address */
- end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
- if (end < max_addr) {
- pr_err("%s: iommu width (%d) is not "
- "sufficient for the mapped address (%llx)\n",
- __func__, dmar_domain->gaw, max_addr);
- return -EFAULT;
- }
- dmar_domain->max_addr = max_addr;
- }
- /* Round up size to next multiple of PAGE_SIZE, if it and
- the low bits of hpa would take us onto the next page */
- size = aligned_nrpages(hpa, size);
- return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
- hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
-}
-
-static int intel_iommu_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
- int ret;
-
- if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
- return -EINVAL;
-
- if (!IS_ALIGNED(iova | paddr, pgsize))
- return -EINVAL;
-
- ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
- if (!ret && mapped)
- *mapped = size;
-
- return ret;
-}
-
-static size_t intel_iommu_unmap(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- struct iommu_iotlb_gather *gather)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long start_pfn, last_pfn;
- int level = 0;
-
- /* Cope with horrid API which requires us to unmap more than the
- size argument if it happens to be a large-page mapping. */
- if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
- &level, GFP_ATOMIC)))
- return 0;
-
- if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
- size = VTD_PAGE_SIZE << level_to_offset_bits(level);
-
- start_pfn = iova >> VTD_PAGE_SHIFT;
- last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
-
- domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
-
- if (dmar_domain->max_addr == iova + size)
- dmar_domain->max_addr = iova;
-
- /*
- * We do not use page-selective IOTLB invalidation in flush queue,
- * so there is no need to track page and sync iotlb.
- */
- if (!iommu_iotlb_gather_queued(gather))
- iommu_iotlb_gather_add_page(domain, gather, iova, size);
-
- return size;
-}
-
-static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
-
- return intel_iommu_unmap(domain, iova, size, gather);
-}
-
static void intel_iommu_tlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
@@ -3671,24 +3151,6 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain,
iommu_put_pages_list(&gather->freelist);
}
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct dma_pte *pte;
- int level = 0;
- u64 phys = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
- GFP_ATOMIC);
- if (pte && dma_pte_present(pte))
- phys = dma_pte_addr(pte) +
- (iova & (BIT_MASK(level_to_offset_bits(level) +
- VTD_PAGE_SHIFT) - 1));
-
- return phys;
-}
-
static bool domain_support_force_snooping(struct dmar_domain *domain)
{
struct device_domain_info *info;
@@ -3730,15 +3192,15 @@ static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
guard(spinlock_irqsave)(&dmar_domain->lock);
- if (!domain_support_force_snooping(dmar_domain) ||
- dmar_domain->has_mappings)
+ if (!domain_support_force_snooping(dmar_domain))
return false;
/*
* Second level page table supports per-PTE snoop control. The
* iommu_map() interface will handle this by setting SNP bit.
*/
- dmar_domain->set_pte_snp = true;
+ dmar_domain->sspt.vtdss_pt.common.features |=
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE);
dmar_domain->force_snooping = true;
return true;
}
@@ -4302,49 +3764,6 @@ err_unwind:
return ret;
}
-static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long end = iova + size - 1;
- unsigned long pgsize;
-
- /*
- * IOMMUFD core calls into a dirty tracking disabled domain without an
- * IOVA bitmap set in order to clean dirty bits in all PTEs that might
- * have occurred when we stopped dirty tracking. This ensures that we
- * never inherit dirtied bits from a previous cycle.
- */
- if (!dmar_domain->dirty_tracking && dirty->bitmap)
- return -EINVAL;
-
- do {
- struct dma_pte *pte;
- int lvl = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
- GFP_ATOMIC);
- pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
- if (!pte || !dma_pte_present(pte)) {
- iova += pgsize;
- continue;
- }
-
- if (dma_sl_pte_test_and_clear_dirty(pte, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-static const struct iommu_dirty_ops intel_dirty_ops = {
- .set_dirty_tracking = intel_iommu_set_dirty_tracking,
- .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
-};
-
static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
@@ -4401,7 +3820,9 @@ static int device_setup_pass_through(struct device *dev)
context_setup_pass_through_cb, dev);
}
-static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int identity_domain_attach_dev(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
@@ -4462,27 +3883,23 @@ static struct iommu_domain identity_domain = {
};
const struct iommu_domain_ops intel_fs_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
.free = intel_iommu_domain_free,
.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
};
const struct iommu_domain_ops intel_ss_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(vtdss),
.attach_dev = intel_iommu_attach_device,
.set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.flush_iotlb_all = intel_flush_iotlb_all,
.iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
.free = intel_iommu_domain_free,
.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
};
@@ -4797,3 +4214,5 @@ err:
return ret;
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 3056583d7f56..25c5e22096d4 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -23,8 +23,8 @@
#include <linux/xarray.h>
#include <linux/perf_event.h>
#include <linux/pci.h>
+#include <linux/generic_pt/iommu.h>
-#include <asm/cacheflush.h>
#include <asm/iommu.h>
#include <uapi/linux/iommufd.h>
@@ -595,22 +595,20 @@ struct qi_batch {
};
struct dmar_domain {
- int nid; /* node id */
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ /* First stage page table */
+ struct pt_iommu_x86_64 fspt;
+ /* Second stage page table */
+ struct pt_iommu_vtdss sspt;
+ };
+
struct xarray iommu_array; /* Attached IOMMU array */
- u8 iommu_coherency: 1; /* indicate coherency of iommu access */
- u8 force_snooping : 1; /* Create IOPTEs with snoop control */
- u8 set_pte_snp:1;
- u8 use_first_level:1; /* DMA translation for the domain goes
- * through the first level page table,
- * otherwise, goes through the second
- * level.
- */
+ u8 force_snooping:1; /* Create PASID entry with snoop control */
u8 dirty_tracking:1; /* Dirty tracking is enabled */
u8 nested_parent:1; /* Has other domains nested on it */
- u8 has_mappings:1; /* Has mappings configured through
- * iommu_map() interface.
- */
u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write
* buffer when creating mappings.
*/
@@ -623,26 +621,9 @@ struct dmar_domain {
struct list_head cache_tags; /* Cache tag list */
struct qi_batch *qi_batch; /* Batched QI descriptors */
- int iommu_superpage;/* Level of superpages supported:
- 0 == 4KiB (no superpages), 1 == 2MiB,
- 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
union {
/* DMA remapping domain */
struct {
- /* virtual address */
- struct dma_pte *pgd;
- /* max guest address width */
- int gaw;
- /*
- * adjusted guest address width:
- * 0: level 2 30-bit
- * 1: level 3 39-bit
- * 2: level 4 48-bit
- * 3: level 5 57-bit
- */
- int agaw;
- /* maximum mapped address */
- u64 max_addr;
/* Protect the s1_domains list */
spinlock_t s1_lock;
/* Track s1_domains nested on this domain */
@@ -664,10 +645,10 @@ struct dmar_domain {
struct mmu_notifier notifier;
};
};
-
- struct iommu_domain domain; /* generic domain data structure for
- iommu core */
};
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain);
/*
* In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
@@ -866,11 +847,6 @@ struct dma_pte {
u64 val;
};
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
- pte->val = 0;
-}
-
static inline u64 dma_pte_addr(struct dma_pte *pte)
{
#ifdef CONFIG_64BIT
@@ -886,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
-static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
- unsigned long flags)
-{
- if (flags & IOMMU_DIRTY_NO_CLEAR)
- return (pte->val & DMA_SL_PTE_DIRTY) != 0;
-
- return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
- (unsigned long *)&pte->val);
-}
-
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
}
-static inline bool first_pte_in_page(struct dma_pte *pte)
-{
- return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
-}
-
-static inline int nr_pte_to_next_page(struct dma_pte *pte)
-{
- return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
- (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
-}
-
static inline bool context_present(struct context_entry *context)
{
return (context->lo & 1);
@@ -927,11 +882,6 @@ static inline int agaw_to_level(int agaw)
return agaw + 2;
}
-static inline int agaw_to_width(int agaw)
-{
- return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
-}
-
static inline int width_to_agaw(int width)
{
return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
@@ -947,25 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level)
return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
}
-static inline u64 level_mask(int level)
-{
- return -1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 level_size(int level)
-{
- return 1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 align_to_level(u64 pfn, int level)
-{
- return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
-static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
-{
- return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
-}
static inline void context_set_present(struct context_entry *context)
{
@@ -1097,7 +1028,7 @@ static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
struct qi_desc *desc)
{
u8 dw = 0, dr = 0;
- int ih = 0;
+ int ih = addr & 1;
if (cap_write_drain(iommu->cap))
dw = 1;
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 1b6ad9c900a5..a3fb8c193ca6 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -19,7 +19,7 @@
#include "pasid.h"
static int intel_nested_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
@@ -29,11 +29,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
device_block_translation(dev);
- if (iommu->agaw < dmar_domain->s2_domain->agaw) {
- dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
- return -ENODEV;
- }
-
/*
* Stage-1 domain cannot work alone, it is nested on a s2_domain.
* The s2_domain will be used in nested translation, hence needs
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 52f678975da7..3e2255057079 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -366,7 +366,7 @@ static void pasid_pte_config_first_level(struct intel_iommu *iommu,
pasid_set_domain_id(pte, did);
pasid_set_address_width(pte, iommu->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP);
/* Setup Present and PASID Granular Transfer Type: */
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
@@ -461,19 +461,22 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu,
*/
static void pasid_pte_config_second_level(struct intel_iommu *iommu,
struct pasid_entry *pte,
- u64 pgd_val, int agaw, u16 did,
- bool dirty_tracking)
+ struct dmar_domain *domain, u16 did)
{
+ struct pt_iommu_vtdss_hw_info pt_info;
+
lockdep_assert_held(&iommu->lock);
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
pasid_clear_entry(pte);
pasid_set_domain_id(pte, did);
- pasid_set_slptr(pte, pgd_val);
- pasid_set_address_width(pte, agaw);
+ pasid_set_slptr(pte, pt_info.ssptptr);
+ pasid_set_address_width(pte, pt_info.aw);
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
- if (dirty_tracking)
+ pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
+ if (domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_present(pte);
@@ -484,10 +487,9 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct device *dev, u32 pasid)
{
struct pasid_entry *pte;
- struct dma_pte *pgd;
- u64 pgd_val;
u16 did;
+
/*
* If hardware advertises no support for second level
* translation, return directly.
@@ -498,8 +500,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
spin_lock(&iommu->lock);
@@ -514,8 +514,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EBUSY;
}
- pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw,
- did, domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, pte, domain, did);
spin_unlock(&iommu->lock);
pasid_flush_caches(iommu, pte, pasid, did);
@@ -529,8 +528,6 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
u32 pasid)
{
struct pasid_entry *pte, new_pte;
- struct dma_pte *pgd;
- u64 pgd_val;
u16 did;
/*
@@ -543,13 +540,9 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
- pasid_pte_config_second_level(iommu, &new_pte, pgd_val,
- domain->agaw, did,
- domain->dirty_tracking);
+ pasid_pte_config_second_level(iommu, &new_pte, domain, did);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
@@ -747,10 +740,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
struct dmar_domain *s2_domain,
u16 did)
{
- struct dma_pte *pgd = s2_domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
lockdep_assert_held(&iommu->lock);
+ pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info);
+
pasid_clear_entry(pte);
if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
@@ -770,11 +765,12 @@ static void pasid_pte_config_nestd(struct intel_iommu *iommu,
if (s2_domain->force_snooping)
pasid_set_pgsnp(pte);
- pasid_set_slptr(pte, virt_to_phys(pgd));
+ pasid_set_slptr(pte, pt_info.ssptptr);
pasid_set_fault_enable(pte);
pasid_set_domain_id(pte, did);
- pasid_set_address_width(pte, s2_domain->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_address_width(pte, pt_info.aw);
+ pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
if (s2_domain->dirty_tracking)
pasid_set_ssade(pte);
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index a771a77d4239..b4c85242dc79 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -24,6 +24,7 @@
#define PASID_FLAG_NESTED BIT(1)
#define PASID_FLAG_PAGE_SNOOP BIT(2)
+#define PASID_FLAG_PWSNP BIT(2)
/*
* The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e147f71f91b7..71de7947971f 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -170,6 +170,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
/* Setup the pasid table: */
sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+ sflags |= PASID_FLAG_PWSNP;
ret = __domain_setup_first_level(iommu, dev, pasid,
FLPT_DEFAULT_DID, __pa(mm->pgd),
sflags, old);
diff --git a/drivers/iommu/io-pgtable-arm-selftests.c b/drivers/iommu/io-pgtable-arm-selftests.c
new file mode 100644
index 000000000000..334e70350924
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm-selftests.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU-agnostic ARM page table allocator.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt
+
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+
+#include "io-pgtable-arm.h"
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_flush(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+ WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ dummy_tlb_flush(iova, granule, granule, cookie);
+}
+
+static const struct iommu_flush_ops dummy_tlb_ops = {
+ .tlb_flush_all = dummy_tlb_flush_all,
+ .tlb_flush_walk = dummy_tlb_flush,
+ .tlb_add_page = dummy_tlb_add_page,
+};
+
+#define __FAIL(test, i) ({ \
+ KUNIT_FAIL(test, "test failed for fmt idx %d\n", (i)); \
+ -EFAULT; \
+})
+
+static int arm_lpae_run_tests(struct kunit *test, struct io_pgtable_cfg *cfg)
+{
+ static const enum io_pgtable_fmt fmts[] = {
+ ARM_64_LPAE_S1,
+ ARM_64_LPAE_S2,
+ };
+
+ int i, j;
+ unsigned long iova;
+ size_t size, mapped;
+ struct io_pgtable_ops *ops;
+
+ for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
+ cfg_cookie = cfg;
+ ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
+ if (!ops) {
+ kunit_err(test, "failed to allocate io pgtable ops\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Initial sanity checks.
+ * Empty page tables shouldn't provide any translations.
+ */
+ if (ops->iova_to_phys(ops, 42))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, SZ_1G + 42))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, SZ_2G + 42))
+ return __FAIL(test, i);
+
+ /*
+ * Distinct mappings of different granule sizes.
+ */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ /* Overlapping mappings */
+ if (!ops->map_pages(ops, iova, iova + size, size, 1,
+ IOMMU_READ | IOMMU_NOEXEC,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(test, i);
+
+ iova += SZ_1G;
+ }
+
+ /* Full unmap */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42))
+ return __FAIL(test, i);
+
+ /* Remap full block */
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_WRITE, GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(test, i);
+
+ iova += SZ_1G;
+ }
+
+ /*
+ * Map/unmap the last largest supported page of the IAS, this can
+ * trigger corner cases in the concatednated page tables.
+ */
+ mapped = 0;
+ size = 1UL << __fls(cfg->pgsize_bitmap);
+ iova = (1UL << cfg->ias) - size;
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+ if (mapped != size)
+ return __FAIL(test, i);
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(test, i);
+
+ free_io_pgtable_ops(ops);
+ }
+
+ return 0;
+}
+
+static void arm_lpae_do_selftests(struct kunit *test)
+{
+ static const unsigned long pgsize[] = {
+ SZ_4K | SZ_2M | SZ_1G,
+ SZ_16K | SZ_32M,
+ SZ_64K | SZ_512M,
+ };
+
+ static const unsigned int address_size[] = {
+ 32, 36, 40, 42, 44, 48,
+ };
+
+ int i, j, k, pass = 0, fail = 0;
+ struct device *dev;
+ struct io_pgtable_cfg cfg = {
+ .tlb = &dummy_tlb_ops,
+ .coherent_walk = true,
+ .quirks = IO_PGTABLE_QUIRK_NO_WARN,
+ };
+
+ dev = kunit_device_register(test, "io-pgtable-test");
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev);
+ if (IS_ERR_OR_NULL(dev))
+ return;
+
+ cfg.iommu_dev = dev;
+
+ for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
+ for (j = 0; j < ARRAY_SIZE(address_size); ++j) {
+ /* Don't use ias > oas as it is not valid for stage-2. */
+ for (k = 0; k <= j; ++k) {
+ cfg.pgsize_bitmap = pgsize[i];
+ cfg.ias = address_size[k];
+ cfg.oas = address_size[j];
+ kunit_info(test, "pgsize_bitmap 0x%08lx, IAS %u OAS %u\n",
+ pgsize[i], cfg.ias, cfg.oas);
+ if (arm_lpae_run_tests(test, &cfg))
+ fail++;
+ else
+ pass++;
+ }
+ }
+ }
+
+ kunit_info(test, "completed with %d PASS %d FAIL\n", pass, fail);
+}
+
+static struct kunit_case io_pgtable_arm_test_cases[] = {
+ KUNIT_CASE(arm_lpae_do_selftests),
+ {},
+};
+
+static struct kunit_suite io_pgtable_arm_test = {
+ .name = "io-pgtable-arm-test",
+ .test_cases = io_pgtable_arm_test_cases,
+};
+
+kunit_test_suite(io_pgtable_arm_test);
+
+MODULE_DESCRIPTION("io-pgtable-arm library kunit tests");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7e8e2216c294..e6626004b323 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -12,8 +12,6 @@
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-#include <linux/device/faux.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -1267,204 +1265,3 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
.alloc = arm_mali_lpae_alloc_pgtable,
.free = arm_lpae_free_pgtable,
};
-
-#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
-
-static struct io_pgtable_cfg *cfg_cookie __initdata;
-
-static void __init dummy_tlb_flush_all(void *cookie)
-{
- WARN_ON(cookie != cfg_cookie);
-}
-
-static void __init dummy_tlb_flush(unsigned long iova, size_t size,
- size_t granule, void *cookie)
-{
- WARN_ON(cookie != cfg_cookie);
- WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
-}
-
-static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t granule,
- void *cookie)
-{
- dummy_tlb_flush(iova, granule, granule, cookie);
-}
-
-static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
- .tlb_flush_all = dummy_tlb_flush_all,
- .tlb_flush_walk = dummy_tlb_flush,
- .tlb_add_page = dummy_tlb_add_page,
-};
-
-static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
-{
- struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &data->iop.cfg;
-
- pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n",
- cfg->pgsize_bitmap, cfg->ias);
- pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n",
- ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data),
- ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd);
-}
-
-#define __FAIL(ops, i) ({ \
- WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \
- arm_lpae_dump_ops(ops); \
- -EFAULT; \
-})
-
-static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
-{
- static const enum io_pgtable_fmt fmts[] __initconst = {
- ARM_64_LPAE_S1,
- ARM_64_LPAE_S2,
- };
-
- int i, j;
- unsigned long iova;
- size_t size, mapped;
- struct io_pgtable_ops *ops;
-
- for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
- cfg_cookie = cfg;
- ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
- if (!ops) {
- pr_err("selftest: failed to allocate io pgtable ops\n");
- return -ENOMEM;
- }
-
- /*
- * Initial sanity checks.
- * Empty page tables shouldn't provide any translations.
- */
- if (ops->iova_to_phys(ops, 42))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_1G + 42))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_2G + 42))
- return __FAIL(ops, i);
-
- /*
- * Distinct mappings of different granule sizes.
- */
- iova = 0;
- for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
- size = 1UL << j;
-
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_READ | IOMMU_WRITE |
- IOMMU_NOEXEC | IOMMU_CACHE,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- /* Overlapping mappings */
- if (!ops->map_pages(ops, iova, iova + size, size, 1,
- IOMMU_READ | IOMMU_NOEXEC,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
- return __FAIL(ops, i);
-
- iova += SZ_1G;
- }
-
- /* Full unmap */
- iova = 0;
- for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
- size = 1UL << j;
-
- if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42))
- return __FAIL(ops, i);
-
- /* Remap full block */
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_WRITE, GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
- return __FAIL(ops, i);
-
- iova += SZ_1G;
- }
-
- /*
- * Map/unmap the last largest supported page of the IAS, this can
- * trigger corner cases in the concatednated page tables.
- */
- mapped = 0;
- size = 1UL << __fls(cfg->pgsize_bitmap);
- iova = (1UL << cfg->ias) - size;
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_READ | IOMMU_WRITE |
- IOMMU_NOEXEC | IOMMU_CACHE,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
- if (mapped != size)
- return __FAIL(ops, i);
- if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- free_io_pgtable_ops(ops);
- }
-
- return 0;
-}
-
-static int __init arm_lpae_do_selftests(void)
-{
- static const unsigned long pgsize[] __initconst = {
- SZ_4K | SZ_2M | SZ_1G,
- SZ_16K | SZ_32M,
- SZ_64K | SZ_512M,
- };
-
- static const unsigned int address_size[] __initconst = {
- 32, 36, 40, 42, 44, 48,
- };
-
- int i, j, k, pass = 0, fail = 0;
- struct faux_device *dev;
- struct io_pgtable_cfg cfg = {
- .tlb = &dummy_tlb_ops,
- .coherent_walk = true,
- .quirks = IO_PGTABLE_QUIRK_NO_WARN,
- };
-
- dev = faux_device_create("io-pgtable-test", NULL, 0);
- if (!dev)
- return -ENOMEM;
-
- cfg.iommu_dev = &dev->dev;
-
- for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
- for (j = 0; j < ARRAY_SIZE(address_size); ++j) {
- /* Don't use ias > oas as it is not valid for stage-2. */
- for (k = 0; k <= j; ++k) {
- cfg.pgsize_bitmap = pgsize[i];
- cfg.ias = address_size[k];
- cfg.oas = address_size[j];
- pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u OAS %u\n",
- pgsize[i], cfg.ias, cfg.oas);
- if (arm_lpae_run_tests(&cfg))
- fail++;
- else
- pass++;
- }
- }
- }
-
- pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail);
- faux_device_destroy(dev);
-
- return fail ? -EFAULT : 0;
-}
-subsys_initcall(arm_lpae_do_selftests);
-#endif
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 8841c1487f00..843fec8e8a51 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
#endif
-#ifdef CONFIG_AMD_IOMMU
- [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
- [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
-#endif
};
static int check_custom_allocator(enum io_pgtable_fmt fmt,
diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c
index 238c09e5166b..3bab175d8557 100644
--- a/drivers/iommu/iommu-pages.c
+++ b/drivers/iommu/iommu-pages.c
@@ -4,6 +4,7 @@
* Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#include "iommu-pages.h"
+#include <linux/dma-mapping.h>
#include <linux/gfp.h>
#include <linux/mm.h>
@@ -22,6 +23,11 @@ IOPTDESC_MATCH(memcg_data, memcg_data);
#undef IOPTDESC_MATCH
static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
+static inline size_t ioptdesc_mem_size(struct ioptdesc *desc)
+{
+ return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT);
+}
+
/**
* iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from
* specific NUMA node
@@ -36,6 +42,7 @@ static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
*/
void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
{
+ struct ioptdesc *iopt;
unsigned long pgcnt;
struct folio *folio;
unsigned int order;
@@ -60,6 +67,9 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
if (unlikely(!folio))
return NULL;
+ iopt = folio_ioptdesc(folio);
+ iopt->incoherent = false;
+
/*
* All page allocations that should be reported to as "iommu-pagetables"
* to userspace must use one of the functions below. This includes
@@ -80,7 +90,10 @@ EXPORT_SYMBOL_GPL(iommu_alloc_pages_node_sz);
static void __iommu_free_desc(struct ioptdesc *iopt)
{
struct folio *folio = ioptdesc_folio(iopt);
- const unsigned long pgcnt = 1UL << folio_order(folio);
+ const unsigned long pgcnt = folio_nr_pages(folio);
+
+ if (IOMMU_PAGES_USE_DMA_API)
+ WARN_ON_ONCE(iopt->incoherent);
mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt);
lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt);
@@ -117,3 +130,124 @@ void iommu_put_pages_list(struct iommu_pages_list *list)
__iommu_free_desc(iopt);
}
EXPORT_SYMBOL_GPL(iommu_put_pages_list);
+
+/**
+ * iommu_pages_start_incoherent - Setup the page for cache incoherent operation
+ * @virt: The page to setup
+ * @dma_dev: The iommu device
+ *
+ * For incoherent memory this will use the DMA API to manage the cache flushing
+ * on some arches. This is a lot of complexity compared to just calling
+ * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers
+ * have been doing. The DMA API requires keeping track of the DMA map and
+ * freeing it when required. This keeps track of the dma map inside the ioptdesc
+ * so that error paths are simple for the caller.
+ */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+ dma_addr_t dma;
+
+ if (WARN_ON(iopt->incoherent))
+ return -EINVAL;
+
+ if (!IOMMU_PAGES_USE_DMA_API) {
+ iommu_pages_flush_incoherent(dma_dev, virt, 0,
+ ioptdesc_mem_size(iopt));
+ } else {
+ dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, dma))
+ return -EINVAL;
+
+ /*
+ * The DMA API is not allowed to do anything other than DMA
+ * direct. It would be nice to also check
+ * dev_is_dma_coherent(dma_dev));
+ */
+ if (WARN_ON(dma != virt_to_phys(virt))) {
+ dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ iopt->incoherent = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent);
+
+/**
+ * iommu_pages_start_incoherent_list - Make a list of pages incoherent
+ * @list: The list of pages to setup
+ * @dma_dev: The iommu device
+ *
+ * Perform iommu_pages_start_incoherent() across all of list.
+ *
+ * If this fails the caller must call iommu_pages_stop_incoherent_list().
+ */
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+ int ret;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ if (WARN_ON(cur->incoherent))
+ continue;
+
+ ret = iommu_pages_start_incoherent(
+ folio_address(ioptdesc_folio(cur)), dma_dev);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list);
+
+/**
+ * iommu_pages_stop_incoherent_list - Undo incoherence across a list
+ * @list: The list of pages to release
+ * @dma_dev: The iommu device
+ *
+ * Revert iommu_pages_start_incoherent() across all of the list. Pages that did
+ * not call or succeed iommu_pages_start_incoherent() will be ignored.
+ */
+#if IOMMU_PAGES_USE_DMA_API
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ struct folio *folio = ioptdesc_folio(cur);
+
+ if (!cur->incoherent)
+ continue;
+ dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)),
+ ioptdesc_mem_size(cur), DMA_TO_DEVICE);
+ cur->incoherent = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list);
+
+/**
+ * iommu_pages_free_incoherent - Free an incoherent page
+ * @virt: virtual address of the page to be freed.
+ * @dma_dev: The iommu device
+ *
+ * If the page is incoherent it made coherent again then freed.
+ */
+void iommu_pages_free_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+
+ if (iopt->incoherent) {
+ dma_unmap_single(dma_dev, virt_to_phys(virt),
+ ioptdesc_mem_size(iopt), DMA_TO_DEVICE);
+ iopt->incoherent = 0;
+ }
+ __iommu_free_desc(iopt);
+}
+EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent);
+#endif
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index b3af2813ed0c..ae9da4f571f6 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -21,7 +21,10 @@ struct ioptdesc {
struct list_head iopt_freelist_elm;
unsigned long __page_mapping;
- pgoff_t __index;
+ union {
+ u8 incoherent;
+ pgoff_t __index;
+ };
void *_private;
unsigned int __page_type;
@@ -98,4 +101,48 @@ static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size)
return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size);
}
-#endif /* __IOMMU_PAGES_H */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev);
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+
+#ifdef CONFIG_X86
+#define IOMMU_PAGES_USE_DMA_API 0
+#include <linux/cacheflush.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
+{
+ clflush_cache_range(virt + offset, len);
+}
+static inline void
+iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ /*
+ * For performance leave the incoherent flag alone which turns this into
+ * a NOP. For X86 the rest of the stop/free flow ignores the flag.
+ */
+}
+static inline void iommu_pages_free_incoherent(void *virt,
+ struct device *dma_dev)
+{
+ iommu_free_pages(virt);
+}
+#else
+#define IOMMU_PAGES_USE_DMA_API 1
+#include <linux/dma-mapping.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
+{
+ dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len,
+ DMA_TO_DEVICE);
+}
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+void iommu_pages_free_incoherent(void *virt, struct device *dma_dev);
+#endif
+
+#endif /* __IOMMU_PAGES_H */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 59244c744eab..2ca990dfbb88 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -100,7 +100,7 @@ static int iommu_bus_notifier(struct notifier_block *nb,
unsigned long action, void *data);
static void iommu_release_device(struct device *dev);
static int __iommu_attach_device(struct iommu_domain *domain,
- struct device *dev);
+ struct device *dev, struct iommu_domain *old);
static int __iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group);
static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev,
@@ -114,6 +114,7 @@ enum {
static int __iommu_device_set_domain(struct iommu_group *group,
struct device *dev,
struct iommu_domain *new_domain,
+ struct iommu_domain *old_domain,
unsigned int flags);
static int __iommu_group_set_domain_internal(struct iommu_group *group,
struct iommu_domain *new_domain,
@@ -542,8 +543,21 @@ static void iommu_deinit_device(struct device *dev)
* Regardless, if a delayed attach never occurred, then the release
* should still avoid touching any hardware configuration either.
*/
- if (!dev->iommu->attach_deferred && ops->release_domain)
- ops->release_domain->ops->attach_dev(ops->release_domain, dev);
+ if (!dev->iommu->attach_deferred && ops->release_domain) {
+ struct iommu_domain *release_domain = ops->release_domain;
+
+ /*
+ * If the device requires direct mappings then it should not
+ * be parked on a BLOCKED domain during release as that would
+ * break the direct mappings.
+ */
+ if (dev->iommu->require_direct && ops->identity_domain &&
+ release_domain == ops->blocked_domain)
+ release_domain = ops->identity_domain;
+
+ release_domain->ops->attach_dev(release_domain, dev,
+ group->domain);
+ }
if (ops->release_device)
ops->release_device(dev);
@@ -628,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
if (group->default_domain)
iommu_create_device_direct_mappings(group->default_domain, dev);
if (group->domain) {
- ret = __iommu_device_set_domain(group, dev, group->domain, 0);
+ ret = __iommu_device_set_domain(group, dev, group->domain, NULL,
+ 0);
if (ret)
goto err_remove_gdev;
} else if (!group->default_domain && !group_list) {
@@ -2115,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group)
}
static int __iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
int ret;
if (unlikely(domain->ops->attach_dev == NULL))
return -ENODEV;
- ret = domain->ops->attach_dev(domain, dev);
+ ret = domain->ops->attach_dev(domain, dev, old);
if (ret)
return ret;
dev->iommu->attach_deferred = 0;
@@ -2171,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
{
if (dev->iommu && dev->iommu->attach_deferred)
- return __iommu_attach_device(domain, dev);
+ return __iommu_attach_device(domain, dev, NULL);
return 0;
}
@@ -2284,6 +2299,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_group);
static int __iommu_device_set_domain(struct iommu_group *group,
struct device *dev,
struct iommu_domain *new_domain,
+ struct iommu_domain *old_domain,
unsigned int flags)
{
int ret;
@@ -2309,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group,
dev->iommu->attach_deferred = 0;
}
- ret = __iommu_attach_device(new_domain, dev);
+ ret = __iommu_attach_device(new_domain, dev, old_domain);
if (ret) {
/*
* If we have a blocking domain then try to attach that in hopes
@@ -2319,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group,
if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) &&
group->blocking_domain &&
group->blocking_domain != new_domain)
- __iommu_attach_device(group->blocking_domain, dev);
+ __iommu_attach_device(group->blocking_domain, dev,
+ old_domain);
return ret;
}
return 0;
@@ -2366,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
result = 0;
for_each_group_device(group, gdev) {
ret = __iommu_device_set_domain(group, gdev->dev, new_domain,
- flags);
+ group->domain, flags);
if (ret) {
result = ret;
/*
@@ -2391,6 +2408,9 @@ err_revert:
*/
last_gdev = gdev;
for_each_group_device(group, gdev) {
+ /* No need to revert the last gdev that failed to set domain */
+ if (gdev == last_gdev)
+ break;
/*
* A NULL domain can happen only for first probe, in which case
* we leave group->domain as NULL and let release clean
@@ -2398,10 +2418,8 @@ err_revert:
*/
if (group->domain)
WARN_ON(__iommu_device_set_domain(
- group, gdev->dev, group->domain,
+ group, gdev->dev, group->domain, new_domain,
IOMMU_SET_DOMAIN_MUST_SUCCEED));
- if (gdev == last_gdev)
- break;
}
return ret;
}
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
index 2beeb4f60ee5..eae3f03629b0 100644
--- a/drivers/iommu/iommufd/Kconfig
+++ b/drivers/iommu/iommufd/Kconfig
@@ -41,6 +41,7 @@ config IOMMUFD_TEST
depends on DEBUG_KERNEL
depends on FAULT_INJECTION
depends on RUNTIME_TESTING_MENU
+ depends on IOMMU_PT_AMDV1
select IOMMUFD_DRIVER
default n
help
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 75d60f2ad900..54cf4d856179 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -8,8 +8,10 @@
* The datastructure uses the iopt_pages to optimize the storage of the PFNs
* between the domains and xarray.
*/
+#include <linux/dma-buf.h>
#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/file.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
#include <linux/lockdep.h>
@@ -284,6 +286,9 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
case IOPT_ADDRESS_FILE:
start = elm->start_byte + elm->pages->start;
break;
+ case IOPT_ADDRESS_DMABUF:
+ start = elm->start_byte + elm->pages->dmabuf.start;
+ break;
}
rc = iopt_alloc_iova(iopt, dst_iova, start, length);
if (rc)
@@ -468,25 +473,53 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
* @iopt: io_pagetable to act on
* @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
* the chosen iova on output. Otherwise is the iova to map to on input
- * @file: file to map
+ * @fd: fdno of a file to map
* @start: map file starting at this byte offset
* @length: Number of bytes to map
* @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
* @flags: IOPT_ALLOC_IOVA or zero
*/
int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
- unsigned long *iova, struct file *file,
- unsigned long start, unsigned long length,
- int iommu_prot, unsigned int flags)
+ unsigned long *iova, int fd, unsigned long start,
+ unsigned long length, int iommu_prot,
+ unsigned int flags)
{
struct iopt_pages *pages;
+ struct dma_buf *dmabuf;
+ unsigned long start_byte;
+ unsigned long last;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(start, length - 1, &last))
+ return -EOVERFLOW;
+
+ start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
+ dmabuf = dma_buf_get(fd);
+ if (!IS_ERR(dmabuf)) {
+ pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
+ length,
+ iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages)) {
+ dma_buf_put(dmabuf);
+ return PTR_ERR(pages);
+ }
+ } else {
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ pages = iopt_alloc_file_pages(file, start_byte, start, length,
+ iommu_prot & IOMMU_WRITE);
+ fput(file);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ }
- pages = iopt_alloc_file_pages(file, start, length,
- iommu_prot & IOMMU_WRITE);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
return iopt_map_common(ictx, iopt, pages, iova, length,
- start - pages->start, iommu_prot, flags);
+ start_byte, iommu_prot, flags);
}
struct iova_bitmap_fn_arg {
@@ -961,9 +994,15 @@ static void iopt_unfill_domain(struct io_pagetable *iopt,
WARN_ON(!area->storage_domain);
if (area->storage_domain == domain)
area->storage_domain = storage_domain;
+ if (iopt_is_dmabuf(pages)) {
+ if (!iopt_dmabuf_revoked(pages))
+ iopt_area_unmap_domain(area, domain);
+ iopt_dmabuf_untrack_domain(pages, area, domain);
+ }
mutex_unlock(&pages->mutex);
- iopt_area_unmap_domain(area, domain);
+ if (!iopt_is_dmabuf(pages))
+ iopt_area_unmap_domain(area, domain);
}
return;
}
@@ -980,6 +1019,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt,
WARN_ON(area->storage_domain != domain);
area->storage_domain = NULL;
iopt_area_unfill_domain(area, pages, domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
mutex_unlock(&pages->mutex);
}
}
@@ -1009,10 +1050,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt,
if (!pages)
continue;
- mutex_lock(&pages->mutex);
+ guard(mutex)(&pages->mutex);
+ if (iopt_is_dmabuf(pages)) {
+ rc = iopt_dmabuf_track_domain(pages, area, domain);
+ if (rc)
+ goto out_unfill;
+ }
rc = iopt_area_fill_domain(area, domain);
if (rc) {
- mutex_unlock(&pages->mutex);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
goto out_unfill;
}
if (!area->storage_domain) {
@@ -1021,7 +1068,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt,
interval_tree_insert(&area->pages_node,
&pages->domains_itree);
}
- mutex_unlock(&pages->mutex);
}
return 0;
@@ -1042,6 +1088,8 @@ out_unfill:
area->storage_domain = NULL;
}
iopt_area_unfill_domain(area, pages, domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
mutex_unlock(&pages->mutex);
}
return rc;
@@ -1252,6 +1300,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
if (!pages || area->prevent_access)
return -EBUSY;
+ /* Maintaining the domains_itree below is a bit complicated */
+ if (iopt_is_dmabuf(pages))
+ return -EOPNOTSUPP;
+
if (new_start & (alignment - 1) ||
iopt_area_start_byte(area, new_start) & (alignment - 1))
return -EINVAL;
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index b6064f4ce4af..14cd052fd320 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -5,6 +5,7 @@
#ifndef __IO_PAGETABLE_H
#define __IO_PAGETABLE_H
+#include <linux/dma-buf.h>
#include <linux/interval_tree.h>
#include <linux/kref.h>
#include <linux/mutex.h>
@@ -69,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
void iopt_area_unmap_domain(struct iopt_area *area,
struct iommu_domain *domain);
+int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area,
+ struct iommu_domain *domain);
+void iopt_dmabuf_untrack_domain(struct iopt_pages *pages,
+ struct iopt_area *area,
+ struct iommu_domain *domain);
+int iopt_dmabuf_track_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages);
+void iopt_dmabuf_untrack_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages);
+
static inline unsigned long iopt_area_index(struct iopt_area *area)
{
return area->pages_node.start;
@@ -179,7 +190,22 @@ enum {
enum iopt_address_type {
IOPT_ADDRESS_USER = 0,
- IOPT_ADDRESS_FILE = 1,
+ IOPT_ADDRESS_FILE,
+ IOPT_ADDRESS_DMABUF,
+};
+
+struct iopt_pages_dmabuf_track {
+ struct iommu_domain *domain;
+ struct iopt_area *area;
+ struct list_head elm;
+};
+
+struct iopt_pages_dmabuf {
+ struct dma_buf_attachment *attach;
+ struct dma_buf_phys_vec phys;
+ /* Always PAGE_SIZE aligned */
+ unsigned long start;
+ struct list_head tracker;
};
/*
@@ -209,6 +235,8 @@ struct iopt_pages {
struct file *file;
unsigned long start;
};
+ /* IOPT_ADDRESS_DMABUF */
+ struct iopt_pages_dmabuf dmabuf;
};
bool writable:1;
u8 account_mode;
@@ -220,10 +248,32 @@ struct iopt_pages {
struct rb_root_cached domains_itree;
};
+static inline bool iopt_is_dmabuf(struct iopt_pages *pages)
+{
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return false;
+ return pages->type == IOPT_ADDRESS_DMABUF;
+}
+
+static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages)
+{
+ lockdep_assert_held(&pages->mutex);
+ if (iopt_is_dmabuf(pages))
+ return pages->dmabuf.phys.len == 0;
+ return false;
+}
+
struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
unsigned long length, bool writable);
-struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+struct iopt_pages *iopt_alloc_file_pages(struct file *file,
+ unsigned long start_byte,
+ unsigned long start,
unsigned long length, bool writable);
+struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx,
+ struct dma_buf *dmabuf,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable);
void iopt_release_pages(struct kref *kref);
static inline void iopt_put_pages(struct iopt_pages *pages)
{
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 459a7c516915..f4721afedadc 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -207,7 +207,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
unsigned long iova = cmd->iova;
struct iommufd_ioas *ioas;
unsigned int flags = 0;
- struct file *file;
int rc;
if (cmd->flags &
@@ -229,11 +228,7 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
flags = IOPT_ALLOC_IOVA;
- file = fget(cmd->fd);
- if (!file)
- return -EBADF;
-
- rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file,
+ rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd,
cmd->start, cmd->length,
conv_iommu_prot(cmd->flags), flags);
if (rc)
@@ -243,7 +238,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_put:
iommufd_put_object(ucmd->ictx, &ioas->obj);
- fput(file);
return rc;
}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 85d0843ed07b..eb6d1a70f673 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -19,6 +19,8 @@ struct iommu_domain;
struct iommu_group;
struct iommu_option;
struct iommufd_device;
+struct dma_buf_attachment;
+struct dma_buf_phys_vec;
struct iommufd_sw_msi_map {
struct list_head sw_msi_item;
@@ -108,7 +110,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long length, int iommu_prot,
unsigned int flags);
int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
- unsigned long *iova, struct file *file,
+ unsigned long *iova, int fd,
unsigned long start, unsigned long length,
int iommu_prot, unsigned int flags);
int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
@@ -504,6 +506,8 @@ void iommufd_device_pre_destroy(struct iommufd_object *obj);
void iommufd_device_destroy(struct iommufd_object *obj);
int iommufd_get_hw_info(struct iommufd_ucmd *ucmd);
+struct device *iommufd_global_device(void);
+
struct iommufd_access {
struct iommufd_object obj;
struct iommufd_ctx *ictx;
@@ -713,6 +717,8 @@ bool iommufd_should_fail(void);
int __init iommufd_test_init(void);
void iommufd_test_exit(void);
bool iommufd_selftest_is_mock_dev(struct device *dev);
+int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys);
#else
static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
unsigned int ioas_id,
@@ -734,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev)
{
return false;
}
+static inline int
+iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ return -EOPNOTSUPP;
+}
#endif
#endif
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 8fc618b2bcf9..73e73e1ec158 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -29,11 +29,22 @@ enum {
IOMMU_TEST_OP_PASID_REPLACE,
IOMMU_TEST_OP_PASID_DETACH,
IOMMU_TEST_OP_PASID_CHECK_HWPT,
+ IOMMU_TEST_OP_DMABUF_GET,
+ IOMMU_TEST_OP_DMABUF_REVOKE,
};
enum {
+ MOCK_IOMMUPT_DEFAULT = 0,
+ MOCK_IOMMUPT_HUGE,
+ MOCK_IOMMUPT_AMDV1,
+};
+
+/* These values are true for MOCK_IOMMUPT_DEFAULT */
+enum {
MOCK_APERTURE_START = 1UL << 24,
MOCK_APERTURE_LAST = (1UL << 31) - 1,
+ MOCK_PAGE_SIZE = 2048,
+ MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE,
};
enum {
@@ -52,7 +63,6 @@ enum {
enum {
MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
- MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1,
MOCK_FLAGS_DEVICE_PASID = 1 << 2,
};
@@ -176,6 +186,14 @@ struct iommu_test_cmd {
__u32 hwpt_id;
/* @id is stdev_id */
} pasid_check;
+ struct {
+ __u32 length;
+ __u32 open_flags;
+ } dmabuf_get;
+ struct {
+ __s32 dmabuf_fd;
+ __u32 revoked;
+ } dmabuf_revoke;
};
__u32 last;
};
@@ -205,6 +223,7 @@ struct iommu_test_hw_info {
*/
struct iommu_hwpt_selftest {
__u32 iotlb;
+ __u32 pagetable_type;
};
/* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index ce775fbbae94..5cc4b08c25f5 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -751,6 +751,15 @@ static struct miscdevice vfio_misc_dev = {
.mode = 0666,
};
+/*
+ * Used only by DMABUF, returns a valid struct device to use as a dummy struct
+ * device for attachment.
+ */
+struct device *iommufd_global_device(void)
+{
+ return iommu_misc_dev.this_device;
+}
+
static int __init iommufd_init(void)
{
int ret;
@@ -794,5 +803,6 @@ MODULE_ALIAS("devname:vfio/vfio");
#endif
MODULE_IMPORT_NS("IOMMUFD_INTERNAL");
MODULE_IMPORT_NS("IOMMUFD");
+MODULE_IMPORT_NS("DMA_BUF");
MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index c3433b845561..dbe51ecb9a20 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -45,6 +45,8 @@
* last_iova + 1 can overflow. An iopt_pages index will always be much less than
* ULONG_MAX so last_index + 1 cannot overflow.
*/
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
@@ -53,6 +55,7 @@
#include <linux/overflow.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
+#include <linux/vfio_pci_core.h>
#include "double_span.h"
#include "io_pagetable.h"
@@ -258,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages,
return container_of(node, struct iopt_area, pages_node);
}
+enum batch_kind {
+ BATCH_CPU_MEMORY = 0,
+ BATCH_MMIO,
+};
+
/*
* A simple datastructure to hold a vector of PFNs, optimized for contiguous
* PFNs. This is used as a temporary holding memory for shuttling pfns from one
@@ -271,7 +279,9 @@ struct pfn_batch {
unsigned int array_size;
unsigned int end;
unsigned int total_pfns;
+ enum batch_kind kind;
};
+enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) };
static void batch_clear(struct pfn_batch *batch)
{
@@ -348,11 +358,17 @@ static void batch_destroy(struct pfn_batch *batch, void *backup)
}
static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn,
- u32 nr)
+ u32 nr, enum batch_kind kind)
{
- const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns));
unsigned int end = batch->end;
+ if (batch->kind != kind) {
+ /* One kind per batch */
+ if (batch->end != 0)
+ return false;
+ batch->kind = kind;
+ }
+
if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] &&
nr <= MAX_NPFNS - batch->npfns[end - 1]) {
batch->npfns[end - 1] += nr;
@@ -379,7 +395,7 @@ static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr)
/* true if the pfn was added, false otherwise */
static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
{
- return batch_add_pfn_num(batch, pfn, 1);
+ return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY);
}
/*
@@ -492,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
{
bool disable_large_pages = area->iopt->disable_large_pages;
unsigned long last_iova = iopt_area_last_iova(area);
+ int iommu_prot = area->iommu_prot;
unsigned int page_offset = 0;
unsigned long start_iova;
unsigned long next_iova;
@@ -499,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
unsigned long iova;
int rc;
+ if (batch->kind == BATCH_MMIO) {
+ iommu_prot &= ~IOMMU_CACHE;
+ iommu_prot |= IOMMU_MMIO;
+ }
+
/* The first index might be a partial page */
if (start_index == iopt_area_index(area))
page_offset = area->page_offset;
@@ -512,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
rc = batch_iommu_map_small(
domain, iova,
PFN_PHYS(batch->pfns[cur]) + page_offset,
- next_iova - iova, area->iommu_prot);
+ next_iova - iova, iommu_prot);
else
rc = iommu_map(domain, iova,
PFN_PHYS(batch->pfns[cur]) + page_offset,
- next_iova - iova, area->iommu_prot,
+ next_iova - iova, iommu_prot,
GFP_KERNEL_ACCOUNT);
if (rc)
goto err_unmap;
@@ -652,7 +674,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
nr = min(nr, npages);
npages -= nr;
- if (!batch_add_pfn_num(batch, pfn, nr))
+ if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY))
break;
if (nr > 1) {
rc = folio_add_pins(folio, nr - 1);
@@ -1054,6 +1076,41 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
return iopt_pages_update_pinned(pages, npages, inc, user);
}
+struct pfn_reader_dmabuf {
+ struct dma_buf_phys_vec phys;
+ unsigned long start_offset;
+};
+
+static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf,
+ struct iopt_pages *pages)
+{
+ /* Callers must not get here if the dmabuf was already revoked */
+ if (WARN_ON(iopt_dmabuf_revoked(pages)))
+ return -EINVAL;
+
+ dmabuf->phys = pages->dmabuf.phys;
+ dmabuf->start_offset = pages->dmabuf.start;
+ return 0;
+}
+
+static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf,
+ struct pfn_batch *batch,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE;
+
+ /*
+ * start/last_index and start are all PAGE_SIZE aligned, the batch is
+ * always filled using page size aligned PFNs just like the other types.
+ * If the dmabuf has been sliced on a sub page offset then the common
+ * batch to domain code will adjust it before mapping to the domain.
+ */
+ batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start),
+ last_index - start_index + 1, BATCH_MMIO);
+ return 0;
+}
+
/*
* PFNs are stored in three places, in order of preference:
* - The iopt_pages xarray. This is only populated if there is a
@@ -1072,7 +1129,10 @@ struct pfn_reader {
unsigned long batch_end_index;
unsigned long last_index;
- struct pfn_reader_user user;
+ union {
+ struct pfn_reader_user user;
+ struct pfn_reader_dmabuf dmabuf;
+ };
};
static int pfn_reader_update_pinned(struct pfn_reader *pfns)
@@ -1108,7 +1168,7 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
{
struct interval_tree_double_span_iter *span = &pfns->span;
unsigned long start_index = pfns->batch_end_index;
- struct pfn_reader_user *user = &pfns->user;
+ struct pfn_reader_user *user;
unsigned long npages;
struct iopt_area *area;
int rc;
@@ -1140,8 +1200,13 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
return 0;
}
- if (start_index >= pfns->user.upages_end) {
- rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index,
+ if (iopt_is_dmabuf(pfns->pages))
+ return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch,
+ start_index, span->last_hole);
+
+ user = &pfns->user;
+ if (start_index >= user->upages_end) {
+ rc = pfn_reader_user_pin(user, pfns->pages, start_index,
span->last_hole);
if (rc)
return rc;
@@ -1209,7 +1274,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
pfns->batch_start_index = start_index;
pfns->batch_end_index = start_index;
pfns->last_index = last_index;
- pfn_reader_user_init(&pfns->user, pages);
+ if (iopt_is_dmabuf(pages))
+ pfn_reader_dmabuf_init(&pfns->dmabuf, pages);
+ else
+ pfn_reader_user_init(&pfns->user, pages);
rc = batch_init(&pfns->batch, last_index - start_index + 1);
if (rc)
return rc;
@@ -1230,8 +1298,12 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
static void pfn_reader_release_pins(struct pfn_reader *pfns)
{
struct iopt_pages *pages = pfns->pages;
- struct pfn_reader_user *user = &pfns->user;
+ struct pfn_reader_user *user;
+
+ if (iopt_is_dmabuf(pages))
+ return;
+ user = &pfns->user;
if (user->upages_end > pfns->batch_end_index) {
/* Any pages not transferred to the batch are just unpinned */
@@ -1261,7 +1333,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns)
struct iopt_pages *pages = pfns->pages;
pfn_reader_release_pins(pfns);
- pfn_reader_user_destroy(&pfns->user, pfns->pages);
+ if (!iopt_is_dmabuf(pfns->pages))
+ pfn_reader_user_destroy(&pfns->user, pfns->pages);
batch_destroy(&pfns->batch, NULL);
WARN_ON(pages->last_npinned != pages->npinned);
}
@@ -1340,26 +1413,234 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
return pages;
}
-struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+struct iopt_pages *iopt_alloc_file_pages(struct file *file,
+ unsigned long start_byte,
+ unsigned long start,
unsigned long length, bool writable)
{
struct iopt_pages *pages;
- unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE);
- unsigned long end;
- if (length && check_add_overflow(start, length - 1, &end))
- return ERR_PTR(-EOVERFLOW);
-
- pages = iopt_alloc_pages(start - start_down, length, writable);
+ pages = iopt_alloc_pages(start_byte, length, writable);
if (IS_ERR(pages))
return pages;
pages->file = get_file(file);
- pages->start = start_down;
+ pages->start = start - start_byte;
pages->type = IOPT_ADDRESS_FILE;
return pages;
}
+static void iopt_revoke_notify(struct dma_buf_attachment *attach)
+{
+ struct iopt_pages *pages = attach->importer_priv;
+ struct iopt_pages_dmabuf_track *track;
+
+ guard(mutex)(&pages->mutex);
+ if (iopt_dmabuf_revoked(pages))
+ return;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm) {
+ struct iopt_area *area = track->area;
+
+ iopt_area_unmap_domain_range(area, track->domain,
+ iopt_area_index(area),
+ iopt_area_last_index(area));
+ }
+ pages->dmabuf.phys.len = 0;
+}
+
+static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = {
+ .allow_peer2peer = true,
+ .move_notify = iopt_revoke_notify,
+};
+
+/*
+ * iommufd and vfio have a circular dependency. Future work for a phys
+ * based private interconnect will remove this.
+ */
+static int
+sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ typeof(&vfio_pci_dma_buf_iommufd_map) fn;
+ int rc;
+
+ rc = iommufd_test_dma_buf_iommufd_map(attachment, phys);
+ if (rc != -EOPNOTSUPP)
+ return rc;
+
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF))
+ return -EOPNOTSUPP;
+
+ fn = symbol_get(vfio_pci_dma_buf_iommufd_map);
+ if (!fn)
+ return -EOPNOTSUPP;
+ rc = fn(attachment, phys);
+ symbol_put(vfio_pci_dma_buf_iommufd_map);
+ return rc;
+}
+
+static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages,
+ struct dma_buf *dmabuf)
+{
+ struct dma_buf_attachment *attach;
+ int rc;
+
+ attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(),
+ &iopt_dmabuf_attach_revoke_ops, pages);
+ if (IS_ERR(attach))
+ return PTR_ERR(attach);
+
+ dma_resv_lock(dmabuf->resv, NULL);
+ /*
+ * Lock ordering requires the mutex to be taken inside the reservation,
+ * make sure lockdep sees this.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ mutex_lock(&pages->mutex);
+ mutex_unlock(&pages->mutex);
+ }
+
+ rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys);
+ if (rc)
+ goto err_detach;
+
+ dma_resv_unlock(dmabuf->resv);
+
+ /* On success iopt_release_pages() will detach and put the dmabuf. */
+ pages->dmabuf.attach = attach;
+ return 0;
+
+err_detach:
+ dma_resv_unlock(dmabuf->resv);
+ dma_buf_detach(dmabuf, attach);
+ return rc;
+}
+
+struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx,
+ struct dma_buf *dmabuf,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable)
+{
+ static struct lock_class_key pages_dmabuf_mutex_key;
+ struct iopt_pages *pages;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (dmabuf->size <= (start + length - 1) ||
+ length / PAGE_SIZE >= MAX_NPFNS)
+ return ERR_PTR(-EINVAL);
+
+ pages = iopt_alloc_pages(start_byte, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+
+ /*
+ * The mmap_lock can be held when obtaining the dmabuf reservation lock
+ * which creates a locking cycle with the pages mutex which is held
+ * while obtaining the mmap_lock. This locking path is not present for
+ * IOPT_ADDRESS_DMABUF so split the lock class.
+ */
+ lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key);
+
+ /* dmabuf does not use pinned page accounting. */
+ pages->account_mode = IOPT_PAGES_ACCOUNT_NONE;
+ pages->type = IOPT_ADDRESS_DMABUF;
+ pages->dmabuf.start = start - start_byte;
+ INIT_LIST_HEAD(&pages->dmabuf.tracker);
+
+ rc = iopt_map_dmabuf(ictx, pages, dmabuf);
+ if (rc) {
+ iopt_put_pages(pages);
+ return ERR_PTR(rc);
+ }
+
+ return pages;
+}
+
+int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area,
+ struct iommu_domain *domain)
+{
+ struct iopt_pages_dmabuf_track *track;
+
+ lockdep_assert_held(&pages->mutex);
+ if (WARN_ON(!iopt_is_dmabuf(pages)))
+ return -EINVAL;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm)
+ if (WARN_ON(track->domain == domain && track->area == area))
+ return -EINVAL;
+
+ track = kzalloc(sizeof(*track), GFP_KERNEL);
+ if (!track)
+ return -ENOMEM;
+ track->domain = domain;
+ track->area = area;
+ list_add_tail(&track->elm, &pages->dmabuf.tracker);
+
+ return 0;
+}
+
+void iopt_dmabuf_untrack_domain(struct iopt_pages *pages,
+ struct iopt_area *area,
+ struct iommu_domain *domain)
+{
+ struct iopt_pages_dmabuf_track *track;
+
+ lockdep_assert_held(&pages->mutex);
+ WARN_ON(!iopt_is_dmabuf(pages));
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm) {
+ if (track->domain == domain && track->area == area) {
+ list_del(&track->elm);
+ kfree(track);
+ return;
+ }
+ }
+ WARN_ON(true);
+}
+
+int iopt_dmabuf_track_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages)
+{
+ struct iopt_pages_dmabuf_track *track;
+ struct iommu_domain *domain;
+ unsigned long index;
+ int rc;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm)
+ if (WARN_ON(track->area == area))
+ return -EINVAL;
+
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = iopt_dmabuf_track_domain(pages, area, domain);
+ if (rc)
+ goto err_untrack;
+ }
+ return 0;
+err_untrack:
+ iopt_dmabuf_untrack_all_domains(area, pages);
+ return rc;
+}
+
+void iopt_dmabuf_untrack_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages)
+{
+ struct iopt_pages_dmabuf_track *track;
+ struct iopt_pages_dmabuf_track *tmp;
+
+ list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker,
+ elm) {
+ if (track->area == area) {
+ list_del(&track->elm);
+ kfree(track);
+ }
+ }
+}
+
void iopt_release_pages(struct kref *kref)
{
struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
@@ -1372,8 +1653,15 @@ void iopt_release_pages(struct kref *kref)
mutex_destroy(&pages->mutex);
put_task_struct(pages->source_task);
free_uid(pages->source_user);
- if (pages->type == IOPT_ADDRESS_FILE)
+ if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) {
+ struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf;
+
+ dma_buf_detach(dmabuf, pages->dmabuf.attach);
+ dma_buf_put(dmabuf);
+ WARN_ON(!list_empty(&pages->dmabuf.tracker));
+ } else if (pages->type == IOPT_ADDRESS_FILE) {
fput(pages->file);
+ }
kfree(pages);
}
@@ -1451,6 +1739,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area,
lockdep_assert_held(&pages->mutex);
+ if (iopt_is_dmabuf(pages)) {
+ if (WARN_ON(iopt_dmabuf_revoked(pages)))
+ return;
+ iopt_area_unmap_domain_range(area, domain, start_index,
+ last_index);
+ return;
+ }
+
/*
* For security we must not unpin something that is still DMA mapped,
* so this must unmap any IOVA before we go ahead and unpin the pages.
@@ -1526,6 +1822,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain)
void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
struct iommu_domain *domain)
{
+ if (iopt_dmabuf_revoked(pages))
+ return;
+
__iopt_area_unfill_domain(area, pages, domain,
iopt_area_last_index(area));
}
@@ -1546,6 +1845,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
lockdep_assert_held(&area->pages->mutex);
+ if (iopt_dmabuf_revoked(area->pages))
+ return 0;
+
rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
iopt_area_last_index(area));
if (rc)
@@ -1605,33 +1907,44 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
return 0;
mutex_lock(&pages->mutex);
- rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
- iopt_area_last_index(area));
- if (rc)
- goto out_unlock;
+ if (iopt_is_dmabuf(pages)) {
+ rc = iopt_dmabuf_track_all_domains(area, pages);
+ if (rc)
+ goto out_unlock;
+ }
- while (!pfn_reader_done(&pfns)) {
- done_first_end_index = pfns.batch_end_index;
- done_all_end_index = pfns.batch_start_index;
- xa_for_each(&area->iopt->domains, index, domain) {
- rc = batch_to_domain(&pfns.batch, domain, area,
- pfns.batch_start_index);
+ if (!iopt_dmabuf_revoked(pages)) {
+ rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
+ iopt_area_last_index(area));
+ if (rc)
+ goto out_untrack;
+
+ while (!pfn_reader_done(&pfns)) {
+ done_first_end_index = pfns.batch_end_index;
+ done_all_end_index = pfns.batch_start_index;
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = batch_to_domain(&pfns.batch, domain, area,
+ pfns.batch_start_index);
+ if (rc)
+ goto out_unmap;
+ }
+ done_all_end_index = done_first_end_index;
+
+ rc = pfn_reader_next(&pfns);
if (rc)
goto out_unmap;
}
- done_all_end_index = done_first_end_index;
-
- rc = pfn_reader_next(&pfns);
+ rc = pfn_reader_update_pinned(&pfns);
if (rc)
goto out_unmap;
+
+ pfn_reader_destroy(&pfns);
}
- rc = pfn_reader_update_pinned(&pfns);
- if (rc)
- goto out_unmap;
area->storage_domain = xa_load(&area->iopt->domains, 0);
interval_tree_insert(&area->pages_node, &pages->domains_itree);
- goto out_destroy;
+ mutex_unlock(&pages->mutex);
+ return 0;
out_unmap:
pfn_reader_release_pins(&pfns);
@@ -1658,8 +1971,10 @@ out_unmap:
end_index);
}
}
-out_destroy:
pfn_reader_destroy(&pfns);
+out_untrack:
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_all_domains(area, pages);
out_unlock:
mutex_unlock(&pages->mutex);
return rc;
@@ -1685,16 +2000,22 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
if (!area->storage_domain)
goto out_unlock;
- xa_for_each(&iopt->domains, index, domain)
- if (domain != area->storage_domain)
+ xa_for_each(&iopt->domains, index, domain) {
+ if (domain == area->storage_domain)
+ continue;
+
+ if (!iopt_dmabuf_revoked(pages))
iopt_area_unmap_domain_range(
area, domain, iopt_area_index(area),
iopt_area_last_index(area));
+ }
if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
interval_tree_remove(&area->pages_node, &pages->domains_itree);
iopt_area_unfill_domain(area, pages, area->storage_domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_all_domains(area, pages);
area->storage_domain = NULL;
out_unlock:
mutex_unlock(&pages->mutex);
@@ -2031,15 +2352,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
return -EPERM;
- if (pages->type == IOPT_ADDRESS_FILE)
+ if (iopt_is_dmabuf(pages))
+ return -EINVAL;
+
+ if (pages->type != IOPT_ADDRESS_USER)
return iopt_pages_rw_slow(pages, start_index, last_index,
start_byte % PAGE_SIZE, data, length,
flags);
- if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
- WARN_ON(pages->type != IOPT_ADDRESS_USER))
- return -EINVAL;
-
if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) {
if (start_index == last_index)
return iopt_pages_rw_page(pages, start_index,
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index de178827a078..c4322fd26f93 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -5,6 +5,8 @@
*/
#include <linux/anon_inodes.h>
#include <linux/debugfs.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/fault-inject.h>
#include <linux/file.h>
#include <linux/iommu.h>
@@ -12,6 +14,8 @@
#include <linux/slab.h>
#include <linux/xarray.h>
#include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
+#include "../iommu-pages.h"
#include "../iommu-priv.h"
#include "io_pagetable.h"
@@ -41,21 +45,6 @@ static DEFINE_IDA(mock_dev_ida);
enum {
MOCK_DIRTY_TRACK = 1,
- MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
- MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE,
-
- /*
- * Like a real page table alignment requires the low bits of the address
- * to be zero. xarray also requires the high bit to be zero, so we store
- * the pfns shifted. The upper bits are used for metadata.
- */
- MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE,
-
- _MOCK_PFN_START = MOCK_PFN_MASK + 1,
- MOCK_PFN_START_IOVA = _MOCK_PFN_START,
- MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
- MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
- MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2,
};
static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain);
@@ -124,10 +113,15 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
}
struct mock_iommu_domain {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ struct pt_iommu_amdv1 amdv1;
+ };
unsigned long flags;
- struct iommu_domain domain;
- struct xarray pfns;
};
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain);
static inline struct mock_iommu_domain *
to_mock_domain(struct iommu_domain *domain)
@@ -216,7 +210,7 @@ static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj)
}
static int mock_domain_nop_attach(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct mock_dev *mdev = to_mock_dev(dev);
struct mock_viommu *new_viommu = NULL;
@@ -344,74 +338,6 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
return 0;
}
-static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock,
- unsigned long iova, size_t page_size,
- unsigned long flags)
-{
- unsigned long cur, end = iova + page_size - 1;
- bool dirty = false;
- void *ent, *old;
-
- for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) {
- ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
- if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA))
- continue;
-
- dirty = true;
- /* Clear dirty */
- if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
- unsigned long val;
-
- val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
- old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE,
- xa_mk_value(val), GFP_KERNEL);
- WARN_ON_ONCE(ent != old);
- }
- }
-
- return dirty;
-}
-
-static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- unsigned long end = iova + size;
- void *ent;
-
- if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
- return -EINVAL;
-
- do {
- unsigned long pgsize = MOCK_IO_PAGE_SIZE;
- unsigned long head;
-
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- if (!ent) {
- iova += pgsize;
- continue;
- }
-
- if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA)
- pgsize = MOCK_HUGE_PAGE_SIZE;
- head = iova & ~(pgsize - 1);
-
- /* Clear dirty */
- if (mock_test_and_clear_dirty(mock, head, pgsize, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-static const struct iommu_dirty_ops dirty_ops = {
- .set_dirty_tracking = mock_domain_set_dirty_tracking,
- .read_and_clear_dirty = mock_domain_read_and_clear_dirty,
-};
-
static struct mock_iommu_domain_nested *
__mock_domain_alloc_nested(const struct iommu_user_data *user_data)
{
@@ -446,7 +372,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
if (flags & ~IOMMU_HWPT_ALLOC_PASID)
return ERR_PTR(-EOPNOTSUPP);
- if (!parent || parent->ops != mock_ops.default_domain_ops)
+ if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING))
return ERR_PTR(-EINVAL);
mock_parent = to_mock_domain(parent);
@@ -459,159 +385,170 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
return &mock_nested->domain;
}
-static struct iommu_domain *
-mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
- const struct iommu_user_data *user_data)
-{
- bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
- IOMMU_HWPT_ALLOC_NEST_PARENT |
- IOMMU_HWPT_ALLOC_PASID;
- struct mock_dev *mdev = to_mock_dev(dev);
- bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
- struct mock_iommu_domain *mock;
-
- if (user_data)
- return ERR_PTR(-EOPNOTSUPP);
- if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
- return ERR_PTR(-EOPNOTSUPP);
-
- mock = kzalloc(sizeof(*mock), GFP_KERNEL);
- if (!mock)
- return ERR_PTR(-ENOMEM);
- mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
- mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
- mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
- if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA)
- mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE;
- mock->domain.ops = mock_ops.default_domain_ops;
- mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
- xa_init(&mock->pfns);
-
- if (has_dirty_flag)
- mock->domain.dirty_ops = &dirty_ops;
- return &mock->domain;
-}
-
static void mock_domain_free(struct iommu_domain *domain)
{
struct mock_iommu_domain *mock = to_mock_domain(domain);
- WARN_ON(!xa_empty(&mock->pfns));
+ pt_iommu_deinit(&mock->iommu);
kfree(mock);
}
-static int mock_domain_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount, int prot,
- gfp_t gfp, size_t *mapped)
+static void mock_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- unsigned long flags = MOCK_PFN_START_IOVA;
- unsigned long start_iova = iova;
+ iommu_put_pages_list(&gather->freelist);
+}
- /*
- * xarray does not reliably work with fault injection because it does a
- * retry allocation, so put our own failure point.
- */
- if (iommufd_should_fail())
- return -ENOENT;
+static const struct iommu_domain_ops amdv1_mock_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
- for (; pgcount; pgcount--) {
- size_t cur;
+static const struct iommu_domain_ops amdv1_mock_huge_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
+#undef pt_iommu_amdv1_mock_map_pages
- for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
- void *old;
+static const struct iommu_dirty_ops amdv1_mock_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1_mock),
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
- if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
- flags = MOCK_PFN_LAST_IOVA;
- if (pgsize != MOCK_IO_PAGE_SIZE) {
- flags |= MOCK_PFN_HUGE_IOVA;
- }
- old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE,
- xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) |
- flags),
- gfp);
- if (xa_is_err(old)) {
- for (; start_iova != iova;
- start_iova += MOCK_IO_PAGE_SIZE)
- xa_erase(&mock->pfns,
- start_iova /
- MOCK_IO_PAGE_SIZE);
- return xa_err(old);
- }
- WARN_ON(old);
- iova += MOCK_IO_PAGE_SIZE;
- paddr += MOCK_IO_PAGE_SIZE;
- *mapped += MOCK_IO_PAGE_SIZE;
- flags = 0;
- }
- }
- return 0;
-}
+static const struct iommu_domain_ops amdv1_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
-static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
- unsigned long iova, size_t pgsize,
- size_t pgcount,
- struct iommu_iotlb_gather *iotlb_gather)
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1),
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
+
+static struct mock_iommu_domain *
+mock_domain_alloc_pgtable(struct device *dev,
+ const struct iommu_hwpt_selftest *user_cfg, u32 flags)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- bool first = true;
- size_t ret = 0;
- void *ent;
+ struct mock_iommu_domain *mock;
+ int rc;
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+ mock = kzalloc(sizeof(*mock), GFP_KERNEL);
+ if (!mock)
+ return ERR_PTR(-ENOMEM);
+ mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
- for (; pgcount; pgcount--) {
- size_t cur;
+ mock->amdv1.iommu.nid = NUMA_NO_NODE;
+
+ switch (user_cfg->pagetable_type) {
+ case MOCK_IOMMUPT_DEFAULT:
+ case MOCK_IOMMUPT_HUGE: {
+ struct pt_iommu_amdv1_cfg cfg = {};
+
+ /* The mock version has a 2k page size */
+ cfg.common.hw_max_vasz_lg2 = 56;
+ cfg.common.hw_max_oasz_lg2 = 51;
+ cfg.starting_level = 2;
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+ mock->domain.ops = &amdv1_mock_huge_ops;
+ else
+ mock->domain.ops = &amdv1_mock_ops;
+ rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL);
+ if (rc)
+ goto err_free;
+
+ /*
+ * In huge mode userspace should only provide huge pages, we
+ * have to include PAGE_SIZE for the domain to be accepted by
+ * iommufd.
+ */
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+ mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE |
+ PAGE_SIZE;
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ mock->domain.dirty_ops = &amdv1_mock_dirty_ops;
+ break;
+ }
- for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
- ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ case MOCK_IOMMUPT_AMDV1: {
+ struct pt_iommu_amdv1_cfg cfg = {};
+
+ cfg.common.hw_max_vasz_lg2 = 64;
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
+ cfg.starting_level = 2;
+ mock->domain.ops = &amdv1_ops;
+ rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL);
+ if (rc)
+ goto err_free;
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ mock->domain.dirty_ops = &amdv1_dirty_ops;
+ break;
+ }
+ default:
+ rc = -EOPNOTSUPP;
+ goto err_free;
+ }
- /*
- * iommufd generates unmaps that must be a strict
- * superset of the map's performend So every
- * starting/ending IOVA should have been an iova passed
- * to map.
- *
- * This simple logic doesn't work when the HUGE_PAGE is
- * turned on since the core code will automatically
- * switch between the two page sizes creating a break in
- * the unmap calls. The break can land in the middle of
- * contiguous IOVA.
- */
- if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) {
- if (first) {
- WARN_ON(ent && !(xa_to_value(ent) &
- MOCK_PFN_START_IOVA));
- first = false;
- }
- if (pgcount == 1 &&
- cur + MOCK_IO_PAGE_SIZE == pgsize)
- WARN_ON(ent && !(xa_to_value(ent) &
- MOCK_PFN_LAST_IOVA));
- }
+ /*
+ * Override the real aperture to the MOCK aperture for test purposes.
+ */
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) {
+ WARN_ON(mock->domain.geometry.aperture_start != 0);
+ WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST);
- iova += MOCK_IO_PAGE_SIZE;
- ret += MOCK_IO_PAGE_SIZE;
- }
+ mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
+ mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
}
- return ret;
+
+ return mock;
+err_free:
+ kfree(mock);
+ return ERR_PTR(rc);
}
-static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
+static struct iommu_domain *
+mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
+ const struct iommu_user_data *user_data)
{
- struct mock_iommu_domain *mock = to_mock_domain(domain);
- void *ent;
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_PASID;
+ struct mock_dev *mdev = to_mock_dev(dev);
+ bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_hwpt_selftest user_cfg = {};
+ struct mock_iommu_domain *mock;
+ int rc;
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- WARN_ON(!ent);
- return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE;
+ if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST &&
+ user_data->type != IOMMU_HWPT_DATA_NONE))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (user_data) {
+ rc = iommu_copy_struct_from_user(
+ &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
+ }
+
+ mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags);
+ if (IS_ERR(mock))
+ return ERR_CAST(mock);
+ return &mock->domain;
}
static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
@@ -955,15 +892,6 @@ static const struct iommu_ops mock_ops = {
.user_pasid_table = true,
.get_viommu_size = mock_get_viommu_size,
.viommu_init = mock_viommu_init,
- .default_domain_ops =
- &(struct iommu_domain_ops){
- .free = mock_domain_free,
- .attach_dev = mock_domain_nop_attach,
- .map_pages = mock_domain_map_pages,
- .unmap_pages = mock_domain_unmap_pages,
- .iova_to_phys = mock_domain_iova_to_phys,
- .set_dev_pasid = mock_domain_set_dev_pasid_nop,
- },
};
static void mock_domain_free_nested(struct iommu_domain *domain)
@@ -1047,7 +975,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
if (IS_ERR(hwpt))
return hwpt;
if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
- hwpt->domain->ops != mock_ops.default_domain_ops) {
+ hwpt->domain->owner != &mock_ops) {
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
@@ -1088,7 +1016,6 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{},
};
const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY |
- MOCK_FLAGS_DEVICE_HUGE_IOVA |
MOCK_FLAGS_DEVICE_PASID;
struct mock_dev *mdev;
int rc, i;
@@ -1277,23 +1204,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
{
struct iommufd_hw_pagetable *hwpt;
struct mock_iommu_domain *mock;
+ unsigned int page_size;
uintptr_t end;
int rc;
- if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE ||
- (uintptr_t)uptr % MOCK_IO_PAGE_SIZE ||
- check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
- return -EINVAL;
-
hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- for (; length; length -= MOCK_IO_PAGE_SIZE) {
+ page_size = 1 << __ffs(mock->domain.pgsize_bitmap);
+ if (iova % page_size || length % page_size ||
+ (uintptr_t)uptr % page_size ||
+ check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
+ return -EINVAL;
+
+ for (; length; length -= page_size) {
struct page *pages[1];
+ phys_addr_t io_phys;
unsigned long pfn;
long npages;
- void *ent;
npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0,
pages);
@@ -1308,15 +1237,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
pfn = page_to_pfn(pages[0]);
put_page(pages[0]);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- if (!ent ||
- (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE !=
- pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
+ io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova);
+ if (io_phys !=
+ pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
rc = -EINVAL;
goto out_put;
}
- iova += MOCK_IO_PAGE_SIZE;
- uptr += MOCK_IO_PAGE_SIZE;
+ iova += page_size;
+ uptr += page_size;
}
rc = 0;
@@ -1795,7 +1723,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+ if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) {
rc = -EINVAL;
goto out_put;
}
@@ -1814,22 +1742,10 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
}
for (i = 0; i < max; i++) {
- unsigned long cur = iova + i * page_size;
- void *ent, *old;
-
if (!test_bit(i, (unsigned long *)tmp))
continue;
-
- ent = xa_load(&mock->pfns, cur / page_size);
- if (ent) {
- unsigned long val;
-
- val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
- old = xa_store(&mock->pfns, cur / page_size,
- xa_mk_value(val), GFP_KERNEL);
- WARN_ON_ONCE(ent != old);
- count++;
- }
+ mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size);
+ count++;
}
cmd->dirty.out_nr_dirty = count;
@@ -2031,6 +1947,140 @@ void iommufd_selftest_destroy(struct iommufd_object *obj)
}
}
+struct iommufd_test_dma_buf {
+ void *memory;
+ size_t length;
+ bool revoked;
+};
+
+static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ return 0;
+}
+
+static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+}
+
+static struct sg_table *
+iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment,
+ enum dma_data_direction dir)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment,
+ struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+}
+
+static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf)
+{
+ struct iommufd_test_dma_buf *priv = dmabuf->priv;
+
+ kfree(priv->memory);
+ kfree(priv);
+}
+
+static const struct dma_buf_ops iommufd_test_dmabuf_ops = {
+ .attach = iommufd_test_dma_buf_attach,
+ .detach = iommufd_test_dma_buf_detach,
+ .map_dma_buf = iommufd_test_dma_buf_map,
+ .release = iommufd_test_dma_buf_release,
+ .unmap_dma_buf = iommufd_test_dma_buf_unmap,
+};
+
+int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv;
+
+ dma_resv_assert_held(attachment->dmabuf->resv);
+
+ if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops)
+ return -EOPNOTSUPP;
+
+ if (priv->revoked)
+ return -ENODEV;
+
+ phys->paddr = virt_to_phys(priv->memory);
+ phys->len = priv->length;
+ return 0;
+}
+
+static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd,
+ unsigned int open_flags,
+ size_t len)
+{
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct iommufd_test_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int rc;
+
+ len = ALIGN(len, PAGE_SIZE);
+ if (len == 0 || len > PAGE_SIZE * 512)
+ return -EINVAL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->length = len;
+ priv->memory = kzalloc(len, GFP_KERNEL);
+ if (!priv->memory) {
+ rc = -ENOMEM;
+ goto err_free;
+ }
+
+ exp_info.ops = &iommufd_test_dmabuf_ops;
+ exp_info.size = len;
+ exp_info.flags = open_flags;
+ exp_info.priv = priv;
+
+ dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(dmabuf)) {
+ rc = PTR_ERR(dmabuf);
+ goto err_free;
+ }
+
+ return dma_buf_fd(dmabuf, open_flags);
+
+err_free:
+ kfree(priv->memory);
+ kfree(priv);
+ return rc;
+}
+
+static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd,
+ bool revoked)
+{
+ struct iommufd_test_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int rc = 0;
+
+ dmabuf = dma_buf_get(fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (dmabuf->ops != &iommufd_test_dmabuf_ops) {
+ rc = -EOPNOTSUPP;
+ goto err_put;
+ }
+
+ priv = dmabuf->priv;
+ dma_resv_lock(dmabuf->resv, NULL);
+ priv->revoked = revoked;
+ dma_buf_move_notify(dmabuf);
+ dma_resv_unlock(dmabuf->resv);
+
+err_put:
+ dma_buf_put(dmabuf);
+ return rc;
+}
+
int iommufd_test(struct iommufd_ucmd *ucmd)
{
struct iommu_test_cmd *cmd = ucmd->cmd;
@@ -2109,6 +2159,13 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return iommufd_test_pasid_detach(ucmd, cmd);
case IOMMU_TEST_OP_PASID_CHECK_HWPT:
return iommufd_test_pasid_check_hwpt(ucmd, cmd);
+ case IOMMU_TEST_OP_DMABUF_GET:
+ return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags,
+ cmd->dmabuf_get.length);
+ case IOMMU_TEST_OP_DMABUF_REVOKE:
+ return iommufd_test_dmabuf_revoke(ucmd,
+ cmd->dmabuf_revoke.dmabuf_fd,
+ cmd->dmabuf_revoke.revoked);
default:
return -EOPNOTSUPP;
}
@@ -2202,3 +2259,5 @@ void iommufd_test_exit(void)
platform_device_unregister(selftest_iommu_dev);
debugfs_remove_recursive(dbgfs_root);
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index ffa892f65714..ca848288dbf2 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -590,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain)
}
static int ipmmu_attach_device(struct iommu_domain *io_domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
@@ -637,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain,
}
static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_domain *domain;
unsigned int i;
- if (io_domain == identity_domain || !io_domain)
+ if (old == identity_domain || !old)
return 0;
- domain = to_vmsa_domain(io_domain);
+ domain = to_vmsa_domain(old);
for (i = 0; i < fwspec->num_ids; ++i)
ipmmu_utlb_disable(domain, fwspec->ids[i]);
@@ -720,6 +720,8 @@ static int ipmmu_init_platform_device(struct device *dev,
dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev));
+ put_device(&ipmmu_pdev->dev);
+
return 0;
}
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 43a61ba021a5..819add75a665 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev)
return &iommu->iommu;
}
-static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
unsigned long flags;
@@ -441,19 +442,19 @@ fail:
}
static int msm_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct msm_priv *priv;
unsigned long flags;
struct msm_iommu_dev *iommu;
struct msm_iommu_ctx_dev *master;
int ret = 0;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- priv = to_msm_priv(domain);
+ priv = to_msm_priv(old);
free_io_pgtable_ops(priv->iop);
spin_lock_irqsave(&msm_iommu_lock, flags);
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 0e0285348d2b..60fcd3d3b5eb 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -139,6 +139,7 @@
/* 2 bits: iommu type */
#define MTK_IOMMU_TYPE_MM (0x0 << 13)
#define MTK_IOMMU_TYPE_INFRA (0x1 << 13)
+#define MTK_IOMMU_TYPE_APU (0x2 << 13)
#define MTK_IOMMU_TYPE_MASK (0x3 << 13)
/* PM and clock always on. e.g. infra iommu */
#define PM_CLK_AO BIT(15)
@@ -147,6 +148,7 @@
#define TF_PORT_TO_ADDR_MT8173 BIT(18)
#define INT_ID_PORT_WIDTH_6 BIT(19)
#define CFG_IFA_MASTER_IN_ATF BIT(20)
+#define DL_WITH_MULTI_LARB BIT(21)
#define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \
((((pdata)->flags) & (mask)) == (_x))
@@ -172,6 +174,7 @@ enum mtk_iommu_plat {
M4U_MT8183,
M4U_MT8186,
M4U_MT8188,
+ M4U_MT8189,
M4U_MT8192,
M4U_MT8195,
M4U_MT8365,
@@ -335,6 +338,8 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data, unsigned int ban
*/
#define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL
+static LIST_HEAD(apulist); /* List the apu iommu HWs */
+static LIST_HEAD(infralist); /* List the iommu_infra HW */
static LIST_HEAD(m4ulist); /* List all the M4U HWs */
#define for_each_m4u(data, head) list_for_each_entry(data, head, list)
@@ -350,6 +355,15 @@ static const struct mtk_iommu_iova_region single_domain[] = {
#define MT8192_MULTI_REGION_NR (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) ? \
MT8192_MULTI_REGION_NR_MAX : 1)
+static const struct mtk_iommu_iova_region mt8189_multi_dom_apu[] = {
+ { .iova_base = 0x200000ULL, .size = SZ_512M}, /* APU SECURE */
+#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT)
+ { .iova_base = SZ_1G, .size = 0xc0000000}, /* APU CODE */
+ { .iova_base = 0x70000000ULL, .size = 0x12600000}, /* APU VLM */
+ { .iova_base = SZ_4G, .size = SZ_4G * 3}, /* APU VPU */
+#endif
+};
+
static const struct mtk_iommu_iova_region mt8192_multi_dom[MT8192_MULTI_REGION_NR] = {
{ .iova_base = 0x0, .size = MTK_IOMMU_IOVA_SZ_4G}, /* 0 ~ 4G, */
#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT)
@@ -705,7 +719,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
}
static int mtk_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata;
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
@@ -773,12 +787,12 @@ err_unlock:
}
static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
mtk_iommu_config(data, dev, false, 0);
@@ -865,6 +879,7 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
struct device_link *link;
struct device *larbdev;
+ unsigned long larbid_msk = 0;
unsigned int larbid, larbidx, i;
if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
@@ -872,30 +887,50 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
/*
* Link the consumer device with the smi-larb device(supplier).
- * The device that connects with each a larb is a independent HW.
- * All the ports in each a device should be in the same larbs.
+ * w/DL_WITH_MULTI_LARB: the master may connect with multi larbs,
+ * we should create device link with each larb.
+ * w/o DL_WITH_MULTI_LARB: the master must connect with one larb,
+ * otherwise fail.
*/
larbid = MTK_M4U_TO_LARB(fwspec->ids[0]);
if (larbid >= MTK_LARB_NR_MAX)
return ERR_PTR(-EINVAL);
+ larbid_msk |= BIT(larbid);
+
for (i = 1; i < fwspec->num_ids; i++) {
larbidx = MTK_M4U_TO_LARB(fwspec->ids[i]);
- if (larbid != larbidx) {
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, DL_WITH_MULTI_LARB)) {
+ larbid_msk |= BIT(larbidx);
+ } else if (larbid != larbidx) {
dev_err(dev, "Can only use one larb. Fail@larb%d-%d.\n",
larbid, larbidx);
return ERR_PTR(-EINVAL);
}
}
- larbdev = data->larb_imu[larbid].dev;
- if (!larbdev)
- return ERR_PTR(-EINVAL);
- link = device_link_add(dev, larbdev,
- DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
- if (!link)
- dev_err(dev, "Unable to link %s\n", dev_name(larbdev));
+ for_each_set_bit(larbid, &larbid_msk, 32) {
+ larbdev = data->larb_imu[larbid].dev;
+ if (!larbdev)
+ return ERR_PTR(-EINVAL);
+
+ link = device_link_add(dev, larbdev,
+ DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
+ if (!link) {
+ dev_err(dev, "Unable to link %s\n", dev_name(larbdev));
+ goto link_remove;
+ }
+ }
+
return &data->iommu;
+
+link_remove:
+ for_each_set_bit(i, &larbid_msk, larbid) {
+ larbdev = data->larb_imu[i].dev;
+ device_link_remove(dev, larbdev);
+ }
+
+ return ERR_PTR(-ENODEV);
}
static void mtk_iommu_release_device(struct device *dev)
@@ -903,11 +938,19 @@ static void mtk_iommu_release_device(struct device *dev)
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct mtk_iommu_data *data;
struct device *larbdev;
- unsigned int larbid;
+ unsigned int larbid, i;
+ unsigned long larbid_msk = 0;
data = dev_iommu_priv_get(dev);
- if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
- larbid = MTK_M4U_TO_LARB(fwspec->ids[0]);
+ if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+ return;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ larbid = MTK_M4U_TO_LARB(fwspec->ids[i]);
+ larbid_msk |= BIT(larbid);
+ }
+
+ for_each_set_bit(larbid, &larbid_msk, 32) {
larbdev = data->larb_imu[larbid].dev;
device_link_remove(dev, larbdev);
}
@@ -974,6 +1017,8 @@ static int mtk_iommu_of_xlate(struct device *dev,
return -EINVAL;
dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+
+ put_device(&m4updev->dev);
}
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -1211,16 +1256,19 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m
}
component_match_add(dev, match, component_compare_dev, &plarbdev->dev);
- platform_device_put(plarbdev);
}
- if (!frst_avail_smicomm_node)
- return -EINVAL;
+ if (!frst_avail_smicomm_node) {
+ ret = -EINVAL;
+ goto err_larbdev_put;
+ }
pcommdev = of_find_device_by_node(frst_avail_smicomm_node);
of_node_put(frst_avail_smicomm_node);
- if (!pcommdev)
- return -ENODEV;
+ if (!pcommdev) {
+ ret = -ENODEV;
+ goto err_larbdev_put;
+ }
data->smicomm_dev = &pcommdev->dev;
link = device_link_add(data->smicomm_dev, dev,
@@ -1228,16 +1276,16 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m
platform_device_put(pcommdev);
if (!link) {
dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev));
- return -EINVAL;
+ ret = -EINVAL;
+ goto err_larbdev_put;
}
return 0;
err_larbdev_put:
- for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) {
- if (!data->larb_imu[i].dev)
- continue;
+ /* id mapping may not be linear, loop the whole array */
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
put_device(data->larb_imu[i].dev);
- }
+
return ret;
}
@@ -1400,8 +1448,12 @@ out_sysfs_remove:
iommu_device_sysfs_remove(&data->iommu);
out_list_del:
list_del(&data->list);
- if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+ if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
device_link_remove(data->smicomm_dev, dev);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
+ }
out_runtime_disable:
pm_runtime_disable(dev);
return ret;
@@ -1421,6 +1473,9 @@ static void mtk_iommu_remove(struct platform_device *pdev)
if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
device_link_remove(data->smicomm_dev, &pdev->dev);
component_master_del(&pdev->dev, &mtk_iommu_com_ops);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
}
pm_runtime_disable(&pdev->dev);
for (i = 0; i < data->plat_data->banks_num; i++) {
@@ -1695,6 +1750,66 @@ static const struct mtk_iommu_plat_data mt8188_data_vpp = {
27, 28 /* ccu0 */, MTK_INVALID_LARBID}, {4, 6}},
};
+static const unsigned int mt8189_apu_region_msk[][MTK_LARB_NR_MAX] = {
+ [0] = {[0] = BIT(2)}, /* Region0: fake larb 0 APU_SECURE */
+ [1] = {[0] = BIT(1)}, /* Region1: fake larb 0 APU_CODE */
+ [2] = {[0] = BIT(3)}, /* Region2: fake larb 0 APU_VLM */
+ [3] = {[0] = BIT(0)}, /* Region3: fake larb 0 APU_DATA */
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_apu = {
+ .m4u_plat = M4U_MT8189,
+ .flags = IOVA_34_EN | DCM_DISABLE |
+ MTK_IOMMU_TYPE_APU | PGTABLE_PA_35_EN,
+ .hw_list = &apulist,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .iova_region = mt8189_multi_dom_apu,
+ .iova_region_nr = ARRAY_SIZE(mt8189_multi_dom_apu),
+ .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}},
+ .iova_region_larb_msk = mt8189_apu_region_msk,
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_infra = {
+ .m4u_plat = M4U_MT8189,
+ .flags = WR_THROT_EN | DCM_DISABLE | MTK_IOMMU_TYPE_INFRA |
+ CFG_IFA_MASTER_IN_ATF | SHARE_PGTABLE | PGTABLE_PA_35_EN,
+ .hw_list = &infralist,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .iova_region = single_domain,
+ .iova_region_nr = ARRAY_SIZE(single_domain),
+};
+
+static const u32 mt8189_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = {
+ [0] = {~0, ~0, ~0, [22] = BIT(0)}, /* Region0: all ports for larb0/1/2 */
+ [1] = {[3] = ~0, [4] = ~0}, /* Region1: all ports for larb4(3)/7(4) */
+ [2] = {[5] = ~0, [6] = ~0, /* Region2: all ports for larb9(5)/11(6) */
+ [7] = ~0, [8] = ~0, /* Region2: all ports for larb13(7)/14(8) */
+ [9] = ~0, [10] = ~0, /* Region2: all ports for larb16(9)/17(10) */
+ [11] = ~0, [12] = ~0, /* Region2: all ports for larb19(11)/20(12) */
+ [21] = ~0}, /* Region2: larb21 fake GCE larb */
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_mm = {
+ .m4u_plat = M4U_MT8189,
+ .flags = HAS_BCLK | HAS_SUB_COMM_3BITS | OUT_ORDER_WR_EN |
+ WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM |
+ PGTABLE_PA_35_EN | DL_WITH_MULTI_LARB,
+ .hw_list = &m4ulist,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 5,
+ .banks_enable = {true, false, false, false, false},
+ .iova_region = mt8192_multi_dom,
+ .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom),
+ .iova_region_larb_msk = mt8189_larb_region_msk,
+ .larbid_remap = {{0}, {1}, {21/* GCE_D */, 21/* GCE_M */, 2},
+ {19, 20, 9, 11}, {7}, {4},
+ {13, 17}, {14, 16}},
+};
+
static const struct mtk_iommu_plat_data mt8192_data = {
.m4u_plat = M4U_MT8192,
.flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN |
@@ -1796,6 +1911,9 @@ static const struct of_device_id mtk_iommu_of_ids[] = {
{ .compatible = "mediatek,mt8188-iommu-infra", .data = &mt8188_data_infra},
{ .compatible = "mediatek,mt8188-iommu-vdo", .data = &mt8188_data_vdo},
{ .compatible = "mediatek,mt8188-iommu-vpp", .data = &mt8188_data_vpp},
+ { .compatible = "mediatek,mt8189-iommu-apu", .data = &mt8189_data_apu},
+ { .compatible = "mediatek,mt8189-iommu-infra", .data = &mt8189_data_infra},
+ { .compatible = "mediatek,mt8189-iommu-mm", .data = &mt8189_data_mm},
{ .compatible = "mediatek,mt8192-m4u", .data = &mt8192_data},
{ .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra},
{ .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo},
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 10cc0b1197e8..c8d8eff5373d 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -303,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain)
kfree(to_mtk_domain(domain));
}
-static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev)
+static int mtk_iommu_v1_attach_device(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain);
@@ -329,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device
}
static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
@@ -435,6 +438,8 @@ static int mtk_iommu_v1_create_mapping(struct device *dev,
return -EINVAL;
dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+
+ put_device(&m4updev->dev);
}
ret = iommu_fwspec_add_ids(dev, args->args, 1);
@@ -641,13 +646,18 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
if (larb_nr < 0)
return larb_nr;
+ if (larb_nr > MTK_LARB_NR_MAX)
+ return -EINVAL;
+
for (i = 0; i < larb_nr; i++) {
struct device_node *larbnode;
struct platform_device *plarbdev;
larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i);
- if (!larbnode)
- return -EINVAL;
+ if (!larbnode) {
+ ret = -EINVAL;
+ goto out_put_larbs;
+ }
if (!of_device_is_available(larbnode)) {
of_node_put(larbnode);
@@ -657,11 +667,14 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
plarbdev = of_find_device_by_node(larbnode);
if (!plarbdev) {
of_node_put(larbnode);
- return -ENODEV;
+ ret = -ENODEV;
+ goto out_put_larbs;
}
if (!plarbdev->dev.driver) {
of_node_put(larbnode);
- return -EPROBE_DEFER;
+ put_device(&plarbdev->dev);
+ ret = -EPROBE_DEFER;
+ goto out_put_larbs;
}
data->larb_imu[i].dev = &plarbdev->dev;
@@ -673,7 +686,7 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
ret = mtk_iommu_v1_hw_init(data);
if (ret)
- return ret;
+ goto out_put_larbs;
ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL,
dev_name(&pdev->dev));
@@ -695,12 +708,17 @@ out_sysfs_remove:
iommu_device_sysfs_remove(&data->iommu);
out_clk_unprepare:
clk_disable_unprepare(data->bclk);
+out_put_larbs:
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
+
return ret;
}
static void mtk_iommu_v1_remove(struct platform_device *pdev)
{
struct mtk_iommu_v1_data *data = platform_get_drvdata(pdev);
+ int i;
iommu_device_sysfs_remove(&data->iommu);
iommu_device_unregister(&data->iommu);
@@ -708,6 +726,9 @@ static void mtk_iommu_v1_remove(struct platform_device *pdev)
clk_disable_unprepare(data->bclk);
devm_free_irq(&pdev->dev, data->irq, data);
component_master_del(&pdev->dev, &mtk_iommu_v1_com_ops);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
}
static int __maybe_unused mtk_iommu_v1_suspend(struct device *dev)
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 5c6f5943f44b..768973b7e511 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1431,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain)
odomain->iommus = NULL;
}
-static int
-omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int omap_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev, struct iommu_domain *old)
{
struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
@@ -1536,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
}
static int omap_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct omap_iommu_domain *omap_domain;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- omap_domain = to_omap_domain(domain);
+ omap_domain = to_omap_domain(old);
spin_lock(&omap_domain->lock);
_omap_iommu_detach_dev(omap_domain, dev);
spin_unlock(&omap_domain->lock);
@@ -1668,23 +1668,20 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev)
}
pdev = of_find_device_by_node(np);
+ of_node_put(np);
if (!pdev) {
- of_node_put(np);
kfree(arch_data);
return ERR_PTR(-ENODEV);
}
oiommu = platform_get_drvdata(pdev);
+ put_device(&pdev->dev);
if (!oiommu) {
- of_node_put(np);
kfree(arch_data);
return ERR_PTR(-EINVAL);
}
tmp->iommu_dev = oiommu;
- tmp->dev = &pdev->dev;
-
- of_node_put(np);
}
dev_iommu_priv_set(dev, arch_data);
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
index 27697109ec79..50b39be61abc 100644
--- a/drivers/iommu/omap-iommu.h
+++ b/drivers/iommu/omap-iommu.h
@@ -88,7 +88,6 @@ struct omap_iommu {
/**
* struct omap_iommu_arch_data - omap iommu private data
* @iommu_dev: handle of the OMAP iommu device
- * @dev: handle of the iommu device
*
* This is an omap iommu private data object, which binds an iommu user
* to its iommu device. This object should be placed at the iommu user's
@@ -97,7 +96,6 @@ struct omap_iommu {
*/
struct omap_iommu_arch_data {
struct omap_iommu *iommu_dev;
- struct device *dev;
};
struct cr_regs {
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index ebb22979075d..d9429097a2b5 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -1321,7 +1321,8 @@ static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_m
}
static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
@@ -1426,7 +1427,8 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
}
static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
@@ -1447,7 +1449,8 @@ static struct iommu_domain riscv_iommu_blocking_domain = {
};
static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct riscv_iommu_device *iommu = dev_to_iommu(dev);
struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 0861dd469bd8..85f3667e797c 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -960,7 +960,8 @@ out_disable_clocks:
}
static int rk_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct rk_iommu *iommu;
struct rk_iommu_domain *rk_domain;
@@ -1005,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = {
};
static int rk_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct rk_iommu *iommu;
struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
@@ -1026,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
if (iommu->domain == domain)
return 0;
- ret = rk_iommu_identity_attach(&rk_identity_domain, dev);
+ ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old);
if (ret)
return ret;
@@ -1041,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
return 0;
ret = rk_iommu_enable(iommu);
- if (ret)
- WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev));
+ if (ret) {
+ /*
+ * Note rk_iommu_identity_attach() might fail before physically
+ * attaching the dev to iommu->domain, in which case the actual
+ * old domain for this revert should be rk_identity_domain v.s.
+ * iommu->domain. Since rk_iommu_identity_attach() does not care
+ * about the old domain argument for now, this is not a problem.
+ */
+ WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev,
+ iommu->domain));
+ }
pm_runtime_put(iommu->dev);
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index aa576736d60b..fe679850af28 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -670,7 +670,8 @@ int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status)
}
static int blocking_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
struct s390_domain *s390_domain;
@@ -694,7 +695,8 @@ static int blocking_domain_attach_device(struct iommu_domain *domain,
}
static int s390_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
@@ -709,7 +711,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
domain->geometry.aperture_end < zdev->start_dma))
return -EINVAL;
- blocking_domain_attach_device(&blocking_domain, dev);
+ blocking_domain_attach_device(&blocking_domain, dev, old);
/* If we fail now DMA remains blocked via blocking domain */
cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
@@ -1131,13 +1133,14 @@ static int __init s390_iommu_init(void)
subsys_initcall(s390_iommu_init);
static int s390_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct zpci_dev *zdev = to_zpci_dev(dev);
u8 status;
int cc;
- blocking_domain_attach_device(&blocking_domain, dev);
+ blocking_domain_attach_device(&blocking_domain, dev, old);
/* If we fail now DMA remains blocked via blocking domain */
cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index c7ca1d8a0b15..555d4505c747 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -247,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain)
}
static int sprd_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev);
struct sprd_iommu_domain *dom = to_sprd_domain(domain);
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index de10b569d9a9..90b26fe21817 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -771,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu,
}
static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sun50i_iommu *iommu = dev_iommu_priv_get(dev);
struct sun50i_iommu_domain *sun50i_domain;
@@ -797,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = {
};
static int sun50i_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
struct sun50i_iommu *iommu;
@@ -813,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain,
if (iommu->domain == domain)
return 0;
- sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev);
+ sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old);
sun50i_iommu_attach_domain(iommu, sun50i_domain);
@@ -839,6 +841,8 @@ static int sun50i_iommu_of_xlate(struct device *dev,
dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev));
+ put_device(&iommu_pdev->dev);
+
return iommu_fwspec_add_ids(dev, &id, 1);
}
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 36cdd5fbab07..c391e7f2cde6 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -490,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
}
static int tegra_smmu_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu *smmu = dev_iommu_priv_get(dev);
@@ -524,9 +524,9 @@ disable:
}
static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu_as *as;
struct tegra_smmu *smmu;
@@ -535,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
if (!fwspec)
return -ENODEV;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- as = to_smmu_as(domain);
+ as = to_smmu_as(old);
smmu = as->smmu;
for (index = 0; index < fwspec->num_ids; index++) {
tegra_smmu_disable(smmu, fwspec->ids[index], as->id);
@@ -830,10 +830,9 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np)
return NULL;
mc = platform_get_drvdata(pdev);
- if (!mc) {
- put_device(&pdev->dev);
+ put_device(&pdev->dev);
+ if (!mc)
return NULL;
- }
return mc->smmu;
}
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index b39d6f134ab2..d314fa5cd847 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -730,7 +730,8 @@ static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev)
return domain;
}
-static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
struct virtio_iommu_req_attach req;
@@ -781,7 +782,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
}
static int viommu_attach_identity_domain(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
struct virtio_iommu_req_attach req;
diff --git a/drivers/net/ethernet/broadcom/bnge/Makefile b/drivers/net/ethernet/broadcom/bnge/Makefile
index 6142d9c57f49..ea6596854e5c 100644
--- a/drivers/net/ethernet/broadcom/bnge/Makefile
+++ b/drivers/net/ethernet/broadcom/bnge/Makefile
@@ -9,4 +9,5 @@ bng_en-y := bnge_core.o \
bnge_rmem.o \
bnge_resc.o \
bnge_netdev.o \
- bnge_ethtool.o
+ bnge_ethtool.o \
+ bnge_auxr.o
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge.h b/drivers/net/ethernet/broadcom/bnge/bnge.h
index 7aed5f81cd51..411744894349 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge.h
@@ -11,6 +11,7 @@
#include <linux/bnxt/hsi.h>
#include "bnge_rmem.h"
#include "bnge_resc.h"
+#include "bnge_auxr.h"
#define DRV_VER_MAJ 1
#define DRV_VER_MIN 15
@@ -22,6 +23,12 @@ enum board_idx {
BCM57708,
};
+struct bnge_auxr_priv {
+ struct auxiliary_device aux_dev;
+ struct bnge_auxr_dev *auxr_dev;
+ int id;
+};
+
struct bnge_pf_info {
u16 fw_fid;
u16 port_id;
@@ -197,6 +204,9 @@ struct bnge_dev {
struct bnge_irq *irq_tbl;
u16 irqs_acquired;
+
+ struct bnge_auxr_priv *aux_priv;
+ struct bnge_auxr_dev *auxr_dev;
};
static inline bool bnge_is_roce_en(struct bnge_dev *bd)
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c
new file mode 100644
index 000000000000..d64592b64e17
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <asm/byteorder.h>
+#include <linux/bitmap.h>
+#include <linux/auxiliary_bus.h>
+#include <linux/bnxt/hsi.h>
+
+#include "bnge.h"
+#include "bnge_hwrm.h"
+#include "bnge_auxr.h"
+
+static DEFINE_IDA(bnge_aux_dev_ids);
+
+static void bnge_fill_msix_vecs(struct bnge_dev *bd,
+ struct bnge_msix_info *info)
+{
+ struct bnge_auxr_dev *auxr_dev = bd->auxr_dev;
+ int num_msix, i;
+
+ if (!auxr_dev->auxr_info->msix_requested) {
+ dev_warn(bd->dev, "Requested MSI-X vectors not allocated\n");
+ return;
+ }
+ num_msix = auxr_dev->auxr_info->msix_requested;
+ for (i = 0; i < num_msix; i++) {
+ info[i].vector = bd->irq_tbl[i].vector;
+ info[i].db_offset = bd->db_offset;
+ info[i].ring_idx = i;
+ }
+}
+
+int bnge_register_dev(struct bnge_auxr_dev *auxr_dev,
+ void *handle)
+{
+ struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev);
+ struct bnge_auxr_info *auxr_info;
+ int rc = 0;
+
+ netdev_lock(bd->netdev);
+ mutex_lock(&auxr_dev->auxr_dev_lock);
+ if (!bd->irq_tbl) {
+ rc = -ENODEV;
+ goto exit;
+ }
+
+ if (!bnge_aux_has_enough_resources(bd)) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ auxr_info = auxr_dev->auxr_info;
+ auxr_info->handle = handle;
+
+ auxr_info->msix_requested = bd->aux_num_msix;
+
+ bnge_fill_msix_vecs(bd, bd->auxr_dev->msix_info);
+ auxr_dev->flags |= BNGE_ARDEV_MSIX_ALLOC;
+
+exit:
+ mutex_unlock(&auxr_dev->auxr_dev_lock);
+ netdev_unlock(bd->netdev);
+ return rc;
+}
+EXPORT_SYMBOL(bnge_register_dev);
+
+void bnge_unregister_dev(struct bnge_auxr_dev *auxr_dev)
+{
+ struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev);
+ struct bnge_auxr_info *auxr_info;
+
+ auxr_info = auxr_dev->auxr_info;
+ netdev_lock(bd->netdev);
+ mutex_lock(&auxr_dev->auxr_dev_lock);
+ if (auxr_info->msix_requested)
+ auxr_dev->flags &= ~BNGE_ARDEV_MSIX_ALLOC;
+ auxr_info->msix_requested = 0;
+
+ mutex_unlock(&auxr_dev->auxr_dev_lock);
+ netdev_unlock(bd->netdev);
+}
+EXPORT_SYMBOL(bnge_unregister_dev);
+
+int bnge_send_msg(struct bnge_auxr_dev *auxr_dev, struct bnge_fw_msg *fw_msg)
+{
+ struct bnge_dev *bd = pci_get_drvdata(auxr_dev->pdev);
+ struct output *resp;
+ struct input *req;
+ u32 resp_len;
+ int rc;
+
+ rc = bnge_hwrm_req_init(bd, req, 0 /* don't care */);
+ if (rc)
+ return rc;
+
+ rc = bnge_hwrm_req_replace(bd, req, fw_msg->msg, fw_msg->msg_len);
+ if (rc)
+ goto drop_req;
+
+ bnge_hwrm_req_timeout(bd, req, fw_msg->timeout);
+ resp = bnge_hwrm_req_hold(bd, req);
+ rc = bnge_hwrm_req_send(bd, req);
+ resp_len = le16_to_cpu(resp->resp_len);
+ if (resp_len) {
+ if (fw_msg->resp_max_len < resp_len)
+ resp_len = fw_msg->resp_max_len;
+
+ memcpy(fw_msg->resp, resp, resp_len);
+ }
+drop_req:
+ bnge_hwrm_req_drop(bd, req);
+ return rc;
+}
+EXPORT_SYMBOL(bnge_send_msg);
+
+void bnge_rdma_aux_device_uninit(struct bnge_dev *bd)
+{
+ struct bnge_auxr_priv *aux_priv;
+ struct auxiliary_device *adev;
+
+ /* Skip if no auxiliary device init was done. */
+ if (!bd->aux_priv)
+ return;
+
+ aux_priv = bd->aux_priv;
+ adev = &aux_priv->aux_dev;
+ auxiliary_device_uninit(adev);
+}
+
+static void bnge_aux_dev_release(struct device *dev)
+{
+ struct bnge_auxr_priv *aux_priv =
+ container_of(dev, struct bnge_auxr_priv, aux_dev.dev);
+ struct bnge_dev *bd = pci_get_drvdata(aux_priv->auxr_dev->pdev);
+
+ ida_free(&bnge_aux_dev_ids, aux_priv->id);
+ kfree(aux_priv->auxr_dev->auxr_info);
+ bd->auxr_dev = NULL;
+ kfree(aux_priv->auxr_dev);
+ kfree(aux_priv);
+ bd->aux_priv = NULL;
+}
+
+void bnge_rdma_aux_device_del(struct bnge_dev *bd)
+{
+ if (!bd->auxr_dev)
+ return;
+
+ auxiliary_device_delete(&bd->aux_priv->aux_dev);
+}
+
+static void bnge_set_auxr_dev_info(struct bnge_auxr_dev *auxr_dev,
+ struct bnge_dev *bd)
+{
+ auxr_dev->pdev = bd->pdev;
+ auxr_dev->l2_db_size = bd->db_size;
+ auxr_dev->l2_db_size_nc = bd->db_size;
+ auxr_dev->l2_db_offset = bd->db_offset;
+ mutex_init(&auxr_dev->auxr_dev_lock);
+
+ if (bd->flags & BNGE_EN_ROCE_V1)
+ auxr_dev->flags |= BNGE_ARDEV_ROCEV1_SUPP;
+ if (bd->flags & BNGE_EN_ROCE_V2)
+ auxr_dev->flags |= BNGE_ARDEV_ROCEV2_SUPP;
+
+ auxr_dev->chip_num = bd->chip_num;
+ auxr_dev->hw_ring_stats_size = bd->hw_ring_stats_size;
+ auxr_dev->pf_port_id = bd->pf.port_id;
+ auxr_dev->en_state = bd->state;
+ auxr_dev->bar0 = bd->bar0;
+}
+
+void bnge_rdma_aux_device_add(struct bnge_dev *bd)
+{
+ struct auxiliary_device *aux_dev;
+ int rc;
+
+ if (!bd->auxr_dev)
+ return;
+
+ aux_dev = &bd->aux_priv->aux_dev;
+ rc = auxiliary_device_add(aux_dev);
+ if (rc) {
+ dev_warn(bd->dev, "Failed to add auxiliary device for ROCE\n");
+ auxiliary_device_uninit(aux_dev);
+ bd->flags &= ~BNGE_EN_ROCE;
+ }
+
+ bd->auxr_dev->net = bd->netdev;
+}
+
+void bnge_rdma_aux_device_init(struct bnge_dev *bd)
+{
+ struct auxiliary_device *aux_dev;
+ struct bnge_auxr_info *auxr_info;
+ struct bnge_auxr_priv *aux_priv;
+ struct bnge_auxr_dev *auxr_dev;
+ int rc;
+
+ if (!bnge_is_roce_en(bd))
+ return;
+
+ aux_priv = kzalloc(sizeof(*aux_priv), GFP_KERNEL);
+ if (!aux_priv)
+ goto exit;
+
+ aux_priv->id = ida_alloc(&bnge_aux_dev_ids, GFP_KERNEL);
+ if (aux_priv->id < 0) {
+ dev_warn(bd->dev, "ida alloc failed for aux device\n");
+ kfree(aux_priv);
+ goto exit;
+ }
+
+ aux_dev = &aux_priv->aux_dev;
+ aux_dev->id = aux_priv->id;
+ aux_dev->name = "rdma";
+ aux_dev->dev.parent = &bd->pdev->dev;
+ aux_dev->dev.release = bnge_aux_dev_release;
+
+ rc = auxiliary_device_init(aux_dev);
+ if (rc) {
+ ida_free(&bnge_aux_dev_ids, aux_priv->id);
+ kfree(aux_priv);
+ goto exit;
+ }
+ bd->aux_priv = aux_priv;
+
+ auxr_dev = kzalloc(sizeof(*auxr_dev), GFP_KERNEL);
+ if (!auxr_dev)
+ goto aux_dev_uninit;
+
+ aux_priv->auxr_dev = auxr_dev;
+
+ auxr_info = kzalloc(sizeof(*auxr_info), GFP_KERNEL);
+ if (!auxr_info)
+ goto aux_dev_uninit;
+
+ auxr_dev->auxr_info = auxr_info;
+ bd->auxr_dev = auxr_dev;
+ bnge_set_auxr_dev_info(auxr_dev, bd);
+
+ return;
+
+aux_dev_uninit:
+ auxiliary_device_uninit(aux_dev);
+exit:
+ bd->flags &= ~BNGE_EN_ROCE;
+}
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h
new file mode 100644
index 000000000000..6c5c15ef2b0a
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_auxr.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Broadcom */
+
+#ifndef _BNGE_AUXR_H_
+#define _BNGE_AUXR_H_
+
+#include <linux/auxiliary_bus.h>
+
+#define BNGE_MIN_ROCE_CP_RINGS 2
+#define BNGE_MIN_ROCE_STAT_CTXS 1
+
+#define BNGE_MAX_ROCE_MSIX 64
+
+struct hwrm_async_event_cmpl;
+struct bnge;
+
+struct bnge_msix_info {
+ u32 vector;
+ u32 ring_idx;
+ u32 db_offset;
+};
+
+struct bnge_fw_msg {
+ void *msg;
+ int msg_len;
+ void *resp;
+ int resp_max_len;
+ int timeout;
+};
+
+struct bnge_auxr_info {
+ void *handle;
+ u16 msix_requested;
+};
+
+enum {
+ BNGE_ARDEV_ROCEV1_SUPP = BIT(0),
+ BNGE_ARDEV_ROCEV2_SUPP = BIT(1),
+ BNGE_ARDEV_MSIX_ALLOC = BIT(2),
+};
+
+#define BNGE_ARDEV_ROCE_SUPP (BNGE_ARDEV_ROCEV1_SUPP | \
+ BNGE_ARDEV_ROCEV2_SUPP)
+
+struct bnge_auxr_dev {
+ struct net_device *net;
+ struct pci_dev *pdev;
+ void __iomem *bar0;
+
+ struct bnge_msix_info msix_info[BNGE_MAX_ROCE_MSIX];
+
+ u32 flags;
+
+ struct bnge_auxr_info *auxr_info;
+
+ /* Doorbell BAR size in bytes mapped by L2 driver. */
+ int l2_db_size;
+ /* Doorbell BAR size in bytes mapped as non-cacheable. */
+ int l2_db_size_nc;
+ /* Doorbell offset in bytes within l2_db_size_nc. */
+ int l2_db_offset;
+
+ u16 chip_num;
+ u16 hw_ring_stats_size;
+ u16 pf_port_id;
+ unsigned long en_state;
+
+ u16 auxr_num_msix_vec;
+ u16 auxr_num_ctxs;
+
+ /* serialize auxr operations */
+ struct mutex auxr_dev_lock;
+};
+
+void bnge_rdma_aux_device_uninit(struct bnge_dev *bdev);
+void bnge_rdma_aux_device_del(struct bnge_dev *bdev);
+void bnge_rdma_aux_device_add(struct bnge_dev *bdev);
+void bnge_rdma_aux_device_init(struct bnge_dev *bdev);
+int bnge_register_dev(struct bnge_auxr_dev *adev,
+ void *handle);
+void bnge_unregister_dev(struct bnge_auxr_dev *adev);
+int bnge_send_msg(struct bnge_auxr_dev *adev, struct bnge_fw_msg *fw_msg);
+
+#endif /* _BNGE_AUXR_H_ */
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_core.c b/drivers/net/ethernet/broadcom/bnge/bnge_core.c
index 2c72dd34d50d..c94e132bebc8 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_core.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_core.c
@@ -41,6 +41,11 @@ static void bnge_print_device_info(struct pci_dev *pdev, enum board_idx idx)
bool bnge_aux_registered(struct bnge_dev *bd)
{
+ struct bnge_auxr_dev *ba_dev = bd->auxr_dev;
+
+ if (ba_dev && ba_dev->auxr_info->msix_requested)
+ return true;
+
return false;
}
@@ -312,16 +317,20 @@ static int bnge_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent)
spin_lock_init(&bd->db_lock);
#endif
+ bnge_rdma_aux_device_init(bd);
+
rc = bnge_alloc_irqs(bd);
if (rc) {
dev_err(&pdev->dev, "Error IRQ allocation rc = %d\n", rc);
- goto err_config_uninit;
+ goto err_uninit_auxr;
}
rc = bnge_netdev_alloc(bd, max_irqs);
if (rc)
goto err_free_irq;
+ bnge_rdma_aux_device_add(bd);
+
pci_save_state(pdev);
return 0;
@@ -329,6 +338,9 @@ static int bnge_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent)
err_free_irq:
bnge_free_irqs(bd);
+err_uninit_auxr:
+ bnge_rdma_aux_device_uninit(bd);
+
err_config_uninit:
bnge_net_uninit_dflt_config(bd);
@@ -354,10 +366,14 @@ static void bnge_remove_one(struct pci_dev *pdev)
{
struct bnge_dev *bd = pci_get_drvdata(pdev);
+ bnge_rdma_aux_device_del(bd);
+
bnge_netdev_free(bd);
bnge_free_irqs(bd);
+ bnge_rdma_aux_device_uninit(bd);
+
bnge_net_uninit_dflt_config(bd);
bnge_devlink_unregister(bd);
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c
index 0f971af24142..c3087e5cd875 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.c
@@ -98,6 +98,46 @@ void bnge_hwrm_req_alloc_flags(struct bnge_dev *bd, void *req, gfp_t gfp)
ctx->gfp = gfp;
}
+int bnge_hwrm_req_replace(struct bnge_dev *bd, void *req, void *new_req,
+ u32 len)
+{
+ struct bnge_hwrm_ctx *ctx = __hwrm_ctx_get(bd, req);
+ struct input *internal_req = req;
+ u16 req_type;
+
+ if (!ctx)
+ return -EINVAL;
+
+ if (len > BNGE_HWRM_CTX_OFFSET)
+ return -E2BIG;
+
+ /* free any existing slices */
+ ctx->allocated = BNGE_HWRM_DMA_SIZE - BNGE_HWRM_CTX_OFFSET;
+ if (ctx->slice_addr) {
+ dma_free_coherent(bd->dev, ctx->slice_size,
+ ctx->slice_addr, ctx->slice_handle);
+ ctx->slice_addr = NULL;
+ }
+ ctx->gfp = GFP_KERNEL;
+
+ if ((bd->fw_cap & BNGE_FW_CAP_SHORT_CMD) || len > BNGE_HWRM_MAX_REQ_LEN) {
+ memcpy(internal_req, new_req, len);
+ } else {
+ internal_req->req_type = ((struct input *)new_req)->req_type;
+ ctx->req = new_req;
+ }
+
+ ctx->req_len = len;
+ ctx->req->resp_addr = cpu_to_le64(ctx->dma_handle +
+ BNGE_HWRM_RESP_OFFSET);
+
+ /* update sentinel for potentially new request type */
+ req_type = le16_to_cpu(internal_req->req_type);
+ ctx->sentinel = bnge_cal_sentinel(ctx, req_type);
+
+ return 0;
+}
+
void bnge_hwrm_req_flags(struct bnge_dev *bd, void *req,
enum bnge_hwrm_ctx_flags flags)
{
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
index 83794a12cc81..6df629761d95 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
@@ -107,4 +107,6 @@ int bnge_hwrm_req_send_silent(struct bnge_dev *bd, void *req);
void bnge_hwrm_req_alloc_flags(struct bnge_dev *bd, void *req, gfp_t flags);
void *bnge_hwrm_req_dma_slice(struct bnge_dev *bd, void *req, u32 size,
dma_addr_t *dma);
+int bnge_hwrm_req_replace(struct bnge_dev *bd, void *req, void *new_req,
+ u32 len);
#endif /* _BNGE_HWRM_H_ */
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_resc.c b/drivers/net/ethernet/broadcom/bnge/bnge_resc.c
index 62ebe03a0dcf..943df5f60f01 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_resc.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_resc.c
@@ -34,6 +34,18 @@ static unsigned int bnge_get_max_func_stat_ctxs(struct bnge_dev *bd)
return bd->hw_resc.max_stat_ctxs;
}
+bool bnge_aux_has_enough_resources(struct bnge_dev *bd)
+{
+ unsigned int max_stat_ctxs;
+
+ max_stat_ctxs = bnge_get_max_func_stat_ctxs(bd);
+ if (max_stat_ctxs <= BNGE_MIN_ROCE_STAT_CTXS ||
+ bd->nq_nr_rings == max_stat_ctxs)
+ return false;
+
+ return true;
+}
+
static unsigned int bnge_get_max_func_cp_rings(struct bnge_dev *bd)
{
return bd->hw_resc.max_cp_rings;
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_resc.h b/drivers/net/ethernet/broadcom/bnge/bnge_resc.h
index 0d6213b27580..b62a634669f6 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_resc.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_resc.h
@@ -74,6 +74,7 @@ void bnge_net_uninit_dflt_config(struct bnge_dev *bd);
void bnge_aux_init_dflt_config(struct bnge_dev *bd);
u32 bnge_get_rxfh_indir_size(struct bnge_dev *bd);
int bnge_cal_nr_rss_ctxs(u16 rx_rings);
+bool bnge_aux_has_enough_resources(struct bnge_dev *bd);
static inline u32
bnge_adjust_pow_two(u32 total_ent, u16 ent_per_blk)
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index cb1011f6fd30..805daae9dd36 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6444,7 +6444,6 @@ bnx2_reset_task(struct work_struct *work)
if (!(pcicmd & PCI_COMMAND_MEMORY)) {
/* in case PCI block has reset */
pci_restore_state(bp->pdev);
- pci_save_state(bp->pdev);
}
rc = bnx2_init_nic(bp, 1);
if (rc) {
@@ -8718,7 +8717,6 @@ static pci_ers_result_t bnx2_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (netif_running(dev))
err = bnx2_init_nic(bp, 1);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index aca4267babc8..6a1cc2032bf3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -14216,7 +14216,6 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (netif_running(dev))
bnx2x_set_power_state(bp, PCI_D0);
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index e21f7c6a6de7..75f66587983d 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -18337,7 +18337,6 @@ static pci_ers_result_t tg3_io_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (!netdev || !netif_running(netdev)) {
rc = PCI_ERS_RESULT_RECOVERED;
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index f92a3550e480..3b1321c8ed14 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -2933,7 +2933,6 @@ static int t3_reenable_adapter(struct adapter *adapter)
}
pci_set_master(adapter->pdev);
pci_restore_state(adapter->pdev);
- pci_save_state(adapter->pdev);
/* Free sge resources */
t3_free_sge_resources(adapter);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 66b8854e059f..043733c5c812 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5458,7 +5458,6 @@ static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev)
if (!adap) {
pci_restore_state(pdev);
- pci_save_state(pdev);
return PCI_ERS_RESULT_RECOVERED;
}
@@ -5473,7 +5472,6 @@ static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (t4_wait_dev_ready(adap->regs) < 0)
return PCI_ERS_RESULT_DISCONNECT;
diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
index e11495b7ee98..7234618e8e81 100644
--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
@@ -160,7 +160,6 @@ static pci_ers_result_t hbg_pci_err_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
hbg_err_reset(priv);
return PCI_ERS_RESULT_RECOVERED;
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 116f3c92b5bc..ddbe2f7d8112 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -7195,7 +7195,6 @@ static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev)
"Cannot re-enable PCI device after reset.\n");
result = PCI_ERS_RESULT_DISCONNECT;
} else {
- pdev->state_saved = true;
pci_restore_state(pdev);
pci_set_master(pdev);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index ae5fe34659cf..d75b8a50413d 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -2423,12 +2423,6 @@ static pci_ers_result_t fm10k_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
-
- /* After second error pci->state_saved is false, this
- * resets it so EEH doesn't break.
- */
- pci_save_state(pdev);
-
pci_wake_from_d3(pdev, false);
result = PCI_ERS_RESULT_RECOVERED;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 50be0a60ae13..d8192aa23254 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -16455,7 +16455,6 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
reg = rd32(&pf->hw, I40E_GLGEN_RTRIG);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 2533876f1a2f..4bb68e7a00f5 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -5653,7 +5653,6 @@ static int ice_resume(struct device *dev)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (!pci_device_is_present(pdev))
return -ENODEV;
@@ -5753,7 +5752,6 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
/* Check for life */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 85f9589cc568..dbea37269d2c 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -9599,7 +9599,6 @@ static int __igb_resume(struct device *dev, bool rpm)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (!pci_device_is_present(pdev))
return -ENODEV;
@@ -9754,7 +9753,6 @@ static pci_ers_result_t igb_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_enable_wake(pdev, PCI_D3hot, 0);
pci_enable_wake(pdev, PCI_D3cold, 0);
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 728d7ca5338b..7aafa60ba0c8 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -7530,7 +7530,6 @@ static int __igc_resume(struct device *dev, bool rpm)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (!pci_device_is_present(pdev))
return -ENODEV;
@@ -7667,7 +7666,6 @@ static pci_ers_result_t igc_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_enable_wake(pdev, PCI_D3hot, 0);
pci_enable_wake(pdev, PCI_D3cold, 0);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4af3b3e71ff1..034618e79169 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -12298,7 +12298,6 @@ static pci_ers_result_t ixgbe_io_slot_reset(struct pci_dev *pdev)
adapter->hw.hw_addr = adapter->io_addr;
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
pci_wake_from_d3(pdev, false);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 2de226951e19..4293f8e33f44 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -4368,7 +4368,6 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
return PCI_ERS_RESULT_RECOVERED;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 024339ce41f1..1ab569ce3fcf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -2137,7 +2137,6 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
err = wait_vital(pdev);
if (err) {
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
index 861d98099c44..9240673c7533 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
@@ -581,7 +581,6 @@ static pci_ers_result_t fbnic_err_slot_reset(struct pci_dev *pdev)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
if (pci_enable_device_mem(pdev)) {
dev_err(&pdev->dev,
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c
index 9d70b51ca91d..e4c542fc6c2b 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -3915,7 +3915,6 @@ static int lan743x_pm_resume(struct device *dev)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- pci_save_state(pdev);
/* Restore HW_CFG that was saved during pm suspend */
if (adapter->is_pci11x1x)
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index e611ff7fa3fa..7be30a8df268 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -3416,10 +3416,6 @@ static void myri10ge_watchdog(struct work_struct *work)
* nic was resumed from power saving mode.
*/
pci_restore_state(mgp->pdev);
-
- /* save state again for accounting reasons */
- pci_save_state(mgp->pdev);
-
} else {
/* if we get back -1's from our slot, perhaps somebody
* powered off our card. Don't try to reset it in
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index 5026b0263d43..1e55ccb4822b 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -3425,7 +3425,6 @@ static void s2io_reset(struct s2io_nic *sp)
/* Restore the PCI state saved during initialization. */
pci_restore_state(sp->pdev);
- pci_save_state(sp->pdev);
pci_read_config_word(sp->pdev, 0x2, &val16);
if (check_pci_device_id(val16) != (u16)PCI_ANY_ID)
break;
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 67647f1880fb..f3c81c892786 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -4,7 +4,7 @@
obj-$(CONFIG_PCI) += access.o bus.o probe.o host-bridge.o \
remove.o pci.o pci-driver.o search.o \
- rom.o setup-res.o irq.o vpd.o \
+ rebar.o rom.o setup-res.o irq.o vpd.o \
setup-bus.o vc.o mmap.o devres.o
obj-$(CONFIG_PCI) += msi/
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index f26aec6ff588..9daf13ed3714 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -357,6 +357,9 @@ void pci_bus_add_device(struct pci_dev *dev)
pci_proc_attach_device(dev);
pci_bridge_d3_update(dev);
+ /* Save config space for error recoverability */
+ pci_save_state(dev);
+
/*
* If the PCI device is associated with a pwrctrl device with a
* power supply, create a device link between the PCI device and
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index 41748d083b93..c254d2b8bf17 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -146,7 +146,7 @@ config PCIE_HISI_ERR
config PCI_IXP4XX
bool "Intel IXP4xx PCI controller"
- depends on ARM && OF
+ depends on OF
depends on ARCH_IXP4XX || COMPILE_TEST
default ARCH_IXP4XX
help
@@ -259,12 +259,20 @@ config PCIE_RCAR_EP
config PCI_RCAR_GEN2
bool "Renesas R-Car Gen2 Internal PCI controller"
- depends on ARCH_RENESAS || COMPILE_TEST
- depends on ARM
+ depends on (ARCH_RENESAS && ARM) || COMPILE_TEST
help
Say Y here if you want internal PCI support on R-Car Gen2 SoC.
- There are 3 internal PCI controllers available with a single
- built-in EHCI/OHCI host controller present on each one.
+ Each internal PCI controller contains a single built-in EHCI/OHCI
+ host controller.
+
+config PCIE_RENESAS_RZG3S_HOST
+ bool "Renesas RZ/G3S PCIe host controller"
+ depends on ARCH_RENESAS || COMPILE_TEST
+ select MFD_SYSCON
+ select IRQ_MSI_LIB
+ help
+ Say Y here if you want PCIe host controller support on Renesas RZ/G3S
+ SoC.
config PCIE_ROCKCHIP
bool
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index 038ccbd9e3ba..229929a945c2 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o
obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar.o pcie-rcar-host.o
obj-$(CONFIG_PCIE_RCAR_EP) += pcie-rcar.o pcie-rcar-ep.o
+obj-$(CONFIG_PCIE_RENESAS_RZG3S_HOST) += pcie-rzg3s-host.o
obj-$(CONFIG_PCI_HOST_COMMON) += pci-host-common.o
obj-$(CONFIG_PCI_HOST_GENERIC) += pci-host-generic.o
obj-$(CONFIG_PCI_HOST_THUNDER_ECAM) += pci-thunder-ecam.o
diff --git a/drivers/pci/controller/cadence/Kconfig b/drivers/pci/controller/cadence/Kconfig
index 02a639e55fd8..9e651d545973 100644
--- a/drivers/pci/controller/cadence/Kconfig
+++ b/drivers/pci/controller/cadence/Kconfig
@@ -19,10 +19,10 @@ config PCIE_CADENCE_EP
select PCIE_CADENCE
config PCIE_CADENCE_PLAT
- bool
+ tristate
config PCIE_CADENCE_PLAT_HOST
- bool "Cadence platform PCIe controller (host mode)"
+ tristate "Cadence platform PCIe controller (host mode)"
depends on OF
select PCIE_CADENCE_HOST
select PCIE_CADENCE_PLAT
@@ -32,7 +32,7 @@ config PCIE_CADENCE_PLAT_HOST
vendors SoCs.
config PCIE_CADENCE_PLAT_EP
- bool "Cadence platform PCIe controller (endpoint mode)"
+ tristate "Cadence platform PCIe controller (endpoint mode)"
depends on OF
depends on PCI_ENDPOINT
select PCIE_CADENCE_EP
@@ -42,6 +42,21 @@ config PCIE_CADENCE_PLAT_EP
endpoint mode. This PCIe controller may be embedded into many
different vendors SoCs.
+config PCI_SKY1_HOST
+ tristate "CIX SKY1 PCIe controller (host mode)"
+ depends on OF && (ARCH_CIX || COMPILE_TEST)
+ select PCIE_CADENCE_HOST
+ select PCI_ECAM
+ help
+ Say Y here if you want to support the CIX SKY1 PCIe platform
+ controller in host mode. CIX SKY1 PCIe controller uses Cadence
+ HPA (High Performance Architecture IP [Second generation of
+ Cadence PCIe IP])
+
+ This driver requires Cadence PCIe core infrastructure
+ (PCIE_CADENCE_HOST) and hardware platform adaptation layer
+ to function.
+
config PCIE_SG2042_HOST
tristate "Sophgo SG2042 PCIe controller (host mode)"
depends on OF && (ARCH_SOPHGO || COMPILE_TEST)
diff --git a/drivers/pci/controller/cadence/Makefile b/drivers/pci/controller/cadence/Makefile
index 5e23f8539ecc..b8ec1cecfaa8 100644
--- a/drivers/pci/controller/cadence/Makefile
+++ b/drivers/pci/controller/cadence/Makefile
@@ -1,7 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_PCIE_CADENCE) += pcie-cadence.o
-obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
-obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
+pcie-cadence-mod-y := pcie-cadence-hpa.o pcie-cadence.o
+pcie-cadence-host-mod-y := pcie-cadence-host-common.o pcie-cadence-host.o pcie-cadence-host-hpa.o
+pcie-cadence-ep-mod-y := pcie-cadence-ep.o
+
+obj-$(CONFIG_PCIE_CADENCE) = pcie-cadence-mod.o
+obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host-mod.o
+obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep-mod.o
obj-$(CONFIG_PCIE_CADENCE_PLAT) += pcie-cadence-plat.o
obj-$(CONFIG_PCI_J721E) += pci-j721e.o
obj-$(CONFIG_PCIE_SG2042_HOST) += pcie-sg2042.o
+obj-$(CONFIG_PCI_SKY1_HOST) += pci-sky1.o
diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c
index 5bc5ab20aa6d..ecd1b0312400 100644
--- a/drivers/pci/controller/cadence/pci-j721e.c
+++ b/drivers/pci/controller/cadence/pci-j721e.c
@@ -477,9 +477,7 @@ static int j721e_pcie_probe(struct platform_device *pdev)
struct j721e_pcie *pcie;
struct cdns_pcie_rc *rc = NULL;
struct cdns_pcie_ep *ep = NULL;
- struct gpio_desc *gpiod;
void __iomem *base;
- struct clk *clk;
u32 num_lanes;
u32 mode;
int ret;
@@ -590,12 +588,12 @@ static int j721e_pcie_probe(struct platform_device *pdev)
switch (mode) {
case PCI_MODE_RC:
- gpiod = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
- if (IS_ERR(gpiod)) {
- ret = dev_err_probe(dev, PTR_ERR(gpiod), "Failed to get reset GPIO\n");
+ pcie->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
+ if (IS_ERR(pcie->reset_gpio)) {
+ ret = dev_err_probe(dev, PTR_ERR(pcie->reset_gpio),
+ "Failed to get reset GPIO\n");
goto err_get_sync;
}
- pcie->reset_gpio = gpiod;
ret = cdns_pcie_init_phy(dev, cdns_pcie);
if (ret) {
@@ -603,19 +601,13 @@ static int j721e_pcie_probe(struct platform_device *pdev)
goto err_get_sync;
}
- clk = devm_clk_get_optional(dev, "pcie_refclk");
- if (IS_ERR(clk)) {
- ret = dev_err_probe(dev, PTR_ERR(clk), "failed to get pcie_refclk\n");
+ pcie->refclk = devm_clk_get_optional_enabled(dev, "pcie_refclk");
+ if (IS_ERR(pcie->refclk)) {
+ ret = dev_err_probe(dev, PTR_ERR(pcie->refclk),
+ "failed to enable pcie_refclk\n");
goto err_pcie_setup;
}
- ret = clk_prepare_enable(clk);
- if (ret) {
- dev_err_probe(dev, ret, "failed to enable pcie_refclk\n");
- goto err_pcie_setup;
- }
- pcie->refclk = clk;
-
/*
* Section 2.2 of the PCI Express Card Electromechanical
* Specification (Revision 5.1) mandates that the deassertion
@@ -623,16 +615,14 @@ static int j721e_pcie_probe(struct platform_device *pdev)
* This shall ensure that the power and the reference clock
* are stable.
*/
- if (gpiod) {
+ if (pcie->reset_gpio) {
msleep(PCIE_T_PVPERL_MS);
- gpiod_set_value_cansleep(gpiod, 1);
+ gpiod_set_value_cansleep(pcie->reset_gpio, 1);
}
ret = cdns_pcie_host_setup(rc);
- if (ret < 0) {
- clk_disable_unprepare(pcie->refclk);
+ if (ret < 0)
goto err_pcie_setup;
- }
break;
case PCI_MODE_EP:
@@ -679,7 +669,6 @@ static void j721e_pcie_remove(struct platform_device *pdev)
gpiod_set_value_cansleep(pcie->reset_gpio, 0);
- clk_disable_unprepare(pcie->refclk);
cdns_pcie_disable_phy(cdns_pcie);
j721e_pcie_disable_link_irq(pcie);
pm_runtime_put(dev);
diff --git a/drivers/pci/controller/cadence/pci-sky1.c b/drivers/pci/controller/cadence/pci-sky1.c
new file mode 100644
index 000000000000..d8c216dc120d
--- /dev/null
+++ b/drivers/pci/controller/cadence/pci-sky1.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCIe controller driver for CIX's sky1 SoCs
+ *
+ * Copyright 2025 Cix Technology Group Co., Ltd.
+ * Author: Hans Zhang <hans.zhang@cixtech.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/pci.h>
+#include <linux/pci-ecam.h>
+#include <linux/pci_ids.h>
+
+#include "pcie-cadence.h"
+#include "pcie-cadence-host-common.h"
+
+#define PCI_VENDOR_ID_CIX 0x1f6c
+#define PCI_DEVICE_ID_CIX_SKY1 0x0001
+
+#define STRAP_REG(n) ((n) * 0x04)
+#define STATUS_REG(n) ((n) * 0x04)
+#define LINK_TRAINING_ENABLE BIT(0)
+#define LINK_COMPLETE BIT(0)
+
+#define SKY1_IP_REG_BANK 0x1000
+#define SKY1_IP_CFG_CTRL_REG_BANK 0x4c00
+#define SKY1_IP_AXI_MASTER_COMMON 0xf000
+#define SKY1_AXI_SLAVE 0x9000
+#define SKY1_AXI_MASTER 0xb000
+#define SKY1_AXI_HLS_REGISTERS 0xc000
+#define SKY1_AXI_RAS_REGISTERS 0xe000
+#define SKY1_DTI_REGISTERS 0xd000
+
+#define IP_REG_I_DBG_STS_0 0x420
+
+struct sky1_pcie {
+ struct cdns_pcie *cdns_pcie;
+ struct cdns_pcie_rc *cdns_pcie_rc;
+
+ struct resource *cfg_res;
+ struct resource *msg_res;
+ struct pci_config_window *cfg;
+ void __iomem *strap_base;
+ void __iomem *status_base;
+ void __iomem *reg_base;
+ void __iomem *cfg_base;
+ void __iomem *msg_base;
+};
+
+static int sky1_pcie_resource_get(struct platform_device *pdev,
+ struct sky1_pcie *pcie)
+{
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+ void __iomem *base;
+
+ base = devm_platform_ioremap_resource_byname(pdev, "reg");
+ if (IS_ERR(base))
+ return dev_err_probe(dev, PTR_ERR(base),
+ "unable to find \"reg\" registers\n");
+ pcie->reg_base = base;
+
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
+ if (!res)
+ return dev_err_probe(dev, -ENODEV, "unable to get \"cfg\" resource\n");
+ pcie->cfg_res = res;
+
+ base = devm_platform_ioremap_resource_byname(pdev, "rcsu_strap");
+ if (IS_ERR(base))
+ return dev_err_probe(dev, PTR_ERR(base),
+ "unable to find \"rcsu_strap\" registers\n");
+ pcie->strap_base = base;
+
+ base = devm_platform_ioremap_resource_byname(pdev, "rcsu_status");
+ if (IS_ERR(base))
+ return dev_err_probe(dev, PTR_ERR(base),
+ "unable to find \"rcsu_status\" registers\n");
+ pcie->status_base = base;
+
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "msg");
+ if (!res)
+ return dev_err_probe(dev, -ENODEV, "unable to get \"msg\" resource\n");
+ pcie->msg_res = res;
+ pcie->msg_base = devm_ioremap_resource(dev, res);
+ if (IS_ERR(pcie->msg_base)) {
+ return dev_err_probe(dev, PTR_ERR(pcie->msg_base),
+ "unable to ioremap msg resource\n");
+ }
+
+ return 0;
+}
+
+static int sky1_pcie_start_link(struct cdns_pcie *cdns_pcie)
+{
+ struct sky1_pcie *pcie = dev_get_drvdata(cdns_pcie->dev);
+ u32 val;
+
+ val = readl(pcie->strap_base + STRAP_REG(1));
+ val |= LINK_TRAINING_ENABLE;
+ writel(val, pcie->strap_base + STRAP_REG(1));
+
+ return 0;
+}
+
+static void sky1_pcie_stop_link(struct cdns_pcie *cdns_pcie)
+{
+ struct sky1_pcie *pcie = dev_get_drvdata(cdns_pcie->dev);
+ u32 val;
+
+ val = readl(pcie->strap_base + STRAP_REG(1));
+ val &= ~LINK_TRAINING_ENABLE;
+ writel(val, pcie->strap_base + STRAP_REG(1));
+}
+
+static bool sky1_pcie_link_up(struct cdns_pcie *cdns_pcie)
+{
+ u32 val;
+
+ val = cdns_pcie_hpa_readl(cdns_pcie, REG_BANK_IP_REG,
+ IP_REG_I_DBG_STS_0);
+ return val & LINK_COMPLETE;
+}
+
+static const struct cdns_pcie_ops sky1_pcie_ops = {
+ .start_link = sky1_pcie_start_link,
+ .stop_link = sky1_pcie_stop_link,
+ .link_up = sky1_pcie_link_up,
+};
+
+static int sky1_pcie_probe(struct platform_device *pdev)
+{
+ struct cdns_plat_pcie_of_data *reg_off;
+ struct device *dev = &pdev->dev;
+ struct pci_host_bridge *bridge;
+ struct cdns_pcie *cdns_pcie;
+ struct resource_entry *bus;
+ struct cdns_pcie_rc *rc;
+ struct sky1_pcie *pcie;
+ int ret;
+
+ pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
+ if (!pcie)
+ return -ENOMEM;
+
+ bridge = devm_pci_alloc_host_bridge(dev, sizeof(*rc));
+ if (!bridge)
+ return -ENOMEM;
+
+ ret = sky1_pcie_resource_get(pdev, pcie);
+ if (ret < 0)
+ return ret;
+
+ bus = resource_list_first_type(&bridge->windows, IORESOURCE_BUS);
+ if (!bus)
+ return -ENODEV;
+
+ pcie->cfg = pci_ecam_create(dev, pcie->cfg_res, bus->res,
+ &pci_generic_ecam_ops);
+ if (IS_ERR(pcie->cfg))
+ return PTR_ERR(pcie->cfg);
+
+ bridge->ops = (struct pci_ops *)&pci_generic_ecam_ops.pci_ops;
+ rc = pci_host_bridge_priv(bridge);
+ rc->ecam_supported = 1;
+ rc->cfg_base = pcie->cfg->win;
+ rc->cfg_res = &pcie->cfg->res;
+
+ cdns_pcie = &rc->pcie;
+ cdns_pcie->dev = dev;
+ cdns_pcie->ops = &sky1_pcie_ops;
+ cdns_pcie->reg_base = pcie->reg_base;
+ cdns_pcie->msg_res = pcie->msg_res;
+ cdns_pcie->is_rc = 1;
+
+ reg_off = devm_kzalloc(dev, sizeof(*reg_off), GFP_KERNEL);
+ if (!reg_off)
+ return -ENOMEM;
+
+ reg_off->ip_reg_bank_offset = SKY1_IP_REG_BANK;
+ reg_off->ip_cfg_ctrl_reg_offset = SKY1_IP_CFG_CTRL_REG_BANK;
+ reg_off->axi_mstr_common_offset = SKY1_IP_AXI_MASTER_COMMON;
+ reg_off->axi_slave_offset = SKY1_AXI_SLAVE;
+ reg_off->axi_master_offset = SKY1_AXI_MASTER;
+ reg_off->axi_hls_offset = SKY1_AXI_HLS_REGISTERS;
+ reg_off->axi_ras_offset = SKY1_AXI_RAS_REGISTERS;
+ reg_off->axi_dti_offset = SKY1_DTI_REGISTERS;
+ cdns_pcie->cdns_pcie_reg_offsets = reg_off;
+
+ pcie->cdns_pcie = cdns_pcie;
+ pcie->cdns_pcie_rc = rc;
+ pcie->cfg_base = rc->cfg_base;
+ bridge->sysdata = pcie->cfg;
+
+ rc->vendor_id = PCI_VENDOR_ID_CIX;
+ rc->device_id = PCI_DEVICE_ID_CIX_SKY1;
+ rc->no_inbound_map = 1;
+
+ dev_set_drvdata(dev, pcie);
+
+ ret = cdns_pcie_hpa_host_setup(rc);
+ if (ret < 0) {
+ pci_ecam_free(pcie->cfg);
+ return ret;
+ }
+
+ return 0;
+}
+
+static const struct of_device_id of_sky1_pcie_match[] = {
+ { .compatible = "cix,sky1-pcie-host", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, of_sky1_pcie_match);
+
+static void sky1_pcie_remove(struct platform_device *pdev)
+{
+ struct sky1_pcie *pcie = platform_get_drvdata(pdev);
+
+ pci_ecam_free(pcie->cfg);
+}
+
+static struct platform_driver sky1_pcie_driver = {
+ .probe = sky1_pcie_probe,
+ .remove = sky1_pcie_remove,
+ .driver = {
+ .name = "sky1-pcie",
+ .of_match_table = of_sky1_pcie_match,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+};
+module_platform_driver(sky1_pcie_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PCIe controller driver for CIX's sky1 SoCs");
+MODULE_AUTHOR("Hans Zhang <hans.zhang@cixtech.com>");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-common.c b/drivers/pci/controller/cadence/pcie-cadence-host-common.c
new file mode 100644
index 000000000000..15415d7f35ee
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-host-common.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cadence PCIe host controller library.
+ *
+ * Copyright (c) 2017 Cadence
+ * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com>
+ */
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/list_sort.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+#include <linux/platform_device.h>
+
+#include "pcie-cadence.h"
+#include "pcie-cadence-host-common.h"
+
+#define LINK_RETRAIN_TIMEOUT HZ
+
+u64 bar_max_size[] = {
+ [RP_BAR0] = _ULL(128 * SZ_2G),
+ [RP_BAR1] = SZ_2G,
+ [RP_NO_BAR] = _BITULL(63),
+};
+EXPORT_SYMBOL_GPL(bar_max_size);
+
+int cdns_pcie_host_training_complete(struct cdns_pcie *pcie)
+{
+ u32 pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET;
+ unsigned long end_jiffies;
+ u16 lnk_stat;
+
+ /* Wait for link training to complete. Exit after timeout. */
+ end_jiffies = jiffies + LINK_RETRAIN_TIMEOUT;
+ do {
+ lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA);
+ if (!(lnk_stat & PCI_EXP_LNKSTA_LT))
+ break;
+ usleep_range(0, 1000);
+ } while (time_before(jiffies, end_jiffies));
+
+ if (!(lnk_stat & PCI_EXP_LNKSTA_LT))
+ return 0;
+
+ return -ETIMEDOUT;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_training_complete);
+
+int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie,
+ cdns_pcie_linkup_func pcie_link_up)
+{
+ struct device *dev = pcie->dev;
+ int retries;
+
+ /* Check if the link is up or not */
+ for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) {
+ if (pcie_link_up(pcie)) {
+ dev_info(dev, "Link up\n");
+ return 0;
+ }
+ usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
+ }
+
+ return -ETIMEDOUT;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_wait_for_link);
+
+int cdns_pcie_retrain(struct cdns_pcie *pcie,
+ cdns_pcie_linkup_func pcie_link_up)
+{
+ u32 lnk_cap_sls, pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET;
+ u16 lnk_stat, lnk_ctl;
+ int ret = 0;
+
+ /*
+ * Set retrain bit if current speed is 2.5 GB/s,
+ * but the PCIe root port support is > 2.5 GB/s.
+ */
+
+ lnk_cap_sls = cdns_pcie_readl(pcie, (CDNS_PCIE_RP_BASE + pcie_cap_off +
+ PCI_EXP_LNKCAP));
+ if ((lnk_cap_sls & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB)
+ return ret;
+
+ lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA);
+ if ((lnk_stat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
+ lnk_ctl = cdns_pcie_rp_readw(pcie,
+ pcie_cap_off + PCI_EXP_LNKCTL);
+ lnk_ctl |= PCI_EXP_LNKCTL_RL;
+ cdns_pcie_rp_writew(pcie, pcie_cap_off + PCI_EXP_LNKCTL,
+ lnk_ctl);
+
+ ret = cdns_pcie_host_training_complete(pcie);
+ if (ret)
+ return ret;
+
+ ret = cdns_pcie_host_wait_for_link(pcie, pcie_link_up);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_retrain);
+
+int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc,
+ cdns_pcie_linkup_func pcie_link_up)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ int ret;
+
+ ret = cdns_pcie_host_wait_for_link(pcie, pcie_link_up);
+
+ /*
+ * Retrain link for Gen2 training defect
+ * if quirk flag is set.
+ */
+ if (!ret && rc->quirk_retrain_flag)
+ ret = cdns_pcie_retrain(pcie, pcie_link_up);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_start_link);
+
+enum cdns_pcie_rp_bar
+cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size)
+{
+ enum cdns_pcie_rp_bar bar, sel_bar;
+
+ sel_bar = RP_BAR_UNDEFINED;
+ for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) {
+ if (!rc->avail_ib_bar[bar])
+ continue;
+
+ if (size <= bar_max_size[bar]) {
+ if (sel_bar == RP_BAR_UNDEFINED) {
+ sel_bar = bar;
+ continue;
+ }
+
+ if (bar_max_size[bar] < bar_max_size[sel_bar])
+ sel_bar = bar;
+ }
+ }
+
+ return sel_bar;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_find_min_bar);
+
+enum cdns_pcie_rp_bar
+cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size)
+{
+ enum cdns_pcie_rp_bar bar, sel_bar;
+
+ sel_bar = RP_BAR_UNDEFINED;
+ for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) {
+ if (!rc->avail_ib_bar[bar])
+ continue;
+
+ if (size >= bar_max_size[bar]) {
+ if (sel_bar == RP_BAR_UNDEFINED) {
+ sel_bar = bar;
+ continue;
+ }
+
+ if (bar_max_size[bar] > bar_max_size[sel_bar])
+ sel_bar = bar;
+ }
+ }
+
+ return sel_bar;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_find_max_bar);
+
+int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct resource_entry *entry1, *entry2;
+
+ entry1 = container_of(a, struct resource_entry, node);
+ entry2 = container_of(b, struct resource_entry, node);
+
+ return resource_size(entry2->res) - resource_size(entry1->res);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_host_dma_ranges_cmp);
+
+int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc,
+ struct resource_entry *entry,
+ cdns_pcie_host_bar_ib_cfg pci_host_ib_config)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct device *dev = pcie->dev;
+ u64 cpu_addr, size, winsize;
+ enum cdns_pcie_rp_bar bar;
+ unsigned long flags;
+ int ret;
+
+ cpu_addr = entry->res->start;
+ flags = entry->res->flags;
+ size = resource_size(entry->res);
+
+ while (size > 0) {
+ /*
+ * Try to find a minimum BAR whose size is greater than
+ * or equal to the remaining resource_entry size. This will
+ * fail if the size of each of the available BARs is less than
+ * the remaining resource_entry size.
+ *
+ * If a minimum BAR is found, IB ATU will be configured and
+ * exited.
+ */
+ bar = cdns_pcie_host_find_min_bar(rc, size);
+ if (bar != RP_BAR_UNDEFINED) {
+ ret = pci_host_ib_config(rc, bar, cpu_addr, size, flags);
+ if (ret)
+ dev_err(dev, "IB BAR: %d config failed\n", bar);
+ return ret;
+ }
+
+ /*
+ * If the control reaches here, it would mean the remaining
+ * resource_entry size cannot be fitted in a single BAR. So we
+ * find a maximum BAR whose size is less than or equal to the
+ * remaining resource_entry size and split the resource entry
+ * so that part of resource entry is fitted inside the maximum
+ * BAR. The remaining size would be fitted during the next
+ * iteration of the loop.
+ *
+ * If a maximum BAR is not found, there is no way we can fit
+ * this resource_entry, so we error out.
+ */
+ bar = cdns_pcie_host_find_max_bar(rc, size);
+ if (bar == RP_BAR_UNDEFINED) {
+ dev_err(dev, "No free BAR to map cpu_addr %llx\n",
+ cpu_addr);
+ return -EINVAL;
+ }
+
+ winsize = bar_max_size[bar];
+ ret = pci_host_ib_config(rc, bar, cpu_addr, winsize, flags);
+ if (ret) {
+ dev_err(dev, "IB BAR: %d config failed\n", bar);
+ return ret;
+ }
+
+ size -= winsize;
+ cpu_addr += winsize;
+ }
+
+ return 0;
+}
+
+int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc,
+ cdns_pcie_host_bar_ib_cfg pci_host_ib_config)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct device *dev = pcie->dev;
+ struct device_node *np = dev->of_node;
+ struct pci_host_bridge *bridge;
+ struct resource_entry *entry;
+ u32 no_bar_nbits = 32;
+ int err;
+
+ bridge = pci_host_bridge_from_priv(rc);
+ if (!bridge)
+ return -ENOMEM;
+
+ if (list_empty(&bridge->dma_ranges)) {
+ of_property_read_u32(np, "cdns,no-bar-match-nbits",
+ &no_bar_nbits);
+ err = pci_host_ib_config(rc, RP_NO_BAR, 0x0, (u64)1 << no_bar_nbits, 0);
+ if (err)
+ dev_err(dev, "IB BAR: %d config failed\n", RP_NO_BAR);
+ return err;
+ }
+
+ list_sort(NULL, &bridge->dma_ranges, cdns_pcie_host_dma_ranges_cmp);
+
+ resource_list_for_each_entry(entry, &bridge->dma_ranges) {
+ err = cdns_pcie_host_bar_config(rc, entry, pci_host_ib_config);
+ if (err) {
+ dev_err(dev, "Fail to configure IB using dma-ranges\n");
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cadence PCIe host controller driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-common.h b/drivers/pci/controller/cadence/pcie-cadence-host-common.h
new file mode 100644
index 000000000000..fe7d4202a8b6
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-host-common.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Cadence PCIe Host controller driver.
+ *
+ * Copyright (c) 2017 Cadence
+ * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com>
+ */
+#ifndef _PCIE_CADENCE_HOST_COMMON_H
+#define _PCIE_CADENCE_HOST_COMMON_H
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+extern u64 bar_max_size[];
+
+typedef int (*cdns_pcie_host_bar_ib_cfg)(struct cdns_pcie_rc *,
+ enum cdns_pcie_rp_bar,
+ u64,
+ u64,
+ unsigned long);
+typedef bool (*cdns_pcie_linkup_func)(struct cdns_pcie *);
+
+int cdns_pcie_host_training_complete(struct cdns_pcie *pcie);
+int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie,
+ cdns_pcie_linkup_func pcie_link_up);
+int cdns_pcie_retrain(struct cdns_pcie *pcie, cdns_pcie_linkup_func pcie_linkup_func);
+int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc,
+ cdns_pcie_linkup_func pcie_link_up);
+enum cdns_pcie_rp_bar
+cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size);
+enum cdns_pcie_rp_bar
+cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size);
+int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b);
+int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc,
+ enum cdns_pcie_rp_bar bar,
+ u64 cpu_addr,
+ u64 size,
+ unsigned long flags);
+int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc,
+ struct resource_entry *entry,
+ cdns_pcie_host_bar_ib_cfg pci_host_ib_config);
+int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc,
+ cdns_pcie_host_bar_ib_cfg pci_host_ib_config);
+
+#endif /* _PCIE_CADENCE_HOST_COMMON_H */
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c b/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c
new file mode 100644
index 000000000000..0f540bed58e8
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-host-hpa.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cadence PCIe host controller driver.
+ *
+ * Copyright (c) 2024, Cadence Design Systems
+ * Author: Manikandan K Pillai <mpillai@cadence.com>
+ */
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/list_sort.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+
+#include "pcie-cadence.h"
+#include "pcie-cadence-host-common.h"
+
+static u8 bar_aperture_mask[] = {
+ [RP_BAR0] = 0x3F,
+ [RP_BAR1] = 0x3F,
+};
+
+void __iomem *cdns_pci_hpa_map_bus(struct pci_bus *bus, unsigned int devfn,
+ int where)
+{
+ struct pci_host_bridge *bridge = pci_find_host_bridge(bus);
+ struct cdns_pcie_rc *rc = pci_host_bridge_priv(bridge);
+ struct cdns_pcie *pcie = &rc->pcie;
+ unsigned int busn = bus->number;
+ u32 addr0, desc0, desc1, ctrl0;
+ u32 regval;
+
+ if (pci_is_root_bus(bus)) {
+ /*
+ * Only the root port (devfn == 0) is connected to this bus.
+ * All other PCI devices are behind some bridge hence on another
+ * bus.
+ */
+ if (devfn)
+ return NULL;
+
+ return pcie->reg_base + (where & 0xfff);
+ }
+
+ /* Clear AXI link-down status */
+ regval = cdns_pcie_hpa_readl(pcie, REG_BANK_AXI_SLAVE, CDNS_PCIE_HPA_AT_LINKDOWN);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE, CDNS_PCIE_HPA_AT_LINKDOWN,
+ (regval & ~GENMASK(0, 0)));
+
+ /* Update Output registers for AXI region 0 */
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(12) |
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) |
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS(busn);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(0), addr0);
+
+ desc1 = cdns_pcie_hpa_readl(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0));
+ desc1 &= ~CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK;
+ desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0);
+ ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS |
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN;
+
+ if (busn == bridge->busnr + 1)
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0;
+ else
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1;
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC0(0), desc0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0), desc1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(0), ctrl0);
+
+ return rc->cfg_base + (where & 0xfff);
+}
+
+static struct pci_ops cdns_pcie_hpa_host_ops = {
+ .map_bus = cdns_pci_hpa_map_bus,
+ .read = pci_generic_config_read,
+ .write = pci_generic_config_write,
+};
+
+static void cdns_pcie_hpa_host_enable_ptm_response(struct cdns_pcie *pcie)
+{
+ u32 val;
+
+ val = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_LM_PTM_CTRL);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_LM_PTM_CTRL,
+ val | CDNS_PCIE_HPA_LM_PTM_CTRL_PTMRSEN);
+}
+
+static int cdns_pcie_hpa_host_bar_ib_config(struct cdns_pcie_rc *rc,
+ enum cdns_pcie_rp_bar bar,
+ u64 cpu_addr, u64 size,
+ unsigned long flags)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ u32 addr0, addr1, aperture, value;
+
+ if (!rc->avail_ib_bar[bar])
+ return -ENODEV;
+
+ rc->avail_ib_bar[bar] = false;
+
+ aperture = ilog2(size);
+ if (bar == RP_NO_BAR) {
+ addr0 = CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS(aperture) |
+ (lower_32_bits(cpu_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(cpu_addr);
+ } else {
+ addr0 = lower_32_bits(cpu_addr);
+ addr1 = upper_32_bits(cpu_addr);
+ }
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_MASTER,
+ CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0(bar), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_MASTER,
+ CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR1(bar), addr1);
+
+ if (bar == RP_NO_BAR)
+ bar = (enum cdns_pcie_rp_bar)BAR_0;
+
+ value = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_HPA_LM_RC_BAR_CFG);
+ value &= ~(HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) |
+ HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) |
+ HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) |
+ HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) |
+ HPA_LM_RC_BAR_CFG_APERTURE(bar, bar_aperture_mask[bar] + 7));
+ if (size + cpu_addr >= SZ_4G) {
+ value |= HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar);
+ if ((flags & IORESOURCE_PREFETCH))
+ value |= HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar);
+ } else {
+ value |= HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar);
+ if ((flags & IORESOURCE_PREFETCH))
+ value |= HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar);
+ }
+
+ value |= HPA_LM_RC_BAR_CFG_APERTURE(bar, aperture);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_HPA_LM_RC_BAR_CFG, value);
+
+ return 0;
+}
+
+static int cdns_pcie_hpa_host_init_root_port(struct cdns_pcie_rc *rc)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ u32 value, ctrl;
+
+ /*
+ * Set the root port BAR configuration register:
+ * - disable both BAR0 and BAR1
+ * - enable Prefetchable Memory Base and Limit registers in type 1
+ * config space (64 bits)
+ * - enable IO Base and Limit registers in type 1 config
+ * space (32 bits)
+ */
+
+ ctrl = CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED;
+ value = CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL(ctrl) |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL(ctrl) |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_ENABLE |
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_32BITS;
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG,
+ CDNS_PCIE_HPA_LM_RC_BAR_CFG, value);
+
+ if (rc->vendor_id != 0xffff)
+ cdns_pcie_hpa_rp_writew(pcie, PCI_VENDOR_ID, rc->vendor_id);
+
+ if (rc->device_id != 0xffff)
+ cdns_pcie_hpa_rp_writew(pcie, PCI_DEVICE_ID, rc->device_id);
+
+ cdns_pcie_hpa_rp_writeb(pcie, PCI_CLASS_REVISION, 0);
+ cdns_pcie_hpa_rp_writeb(pcie, PCI_CLASS_PROG, 0);
+ cdns_pcie_hpa_rp_writew(pcie, PCI_CLASS_DEVICE, PCI_CLASS_BRIDGE_PCI);
+
+ /* Enable bus mastering */
+ value = cdns_pcie_hpa_readl(pcie, REG_BANK_RP, PCI_COMMAND);
+ value |= (PCI_COMMAND_MEMORY | PCI_COMMAND_IO | PCI_COMMAND_MASTER);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_RP, PCI_COMMAND, value);
+ return 0;
+}
+
+static void cdns_pcie_hpa_create_region_for_cfg(struct cdns_pcie_rc *rc)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(rc);
+ struct resource *cfg_res = rc->cfg_res;
+ struct resource_entry *entry;
+ u64 cpu_addr = cfg_res->start;
+ u32 addr0, addr1, desc1;
+ int busnr = 0;
+
+ entry = resource_list_first_type(&bridge->windows, IORESOURCE_BUS);
+ if (entry)
+ busnr = entry->res->start;
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_TAG_MANAGEMENT, 0x01000000);
+ /*
+ * Reserve region 0 for PCI configure space accesses:
+ * OB_REGION_PCI_ADDR0 and OB_REGION_DESC0 are updated dynamically by
+ * cdns_pci_map_bus(), other region registers are set here once for all
+ */
+ desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(0), 0x0);
+ /* Type-1 CFG */
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC0(0), 0x05000000);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(0), desc1);
+
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(12) |
+ (lower_32_bits(cpu_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(cpu_addr);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(0), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(0), addr1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(0), 0x06000000);
+}
+
+static int cdns_pcie_hpa_host_init_address_translation(struct cdns_pcie_rc *rc)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(rc);
+ struct resource_entry *entry;
+ int r = 0, busnr = 0;
+
+ if (!rc->ecam_supported)
+ cdns_pcie_hpa_create_region_for_cfg(rc);
+
+ entry = resource_list_first_type(&bridge->windows, IORESOURCE_BUS);
+ if (entry)
+ busnr = entry->res->start;
+
+ r++;
+ if (pcie->msg_res) {
+ cdns_pcie_hpa_set_outbound_region_for_normal_msg(pcie, busnr, 0, r,
+ pcie->msg_res->start);
+
+ r++;
+ }
+ resource_list_for_each_entry(entry, &bridge->windows) {
+ struct resource *res = entry->res;
+ u64 pci_addr = res->start - entry->offset;
+
+ if (resource_type(res) == IORESOURCE_IO)
+ cdns_pcie_hpa_set_outbound_region(pcie, busnr, 0, r,
+ true,
+ pci_pio_to_address(res->start),
+ pci_addr,
+ resource_size(res));
+ else
+ cdns_pcie_hpa_set_outbound_region(pcie, busnr, 0, r,
+ false,
+ res->start,
+ pci_addr,
+ resource_size(res));
+
+ r++;
+ }
+
+ if (rc->no_inbound_map)
+ return 0;
+ else
+ return cdns_pcie_host_map_dma_ranges(rc, cdns_pcie_hpa_host_bar_ib_config);
+}
+
+static int cdns_pcie_hpa_host_init(struct cdns_pcie_rc *rc)
+{
+ int err;
+
+ err = cdns_pcie_hpa_host_init_root_port(rc);
+ if (err)
+ return err;
+
+ return cdns_pcie_hpa_host_init_address_translation(rc);
+}
+
+int cdns_pcie_hpa_host_link_setup(struct cdns_pcie_rc *rc)
+{
+ struct cdns_pcie *pcie = &rc->pcie;
+ struct device *dev = rc->pcie.dev;
+ int ret;
+
+ if (rc->quirk_detect_quiet_flag)
+ cdns_pcie_hpa_detect_quiet_min_delay_set(&rc->pcie);
+
+ cdns_pcie_hpa_host_enable_ptm_response(pcie);
+
+ ret = cdns_pcie_start_link(pcie);
+ if (ret) {
+ dev_err(dev, "Failed to start link\n");
+ return ret;
+ }
+
+ ret = cdns_pcie_host_wait_for_link(pcie, cdns_pcie_hpa_link_up);
+ if (ret)
+ dev_dbg(dev, "PCIe link never came up\n");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_host_link_setup);
+
+int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc)
+{
+ struct device *dev = rc->pcie.dev;
+ struct platform_device *pdev = to_platform_device(dev);
+ struct pci_host_bridge *bridge;
+ enum cdns_pcie_rp_bar bar;
+ struct cdns_pcie *pcie;
+ struct resource *res;
+ int ret;
+
+ bridge = pci_host_bridge_from_priv(rc);
+ if (!bridge)
+ return -ENOMEM;
+
+ pcie = &rc->pcie;
+ pcie->is_rc = true;
+
+ if (!pcie->reg_base) {
+ pcie->reg_base = devm_platform_ioremap_resource_byname(pdev, "reg");
+ if (IS_ERR(pcie->reg_base)) {
+ dev_err(dev, "missing \"reg\"\n");
+ return PTR_ERR(pcie->reg_base);
+ }
+ }
+
+ /* ECAM config space is remapped at glue layer */
+ if (!rc->cfg_base) {
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
+ rc->cfg_base = devm_pci_remap_cfg_resource(dev, res);
+ if (IS_ERR(rc->cfg_base))
+ return PTR_ERR(rc->cfg_base);
+ rc->cfg_res = res;
+ }
+
+ /* Put EROM Bar aperture to 0 */
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_CFG_CTRL_REG, CDNS_PCIE_EROM, 0x0);
+
+ ret = cdns_pcie_hpa_host_link_setup(rc);
+ if (ret)
+ return ret;
+
+ for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++)
+ rc->avail_ib_bar[bar] = true;
+
+ ret = cdns_pcie_hpa_host_init(rc);
+ if (ret)
+ return ret;
+
+ if (!bridge->ops)
+ bridge->ops = &cdns_pcie_hpa_host_ops;
+
+ return pci_host_probe(bridge);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_host_setup);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cadence PCIe host controller driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host.c b/drivers/pci/controller/cadence/pcie-cadence-host.c
index fffd63d6665e..db3154c1eccb 100644
--- a/drivers/pci/controller/cadence/pcie-cadence-host.c
+++ b/drivers/pci/controller/cadence/pcie-cadence-host.c
@@ -12,14 +12,7 @@
#include <linux/platform_device.h>
#include "pcie-cadence.h"
-
-#define LINK_RETRAIN_TIMEOUT HZ
-
-static u64 bar_max_size[] = {
- [RP_BAR0] = _ULL(128 * SZ_2G),
- [RP_BAR1] = SZ_2G,
- [RP_NO_BAR] = _BITULL(63),
-};
+#include "pcie-cadence-host-common.h"
static u8 bar_aperture_mask[] = {
[RP_BAR0] = 0x1F,
@@ -81,77 +74,6 @@ static struct pci_ops cdns_pcie_host_ops = {
.write = pci_generic_config_write,
};
-static int cdns_pcie_host_training_complete(struct cdns_pcie *pcie)
-{
- u32 pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET;
- unsigned long end_jiffies;
- u16 lnk_stat;
-
- /* Wait for link training to complete. Exit after timeout. */
- end_jiffies = jiffies + LINK_RETRAIN_TIMEOUT;
- do {
- lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA);
- if (!(lnk_stat & PCI_EXP_LNKSTA_LT))
- break;
- usleep_range(0, 1000);
- } while (time_before(jiffies, end_jiffies));
-
- if (!(lnk_stat & PCI_EXP_LNKSTA_LT))
- return 0;
-
- return -ETIMEDOUT;
-}
-
-static int cdns_pcie_host_wait_for_link(struct cdns_pcie *pcie)
-{
- struct device *dev = pcie->dev;
- int retries;
-
- /* Check if the link is up or not */
- for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) {
- if (cdns_pcie_link_up(pcie)) {
- dev_info(dev, "Link up\n");
- return 0;
- }
- usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
- }
-
- return -ETIMEDOUT;
-}
-
-static int cdns_pcie_retrain(struct cdns_pcie *pcie)
-{
- u32 lnk_cap_sls, pcie_cap_off = CDNS_PCIE_RP_CAP_OFFSET;
- u16 lnk_stat, lnk_ctl;
- int ret = 0;
-
- /*
- * Set retrain bit if current speed is 2.5 GB/s,
- * but the PCIe root port support is > 2.5 GB/s.
- */
-
- lnk_cap_sls = cdns_pcie_readl(pcie, (CDNS_PCIE_RP_BASE + pcie_cap_off +
- PCI_EXP_LNKCAP));
- if ((lnk_cap_sls & PCI_EXP_LNKCAP_SLS) <= PCI_EXP_LNKCAP_SLS_2_5GB)
- return ret;
-
- lnk_stat = cdns_pcie_rp_readw(pcie, pcie_cap_off + PCI_EXP_LNKSTA);
- if ((lnk_stat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
- lnk_ctl = cdns_pcie_rp_readw(pcie,
- pcie_cap_off + PCI_EXP_LNKCTL);
- lnk_ctl |= PCI_EXP_LNKCTL_RL;
- cdns_pcie_rp_writew(pcie, pcie_cap_off + PCI_EXP_LNKCTL,
- lnk_ctl);
-
- ret = cdns_pcie_host_training_complete(pcie);
- if (ret)
- return ret;
-
- ret = cdns_pcie_host_wait_for_link(pcie);
- }
- return ret;
-}
-
static void cdns_pcie_host_disable_ptm_response(struct cdns_pcie *pcie)
{
u32 val;
@@ -168,23 +90,6 @@ static void cdns_pcie_host_enable_ptm_response(struct cdns_pcie *pcie)
cdns_pcie_writel(pcie, CDNS_PCIE_LM_PTM_CTRL, val | CDNS_PCIE_LM_TPM_CTRL_PTMRSEN);
}
-static int cdns_pcie_host_start_link(struct cdns_pcie_rc *rc)
-{
- struct cdns_pcie *pcie = &rc->pcie;
- int ret;
-
- ret = cdns_pcie_host_wait_for_link(pcie);
-
- /*
- * Retrain link for Gen2 training defect
- * if quirk flag is set.
- */
- if (!ret && rc->quirk_retrain_flag)
- ret = cdns_pcie_retrain(pcie);
-
- return ret;
-}
-
static void cdns_pcie_host_deinit_root_port(struct cdns_pcie_rc *rc)
{
struct cdns_pcie *pcie = &rc->pcie;
@@ -245,10 +150,11 @@ static int cdns_pcie_host_init_root_port(struct cdns_pcie_rc *rc)
return 0;
}
-static int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc,
- enum cdns_pcie_rp_bar bar,
- u64 cpu_addr, u64 size,
- unsigned long flags)
+int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc,
+ enum cdns_pcie_rp_bar bar,
+ u64 cpu_addr,
+ u64 size,
+ unsigned long flags)
{
struct cdns_pcie *pcie = &rc->pcie;
u32 addr0, addr1, aperture, value;
@@ -290,137 +196,6 @@ static int cdns_pcie_host_bar_ib_config(struct cdns_pcie_rc *rc,
return 0;
}
-static enum cdns_pcie_rp_bar
-cdns_pcie_host_find_min_bar(struct cdns_pcie_rc *rc, u64 size)
-{
- enum cdns_pcie_rp_bar bar, sel_bar;
-
- sel_bar = RP_BAR_UNDEFINED;
- for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) {
- if (!rc->avail_ib_bar[bar])
- continue;
-
- if (size <= bar_max_size[bar]) {
- if (sel_bar == RP_BAR_UNDEFINED) {
- sel_bar = bar;
- continue;
- }
-
- if (bar_max_size[bar] < bar_max_size[sel_bar])
- sel_bar = bar;
- }
- }
-
- return sel_bar;
-}
-
-static enum cdns_pcie_rp_bar
-cdns_pcie_host_find_max_bar(struct cdns_pcie_rc *rc, u64 size)
-{
- enum cdns_pcie_rp_bar bar, sel_bar;
-
- sel_bar = RP_BAR_UNDEFINED;
- for (bar = RP_BAR0; bar <= RP_NO_BAR; bar++) {
- if (!rc->avail_ib_bar[bar])
- continue;
-
- if (size >= bar_max_size[bar]) {
- if (sel_bar == RP_BAR_UNDEFINED) {
- sel_bar = bar;
- continue;
- }
-
- if (bar_max_size[bar] > bar_max_size[sel_bar])
- sel_bar = bar;
- }
- }
-
- return sel_bar;
-}
-
-static int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc,
- struct resource_entry *entry)
-{
- u64 cpu_addr, pci_addr, size, winsize;
- struct cdns_pcie *pcie = &rc->pcie;
- struct device *dev = pcie->dev;
- enum cdns_pcie_rp_bar bar;
- unsigned long flags;
- int ret;
-
- cpu_addr = entry->res->start;
- pci_addr = entry->res->start - entry->offset;
- flags = entry->res->flags;
- size = resource_size(entry->res);
-
- if (entry->offset) {
- dev_err(dev, "PCI addr: %llx must be equal to CPU addr: %llx\n",
- pci_addr, cpu_addr);
- return -EINVAL;
- }
-
- while (size > 0) {
- /*
- * Try to find a minimum BAR whose size is greater than
- * or equal to the remaining resource_entry size. This will
- * fail if the size of each of the available BARs is less than
- * the remaining resource_entry size.
- * If a minimum BAR is found, IB ATU will be configured and
- * exited.
- */
- bar = cdns_pcie_host_find_min_bar(rc, size);
- if (bar != RP_BAR_UNDEFINED) {
- ret = cdns_pcie_host_bar_ib_config(rc, bar, cpu_addr,
- size, flags);
- if (ret)
- dev_err(dev, "IB BAR: %d config failed\n", bar);
- return ret;
- }
-
- /*
- * If the control reaches here, it would mean the remaining
- * resource_entry size cannot be fitted in a single BAR. So we
- * find a maximum BAR whose size is less than or equal to the
- * remaining resource_entry size and split the resource entry
- * so that part of resource entry is fitted inside the maximum
- * BAR. The remaining size would be fitted during the next
- * iteration of the loop.
- * If a maximum BAR is not found, there is no way we can fit
- * this resource_entry, so we error out.
- */
- bar = cdns_pcie_host_find_max_bar(rc, size);
- if (bar == RP_BAR_UNDEFINED) {
- dev_err(dev, "No free BAR to map cpu_addr %llx\n",
- cpu_addr);
- return -EINVAL;
- }
-
- winsize = bar_max_size[bar];
- ret = cdns_pcie_host_bar_ib_config(rc, bar, cpu_addr, winsize,
- flags);
- if (ret) {
- dev_err(dev, "IB BAR: %d config failed\n", bar);
- return ret;
- }
-
- size -= winsize;
- cpu_addr += winsize;
- }
-
- return 0;
-}
-
-static int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a,
- const struct list_head *b)
-{
- struct resource_entry *entry1, *entry2;
-
- entry1 = container_of(a, struct resource_entry, node);
- entry2 = container_of(b, struct resource_entry, node);
-
- return resource_size(entry2->res) - resource_size(entry1->res);
-}
-
static void cdns_pcie_host_unmap_dma_ranges(struct cdns_pcie_rc *rc)
{
struct cdns_pcie *pcie = &rc->pcie;
@@ -447,43 +222,6 @@ static void cdns_pcie_host_unmap_dma_ranges(struct cdns_pcie_rc *rc)
}
}
-static int cdns_pcie_host_map_dma_ranges(struct cdns_pcie_rc *rc)
-{
- struct cdns_pcie *pcie = &rc->pcie;
- struct device *dev = pcie->dev;
- struct device_node *np = dev->of_node;
- struct pci_host_bridge *bridge;
- struct resource_entry *entry;
- u32 no_bar_nbits = 32;
- int err;
-
- bridge = pci_host_bridge_from_priv(rc);
- if (!bridge)
- return -ENOMEM;
-
- if (list_empty(&bridge->dma_ranges)) {
- of_property_read_u32(np, "cdns,no-bar-match-nbits",
- &no_bar_nbits);
- err = cdns_pcie_host_bar_ib_config(rc, RP_NO_BAR, 0x0,
- (u64)1 << no_bar_nbits, 0);
- if (err)
- dev_err(dev, "IB BAR: %d config failed\n", RP_NO_BAR);
- return err;
- }
-
- list_sort(NULL, &bridge->dma_ranges, cdns_pcie_host_dma_ranges_cmp);
-
- resource_list_for_each_entry(entry, &bridge->dma_ranges) {
- err = cdns_pcie_host_bar_config(rc, entry);
- if (err) {
- dev_err(dev, "Fail to configure IB using dma-ranges\n");
- return err;
- }
- }
-
- return 0;
-}
-
static void cdns_pcie_host_deinit_address_translation(struct cdns_pcie_rc *rc)
{
struct cdns_pcie *pcie = &rc->pcie;
@@ -561,7 +299,7 @@ static int cdns_pcie_host_init_address_translation(struct cdns_pcie_rc *rc)
r++;
}
- return cdns_pcie_host_map_dma_ranges(rc);
+ return cdns_pcie_host_map_dma_ranges(rc, cdns_pcie_host_bar_ib_config);
}
static void cdns_pcie_host_deinit(struct cdns_pcie_rc *rc)
@@ -607,7 +345,7 @@ int cdns_pcie_host_link_setup(struct cdns_pcie_rc *rc)
return ret;
}
- ret = cdns_pcie_host_start_link(rc);
+ ret = cdns_pcie_host_start_link(rc, cdns_pcie_link_up);
if (ret)
dev_dbg(dev, "PCIe link never came up\n");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h b/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h
new file mode 100644
index 000000000000..026e131600de
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-hpa-regs.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Cadence PCIe controller driver.
+ *
+ * Copyright (c) 2024, Cadence Design Systems
+ * Author: Manikandan K Pillai <mpillai@cadence.com>
+ */
+#ifndef _PCIE_CADENCE_HPA_REGS_H
+#define _PCIE_CADENCE_HPA_REGS_H
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci-epf.h>
+#include <linux/phy/phy.h>
+#include <linux/bitfield.h>
+
+/* High Performance Architecture (HPA) PCIe controller registers */
+#define CDNS_PCIE_HPA_IP_REG_BANK 0x01000000
+#define CDNS_PCIE_HPA_IP_CFG_CTRL_REG_BANK 0x01003C00
+#define CDNS_PCIE_HPA_IP_AXI_MASTER_COMMON 0x02020000
+
+/* Address Translation Registers */
+#define CDNS_PCIE_HPA_AXI_SLAVE 0x03000000
+#define CDNS_PCIE_HPA_AXI_MASTER 0x03002000
+
+/* Root Port register base address */
+#define CDNS_PCIE_HPA_RP_BASE 0x0
+
+#define CDNS_PCIE_HPA_LM_ID 0x1420
+
+/* Endpoint Function BARs */
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG(bar, fn) \
+ (((bar) < BAR_3) ? CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG0(fn) : \
+ CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG1(fn))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG0(pfn) (0x4000 * (pfn))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG1(pfn) ((0x4000 * (pfn)) + 0x04)
+#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG(bar, fn) \
+ (((bar) < BAR_3) ? CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG0(fn) : \
+ CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG1(fn))
+#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG0(vfn) ((0x4000 * (vfn)) + 0x08)
+#define CDNS_PCIE_HPA_LM_EP_VFUNC_BAR_CFG1(vfn) ((0x4000 * (vfn)) + 0x0C)
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(f) \
+ (GENMASK(5, 0) << (0x4 + (f) * 10))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \
+ (((a) << (4 + ((b) * 10))) & (CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b)))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(f) \
+ (GENMASK(3, 0) << ((f) * 10))
+#define CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \
+ (((c) << ((b) * 10)) & (CDNS_PCIE_HPA_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b)))
+
+/* Endpoint Function Configuration Register */
+#define CDNS_PCIE_HPA_LM_EP_FUNC_CFG 0x02C0
+
+/* Root Complex BAR Configuration Register */
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG 0x14
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(9, 4)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE(a) \
+ FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_APERTURE_MASK, a)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(3, 0)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL(c) \
+ FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR0_CTRL_MASK, c)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(19, 14)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE(a) \
+ FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_APERTURE_MASK, a)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(13, 10)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL(c) \
+ FIELD_PREP(CDNS_PCIE_HPA_LM_RC_BAR_CFG_BAR1_CTRL_MASK, c)
+
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(20)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(21)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_ENABLE BIT(22)
+#define CDNS_PCIE_HPA_LM_RC_BAR_CFG_IO_32BITS BIT(23)
+
+/* BAR control values applicable to both Endpoint Function and Root Complex */
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED 0x0
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_IO_32BITS 0x3
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_32BITS 0x1
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x9
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_64BITS 0x5
+#define CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0xD
+
+#define HPA_LM_RC_BAR_CFG_CTRL_DISABLED(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_DISABLED << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_IO_32BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_32BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_MEM_64BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \
+ (CDNS_PCIE_HPA_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << ((bar) * 10))
+#define HPA_LM_RC_BAR_CFG_APERTURE(bar, aperture) \
+ (((aperture) - 7) << (((bar) * 10) + 4))
+
+#define CDNS_PCIE_HPA_LM_PTM_CTRL 0x0520
+#define CDNS_PCIE_HPA_LM_PTM_CTRL_PTMRSEN BIT(17)
+
+/* Root Port Registers PCI config space for root port function */
+#define CDNS_PCIE_HPA_RP_CAP_OFFSET 0xC0
+
+/* Region r Outbound AXI to PCIe Address Translation Register 0 */
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r) (0x1010 + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS_MASK)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(23, 16)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK, devfn)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(31, 24)
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS(bus) \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_BUS_MASK, bus)
+
+/* Region r Outbound AXI to PCIe Address Translation Register 1 */
+#define CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r) (0x1014 + ((r) & 0x1F) * 0x0080)
+
+/* Region r Outbound PCIe Descriptor Register */
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r) (0x1008 + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(28, 24)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MEM \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x0)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_IO \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x2)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x4)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x5)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MASK, 0x10)
+
+/* Region r Outbound PCIe Descriptor Register */
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r) (0x100C + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS_MASK GENMASK(31, 24)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(bus) \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS_MASK, bus)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK GENMASK(23, 16)
+#define CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(devfn) \
+ FIELD_PREP(CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN_MASK, devfn)
+
+#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r) (0x1018 + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS BIT(26)
+#define CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN BIT(25)
+
+/* Region r AXI Region Base Address Register 0 */
+#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r) (0x1000 + ((r) & 0x1F) * 0x0080)
+#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS_MASK)
+
+/* Region r AXI Region Base Address Register 1 */
+#define CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r) (0x1004 + ((r) & 0x1F) * 0x0080)
+
+/* Root Port BAR Inbound PCIe to AXI Address Translation Register */
+#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0(bar) (((bar) * 0x0008))
+#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR0_NBITS_MASK)
+#define CDNS_PCIE_HPA_AT_IB_RP_BAR_ADDR1(bar) (0x04 + ((bar) * 0x0008))
+
+/* AXI link down register */
+#define CDNS_PCIE_HPA_AT_LINKDOWN 0x04
+
+/*
+ * Physical Layer Configuration Register 0
+ * This register contains the parameters required for functional setup
+ * of Physical Layer.
+ */
+#define CDNS_PCIE_HPA_PHY_LAYER_CFG0 0x0400
+#define CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK GENMASK(26, 24)
+#define CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY(delay) \
+ FIELD_PREP(CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK, delay)
+#define CDNS_PCIE_HPA_LINK_TRNG_EN_MASK GENMASK(27, 27)
+
+#define CDNS_PCIE_HPA_PHY_DBG_STS_REG0 0x0420
+
+#define CDNS_PCIE_HPA_RP_MAX_IB 0x3
+#define CDNS_PCIE_HPA_MAX_OB 15
+
+/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */
+#define CDNS_PCIE_HPA_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) (((fn) * 0x0080) + ((bar) * 0x0008))
+#define CDNS_PCIE_HPA_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) (0x4 + ((fn) * 0x0080) + ((bar) * 0x0008))
+
+/* Miscellaneous offsets definitions */
+#define CDNS_PCIE_HPA_TAG_MANAGEMENT 0x0
+#define CDNS_PCIE_HPA_SLAVE_RESP 0x100
+
+#define I_ROOT_PORT_REQ_ID_REG 0x141c
+#define LM_HAL_SBSA_CTRL 0x1170
+
+#define I_PCIE_BUS_NUMBERS (CDNS_PCIE_HPA_RP_BASE + 0x18)
+#define CDNS_PCIE_EROM 0x18
+#endif /* _PCIE_CADENCE_HPA_REGS_H */
diff --git a/drivers/pci/controller/cadence/pcie-cadence-hpa.c b/drivers/pci/controller/cadence/pcie-cadence-hpa.c
new file mode 100644
index 000000000000..f60a16938265
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-hpa.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cadence PCIe controller driver.
+ *
+ * Copyright (c) 2024, Cadence Design Systems
+ * Author: Manikandan K Pillai <mpillai@cadence.com>
+ */
+#include <linux/kernel.h>
+#include <linux/of.h>
+
+#include "pcie-cadence.h"
+
+bool cdns_pcie_hpa_link_up(struct cdns_pcie *pcie)
+{
+ u32 pl_reg_val;
+
+ pl_reg_val = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG, CDNS_PCIE_HPA_PHY_DBG_STS_REG0);
+ if (pl_reg_val & GENMASK(0, 0))
+ return true;
+ return false;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_link_up);
+
+void cdns_pcie_hpa_detect_quiet_min_delay_set(struct cdns_pcie *pcie)
+{
+ u32 delay = 0x3;
+ u32 ltssm_control_cap;
+
+ /* Set the LTSSM Detect Quiet state min. delay to 2ms */
+ ltssm_control_cap = cdns_pcie_hpa_readl(pcie, REG_BANK_IP_REG,
+ CDNS_PCIE_HPA_PHY_LAYER_CFG0);
+ ltssm_control_cap = ((ltssm_control_cap &
+ ~CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY_MASK) |
+ CDNS_PCIE_HPA_DETECT_QUIET_MIN_DELAY(delay));
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_IP_REG,
+ CDNS_PCIE_HPA_PHY_LAYER_CFG0, ltssm_control_cap);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_detect_quiet_min_delay_set);
+
+void cdns_pcie_hpa_set_outbound_region(struct cdns_pcie *pcie, u8 busnr, u8 fn,
+ u32 r, bool is_io,
+ u64 cpu_addr, u64 pci_addr, size_t size)
+{
+ /*
+ * roundup_pow_of_two() returns an unsigned long, which is not suited
+ * for 64bit values
+ */
+ u64 sz = 1ULL << fls64(size - 1);
+ int nbits = ilog2(sz);
+ u32 addr0, addr1, desc0, desc1, ctrl0;
+
+ if (nbits < 8)
+ nbits = 8;
+
+ /* Set the PCI address */
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) |
+ (lower_32_bits(pci_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(pci_addr);
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r), addr1);
+
+ /* Set the PCIe header descriptor */
+ if (is_io)
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_IO;
+ else
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_MEM;
+ desc1 = 0;
+ ctrl0 = 0;
+
+ /*
+ * Whether Bit [26] is set or not inside DESC0 register of the outbound
+ * PCIe descriptor, the PCI function number must be set into
+ * Bits [31:24] of DESC1 anyway.
+ *
+ * In Root Complex mode, the function number is always 0 but in Endpoint
+ * mode, the PCIe controller may support more than one function. This
+ * function number needs to be set properly into the outbound PCIe
+ * descriptor.
+ *
+ * Besides, setting Bit [26] is mandatory when in Root Complex mode:
+ * then the driver must provide the bus, resp. device, number in
+ * Bits [31:24] of DESC1, resp. Bits[23:16] of DESC0. Like the function
+ * number, the device number is always 0 in Root Complex mode.
+ *
+ * However when in Endpoint mode, we can clear Bit [26] of DESC0, hence
+ * the PCIe controller will use the captured values for the bus and
+ * device numbers.
+ */
+ if (pcie->is_rc) {
+ /* The device and function numbers are always 0 */
+ desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr) |
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0);
+ ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS |
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN;
+ } else {
+ /*
+ * Use captured values for bus and device numbers but still
+ * need to set the function number
+ */
+ desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(fn);
+ }
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r), desc0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r), desc1);
+
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) |
+ (lower_32_bits(cpu_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(cpu_addr);
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r), addr1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r), ctrl0);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_set_outbound_region);
+
+void cdns_pcie_hpa_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie,
+ u8 busnr, u8 fn,
+ u32 r, u64 cpu_addr)
+{
+ u32 addr0, addr1, desc0, desc1, ctrl0;
+
+ desc0 = CDNS_PCIE_HPA_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG;
+ desc1 = 0;
+ ctrl0 = 0;
+
+ /* See cdns_pcie_set_outbound_region() comments above */
+ if (pcie->is_rc) {
+ desc1 = CDNS_PCIE_HPA_AT_OB_REGION_DESC1_BUS(busnr) |
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(0);
+ ctrl0 = CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_BUS |
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0_SUPPLY_DEV_FN;
+ } else {
+ desc1 |= CDNS_PCIE_HPA_AT_OB_REGION_DESC1_DEVFN(fn);
+ }
+
+ addr0 = CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0_NBITS(17) |
+ (lower_32_bits(cpu_addr) & GENMASK(31, 8));
+ addr1 = upper_32_bits(cpu_addr);
+
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR0(r), 0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_PCI_ADDR1(r), 0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC0(r), desc0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_DESC1(r), desc1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR0(r), addr0);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CPU_ADDR1(r), addr1);
+ cdns_pcie_hpa_writel(pcie, REG_BANK_AXI_SLAVE,
+ CDNS_PCIE_HPA_AT_OB_REGION_CTRL0(r), ctrl0);
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_hpa_set_outbound_region_for_normal_msg);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cadence PCIe controller driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h b/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h
new file mode 100644
index 000000000000..857b2140c5d2
--- /dev/null
+++ b/drivers/pci/controller/cadence/pcie-cadence-lga-regs.h
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Cadence PCIe controller driver.
+ *
+ * Copyright (c) 2017 Cadence
+ * Author: Cyrille Pitchen <cyrille.pitchen@free-electrons.com>
+ */
+#ifndef _PCIE_CADENCE_LGA_REGS_H
+#define _PCIE_CADENCE_LGA_REGS_H
+
+#include <linux/bitfield.h>
+
+/* Parameters for the waiting for link up routine */
+#define LINK_WAIT_MAX_RETRIES 10
+#define LINK_WAIT_USLEEP_MIN 90000
+#define LINK_WAIT_USLEEP_MAX 100000
+
+/* Local Management Registers */
+#define CDNS_PCIE_LM_BASE 0x00100000
+
+/* Vendor ID Register */
+#define CDNS_PCIE_LM_ID (CDNS_PCIE_LM_BASE + 0x0044)
+#define CDNS_PCIE_LM_ID_VENDOR_MASK GENMASK(15, 0)
+#define CDNS_PCIE_LM_ID_VENDOR_SHIFT 0
+#define CDNS_PCIE_LM_ID_VENDOR(vid) \
+ (((vid) << CDNS_PCIE_LM_ID_VENDOR_SHIFT) & CDNS_PCIE_LM_ID_VENDOR_MASK)
+#define CDNS_PCIE_LM_ID_SUBSYS_MASK GENMASK(31, 16)
+#define CDNS_PCIE_LM_ID_SUBSYS_SHIFT 16
+#define CDNS_PCIE_LM_ID_SUBSYS(sub) \
+ (((sub) << CDNS_PCIE_LM_ID_SUBSYS_SHIFT) & CDNS_PCIE_LM_ID_SUBSYS_MASK)
+
+/* Root Port Requester ID Register */
+#define CDNS_PCIE_LM_RP_RID (CDNS_PCIE_LM_BASE + 0x0228)
+#define CDNS_PCIE_LM_RP_RID_MASK GENMASK(15, 0)
+#define CDNS_PCIE_LM_RP_RID_SHIFT 0
+#define CDNS_PCIE_LM_RP_RID_(rid) \
+ (((rid) << CDNS_PCIE_LM_RP_RID_SHIFT) & CDNS_PCIE_LM_RP_RID_MASK)
+
+/* Endpoint Bus and Device Number Register */
+#define CDNS_PCIE_LM_EP_ID (CDNS_PCIE_LM_BASE + 0x022C)
+#define CDNS_PCIE_LM_EP_ID_DEV_MASK GENMASK(4, 0)
+#define CDNS_PCIE_LM_EP_ID_DEV_SHIFT 0
+#define CDNS_PCIE_LM_EP_ID_BUS_MASK GENMASK(15, 8)
+#define CDNS_PCIE_LM_EP_ID_BUS_SHIFT 8
+
+/* Endpoint Function f BAR b Configuration Registers */
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG(bar, fn) \
+ (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn))
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) \
+ (CDNS_PCIE_LM_BASE + 0x0240 + (fn) * 0x0008)
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn) \
+ (CDNS_PCIE_LM_BASE + 0x0244 + (fn) * 0x0008)
+#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG(bar, fn) \
+ (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn))
+#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) \
+ (CDNS_PCIE_LM_BASE + 0x0280 + (fn) * 0x0008)
+#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn) \
+ (CDNS_PCIE_LM_BASE + 0x0284 + (fn) * 0x0008)
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b) \
+ (GENMASK(4, 0) << ((b) * 8))
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \
+ (((a) << ((b) * 8)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b))
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b) \
+ (GENMASK(7, 5) << ((b) * 8))
+#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \
+ (((c) << ((b) * 8 + 5)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b))
+
+/* Endpoint Function Configuration Register */
+#define CDNS_PCIE_LM_EP_FUNC_CFG (CDNS_PCIE_LM_BASE + 0x02C0)
+
+/* Root Complex BAR Configuration Register */
+#define CDNS_PCIE_LM_RC_BAR_CFG (CDNS_PCIE_LM_BASE + 0x0300)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(5, 0)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE(a) \
+ (((a) << 0) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(8, 6)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL(c) \
+ (((c) << 6) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(13, 9)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE(a) \
+ (((a) << 9) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(16, 14)
+#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL(c) \
+ (((c) << 14) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK)
+#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(17)
+#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_32BITS 0
+#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(18)
+#define CDNS_PCIE_LM_RC_BAR_CFG_IO_ENABLE BIT(19)
+#define CDNS_PCIE_LM_RC_BAR_CFG_IO_16BITS 0
+#define CDNS_PCIE_LM_RC_BAR_CFG_IO_32BITS BIT(20)
+#define CDNS_PCIE_LM_RC_BAR_CFG_CHECK_ENABLE BIT(31)
+
+/* BAR control values applicable to both Endpoint Function and Root Complex */
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED 0x0
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS 0x1
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS 0x4
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x5
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS 0x6
+#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0x7
+
+#define LM_RC_BAR_CFG_CTRL_DISABLED(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \
+ (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << (((bar) * 8) + 6))
+#define LM_RC_BAR_CFG_APERTURE(bar, aperture) \
+ (((aperture) - 2) << ((bar) * 8))
+
+/* PTM Control Register */
+#define CDNS_PCIE_LM_PTM_CTRL (CDNS_PCIE_LM_BASE + 0x0DA8)
+#define CDNS_PCIE_LM_TPM_CTRL_PTMRSEN BIT(17)
+
+/*
+ * Endpoint Function Registers (PCI configuration space for endpoint functions)
+ */
+#define CDNS_PCIE_EP_FUNC_BASE(fn) (((fn) << 12) & GENMASK(19, 12))
+
+#define CDNS_PCIE_EP_FUNC_MSI_CAP_OFFSET 0x90
+#define CDNS_PCIE_EP_FUNC_MSIX_CAP_OFFSET 0xB0
+#define CDNS_PCIE_EP_FUNC_DEV_CAP_OFFSET 0xC0
+#define CDNS_PCIE_EP_FUNC_SRIOV_CAP_OFFSET 0x200
+
+/* Endpoint PF Registers */
+#define CDNS_PCIE_CORE_PF_I_ARI_CAP_AND_CTRL(fn) (0x144 + (fn) * 0x1000)
+#define CDNS_PCIE_ARI_CAP_NFN_MASK GENMASK(15, 8)
+
+/* Root Port Registers (PCI configuration space for the root port function) */
+#define CDNS_PCIE_RP_BASE 0x00200000
+#define CDNS_PCIE_RP_CAP_OFFSET 0xC0
+
+/* Address Translation Registers */
+#define CDNS_PCIE_AT_BASE 0x00400000
+
+/* Region r Outbound AXI to PCIe Address Translation Register 0 */
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0(r) \
+ (CDNS_PCIE_AT_BASE + 0x0000 + ((r) & 0x1F) * 0x0020)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(19, 12)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \
+ (((devfn) << 12) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(27, 20)
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS(bus) \
+ (((bus) << 20) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK)
+
+/* Region r Outbound AXI to PCIe Address Translation Register 1 */
+#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR1(r) \
+ (CDNS_PCIE_AT_BASE + 0x0004 + ((r) & 0x1F) * 0x0020)
+
+/* Region r Outbound PCIe Descriptor Register 0 */
+#define CDNS_PCIE_AT_OB_REGION_DESC0(r) \
+ (CDNS_PCIE_AT_BASE + 0x0008 + ((r) & 0x1F) * 0x0020)
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(3, 0)
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MEM 0x2
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_IO 0x6
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 0xA
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 0xB
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG 0xC
+#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_VENDOR_MSG 0xD
+/* Bit 23 MUST be set in RC mode. */
+#define CDNS_PCIE_AT_OB_REGION_DESC0_HARDCODED_RID BIT(23)
+#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK GENMASK(31, 24)
+#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN(devfn) \
+ (((devfn) << 24) & CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK)
+
+/* Region r Outbound PCIe Descriptor Register 1 */
+#define CDNS_PCIE_AT_OB_REGION_DESC1(r) \
+ (CDNS_PCIE_AT_BASE + 0x000C + ((r) & 0x1F) * 0x0020)
+#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK GENMASK(7, 0)
+#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS(bus) \
+ ((bus) & CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK)
+
+/* Region r AXI Region Base Address Register 0 */
+#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0(r) \
+ (CDNS_PCIE_AT_BASE + 0x0018 + ((r) & 0x1F) * 0x0020)
+#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK)
+
+/* Region r AXI Region Base Address Register 1 */
+#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR1(r) \
+ (CDNS_PCIE_AT_BASE + 0x001C + ((r) & 0x1F) * 0x0020)
+
+/* Root Port BAR Inbound PCIe to AXI Address Translation Register */
+#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0(bar) \
+ (CDNS_PCIE_AT_BASE + 0x0800 + (bar) * 0x0008)
+#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0)
+#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \
+ (((nbits) - 1) & CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK)
+#define CDNS_PCIE_AT_IB_RP_BAR_ADDR1(bar) \
+ (CDNS_PCIE_AT_BASE + 0x0804 + (bar) * 0x0008)
+
+/* AXI link down register */
+#define CDNS_PCIE_AT_LINKDOWN (CDNS_PCIE_AT_BASE + 0x0824)
+
+/* LTSSM Capabilities register */
+#define CDNS_PCIE_LTSSM_CONTROL_CAP (CDNS_PCIE_LM_BASE + 0x0054)
+#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK GENMASK(2, 1)
+#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT 1
+#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY(delay) \
+ (((delay) << CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT) & \
+ CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK)
+
+#define CDNS_PCIE_RP_MAX_IB 0x3
+#define CDNS_PCIE_MAX_OB 32
+
+/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */
+#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) \
+ (CDNS_PCIE_AT_BASE + 0x0840 + (fn) * 0x0040 + (bar) * 0x0008)
+#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) \
+ (CDNS_PCIE_AT_BASE + 0x0844 + (fn) * 0x0040 + (bar) * 0x0008)
+
+/* Normal/Vendor specific message access: offset inside some outbound region */
+#define CDNS_PCIE_NORMAL_MSG_ROUTING_MASK GENMASK(7, 5)
+#define CDNS_PCIE_NORMAL_MSG_ROUTING(route) \
+ (((route) << 5) & CDNS_PCIE_NORMAL_MSG_ROUTING_MASK)
+#define CDNS_PCIE_NORMAL_MSG_CODE_MASK GENMASK(15, 8)
+#define CDNS_PCIE_NORMAL_MSG_CODE(code) \
+ (((code) << 8) & CDNS_PCIE_NORMAL_MSG_CODE_MASK)
+#define CDNS_PCIE_MSG_NO_DATA BIT(16)
+
+#endif /* _PCIE_CADENCE_LGA_REGS_H */
diff --git a/drivers/pci/controller/cadence/pcie-cadence-plat.c b/drivers/pci/controller/cadence/pcie-cadence-plat.c
index 0456845dabb9..b067a3296dd3 100644
--- a/drivers/pci/controller/cadence/pcie-cadence-plat.c
+++ b/drivers/pci/controller/cadence/pcie-cadence-plat.c
@@ -22,10 +22,6 @@ struct cdns_plat_pcie {
struct cdns_pcie *pcie;
};
-struct cdns_plat_pcie_of_data {
- bool is_rc;
-};
-
static const struct of_device_id cdns_plat_pcie_of_match[];
static u64 cdns_plat_cpu_addr_fixup(struct cdns_pcie *pcie, u64 cpu_addr)
@@ -177,4 +173,7 @@ static struct platform_driver cdns_plat_pcie_driver = {
.probe = cdns_plat_pcie_probe,
.shutdown = cdns_plat_pcie_shutdown,
};
-builtin_platform_driver(cdns_plat_pcie_driver);
+module_platform_driver(cdns_plat_pcie_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cadence PCIe controller platform driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence.c b/drivers/pci/controller/cadence/pcie-cadence.c
index bd683d0fecb2..e6f1a4ac0fb7 100644
--- a/drivers/pci/controller/cadence/pcie-cadence.c
+++ b/drivers/pci/controller/cadence/pcie-cadence.c
@@ -23,6 +23,17 @@ u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap)
}
EXPORT_SYMBOL_GPL(cdns_pcie_find_ext_capability);
+bool cdns_pcie_linkup(struct cdns_pcie *pcie)
+{
+ u32 pl_reg_val;
+
+ pl_reg_val = cdns_pcie_readl(pcie, CDNS_PCIE_LM_BASE);
+ if (pl_reg_val & GENMASK(0, 0))
+ return true;
+ return false;
+}
+EXPORT_SYMBOL_GPL(cdns_pcie_linkup);
+
void cdns_pcie_detect_quiet_min_delay_set(struct cdns_pcie *pcie)
{
u32 delay = 0x3;
@@ -293,6 +304,7 @@ const struct dev_pm_ops cdns_pcie_pm_ops = {
NOIRQ_SYSTEM_SLEEP_PM_OPS(cdns_pcie_suspend_noirq,
cdns_pcie_resume_noirq)
};
+EXPORT_SYMBOL_GPL(cdns_pcie_pm_ops);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Cadence PCIe controller driver");
diff --git a/drivers/pci/controller/cadence/pcie-cadence.h b/drivers/pci/controller/cadence/pcie-cadence.h
index e2a853d2c0ab..443033c607d7 100644
--- a/drivers/pci/controller/cadence/pcie-cadence.h
+++ b/drivers/pci/controller/cadence/pcie-cadence.h
@@ -7,211 +7,12 @@
#define _PCIE_CADENCE_H
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/pci.h>
#include <linux/pci-epf.h>
#include <linux/phy/phy.h>
-
-/* Parameters for the waiting for link up routine */
-#define LINK_WAIT_MAX_RETRIES 10
-#define LINK_WAIT_USLEEP_MIN 90000
-#define LINK_WAIT_USLEEP_MAX 100000
-
-/*
- * Local Management Registers
- */
-#define CDNS_PCIE_LM_BASE 0x00100000
-
-/* Vendor ID Register */
-#define CDNS_PCIE_LM_ID (CDNS_PCIE_LM_BASE + 0x0044)
-#define CDNS_PCIE_LM_ID_VENDOR_MASK GENMASK(15, 0)
-#define CDNS_PCIE_LM_ID_VENDOR_SHIFT 0
-#define CDNS_PCIE_LM_ID_VENDOR(vid) \
- (((vid) << CDNS_PCIE_LM_ID_VENDOR_SHIFT) & CDNS_PCIE_LM_ID_VENDOR_MASK)
-#define CDNS_PCIE_LM_ID_SUBSYS_MASK GENMASK(31, 16)
-#define CDNS_PCIE_LM_ID_SUBSYS_SHIFT 16
-#define CDNS_PCIE_LM_ID_SUBSYS(sub) \
- (((sub) << CDNS_PCIE_LM_ID_SUBSYS_SHIFT) & CDNS_PCIE_LM_ID_SUBSYS_MASK)
-
-/* Root Port Requester ID Register */
-#define CDNS_PCIE_LM_RP_RID (CDNS_PCIE_LM_BASE + 0x0228)
-#define CDNS_PCIE_LM_RP_RID_MASK GENMASK(15, 0)
-#define CDNS_PCIE_LM_RP_RID_SHIFT 0
-#define CDNS_PCIE_LM_RP_RID_(rid) \
- (((rid) << CDNS_PCIE_LM_RP_RID_SHIFT) & CDNS_PCIE_LM_RP_RID_MASK)
-
-/* Endpoint Bus and Device Number Register */
-#define CDNS_PCIE_LM_EP_ID (CDNS_PCIE_LM_BASE + 0x022c)
-#define CDNS_PCIE_LM_EP_ID_DEV_MASK GENMASK(4, 0)
-#define CDNS_PCIE_LM_EP_ID_DEV_SHIFT 0
-#define CDNS_PCIE_LM_EP_ID_BUS_MASK GENMASK(15, 8)
-#define CDNS_PCIE_LM_EP_ID_BUS_SHIFT 8
-
-/* Endpoint Function f BAR b Configuration Registers */
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG(bar, fn) \
- (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn))
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG0(fn) \
- (CDNS_PCIE_LM_BASE + 0x0240 + (fn) * 0x0008)
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG1(fn) \
- (CDNS_PCIE_LM_BASE + 0x0244 + (fn) * 0x0008)
-#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG(bar, fn) \
- (((bar) < BAR_4) ? CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) : CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn))
-#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG0(fn) \
- (CDNS_PCIE_LM_BASE + 0x0280 + (fn) * 0x0008)
-#define CDNS_PCIE_LM_EP_VFUNC_BAR_CFG1(fn) \
- (CDNS_PCIE_LM_BASE + 0x0284 + (fn) * 0x0008)
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b) \
- (GENMASK(4, 0) << ((b) * 8))
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE(b, a) \
- (((a) << ((b) * 8)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_APERTURE_MASK(b))
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b) \
- (GENMASK(7, 5) << ((b) * 8))
-#define CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL(b, c) \
- (((c) << ((b) * 8 + 5)) & CDNS_PCIE_LM_EP_FUNC_BAR_CFG_BAR_CTRL_MASK(b))
-
-/* Endpoint Function Configuration Register */
-#define CDNS_PCIE_LM_EP_FUNC_CFG (CDNS_PCIE_LM_BASE + 0x02c0)
-
-/* Root Complex BAR Configuration Register */
-#define CDNS_PCIE_LM_RC_BAR_CFG (CDNS_PCIE_LM_BASE + 0x0300)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK GENMASK(5, 0)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE(a) \
- (((a) << 0) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_APERTURE_MASK)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK GENMASK(8, 6)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL(c) \
- (((c) << 6) & CDNS_PCIE_LM_RC_BAR_CFG_BAR0_CTRL_MASK)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK GENMASK(13, 9)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE(a) \
- (((a) << 9) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_APERTURE_MASK)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK GENMASK(16, 14)
-#define CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL(c) \
- (((c) << 14) & CDNS_PCIE_LM_RC_BAR_CFG_BAR1_CTRL_MASK)
-#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_ENABLE BIT(17)
-#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_32BITS 0
-#define CDNS_PCIE_LM_RC_BAR_CFG_PREFETCH_MEM_64BITS BIT(18)
-#define CDNS_PCIE_LM_RC_BAR_CFG_IO_ENABLE BIT(19)
-#define CDNS_PCIE_LM_RC_BAR_CFG_IO_16BITS 0
-#define CDNS_PCIE_LM_RC_BAR_CFG_IO_32BITS BIT(20)
-#define CDNS_PCIE_LM_RC_BAR_CFG_CHECK_ENABLE BIT(31)
-
-/* BAR control values applicable to both Endpoint Function and Root Complex */
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED 0x0
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS 0x1
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS 0x4
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS 0x5
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS 0x6
-#define CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS 0x7
-
-#define LM_RC_BAR_CFG_CTRL_DISABLED(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_DISABLED << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_IO_32BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_IO_32BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_MEM_32BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_32BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_PREF_MEM_32BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_32BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_MEM_64BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_MEM_64BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_CTRL_PREF_MEM_64BITS(bar) \
- (CDNS_PCIE_LM_BAR_CFG_CTRL_PREFETCH_MEM_64BITS << (((bar) * 8) + 6))
-#define LM_RC_BAR_CFG_APERTURE(bar, aperture) \
- (((aperture) - 2) << ((bar) * 8))
-
-/* PTM Control Register */
-#define CDNS_PCIE_LM_PTM_CTRL (CDNS_PCIE_LM_BASE + 0x0da8)
-#define CDNS_PCIE_LM_TPM_CTRL_PTMRSEN BIT(17)
-
-/*
- * Endpoint Function Registers (PCI configuration space for endpoint functions)
- */
-#define CDNS_PCIE_EP_FUNC_BASE(fn) (((fn) << 12) & GENMASK(19, 12))
-
-/*
- * Endpoint PF Registers
- */
-#define CDNS_PCIE_CORE_PF_I_ARI_CAP_AND_CTRL(fn) (0x144 + (fn) * 0x1000)
-#define CDNS_PCIE_ARI_CAP_NFN_MASK GENMASK(15, 8)
-
-/*
- * Root Port Registers (PCI configuration space for the root port function)
- */
-#define CDNS_PCIE_RP_BASE 0x00200000
-#define CDNS_PCIE_RP_CAP_OFFSET 0xc0
-
-/*
- * Address Translation Registers
- */
-#define CDNS_PCIE_AT_BASE 0x00400000
-
-/* Region r Outbound AXI to PCIe Address Translation Register 0 */
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0(r) \
- (CDNS_PCIE_AT_BASE + 0x0000 + ((r) & 0x1f) * 0x0020)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK GENMASK(5, 0)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS(nbits) \
- (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_NBITS_MASK)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK GENMASK(19, 12)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN(devfn) \
- (((devfn) << 12) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_DEVFN_MASK)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK GENMASK(27, 20)
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS(bus) \
- (((bus) << 20) & CDNS_PCIE_AT_OB_REGION_PCI_ADDR0_BUS_MASK)
-
-/* Region r Outbound AXI to PCIe Address Translation Register 1 */
-#define CDNS_PCIE_AT_OB_REGION_PCI_ADDR1(r) \
- (CDNS_PCIE_AT_BASE + 0x0004 + ((r) & 0x1f) * 0x0020)
-
-/* Region r Outbound PCIe Descriptor Register 0 */
-#define CDNS_PCIE_AT_OB_REGION_DESC0(r) \
- (CDNS_PCIE_AT_BASE + 0x0008 + ((r) & 0x1f) * 0x0020)
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MASK GENMASK(3, 0)
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_MEM 0x2
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_IO 0x6
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE0 0xa
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_CONF_TYPE1 0xb
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_NORMAL_MSG 0xc
-#define CDNS_PCIE_AT_OB_REGION_DESC0_TYPE_VENDOR_MSG 0xd
-/* Bit 23 MUST be set in RC mode. */
-#define CDNS_PCIE_AT_OB_REGION_DESC0_HARDCODED_RID BIT(23)
-#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK GENMASK(31, 24)
-#define CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN(devfn) \
- (((devfn) << 24) & CDNS_PCIE_AT_OB_REGION_DESC0_DEVFN_MASK)
-
-/* Region r Outbound PCIe Descriptor Register 1 */
-#define CDNS_PCIE_AT_OB_REGION_DESC1(r) \
- (CDNS_PCIE_AT_BASE + 0x000c + ((r) & 0x1f) * 0x0020)
-#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK GENMASK(7, 0)
-#define CDNS_PCIE_AT_OB_REGION_DESC1_BUS(bus) \
- ((bus) & CDNS_PCIE_AT_OB_REGION_DESC1_BUS_MASK)
-
-/* Region r AXI Region Base Address Register 0 */
-#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0(r) \
- (CDNS_PCIE_AT_BASE + 0x0018 + ((r) & 0x1f) * 0x0020)
-#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK GENMASK(5, 0)
-#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS(nbits) \
- (((nbits) - 1) & CDNS_PCIE_AT_OB_REGION_CPU_ADDR0_NBITS_MASK)
-
-/* Region r AXI Region Base Address Register 1 */
-#define CDNS_PCIE_AT_OB_REGION_CPU_ADDR1(r) \
- (CDNS_PCIE_AT_BASE + 0x001c + ((r) & 0x1f) * 0x0020)
-
-/* Root Port BAR Inbound PCIe to AXI Address Translation Register */
-#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0(bar) \
- (CDNS_PCIE_AT_BASE + 0x0800 + (bar) * 0x0008)
-#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK GENMASK(5, 0)
-#define CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS(nbits) \
- (((nbits) - 1) & CDNS_PCIE_AT_IB_RP_BAR_ADDR0_NBITS_MASK)
-#define CDNS_PCIE_AT_IB_RP_BAR_ADDR1(bar) \
- (CDNS_PCIE_AT_BASE + 0x0804 + (bar) * 0x0008)
-
-/* AXI link down register */
-#define CDNS_PCIE_AT_LINKDOWN (CDNS_PCIE_AT_BASE + 0x0824)
-
-/* LTSSM Capabilities register */
-#define CDNS_PCIE_LTSSM_CONTROL_CAP (CDNS_PCIE_LM_BASE + 0x0054)
-#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK GENMASK(2, 1)
-#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT 1
-#define CDNS_PCIE_DETECT_QUIET_MIN_DELAY(delay) \
- (((delay) << CDNS_PCIE_DETECT_QUIET_MIN_DELAY_SHIFT) & \
- CDNS_PCIE_DETECT_QUIET_MIN_DELAY_MASK)
+#include "pcie-cadence-lga-regs.h"
+#include "pcie-cadence-hpa-regs.h"
enum cdns_pcie_rp_bar {
RP_BAR_UNDEFINED = -1,
@@ -220,42 +21,63 @@ enum cdns_pcie_rp_bar {
RP_NO_BAR
};
-#define CDNS_PCIE_RP_MAX_IB 0x3
-#define CDNS_PCIE_MAX_OB 32
-
struct cdns_pcie_rp_ib_bar {
u64 size;
bool free;
};
-/* Endpoint Function BAR Inbound PCIe to AXI Address Translation Register */
-#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR0(fn, bar) \
- (CDNS_PCIE_AT_BASE + 0x0840 + (fn) * 0x0040 + (bar) * 0x0008)
-#define CDNS_PCIE_AT_IB_EP_FUNC_BAR_ADDR1(fn, bar) \
- (CDNS_PCIE_AT_BASE + 0x0844 + (fn) * 0x0040 + (bar) * 0x0008)
-
-/* Normal/Vendor specific message access: offset inside some outbound region */
-#define CDNS_PCIE_NORMAL_MSG_ROUTING_MASK GENMASK(7, 5)
-#define CDNS_PCIE_NORMAL_MSG_ROUTING(route) \
- (((route) << 5) & CDNS_PCIE_NORMAL_MSG_ROUTING_MASK)
-#define CDNS_PCIE_NORMAL_MSG_CODE_MASK GENMASK(15, 8)
-#define CDNS_PCIE_NORMAL_MSG_CODE(code) \
- (((code) << 8) & CDNS_PCIE_NORMAL_MSG_CODE_MASK)
-#define CDNS_PCIE_MSG_DATA BIT(16)
-
struct cdns_pcie;
+struct cdns_pcie_rc;
+
+enum cdns_pcie_reg_bank {
+ REG_BANK_RP,
+ REG_BANK_IP_REG,
+ REG_BANK_IP_CFG_CTRL_REG,
+ REG_BANK_AXI_MASTER_COMMON,
+ REG_BANK_AXI_MASTER,
+ REG_BANK_AXI_SLAVE,
+ REG_BANK_AXI_HLS,
+ REG_BANK_AXI_RAS,
+ REG_BANK_AXI_DTI,
+ REG_BANKS_MAX,
+};
struct cdns_pcie_ops {
- int (*start_link)(struct cdns_pcie *pcie);
- void (*stop_link)(struct cdns_pcie *pcie);
- bool (*link_up)(struct cdns_pcie *pcie);
+ int (*start_link)(struct cdns_pcie *pcie);
+ void (*stop_link)(struct cdns_pcie *pcie);
+ bool (*link_up)(struct cdns_pcie *pcie);
u64 (*cpu_addr_fixup)(struct cdns_pcie *pcie, u64 cpu_addr);
};
/**
+ * struct cdns_plat_pcie_of_data - Register bank offset for a platform
+ * @is_rc: controller is a RC
+ * @ip_reg_bank_offset: ip register bank start offset
+ * @ip_cfg_ctrl_reg_offset: ip config control register start offset
+ * @axi_mstr_common_offset: AXI master common register start offset
+ * @axi_slave_offset: AXI slave start offset
+ * @axi_master_offset: AXI master start offset
+ * @axi_hls_offset: AXI HLS offset start
+ * @axi_ras_offset: AXI RAS offset
+ * @axi_dti_offset: AXI DTI offset
+ */
+struct cdns_plat_pcie_of_data {
+ u32 is_rc:1;
+ u32 ip_reg_bank_offset;
+ u32 ip_cfg_ctrl_reg_offset;
+ u32 axi_mstr_common_offset;
+ u32 axi_slave_offset;
+ u32 axi_master_offset;
+ u32 axi_hls_offset;
+ u32 axi_ras_offset;
+ u32 axi_dti_offset;
+};
+
+/**
* struct cdns_pcie - private data for Cadence PCIe controller drivers
* @reg_base: IO mapped register base
* @mem_res: start/end offsets in the physical system memory to map PCI accesses
+ * @msg_res: Region for send message to map PCI accesses
* @dev: PCIe controller
* @is_rc: tell whether the PCIe controller mode is Root Complex or Endpoint.
* @phy_count: number of supported PHY devices
@@ -263,16 +85,19 @@ struct cdns_pcie_ops {
* @link: list of pointers to corresponding device link representations
* @ops: Platform-specific ops to control various inputs from Cadence PCIe
* wrapper
+ * @cdns_pcie_reg_offsets: Register bank offsets for different SoC
*/
struct cdns_pcie {
- void __iomem *reg_base;
- struct resource *mem_res;
- struct device *dev;
- bool is_rc;
- int phy_count;
- struct phy **phy;
- struct device_link **link;
- const struct cdns_pcie_ops *ops;
+ void __iomem *reg_base;
+ struct resource *mem_res;
+ struct resource *msg_res;
+ struct device *dev;
+ bool is_rc;
+ int phy_count;
+ struct phy **phy;
+ struct device_link **link;
+ const struct cdns_pcie_ops *ops;
+ const struct cdns_plat_pcie_of_data *cdns_pcie_reg_offsets;
};
/**
@@ -288,6 +113,8 @@ struct cdns_pcie {
* available
* @quirk_retrain_flag: Retrain link as quirk for PCIe Gen2
* @quirk_detect_quiet_flag: LTSSM Detect Quiet min delay set as quirk
+ * @ecam_supported: Whether the ECAM is supported
+ * @no_inbound_map: Whether inbound mapping is supported
*/
struct cdns_pcie_rc {
struct cdns_pcie pcie;
@@ -298,6 +125,8 @@ struct cdns_pcie_rc {
bool avail_ib_bar[CDNS_PCIE_RP_MAX_IB];
unsigned int quirk_retrain_flag:1;
unsigned int quirk_detect_quiet_flag:1;
+ unsigned int ecam_supported:1;
+ unsigned int no_inbound_map:1;
};
/**
@@ -350,6 +179,43 @@ struct cdns_pcie_ep {
unsigned int quirk_disable_flr:1;
};
+static inline u32 cdns_reg_bank_to_off(struct cdns_pcie *pcie, enum cdns_pcie_reg_bank bank)
+{
+ u32 offset = 0x0;
+
+ switch (bank) {
+ case REG_BANK_RP:
+ offset = 0;
+ break;
+ case REG_BANK_IP_REG:
+ offset = pcie->cdns_pcie_reg_offsets->ip_reg_bank_offset;
+ break;
+ case REG_BANK_IP_CFG_CTRL_REG:
+ offset = pcie->cdns_pcie_reg_offsets->ip_cfg_ctrl_reg_offset;
+ break;
+ case REG_BANK_AXI_MASTER_COMMON:
+ offset = pcie->cdns_pcie_reg_offsets->axi_mstr_common_offset;
+ break;
+ case REG_BANK_AXI_MASTER:
+ offset = pcie->cdns_pcie_reg_offsets->axi_master_offset;
+ break;
+ case REG_BANK_AXI_SLAVE:
+ offset = pcie->cdns_pcie_reg_offsets->axi_slave_offset;
+ break;
+ case REG_BANK_AXI_HLS:
+ offset = pcie->cdns_pcie_reg_offsets->axi_hls_offset;
+ break;
+ case REG_BANK_AXI_RAS:
+ offset = pcie->cdns_pcie_reg_offsets->axi_ras_offset;
+ break;
+ case REG_BANK_AXI_DTI:
+ offset = pcie->cdns_pcie_reg_offsets->axi_dti_offset;
+ break;
+ default:
+ break;
+ }
+ return offset;
+}
/* Register access */
static inline void cdns_pcie_writel(struct cdns_pcie *pcie, u32 reg, u32 value)
@@ -362,6 +228,27 @@ static inline u32 cdns_pcie_readl(struct cdns_pcie *pcie, u32 reg)
return readl(pcie->reg_base + reg);
}
+static inline void cdns_pcie_hpa_writel(struct cdns_pcie *pcie,
+ enum cdns_pcie_reg_bank bank,
+ u32 reg,
+ u32 value)
+{
+ u32 offset = cdns_reg_bank_to_off(pcie, bank);
+
+ reg += offset;
+ writel(value, pcie->reg_base + reg);
+}
+
+static inline u32 cdns_pcie_hpa_readl(struct cdns_pcie *pcie,
+ enum cdns_pcie_reg_bank bank,
+ u32 reg)
+{
+ u32 offset = cdns_reg_bank_to_off(pcie, bank);
+
+ reg += offset;
+ return readl(pcie->reg_base + reg);
+}
+
static inline u16 cdns_pcie_readw(struct cdns_pcie *pcie, u32 reg)
{
return readw(pcie->reg_base + reg);
@@ -457,6 +344,29 @@ static inline u16 cdns_pcie_rp_readw(struct cdns_pcie *pcie, u32 reg)
return cdns_pcie_read_sz(addr, 0x2);
}
+static inline void cdns_pcie_hpa_rp_writeb(struct cdns_pcie *pcie,
+ u32 reg, u8 value)
+{
+ void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg;
+
+ cdns_pcie_write_sz(addr, 0x1, value);
+}
+
+static inline void cdns_pcie_hpa_rp_writew(struct cdns_pcie *pcie,
+ u32 reg, u16 value)
+{
+ void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg;
+
+ cdns_pcie_write_sz(addr, 0x2, value);
+}
+
+static inline u16 cdns_pcie_hpa_rp_readw(struct cdns_pcie *pcie, u32 reg)
+{
+ void __iomem *addr = pcie->reg_base + CDNS_PCIE_HPA_RP_BASE + reg;
+
+ return cdns_pcie_read_sz(addr, 0x2);
+}
+
/* Endpoint Function register access */
static inline void cdns_pcie_ep_fn_writeb(struct cdns_pcie *pcie, u8 fn,
u32 reg, u8 value)
@@ -521,6 +431,7 @@ int cdns_pcie_host_setup(struct cdns_pcie_rc *rc);
void cdns_pcie_host_disable(struct cdns_pcie_rc *rc);
void __iomem *cdns_pci_map_bus(struct pci_bus *bus, unsigned int devfn,
int where);
+int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc);
#else
static inline int cdns_pcie_host_link_setup(struct cdns_pcie_rc *rc)
{
@@ -537,6 +448,11 @@ static inline int cdns_pcie_host_setup(struct cdns_pcie_rc *rc)
return 0;
}
+static inline int cdns_pcie_hpa_host_setup(struct cdns_pcie_rc *rc)
+{
+ return 0;
+}
+
static inline void cdns_pcie_host_disable(struct cdns_pcie_rc *rc)
{
}
@@ -551,6 +467,7 @@ static inline void __iomem *cdns_pci_map_bus(struct pci_bus *bus, unsigned int d
#if IS_ENABLED(CONFIG_PCIE_CADENCE_EP)
int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep);
void cdns_pcie_ep_disable(struct cdns_pcie_ep *ep);
+int cdns_pcie_hpa_ep_setup(struct cdns_pcie_ep *ep);
#else
static inline int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep)
{
@@ -560,10 +477,17 @@ static inline int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep)
static inline void cdns_pcie_ep_disable(struct cdns_pcie_ep *ep)
{
}
+
+static inline int cdns_pcie_hpa_ep_setup(struct cdns_pcie_ep *ep)
+{
+ return 0;
+}
+
#endif
-u8 cdns_pcie_find_capability(struct cdns_pcie *pcie, u8 cap);
-u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap);
+u8 cdns_pcie_find_capability(struct cdns_pcie *pcie, u8 cap);
+u16 cdns_pcie_find_ext_capability(struct cdns_pcie *pcie, u8 cap);
+bool cdns_pcie_linkup(struct cdns_pcie *pcie);
void cdns_pcie_detect_quiet_min_delay_set(struct cdns_pcie *pcie);
@@ -577,8 +501,23 @@ void cdns_pcie_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie,
void cdns_pcie_reset_outbound_region(struct cdns_pcie *pcie, u32 r);
void cdns_pcie_disable_phy(struct cdns_pcie *pcie);
-int cdns_pcie_enable_phy(struct cdns_pcie *pcie);
-int cdns_pcie_init_phy(struct device *dev, struct cdns_pcie *pcie);
+int cdns_pcie_enable_phy(struct cdns_pcie *pcie);
+int cdns_pcie_init_phy(struct device *dev, struct cdns_pcie *pcie);
+void cdns_pcie_hpa_detect_quiet_min_delay_set(struct cdns_pcie *pcie);
+void cdns_pcie_hpa_set_outbound_region(struct cdns_pcie *pcie, u8 busnr, u8 fn,
+ u32 r, bool is_io,
+ u64 cpu_addr, u64 pci_addr, size_t size);
+void cdns_pcie_hpa_set_outbound_region_for_normal_msg(struct cdns_pcie *pcie,
+ u8 busnr, u8 fn,
+ u32 r, u64 cpu_addr);
+int cdns_pcie_hpa_host_link_setup(struct cdns_pcie_rc *rc);
+void __iomem *cdns_pci_hpa_map_bus(struct pci_bus *bus, unsigned int devfn,
+ int where);
+int cdns_pcie_hpa_host_start_link(struct cdns_pcie_rc *rc);
+int cdns_pcie_hpa_start_link(struct cdns_pcie *pcie);
+void cdns_pcie_hpa_stop_link(struct cdns_pcie *pcie);
+bool cdns_pcie_hpa_link_up(struct cdns_pcie *pcie);
+
extern const struct dev_pm_ops cdns_pcie_pm_ops;
#endif /* _PCIE_CADENCE_H */
diff --git a/drivers/pci/controller/cadence/pcie-sg2042.c b/drivers/pci/controller/cadence/pcie-sg2042.c
index a077b28d4894..0c50c74d03ee 100644
--- a/drivers/pci/controller/cadence/pcie-sg2042.c
+++ b/drivers/pci/controller/cadence/pcie-sg2042.c
@@ -74,15 +74,12 @@ static int sg2042_pcie_probe(struct platform_device *pdev)
static void sg2042_pcie_remove(struct platform_device *pdev)
{
struct cdns_pcie *pcie = platform_get_drvdata(pdev);
- struct device *dev = &pdev->dev;
struct cdns_pcie_rc *rc;
rc = container_of(pcie, struct cdns_pcie_rc, pcie);
cdns_pcie_host_disable(rc);
cdns_pcie_disable_phy(pcie);
-
- pm_runtime_disable(dev);
}
static int sg2042_pcie_suspend_noirq(struct device *dev)
diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig
index 349d4657393c..519b59422b47 100644
--- a/drivers/pci/controller/dwc/Kconfig
+++ b/drivers/pci/controller/dwc/Kconfig
@@ -256,6 +256,16 @@ config PCIE_TEGRA194_EP
in order to enable device-specific features PCIE_TEGRA194_EP must be
selected. This uses the DesignWare core.
+config PCIE_NXP_S32G
+ bool "NXP S32G PCIe controller (host mode)"
+ depends on ARCH_S32 || COMPILE_TEST
+ select PCIE_DW_HOST
+ help
+ Enable support for the PCIe controller in NXP S32G based boards to
+ work in Host mode. The controller is based on DesignWare IP and
+ can work either as RC or EP. In order to enable host-specific
+ features PCIE_NXP_S32G must be selected.
+
config PCIE_DW_PLAT
bool
@@ -416,6 +426,19 @@ config PCIE_SOPHGO_DW
Say Y here if you want PCIe host controller support on
Sophgo SoCs.
+config PCIE_SPACEMIT_K1
+ tristate "SpacemiT K1 PCIe controller (host mode)"
+ depends on ARCH_SPACEMIT || COMPILE_TEST
+ depends on HAS_IOMEM
+ select PCIE_DW_HOST
+ select PCI_PWRCTRL_SLOT
+ default ARCH_SPACEMIT
+ help
+ Enables support for the DesignWare based PCIe controller in
+ the SpacemiT K1 SoC operating in host mode. Three controllers
+ are available on the K1 SoC; the first of these shares a PHY
+ with a USB 3.0 host controller (one or the other can be used).
+
config PCIE_SPEAR13XX
bool "STMicroelectronics SPEAr PCIe controller"
depends on ARCH_SPEAR13XX || COMPILE_TEST
@@ -482,15 +505,21 @@ config PCI_DRA7XX_EP
to enable device-specific features PCI_DRA7XX_EP must be selected.
This uses the DesignWare core.
+# ARM32 platforms use hook_fault_code() and cannot support loadable module.
config PCI_KEYSTONE
bool
+# On non-ARM32 platforms, loadable module can be supported.
+config PCI_KEYSTONE_TRISTATE
+ tristate
+
config PCI_KEYSTONE_HOST
- bool "TI Keystone PCIe controller (host mode)"
+ tristate "TI Keystone PCIe controller (host mode)"
depends on ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST
depends on PCI_MSI
select PCIE_DW_HOST
- select PCI_KEYSTONE
+ select PCI_KEYSTONE if ARM
+ select PCI_KEYSTONE_TRISTATE if !ARM
help
Enables support for the PCIe controller in the Keystone SoC to
work in host mode. The PCI controller on Keystone is based on
@@ -498,11 +527,12 @@ config PCI_KEYSTONE_HOST
DesignWare core functions to implement the driver.
config PCI_KEYSTONE_EP
- bool "TI Keystone PCIe controller (endpoint mode)"
+ tristate "TI Keystone PCIe controller (endpoint mode)"
depends on ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST
depends on PCI_ENDPOINT
select PCIE_DW_EP
- select PCI_KEYSTONE
+ select PCI_KEYSTONE if ARM
+ select PCI_KEYSTONE_TRISTATE if !ARM
help
Enables support for the PCIe controller in the Keystone SoC to
work in endpoint mode. The PCI controller on Keystone is based
diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile
index 7ae28f3b0fb3..67ba59c02038 100644
--- a/drivers/pci/controller/dwc/Makefile
+++ b/drivers/pci/controller/dwc/Makefile
@@ -10,8 +10,12 @@ obj-$(CONFIG_PCI_DRA7XX) += pci-dra7xx.o
obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o
obj-$(CONFIG_PCIE_FU740) += pcie-fu740.o
obj-$(CONFIG_PCI_IMX6) += pci-imx6.o
+obj-$(CONFIG_PCIE_NXP_S32G) += pcie-nxp-s32g.o
obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o
+# ARM32 platforms use hook_fault_code() and cannot support loadable module.
obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o
+# On non-ARM32 platforms, loadable module can be supported.
+obj-$(CONFIG_PCI_KEYSTONE_TRISTATE) += pci-keystone.o
obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o
obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o
obj-$(CONFIG_PCIE_QCOM_COMMON) += pcie-qcom-common.o
@@ -31,6 +35,7 @@ obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o
obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o
obj-$(CONFIG_PCIE_VISCONTI_HOST) += pcie-visconti.o
obj-$(CONFIG_PCIE_RCAR_GEN4) += pcie-rcar-gen4.o
+obj-$(CONFIG_PCIE_SPACEMIT_K1) += pcie-spacemit-k1.o
obj-$(CONFIG_PCIE_STM32_HOST) += pcie-stm32.o
obj-$(CONFIG_PCIE_STM32_EP) += pcie-stm32-ep.o
diff --git a/drivers/pci/controller/dwc/pci-keystone.c b/drivers/pci/controller/dwc/pci-keystone.c
index eb00aa380722..f86d9111f863 100644
--- a/drivers/pci/controller/dwc/pci-keystone.c
+++ b/drivers/pci/controller/dwc/pci-keystone.c
@@ -17,6 +17,7 @@
#include <linux/irqchip/chained_irq.h>
#include <linux/irqdomain.h>
#include <linux/mfd/syscon.h>
+#include <linux/module.h>
#include <linux/msi.h>
#include <linux/of.h>
#include <linux/of_irq.h>
@@ -777,29 +778,7 @@ err:
return ret;
}
-#ifdef CONFIG_ARM
-/*
- * When a PCI device does not exist during config cycles, keystone host
- * gets a bus error instead of returning 0xffffffff (PCI_ERROR_RESPONSE).
- * This handler always returns 0 for this kind of fault.
- */
-static int ks_pcie_fault(unsigned long addr, unsigned int fsr,
- struct pt_regs *regs)
-{
- unsigned long instr = *(unsigned long *) instruction_pointer(regs);
-
- if ((instr & 0x0e100090) == 0x00100090) {
- int reg = (instr >> 12) & 15;
-
- regs->uregs[reg] = -1;
- regs->ARM_pc += 4;
- }
-
- return 0;
-}
-#endif
-
-static int __init ks_pcie_init_id(struct keystone_pcie *ks_pcie)
+static int ks_pcie_init_id(struct keystone_pcie *ks_pcie)
{
int ret;
unsigned int id;
@@ -831,7 +810,7 @@ static int __init ks_pcie_init_id(struct keystone_pcie *ks_pcie)
return 0;
}
-static int __init ks_pcie_host_init(struct dw_pcie_rp *pp)
+static int ks_pcie_host_init(struct dw_pcie_rp *pp)
{
struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
struct keystone_pcie *ks_pcie = to_keystone_pcie(pci);
@@ -861,15 +840,6 @@ static int __init ks_pcie_host_init(struct dw_pcie_rp *pp)
if (ret < 0)
return ret;
-#ifdef CONFIG_ARM
- /*
- * PCIe access errors that result into OCP errors are caught by ARM as
- * "External aborts"
- */
- hook_fault_code(17, ks_pcie_fault, SIGBUS, 0,
- "Asynchronous external abort");
-#endif
-
return 0;
}
@@ -1134,6 +1104,7 @@ static const struct of_device_id ks_pcie_of_match[] = {
},
{ },
};
+MODULE_DEVICE_TABLE(of, ks_pcie_of_match);
static int ks_pcie_probe(struct platform_device *pdev)
{
@@ -1337,6 +1308,8 @@ static int ks_pcie_probe(struct platform_device *pdev)
break;
default:
dev_err(dev, "INVALID device type %d\n", mode);
+ ret = -EINVAL;
+ goto err_get_sync;
}
ks_pcie_enable_error_irq(ks_pcie);
@@ -1379,4 +1352,45 @@ static struct platform_driver ks_pcie_driver = {
.of_match_table = ks_pcie_of_match,
},
};
+
+#ifdef CONFIG_ARM
+/*
+ * When a PCI device does not exist during config cycles, keystone host
+ * gets a bus error instead of returning 0xffffffff (PCI_ERROR_RESPONSE).
+ * This handler always returns 0 for this kind of fault.
+ */
+static int ks_pcie_fault(unsigned long addr, unsigned int fsr,
+ struct pt_regs *regs)
+{
+ unsigned long instr = *(unsigned long *)instruction_pointer(regs);
+
+ if ((instr & 0x0e100090) == 0x00100090) {
+ int reg = (instr >> 12) & 15;
+
+ regs->uregs[reg] = -1;
+ regs->ARM_pc += 4;
+ }
+
+ return 0;
+}
+
+static int __init ks_pcie_init(void)
+{
+ /*
+ * PCIe access errors that result into OCP errors are caught by ARM as
+ * "External aborts"
+ */
+ if (of_find_matching_node(NULL, ks_pcie_of_match))
+ hook_fault_code(17, ks_pcie_fault, SIGBUS, 0,
+ "Asynchronous external abort");
+
+ return platform_driver_register(&ks_pcie_driver);
+}
+device_initcall(ks_pcie_init);
+#else
builtin_platform_driver(ks_pcie_driver);
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PCIe controller driver for Texas Instruments Keystone SoCs");
+MODULE_AUTHOR("Murali Karicheri <m-karicheri2@ti.com>");
diff --git a/drivers/pci/controller/dwc/pci-meson.c b/drivers/pci/controller/dwc/pci-meson.c
index 787469d1b396..54b6a4196f17 100644
--- a/drivers/pci/controller/dwc/pci-meson.c
+++ b/drivers/pci/controller/dwc/pci-meson.c
@@ -108,10 +108,22 @@ static int meson_pcie_get_mems(struct platform_device *pdev,
struct meson_pcie *mp)
{
struct dw_pcie *pci = &mp->pci;
+ struct resource *res;
- pci->dbi_base = devm_platform_ioremap_resource_byname(pdev, "elbi");
- if (IS_ERR(pci->dbi_base))
- return PTR_ERR(pci->dbi_base);
+ /*
+ * For the broken DTs that supply 'dbi' as 'elbi', parse the 'elbi'
+ * region and assign it to both 'pci->elbi_base' and 'pci->dbi_space' so
+ * that the DWC core can skip parsing both regions.
+ */
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
+ if (res) {
+ pci->elbi_base = devm_pci_remap_cfg_resource(pci->dev, res);
+ if (IS_ERR(pci->elbi_base))
+ return PTR_ERR(pci->elbi_base);
+
+ pci->dbi_base = pci->elbi_base;
+ pci->dbi_phys_addr = res->start;
+ }
mp->cfg_base = devm_platform_ioremap_resource_byname(pdev, "cfg");
if (IS_ERR(mp->cfg_base))
diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 7f2112c2fb21..19571ac2b961 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -797,6 +797,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
return 0;
}
+EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_msix_irq);
/**
* dw_pcie_ep_cleanup - Cleanup DWC EP resources after fundamental reset
diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
index e92513c5bda5..372207c33a85 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -233,6 +233,7 @@ int dw_pcie_allocate_domains(struct dw_pcie_rp *pp)
return 0;
}
+EXPORT_SYMBOL_GPL(dw_pcie_allocate_domains);
void dw_pcie_free_msi(struct dw_pcie_rp *pp)
{
@@ -856,10 +857,19 @@ static void __iomem *dw_pcie_ecam_conf_map_bus(struct pci_bus *bus, unsigned int
return pci->dbi_base + where;
}
+static int dw_pcie_op_assert_perst(struct pci_bus *bus, bool assert)
+{
+ struct dw_pcie_rp *pp = bus->sysdata;
+ struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+
+ return dw_pcie_assert_perst(pci, assert);
+}
+
static struct pci_ops dw_pcie_ops = {
.map_bus = dw_pcie_own_conf_map_bus,
.read = pci_generic_config_read,
.write = pci_generic_config_write,
+ .assert_perst = dw_pcie_op_assert_perst,
};
static struct pci_ops dw_pcie_ecam_ops = {
@@ -1080,6 +1090,8 @@ int dw_pcie_setup_rc(struct dw_pcie_rp *pp)
PCI_COMMAND_MASTER | PCI_COMMAND_SERR;
dw_pcie_writel_dbi(pci, PCI_COMMAND, val);
+ dw_pcie_hide_unsupported_l1ss(pci);
+
dw_pcie_config_presets(pp);
/*
* If the platform provides its own child bus config accesses, it means
diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c
index c644216995f6..75fc8b767fcc 100644
--- a/drivers/pci/controller/dwc/pcie-designware.c
+++ b/drivers/pci/controller/dwc/pcie-designware.c
@@ -168,11 +168,13 @@ int dw_pcie_get_resources(struct dw_pcie *pci)
}
/* ELBI is an optional resource */
- res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
- if (res) {
- pci->elbi_base = devm_ioremap_resource(pci->dev, res);
- if (IS_ERR(pci->elbi_base))
- return PTR_ERR(pci->elbi_base);
+ if (!pci->elbi_base) {
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
+ if (res) {
+ pci->elbi_base = devm_ioremap_resource(pci->dev, res);
+ if (IS_ERR(pci->elbi_base))
+ return PTR_ERR(pci->elbi_base);
+ }
}
/* LLDD is supposed to manually switch the clocks and resets state */
@@ -1081,6 +1083,30 @@ void dw_pcie_edma_remove(struct dw_pcie *pci)
dw_edma_remove(&pci->edma);
}
+void dw_pcie_hide_unsupported_l1ss(struct dw_pcie *pci)
+{
+ u16 l1ss;
+ u32 l1ss_cap;
+
+ if (pci->l1ss_support)
+ return;
+
+ l1ss = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
+ if (!l1ss)
+ return;
+
+ /*
+ * Unless the driver claims "l1ss_support", don't advertise L1 PM
+ * Substates because they require CLKREQ# and possibly other
+ * device-specific configuration.
+ */
+ l1ss_cap = dw_pcie_readl_dbi(pci, l1ss + PCI_L1SS_CAP);
+ l1ss_cap &= ~(PCI_L1SS_CAP_PCIPM_L1_1 | PCI_L1SS_CAP_ASPM_L1_1 |
+ PCI_L1SS_CAP_PCIPM_L1_2 | PCI_L1SS_CAP_ASPM_L1_2 |
+ PCI_L1SS_CAP_L1_PM_SS);
+ dw_pcie_writel_dbi(pci, l1ss + PCI_L1SS_CAP, l1ss_cap);
+}
+
void dw_pcie_setup(struct dw_pcie *pci)
{
u32 val;
diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
index e995f692a1ec..31685951a080 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -97,7 +97,7 @@
#define PORT_LANE_SKEW_INSERT_MASK GENMASK(23, 0)
#define PCIE_PORT_DEBUG0 0x728
-#define PORT_LOGIC_LTSSM_STATE_MASK 0x1f
+#define PORT_LOGIC_LTSSM_STATE_MASK 0x3f
#define PORT_LOGIC_LTSSM_STATE_L0 0x11
#define PCIE_PORT_DEBUG1 0x72C
#define PCIE_PORT_DEBUG1_LINK_UP BIT(4)
@@ -121,6 +121,7 @@
#define GEN3_RELATED_OFF 0x890
#define GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL BIT(0)
+#define GEN3_RELATED_OFF_EQ_PHASE_2_3 BIT(9)
#define GEN3_RELATED_OFF_RXEQ_RGRDLESS_RXTS BIT(13)
#define GEN3_RELATED_OFF_GEN3_EQ_DISABLE BIT(16)
#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT 24
@@ -138,6 +139,13 @@
#define GEN3_EQ_FMDC_MAX_PRE_CURSOR_DELTA GENMASK(13, 10)
#define GEN3_EQ_FMDC_MAX_POST_CURSOR_DELTA GENMASK(17, 14)
+#define COHERENCY_CONTROL_1_OFF 0x8E0
+#define CFG_MEMTYPE_BOUNDARY_LOW_ADDR_MASK GENMASK(31, 2)
+#define CFG_MEMTYPE_VALUE BIT(0)
+
+#define COHERENCY_CONTROL_2_OFF 0x8E4
+#define COHERENCY_CONTROL_3_OFF 0x8E8
+
#define PCIE_PORT_MULTI_LANE_CTRL 0x8C0
#define PORT_MLTI_UPCFG_SUPPORT BIT(7)
@@ -485,6 +493,7 @@ struct dw_pcie_ops {
enum dw_pcie_ltssm (*get_ltssm)(struct dw_pcie *pcie);
int (*start_link)(struct dw_pcie *pcie);
void (*stop_link)(struct dw_pcie *pcie);
+ int (*assert_perst)(struct dw_pcie *pcie, bool assert);
};
struct debugfs_info {
@@ -516,6 +525,7 @@ struct dw_pcie {
int max_link_speed;
u8 n_fts[2];
struct dw_edma_chip edma;
+ bool l1ss_support; /* L1 PM Substates support */
struct clk_bulk_data app_clks[DW_PCIE_NUM_APP_CLKS];
struct clk_bulk_data core_clks[DW_PCIE_NUM_CORE_CLKS];
struct reset_control_bulk_data app_rsts[DW_PCIE_NUM_APP_RSTS];
@@ -573,6 +583,7 @@ int dw_pcie_prog_ep_inbound_atu(struct dw_pcie *pci, u8 func_no, int index,
int type, u64 parent_bus_addr,
u8 bar, size_t size);
void dw_pcie_disable_atu(struct dw_pcie *pci, u32 dir, int index);
+void dw_pcie_hide_unsupported_l1ss(struct dw_pcie *pci);
void dw_pcie_setup(struct dw_pcie *pci);
void dw_pcie_iatu_detect(struct dw_pcie *pci);
int dw_pcie_edma_detect(struct dw_pcie *pci);
@@ -787,6 +798,14 @@ static inline void dw_pcie_stop_link(struct dw_pcie *pci)
pci->ops->stop_link(pci);
}
+static inline int dw_pcie_assert_perst(struct dw_pcie *pci, bool assert)
+{
+ if (pci->ops && pci->ops->assert_perst)
+ return pci->ops->assert_perst(pci, assert);
+
+ return 0;
+}
+
static inline enum dw_pcie_ltssm dw_pcie_get_ltssm(struct dw_pcie *pci)
{
u32 val;
diff --git a/drivers/pci/controller/dwc/pcie-dw-rockchip.c b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
index 3e2752c7dd09..f8605fe61a41 100644
--- a/drivers/pci/controller/dwc/pcie-dw-rockchip.c
+++ b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
@@ -62,6 +62,12 @@
/* Interrupt Mask Register Related to Miscellaneous Operation */
#define PCIE_CLIENT_INTR_MASK_MISC 0x24
+/* Power Management Control Register */
+#define PCIE_CLIENT_POWER_CON 0x2c
+#define PCIE_CLKREQ_READY FIELD_PREP_WM16(BIT(0), 1)
+#define PCIE_CLKREQ_NOT_READY FIELD_PREP_WM16(BIT(0), 0)
+#define PCIE_CLKREQ_PULL_DOWN FIELD_PREP_WM16(GENMASK(13, 12), 1)
+
/* Hot Reset Control Register */
#define PCIE_CLIENT_HOT_RESET_CTRL 0x180
#define PCIE_LTSSM_APP_DLY2_EN BIT(1)
@@ -82,9 +88,9 @@ struct rockchip_pcie {
unsigned int clk_cnt;
struct reset_control *rst;
struct gpio_desc *rst_gpio;
- struct regulator *vpcie3v3;
struct irq_domain *irq_domain;
const struct rockchip_pcie_of_data *data;
+ bool supports_clkreq;
};
struct rockchip_pcie_of_data {
@@ -200,6 +206,35 @@ static bool rockchip_pcie_link_up(struct dw_pcie *pci)
return FIELD_GET(PCIE_LINKUP_MASK, val) == PCIE_LINKUP;
}
+/*
+ * See e.g. section '11.6.6.4 L1 Substate' in the RK3588 TRM V1.0 for the steps
+ * needed to support L1 substates. Currently, just enable L1 substates for RC
+ * mode if CLKREQ# is properly connected and supports-clkreq is present in DT.
+ * For EP mode, there are more things should be done to actually save power in
+ * L1 substates, so disable L1 substates until there is proper support.
+ */
+static void rockchip_pcie_configure_l1ss(struct dw_pcie *pci)
+{
+ struct rockchip_pcie *rockchip = to_rockchip_pcie(pci);
+
+ /* Enable L1 substates if CLKREQ# is properly connected */
+ if (rockchip->supports_clkreq) {
+ rockchip_pcie_writel_apb(rockchip, PCIE_CLKREQ_READY,
+ PCIE_CLIENT_POWER_CON);
+ pci->l1ss_support = true;
+ return;
+ }
+
+ /*
+ * Otherwise, assert CLKREQ# unconditionally. Since
+ * pci->l1ss_support is not set, the DWC core will prevent L1
+ * Substates support from being advertised.
+ */
+ rockchip_pcie_writel_apb(rockchip,
+ PCIE_CLKREQ_PULL_DOWN | PCIE_CLKREQ_NOT_READY,
+ PCIE_CLIENT_POWER_CON);
+}
+
static void rockchip_pcie_enable_l0s(struct dw_pcie *pci)
{
u32 cap, lnkcap;
@@ -264,6 +299,7 @@ static int rockchip_pcie_host_init(struct dw_pcie_rp *pp)
irq_set_chained_handler_and_data(irq, rockchip_pcie_intx_handler,
rockchip);
+ rockchip_pcie_configure_l1ss(pci);
rockchip_pcie_enable_l0s(pci);
return 0;
@@ -412,6 +448,9 @@ static int rockchip_pcie_resource_get(struct platform_device *pdev,
return dev_err_probe(&pdev->dev, PTR_ERR(rockchip->rst),
"failed to get reset lines\n");
+ rockchip->supports_clkreq = of_property_read_bool(pdev->dev.of_node,
+ "supports-clkreq");
+
return 0;
}
@@ -652,22 +691,15 @@ static int rockchip_pcie_probe(struct platform_device *pdev)
return ret;
/* DON'T MOVE ME: must be enable before PHY init */
- rockchip->vpcie3v3 = devm_regulator_get_optional(dev, "vpcie3v3");
- if (IS_ERR(rockchip->vpcie3v3)) {
- if (PTR_ERR(rockchip->vpcie3v3) != -ENODEV)
- return dev_err_probe(dev, PTR_ERR(rockchip->vpcie3v3),
- "failed to get vpcie3v3 regulator\n");
- rockchip->vpcie3v3 = NULL;
- } else {
- ret = regulator_enable(rockchip->vpcie3v3);
- if (ret)
- return dev_err_probe(dev, ret,
- "failed to enable vpcie3v3 regulator\n");
- }
+ ret = devm_regulator_get_enable_optional(dev, "vpcie3v3");
+ if (ret < 0 && ret != -ENODEV)
+ return dev_err_probe(dev, ret,
+ "failed to enable vpcie3v3 regulator\n");
ret = rockchip_pcie_phy_init(rockchip);
if (ret)
- goto disable_regulator;
+ return dev_err_probe(dev, ret,
+ "failed to initialize the phy\n");
ret = reset_control_deassert(rockchip->rst);
if (ret)
@@ -700,9 +732,6 @@ deinit_clk:
clk_bulk_disable_unprepare(rockchip->clk_cnt, rockchip->clks);
deinit_phy:
rockchip_pcie_phy_deinit(rockchip);
-disable_regulator:
- if (rockchip->vpcie3v3)
- regulator_disable(rockchip->vpcie3v3);
return ret;
}
diff --git a/drivers/pci/controller/dwc/pcie-nxp-s32g.c b/drivers/pci/controller/dwc/pcie-nxp-s32g.c
new file mode 100644
index 000000000000..47745749f75c
--- /dev/null
+++ b/drivers/pci/controller/dwc/pcie-nxp-s32g.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCIe host controller driver for NXP S32G SoCs
+ *
+ * Copyright 2019-2025 NXP
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/of_address.h>
+#include <linux/pci.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include "pcie-designware.h"
+
+/* PCIe controller Sub-System */
+
+/* PCIe controller 0 General Control 1 */
+#define PCIE_S32G_PE0_GEN_CTRL_1 0x50
+#define DEVICE_TYPE_MASK GENMASK(3, 0)
+#define SRIS_MODE BIT(8)
+
+/* PCIe controller 0 General Control 3 */
+#define PCIE_S32G_PE0_GEN_CTRL_3 0x58
+#define LTSSM_EN BIT(0)
+
+/* PCIe Controller 0 Interrupt Status */
+#define PCIE_S32G_PE0_INT_STS 0xE8
+#define HP_INT_STS BIT(6)
+
+/* Boundary between peripheral space and physical memory space */
+#define S32G_MEMORY_BOUNDARY_ADDR 0x80000000
+
+struct s32g_pcie_port {
+ struct list_head list;
+ struct phy *phy;
+};
+
+struct s32g_pcie {
+ struct dw_pcie pci;
+ void __iomem *ctrl_base;
+ struct list_head ports;
+};
+
+#define to_s32g_from_dw_pcie(x) \
+ container_of(x, struct s32g_pcie, pci)
+
+static void s32g_pcie_writel_ctrl(struct s32g_pcie *s32g_pp, u32 reg, u32 val)
+{
+ writel(val, s32g_pp->ctrl_base + reg);
+}
+
+static u32 s32g_pcie_readl_ctrl(struct s32g_pcie *s32g_pp, u32 reg)
+{
+ return readl(s32g_pp->ctrl_base + reg);
+}
+
+static void s32g_pcie_enable_ltssm(struct s32g_pcie *s32g_pp)
+{
+ u32 reg;
+
+ reg = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3);
+ reg |= LTSSM_EN;
+ s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3, reg);
+}
+
+static void s32g_pcie_disable_ltssm(struct s32g_pcie *s32g_pp)
+{
+ u32 reg;
+
+ reg = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3);
+ reg &= ~LTSSM_EN;
+ s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_3, reg);
+}
+
+static int s32g_pcie_start_link(struct dw_pcie *pci)
+{
+ struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci);
+
+ s32g_pcie_enable_ltssm(s32g_pp);
+
+ return 0;
+}
+
+static void s32g_pcie_stop_link(struct dw_pcie *pci)
+{
+ struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci);
+
+ s32g_pcie_disable_ltssm(s32g_pp);
+}
+
+static struct dw_pcie_ops s32g_pcie_ops = {
+ .start_link = s32g_pcie_start_link,
+ .stop_link = s32g_pcie_stop_link,
+};
+
+/* Configure the AMBA AXI Coherency Extensions (ACE) interface */
+static void s32g_pcie_reset_mstr_ace(struct dw_pcie *pci)
+{
+ u32 ddr_base_low = lower_32_bits(S32G_MEMORY_BOUNDARY_ADDR);
+ u32 ddr_base_high = upper_32_bits(S32G_MEMORY_BOUNDARY_ADDR);
+
+ dw_pcie_dbi_ro_wr_en(pci);
+ dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_3_OFF, 0x0);
+
+ /*
+ * Ncore is a cache-coherent interconnect module that enables the
+ * integration of heterogeneous coherent and non-coherent agents in
+ * the chip. Ncore transactions to peripheral should be non-coherent
+ * or it might drop them.
+ *
+ * One example where this is needed are PCIe MSIs, which use NoSnoop=0
+ * and might end up routed to Ncore. PCIe coherent traffic (e.g. MSIs)
+ * that targets peripheral space will be dropped by Ncore because
+ * peripherals on S32G are not coherent as slaves. We add a hard
+ * boundary in the PCIe controller coherency control registers to
+ * separate physical memory space from peripheral space.
+ *
+ * Define the start of DDR as seen by Linux as this boundary between
+ * "memory" and "peripherals", with peripherals being below.
+ */
+ dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_1_OFF,
+ (ddr_base_low & CFG_MEMTYPE_BOUNDARY_LOW_ADDR_MASK));
+ dw_pcie_writel_dbi(pci, COHERENCY_CONTROL_2_OFF, ddr_base_high);
+ dw_pcie_dbi_ro_wr_dis(pci);
+}
+
+static int s32g_init_pcie_controller(struct dw_pcie_rp *pp)
+{
+ struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+ struct s32g_pcie *s32g_pp = to_s32g_from_dw_pcie(pci);
+ u32 val;
+
+ /* Set RP mode */
+ val = s32g_pcie_readl_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_1);
+ val &= ~DEVICE_TYPE_MASK;
+ val |= FIELD_PREP(DEVICE_TYPE_MASK, PCI_EXP_TYPE_ROOT_PORT);
+
+ /* Use default CRNS */
+ val &= ~SRIS_MODE;
+
+ s32g_pcie_writel_ctrl(s32g_pp, PCIE_S32G_PE0_GEN_CTRL_1, val);
+
+ /*
+ * Make sure we use the coherency defaults (just in case the settings
+ * have been changed from their reset values)
+ */
+ s32g_pcie_reset_mstr_ace(pci);
+
+ dw_pcie_dbi_ro_wr_en(pci);
+
+ val = dw_pcie_readl_dbi(pci, PCIE_PORT_FORCE);
+ val |= PORT_FORCE_DO_DESKEW_FOR_SRIS;
+ dw_pcie_writel_dbi(pci, PCIE_PORT_FORCE, val);
+
+ val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
+ val |= GEN3_RELATED_OFF_EQ_PHASE_2_3;
+ dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val);
+
+ dw_pcie_dbi_ro_wr_dis(pci);
+
+ return 0;
+}
+
+static const struct dw_pcie_host_ops s32g_pcie_host_ops = {
+ .init = s32g_init_pcie_controller,
+};
+
+static int s32g_init_pcie_phy(struct s32g_pcie *s32g_pp)
+{
+ struct dw_pcie *pci = &s32g_pp->pci;
+ struct device *dev = pci->dev;
+ struct s32g_pcie_port *port, *tmp;
+ int ret;
+
+ list_for_each_entry(port, &s32g_pp->ports, list) {
+ ret = phy_init(port->phy);
+ if (ret) {
+ dev_err(dev, "Failed to init serdes PHY\n");
+ goto err_phy_revert;
+ }
+
+ ret = phy_set_mode_ext(port->phy, PHY_MODE_PCIE, 0);
+ if (ret) {
+ dev_err(dev, "Failed to set mode on serdes PHY\n");
+ goto err_phy_exit;
+ }
+
+ ret = phy_power_on(port->phy);
+ if (ret) {
+ dev_err(dev, "Failed to power on serdes PHY\n");
+ goto err_phy_exit;
+ }
+ }
+
+ return 0;
+
+err_phy_exit:
+ phy_exit(port->phy);
+
+err_phy_revert:
+ list_for_each_entry_continue_reverse(port, &s32g_pp->ports, list) {
+ phy_power_off(port->phy);
+ phy_exit(port->phy);
+ }
+
+ list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list)
+ list_del(&port->list);
+
+ return ret;
+}
+
+static void s32g_deinit_pcie_phy(struct s32g_pcie *s32g_pp)
+{
+ struct s32g_pcie_port *port, *tmp;
+
+ list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list) {
+ phy_power_off(port->phy);
+ phy_exit(port->phy);
+ list_del(&port->list);
+ }
+}
+
+static int s32g_pcie_init(struct device *dev, struct s32g_pcie *s32g_pp)
+{
+ s32g_pcie_disable_ltssm(s32g_pp);
+
+ return s32g_init_pcie_phy(s32g_pp);
+}
+
+static void s32g_pcie_deinit(struct s32g_pcie *s32g_pp)
+{
+ s32g_pcie_disable_ltssm(s32g_pp);
+
+ s32g_deinit_pcie_phy(s32g_pp);
+}
+
+static int s32g_pcie_parse_port(struct s32g_pcie *s32g_pp, struct device_node *node)
+{
+ struct device *dev = s32g_pp->pci.dev;
+ struct s32g_pcie_port *port;
+ int num_lanes;
+
+ port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+
+ port->phy = devm_of_phy_get(dev, node, NULL);
+ if (IS_ERR(port->phy))
+ return dev_err_probe(dev, PTR_ERR(port->phy),
+ "Failed to get serdes PHY\n");
+
+ INIT_LIST_HEAD(&port->list);
+ list_add_tail(&port->list, &s32g_pp->ports);
+
+ /*
+ * The DWC core initialization code cannot yet parse the num-lanes
+ * attribute in the Root Port node. The S32G only supports one Root
+ * Port for now so its driver can parse the node and set the num_lanes
+ * field of struct dwc_pcie before calling dw_pcie_host_init().
+ */
+ if (!of_property_read_u32(node, "num-lanes", &num_lanes))
+ s32g_pp->pci.num_lanes = num_lanes;
+
+ return 0;
+}
+
+static int s32g_pcie_parse_ports(struct device *dev, struct s32g_pcie *s32g_pp)
+{
+ struct s32g_pcie_port *port, *tmp;
+ int ret = -ENOENT;
+
+ for_each_available_child_of_node_scoped(dev->of_node, of_port) {
+ if (!of_node_is_type(of_port, "pci"))
+ continue;
+
+ ret = s32g_pcie_parse_port(s32g_pp, of_port);
+ if (ret)
+ goto err_port;
+ }
+
+err_port:
+ list_for_each_entry_safe(port, tmp, &s32g_pp->ports, list)
+ list_del(&port->list);
+
+ return ret;
+}
+
+static int s32g_pcie_get_resources(struct platform_device *pdev,
+ struct s32g_pcie *s32g_pp)
+{
+ struct dw_pcie *pci = &s32g_pp->pci;
+ struct device *dev = &pdev->dev;
+ int ret;
+
+ pci->dev = dev;
+ pci->ops = &s32g_pcie_ops;
+
+ s32g_pp->ctrl_base = devm_platform_ioremap_resource_byname(pdev, "ctrl");
+ if (IS_ERR(s32g_pp->ctrl_base))
+ return PTR_ERR(s32g_pp->ctrl_base);
+
+ INIT_LIST_HEAD(&s32g_pp->ports);
+
+ ret = s32g_pcie_parse_ports(dev, s32g_pp);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to parse Root Port: %d\n", ret);
+
+ platform_set_drvdata(pdev, s32g_pp);
+
+ return 0;
+}
+
+static int s32g_pcie_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct s32g_pcie *s32g_pp;
+ struct dw_pcie_rp *pp;
+ int ret;
+
+ s32g_pp = devm_kzalloc(dev, sizeof(*s32g_pp), GFP_KERNEL);
+ if (!s32g_pp)
+ return -ENOMEM;
+
+ ret = s32g_pcie_get_resources(pdev, s32g_pp);
+ if (ret)
+ return ret;
+
+ pm_runtime_no_callbacks(dev);
+ devm_pm_runtime_enable(dev);
+ ret = pm_runtime_get_sync(dev);
+ if (ret < 0)
+ goto err_pm_runtime_put;
+
+ ret = s32g_pcie_init(dev, s32g_pp);
+ if (ret)
+ goto err_pm_runtime_put;
+
+ pp = &s32g_pp->pci.pp;
+ pp->ops = &s32g_pcie_host_ops;
+ pp->use_atu_msg = true;
+
+ ret = dw_pcie_host_init(pp);
+ if (ret)
+ goto err_pcie_deinit;
+
+ return 0;
+
+err_pcie_deinit:
+ s32g_pcie_deinit(s32g_pp);
+err_pm_runtime_put:
+ pm_runtime_put(dev);
+
+ return ret;
+}
+
+static int s32g_pcie_suspend_noirq(struct device *dev)
+{
+ struct s32g_pcie *s32g_pp = dev_get_drvdata(dev);
+ struct dw_pcie *pci = &s32g_pp->pci;
+
+ return dw_pcie_suspend_noirq(pci);
+}
+
+static int s32g_pcie_resume_noirq(struct device *dev)
+{
+ struct s32g_pcie *s32g_pp = dev_get_drvdata(dev);
+ struct dw_pcie *pci = &s32g_pp->pci;
+
+ return dw_pcie_resume_noirq(pci);
+}
+
+static const struct dev_pm_ops s32g_pcie_pm_ops = {
+ NOIRQ_SYSTEM_SLEEP_PM_OPS(s32g_pcie_suspend_noirq,
+ s32g_pcie_resume_noirq)
+};
+
+static const struct of_device_id s32g_pcie_of_match[] = {
+ { .compatible = "nxp,s32g2-pcie" },
+ { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, s32g_pcie_of_match);
+
+static struct platform_driver s32g_pcie_driver = {
+ .driver = {
+ .name = "s32g-pcie",
+ .of_match_table = s32g_pcie_of_match,
+ .suppress_bind_attrs = true,
+ .pm = pm_sleep_ptr(&s32g_pcie_pm_ops),
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+ .probe = s32g_pcie_probe,
+};
+
+builtin_platform_driver(s32g_pcie_driver);
+
+MODULE_AUTHOR("Ionut Vicovan <Ionut.Vicovan@nxp.com>");
+MODULE_DESCRIPTION("NXP S32G PCIe Host controller driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index c48a20602d7f..7b92e7a1c0d9 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -641,6 +641,18 @@ static int qcom_pcie_post_init_1_0_0(struct qcom_pcie *pcie)
return 0;
}
+static int qcom_pcie_assert_perst(struct dw_pcie *pci, bool assert)
+{
+ struct qcom_pcie *pcie = to_qcom_pcie(pci);
+
+ if (assert)
+ qcom_ep_reset_assert(pcie);
+ else
+ qcom_ep_reset_deassert(pcie);
+
+ return 0;
+}
+
static void qcom_pcie_2_3_2_ltssm_enable(struct qcom_pcie *pcie)
{
u32 val;
@@ -1012,6 +1024,8 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie)
val &= ~REQ_NOT_ENTR_L1;
writel(val, pcie->parf + PARF_PM_CTRL);
+ pci->l1ss_support = true;
+
val = readl(pcie->parf + PARF_AXI_MSTR_WR_ADDR_HALT_V2);
val |= EN;
writel(val, pcie->parf + PARF_AXI_MSTR_WR_ADDR_HALT_V2);
@@ -1480,6 +1494,7 @@ static const struct qcom_pcie_cfg cfg_fw_managed = {
static const struct dw_pcie_ops dw_pcie_ops = {
.link_up = qcom_pcie_link_up,
.start_link = qcom_pcie_start_link,
+ .assert_perst = qcom_pcie_assert_perst,
};
static int qcom_pcie_icc_init(struct qcom_pcie *pcie)
@@ -1529,6 +1544,7 @@ static void qcom_pcie_icc_opp_update(struct qcom_pcie *pcie)
{
u32 offset, status, width, speed;
struct dw_pcie *pci = pcie->pci;
+ struct dev_pm_opp_key key = {};
unsigned long freq_kbps;
struct dev_pm_opp *opp;
int ret, freq_mbps;
@@ -1556,8 +1572,20 @@ static void qcom_pcie_icc_opp_update(struct qcom_pcie *pcie)
return;
freq_kbps = freq_mbps * KILO;
- opp = dev_pm_opp_find_freq_exact(pci->dev, freq_kbps * width,
- true);
+ opp = dev_pm_opp_find_level_exact(pci->dev, speed);
+ if (IS_ERR(opp)) {
+ /* opp-level is not defined use only frequency */
+ opp = dev_pm_opp_find_freq_exact(pci->dev, freq_kbps * width,
+ true);
+ } else {
+ /* put opp-level OPP */
+ dev_pm_opp_put(opp);
+
+ key.freq = freq_kbps * width;
+ key.level = speed;
+ key.bw = 0;
+ opp = dev_pm_opp_find_key_exact(pci->dev, &key, true);
+ }
if (!IS_ERR(opp)) {
ret = dev_pm_opp_set_opp(pci->dev, opp);
if (ret)
diff --git a/drivers/pci/controller/dwc/pcie-spacemit-k1.c b/drivers/pci/controller/dwc/pcie-spacemit-k1.c
new file mode 100644
index 000000000000..be20a520255b
--- /dev/null
+++ b/drivers/pci/controller/dwc/pcie-spacemit-k1.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SpacemiT K1 PCIe host driver
+ *
+ * Copyright (C) 2025 by RISCstar Solutions Corporation. All rights reserved.
+ * Copyright (c) 2023, spacemit Corporation.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mod_devicetable.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/reset.h>
+#include <linux/types.h>
+
+#include "pcie-designware.h"
+
+#define PCI_VENDOR_ID_SPACEMIT 0x201f
+#define PCI_DEVICE_ID_SPACEMIT_K1 0x0001
+
+/* Offsets and field definitions for link management registers */
+#define K1_PHY_AHB_IRQ_EN 0x0000
+#define PCIE_INTERRUPT_EN BIT(0)
+
+#define K1_PHY_AHB_LINK_STS 0x0004
+#define SMLH_LINK_UP BIT(1)
+#define RDLH_LINK_UP BIT(12)
+
+#define INTR_ENABLE 0x0014
+#define MSI_CTRL_INT BIT(11)
+
+/* Some controls require APMU regmap access */
+#define SYSCON_APMU "spacemit,apmu"
+
+/* Offsets and field definitions for APMU registers */
+#define PCIE_CLK_RESET_CONTROL 0x0000
+#define LTSSM_EN BIT(6)
+#define PCIE_AUX_PWR_DET BIT(9)
+#define PCIE_RC_PERST BIT(12) /* 1: assert PERST# */
+#define APP_HOLD_PHY_RST BIT(30)
+#define DEVICE_TYPE_RC BIT(31) /* 0: endpoint; 1: RC */
+
+#define PCIE_CONTROL_LOGIC 0x0004
+#define PCIE_SOFT_RESET BIT(0)
+
+struct k1_pcie {
+ struct dw_pcie pci;
+ struct phy *phy;
+ void __iomem *link;
+ struct regmap *pmu; /* Errors ignored; MMIO-backed regmap */
+ u32 pmu_off;
+};
+
+#define to_k1_pcie(dw_pcie) \
+ platform_get_drvdata(to_platform_device((dw_pcie)->dev))
+
+static void k1_pcie_toggle_soft_reset(struct k1_pcie *k1)
+{
+ u32 offset;
+ u32 val;
+
+ /*
+ * Write, then read back to guarantee it has reached the device
+ * before we start the delay.
+ */
+ offset = k1->pmu_off + PCIE_CONTROL_LOGIC;
+ regmap_set_bits(k1->pmu, offset, PCIE_SOFT_RESET);
+ regmap_read(k1->pmu, offset, &val);
+
+ mdelay(2);
+
+ regmap_clear_bits(k1->pmu, offset, PCIE_SOFT_RESET);
+}
+
+/* Enable app clocks, deassert resets */
+static int k1_pcie_enable_resources(struct k1_pcie *k1)
+{
+ struct dw_pcie *pci = &k1->pci;
+ int ret;
+
+ ret = clk_bulk_prepare_enable(ARRAY_SIZE(pci->app_clks), pci->app_clks);
+ if (ret)
+ return ret;
+
+ ret = reset_control_bulk_deassert(ARRAY_SIZE(pci->app_rsts),
+ pci->app_rsts);
+ if (ret)
+ goto err_disable_clks;
+
+ return 0;
+
+err_disable_clks:
+ clk_bulk_disable_unprepare(ARRAY_SIZE(pci->app_clks), pci->app_clks);
+
+ return ret;
+}
+
+/* Assert resets, disable app clocks */
+static void k1_pcie_disable_resources(struct k1_pcie *k1)
+{
+ struct dw_pcie *pci = &k1->pci;
+
+ reset_control_bulk_assert(ARRAY_SIZE(pci->app_rsts), pci->app_rsts);
+ clk_bulk_disable_unprepare(ARRAY_SIZE(pci->app_clks), pci->app_clks);
+}
+
+/* FIXME: Disable ASPM L1 to avoid errors reported on some NVMe drives */
+static void k1_pcie_disable_aspm_l1(struct k1_pcie *k1)
+{
+ struct dw_pcie *pci = &k1->pci;
+ u8 offset;
+ u32 val;
+
+ offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+ offset += PCI_EXP_LNKCAP;
+
+ dw_pcie_dbi_ro_wr_en(pci);
+ val = dw_pcie_readl_dbi(pci, offset);
+ val &= ~PCI_EXP_LNKCAP_ASPM_L1;
+ dw_pcie_writel_dbi(pci, offset, val);
+ dw_pcie_dbi_ro_wr_dis(pci);
+}
+
+static int k1_pcie_init(struct dw_pcie_rp *pp)
+{
+ struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+ u32 reset_ctrl;
+ u32 val;
+ int ret;
+
+ k1_pcie_toggle_soft_reset(k1);
+
+ ret = k1_pcie_enable_resources(k1);
+ if (ret)
+ return ret;
+
+ /* Set the PCI vendor and device ID */
+ dw_pcie_dbi_ro_wr_en(pci);
+ dw_pcie_writew_dbi(pci, PCI_VENDOR_ID, PCI_VENDOR_ID_SPACEMIT);
+ dw_pcie_writew_dbi(pci, PCI_DEVICE_ID, PCI_DEVICE_ID_SPACEMIT_K1);
+ dw_pcie_dbi_ro_wr_dis(pci);
+
+ /*
+ * Start by asserting fundamental reset (drive PERST# low). The
+ * PCI CEM spec says that PERST# should be deasserted at least
+ * 100ms after the power becomes stable, so we'll insert that
+ * delay first. Write, then read it back to guarantee the write
+ * reaches the device before we start the delay.
+ */
+ reset_ctrl = k1->pmu_off + PCIE_CLK_RESET_CONTROL;
+ regmap_set_bits(k1->pmu, reset_ctrl, PCIE_RC_PERST);
+ regmap_read(k1->pmu, reset_ctrl, &val);
+ mdelay(PCIE_T_PVPERL_MS);
+
+ /*
+ * Put the controller in root complex mode, and indicate that
+ * Vaux (3.3v) is present.
+ */
+ regmap_set_bits(k1->pmu, reset_ctrl, DEVICE_TYPE_RC | PCIE_AUX_PWR_DET);
+
+ ret = phy_init(k1->phy);
+ if (ret) {
+ k1_pcie_disable_resources(k1);
+
+ return ret;
+ }
+
+ /* Deassert fundamental reset (drive PERST# high) */
+ regmap_clear_bits(k1->pmu, reset_ctrl, PCIE_RC_PERST);
+
+ /* Finally, as a workaround, disable ASPM L1 */
+ k1_pcie_disable_aspm_l1(k1);
+
+ return 0;
+}
+
+static void k1_pcie_deinit(struct dw_pcie_rp *pp)
+{
+ struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+
+ /* Assert fundamental reset (drive PERST# low) */
+ regmap_set_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL,
+ PCIE_RC_PERST);
+
+ phy_exit(k1->phy);
+
+ k1_pcie_disable_resources(k1);
+}
+
+static const struct dw_pcie_host_ops k1_pcie_host_ops = {
+ .init = k1_pcie_init,
+ .deinit = k1_pcie_deinit,
+};
+
+static bool k1_pcie_link_up(struct dw_pcie *pci)
+{
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+ u32 val;
+
+ val = readl_relaxed(k1->link + K1_PHY_AHB_LINK_STS);
+
+ return (val & RDLH_LINK_UP) && (val & SMLH_LINK_UP);
+}
+
+static int k1_pcie_start_link(struct dw_pcie *pci)
+{
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+ u32 val;
+
+ /* Stop holding the PHY in reset, and enable link training */
+ regmap_update_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL,
+ APP_HOLD_PHY_RST | LTSSM_EN, LTSSM_EN);
+
+ /* Enable the MSI interrupt */
+ writel_relaxed(MSI_CTRL_INT, k1->link + INTR_ENABLE);
+
+ /* Top-level interrupt enable */
+ val = readl_relaxed(k1->link + K1_PHY_AHB_IRQ_EN);
+ val |= PCIE_INTERRUPT_EN;
+ writel_relaxed(val, k1->link + K1_PHY_AHB_IRQ_EN);
+
+ return 0;
+}
+
+static void k1_pcie_stop_link(struct dw_pcie *pci)
+{
+ struct k1_pcie *k1 = to_k1_pcie(pci);
+ u32 val;
+
+ /* Disable interrupts */
+ val = readl_relaxed(k1->link + K1_PHY_AHB_IRQ_EN);
+ val &= ~PCIE_INTERRUPT_EN;
+ writel_relaxed(val, k1->link + K1_PHY_AHB_IRQ_EN);
+
+ writel_relaxed(0, k1->link + INTR_ENABLE);
+
+ /* Disable the link and hold the PHY in reset */
+ regmap_update_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL,
+ APP_HOLD_PHY_RST | LTSSM_EN, APP_HOLD_PHY_RST);
+}
+
+static const struct dw_pcie_ops k1_pcie_ops = {
+ .link_up = k1_pcie_link_up,
+ .start_link = k1_pcie_start_link,
+ .stop_link = k1_pcie_stop_link,
+};
+
+static int k1_pcie_parse_port(struct k1_pcie *k1)
+{
+ struct device *dev = k1->pci.dev;
+ struct device_node *root_port;
+ struct phy *phy;
+
+ /* We assume only one root port */
+ root_port = of_get_next_available_child(dev_of_node(dev), NULL);
+ if (!root_port)
+ return -EINVAL;
+
+ phy = devm_of_phy_get(dev, root_port, NULL);
+
+ of_node_put(root_port);
+
+ if (IS_ERR(phy))
+ return PTR_ERR(phy);
+
+ k1->phy = phy;
+
+ return 0;
+}
+
+static int k1_pcie_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct k1_pcie *k1;
+ int ret;
+
+ k1 = devm_kzalloc(dev, sizeof(*k1), GFP_KERNEL);
+ if (!k1)
+ return -ENOMEM;
+
+ k1->pmu = syscon_regmap_lookup_by_phandle_args(dev_of_node(dev),
+ SYSCON_APMU, 1,
+ &k1->pmu_off);
+ if (IS_ERR(k1->pmu))
+ return dev_err_probe(dev, PTR_ERR(k1->pmu),
+ "failed to lookup PMU registers\n");
+
+ k1->link = devm_platform_ioremap_resource_byname(pdev, "link");
+ if (IS_ERR(k1->link))
+ return dev_err_probe(dev, PTR_ERR(k1->link),
+ "failed to map \"link\" registers\n");
+
+ k1->pci.dev = dev;
+ k1->pci.ops = &k1_pcie_ops;
+ k1->pci.pp.num_vectors = MAX_MSI_IRQS;
+ dw_pcie_cap_set(&k1->pci, REQ_RES);
+
+ k1->pci.pp.ops = &k1_pcie_host_ops;
+
+ /* Hold the PHY in reset until we start the link */
+ regmap_set_bits(k1->pmu, k1->pmu_off + PCIE_CLK_RESET_CONTROL,
+ APP_HOLD_PHY_RST);
+
+ ret = devm_regulator_get_enable(dev, "vpcie3v3");
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "failed to get \"vpcie3v3\" supply\n");
+
+ pm_runtime_set_active(dev);
+ pm_runtime_no_callbacks(dev);
+ devm_pm_runtime_enable(dev);
+
+ platform_set_drvdata(pdev, k1);
+
+ ret = k1_pcie_parse_port(k1);
+ if (ret)
+ return dev_err_probe(dev, ret, "failed to parse root port\n");
+
+ ret = dw_pcie_host_init(&k1->pci.pp);
+ if (ret)
+ return dev_err_probe(dev, ret, "failed to initialize host\n");
+
+ return 0;
+}
+
+static void k1_pcie_remove(struct platform_device *pdev)
+{
+ struct k1_pcie *k1 = platform_get_drvdata(pdev);
+
+ dw_pcie_host_deinit(&k1->pci.pp);
+}
+
+static const struct of_device_id k1_pcie_of_match_table[] = {
+ { .compatible = "spacemit,k1-pcie", },
+ { }
+};
+
+static struct platform_driver k1_pcie_driver = {
+ .probe = k1_pcie_probe,
+ .remove = k1_pcie_remove,
+ .driver = {
+ .name = "spacemit-k1-pcie",
+ .of_match_table = k1_pcie_of_match_table,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+};
+module_platform_driver(k1_pcie_driver);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SpacemiT K1 PCIe host driver");
diff --git a/drivers/pci/controller/dwc/pcie-stm32-ep.c b/drivers/pci/controller/dwc/pcie-stm32-ep.c
index 3400c7cd2d88..2cecf32d2b0f 100644
--- a/drivers/pci/controller/dwc/pcie-stm32-ep.c
+++ b/drivers/pci/controller/dwc/pcie-stm32-ep.c
@@ -7,9 +7,9 @@
*/
#include <linux/clk.h>
+#include <linux/gpio/consumer.h>
#include <linux/mfd/syscon.h>
#include <linux/of_platform.h>
-#include <linux/of_gpio.h>
#include <linux/phy/phy.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
@@ -37,36 +37,9 @@ static void stm32_pcie_ep_init(struct dw_pcie_ep *ep)
dw_pcie_ep_reset_bar(pci, bar);
}
-static int stm32_pcie_enable_link(struct dw_pcie *pci)
-{
- struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci);
-
- regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR,
- STM32MP25_PCIECR_LTSSM_EN,
- STM32MP25_PCIECR_LTSSM_EN);
-
- return dw_pcie_wait_for_link(pci);
-}
-
-static void stm32_pcie_disable_link(struct dw_pcie *pci)
-{
- struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci);
-
- regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR, STM32MP25_PCIECR_LTSSM_EN, 0);
-}
-
static int stm32_pcie_start_link(struct dw_pcie *pci)
{
struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci);
- int ret;
-
- dev_dbg(pci->dev, "Enable link\n");
-
- ret = stm32_pcie_enable_link(pci);
- if (ret) {
- dev_err(pci->dev, "PCIe cannot establish link: %d\n", ret);
- return ret;
- }
enable_irq(stm32_pcie->perst_irq);
@@ -77,11 +50,7 @@ static void stm32_pcie_stop_link(struct dw_pcie *pci)
{
struct stm32_pcie *stm32_pcie = to_stm32_pcie(pci);
- dev_dbg(pci->dev, "Disable link\n");
-
disable_irq(stm32_pcie->perst_irq);
-
- stm32_pcie_disable_link(pci);
}
static int stm32_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
@@ -152,6 +121,9 @@ static void stm32_pcie_perst_assert(struct dw_pcie *pci)
dev_dbg(dev, "PERST asserted by host\n");
+ regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR,
+ STM32MP25_PCIECR_LTSSM_EN, 0);
+
pci_epc_deinit_notify(ep->epc);
stm32_pcie_disable_resources(stm32_pcie);
@@ -192,6 +164,11 @@ static void stm32_pcie_perst_deassert(struct dw_pcie *pci)
pci_epc_init_notify(ep->epc);
+ /* Enable link training */
+ regmap_update_bits(stm32_pcie->regmap, SYSCFG_PCIECR,
+ STM32MP25_PCIECR_LTSSM_EN,
+ STM32MP25_PCIECR_LTSSM_EN);
+
return;
err_disable_resources:
@@ -237,6 +214,8 @@ static int stm32_add_pcie_ep(struct stm32_pcie *stm32_pcie,
ep->ops = &stm32_pcie_ep_ops;
+ ep->page_size = stm32_pcie_epc_features.align;
+
ret = dw_pcie_ep_init(ep);
if (ret) {
dev_err(dev, "Failed to initialize ep: %d\n", ret);
diff --git a/drivers/pci/controller/dwc/pcie-stm32.c b/drivers/pci/controller/dwc/pcie-stm32.c
index 96a5fb893af4..a9e77478443b 100644
--- a/drivers/pci/controller/dwc/pcie-stm32.c
+++ b/drivers/pci/controller/dwc/pcie-stm32.c
@@ -7,18 +7,30 @@
*/
#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/gpio/consumer.h>
+#include <linux/irq.h>
#include <linux/mfd/syscon.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/of.h>
#include <linux/of_platform.h>
#include <linux/phy/phy.h>
#include <linux/pinctrl/consumer.h>
#include <linux/platform_device.h>
+#include <linux/pm.h>
#include <linux/pm_runtime.h>
#include <linux/pm_wakeirq.h>
#include <linux/regmap.h>
#include <linux/reset.h>
+#include <linux/stddef.h>
+
+#include "../../pci.h"
+
#include "pcie-designware.h"
#include "pcie-stm32.h"
-#include "../../pci.h"
struct stm32_pcie {
struct dw_pcie pci;
diff --git a/drivers/pci/controller/dwc/pcie-stm32.h b/drivers/pci/controller/dwc/pcie-stm32.h
index 09d39f04e469..419cf1ff669d 100644
--- a/drivers/pci/controller/dwc/pcie-stm32.h
+++ b/drivers/pci/controller/dwc/pcie-stm32.h
@@ -6,6 +6,9 @@
* Author: Christian Bruel <christian.bruel@foss.st.com>
*/
+#include <linux/bits.h>
+#include <linux/device.h>
+
#define to_stm32_pcie(x) dev_get_drvdata((x)->dev)
#define STM32MP25_PCIECR_TYPE_MASK GENMASK(11, 8)
diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c
index 10e74458e667..0ddeef70726d 100644
--- a/drivers/pci/controller/dwc/pcie-tegra194.c
+++ b/drivers/pci/controller/dwc/pcie-tegra194.c
@@ -260,7 +260,6 @@ struct tegra_pcie_dw {
u32 msi_ctrl_int;
u32 num_lanes;
u32 cid;
- u32 cfg_link_cap_l1sub;
u32 ras_des_cap;
u32 pcie_cap_base;
u32 aspm_cmrt;
@@ -475,8 +474,7 @@ static irqreturn_t tegra_pcie_ep_irq_thread(int irq, void *arg)
return IRQ_HANDLED;
/* If EP doesn't advertise L1SS, just return */
- val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub);
- if (!(val & (PCI_L1SS_CAP_ASPM_L1_1 | PCI_L1SS_CAP_ASPM_L1_2)))
+ if (!pci->l1ss_support)
return IRQ_HANDLED;
/* Check if BME is set to '1' */
@@ -608,24 +606,6 @@ static struct pci_ops tegra_pci_ops = {
};
#if defined(CONFIG_PCIEASPM)
-static void disable_aspm_l11(struct tegra_pcie_dw *pcie)
-{
- u32 val;
-
- val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub);
- val &= ~PCI_L1SS_CAP_ASPM_L1_1;
- dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val);
-}
-
-static void disable_aspm_l12(struct tegra_pcie_dw *pcie)
-{
- u32 val;
-
- val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub);
- val &= ~PCI_L1SS_CAP_ASPM_L1_2;
- dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val);
-}
-
static inline u32 event_counter_prog(struct tegra_pcie_dw *pcie, u32 event)
{
u32 val;
@@ -682,10 +662,9 @@ static int aspm_state_cnt(struct seq_file *s, void *data)
static void init_host_aspm(struct tegra_pcie_dw *pcie)
{
struct dw_pcie *pci = &pcie->pci;
- u32 val;
+ u32 l1ss, val;
- val = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
- pcie->cfg_link_cap_l1sub = val + PCI_L1SS_CAP;
+ l1ss = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
pcie->ras_des_cap = dw_pcie_find_ext_capability(&pcie->pci,
PCI_EXT_CAP_ID_VNDR);
@@ -697,11 +676,14 @@ static void init_host_aspm(struct tegra_pcie_dw *pcie)
PCIE_RAS_DES_EVENT_COUNTER_CONTROL, val);
/* Program T_cmrt and T_pwr_on values */
- val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub);
+ val = dw_pcie_readl_dbi(pci, l1ss + PCI_L1SS_CAP);
val &= ~(PCI_L1SS_CAP_CM_RESTORE_TIME | PCI_L1SS_CAP_P_PWR_ON_VALUE);
val |= (pcie->aspm_cmrt << 8);
val |= (pcie->aspm_pwr_on_t << 19);
- dw_pcie_writel_dbi(pci, pcie->cfg_link_cap_l1sub, val);
+ dw_pcie_writel_dbi(pci, l1ss + PCI_L1SS_CAP, val);
+
+ if (pcie->supports_clkreq)
+ pci->l1ss_support = true;
/* Program L0s and L1 entrance latencies */
val = dw_pcie_readl_dbi(pci, PCIE_PORT_AFR);
@@ -726,8 +708,6 @@ static void init_debugfs(struct tegra_pcie_dw *pcie)
aspm_state_cnt);
}
#else
-static inline void disable_aspm_l12(struct tegra_pcie_dw *pcie) { return; }
-static inline void disable_aspm_l11(struct tegra_pcie_dw *pcie) { return; }
static inline void init_host_aspm(struct tegra_pcie_dw *pcie) { return; }
static inline void init_debugfs(struct tegra_pcie_dw *pcie) { return; }
#endif
@@ -931,12 +911,6 @@ static int tegra_pcie_dw_host_init(struct dw_pcie_rp *pp)
init_host_aspm(pcie);
- /* Disable ASPM-L1SS advertisement if there is no CLKREQ routing */
- if (!pcie->supports_clkreq) {
- disable_aspm_l11(pcie);
- disable_aspm_l12(pcie);
- }
-
if (!pcie->of_data->has_l1ss_exit_fix) {
val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL;
@@ -1871,12 +1845,6 @@ static void pex_ep_event_pex_rst_deassert(struct tegra_pcie_dw *pcie)
init_host_aspm(pcie);
- /* Disable ASPM-L1SS advertisement if there is no CLKREQ routing */
- if (!pcie->supports_clkreq) {
- disable_aspm_l11(pcie);
- disable_aspm_l12(pcie);
- }
-
if (!pcie->of_data->has_l1ss_exit_fix) {
val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL;
diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c
index 810d1c8de24e..c473e7c03bac 100644
--- a/drivers/pci/controller/pci-host-common.c
+++ b/drivers/pci/controller/pci-host-common.c
@@ -53,16 +53,12 @@ struct pci_config_window *pci_host_common_ecam_create(struct device *dev,
EXPORT_SYMBOL_GPL(pci_host_common_ecam_create);
int pci_host_common_init(struct platform_device *pdev,
+ struct pci_host_bridge *bridge,
const struct pci_ecam_ops *ops)
{
struct device *dev = &pdev->dev;
- struct pci_host_bridge *bridge;
struct pci_config_window *cfg;
- bridge = devm_pci_alloc_host_bridge(dev, 0);
- if (!bridge)
- return -ENOMEM;
-
of_pci_check_probe_only();
platform_set_drvdata(pdev, bridge);
@@ -85,12 +81,17 @@ EXPORT_SYMBOL_GPL(pci_host_common_init);
int pci_host_common_probe(struct platform_device *pdev)
{
const struct pci_ecam_ops *ops;
+ struct pci_host_bridge *bridge;
ops = of_device_get_match_data(&pdev->dev);
if (!ops)
return -ENODEV;
- return pci_host_common_init(pdev, ops);
+ bridge = devm_pci_alloc_host_bridge(&pdev->dev, 0);
+ if (!bridge)
+ return -ENOMEM;
+
+ return pci_host_common_init(pdev, bridge, ops);
}
EXPORT_SYMBOL_GPL(pci_host_common_probe);
diff --git a/drivers/pci/controller/pci-host-common.h b/drivers/pci/controller/pci-host-common.h
index 51c35ec0cf37..b5075d4bd7eb 100644
--- a/drivers/pci/controller/pci-host-common.h
+++ b/drivers/pci/controller/pci-host-common.h
@@ -14,6 +14,7 @@ struct pci_ecam_ops;
int pci_host_common_probe(struct platform_device *pdev);
int pci_host_common_init(struct platform_device *pdev,
+ struct pci_host_bridge *bridge,
const struct pci_ecam_ops *ops);
void pci_host_common_remove(struct platform_device *pdev);
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 146b43981b27..1e237d3538f9 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -3696,48 +3696,6 @@ static int hv_send_resources_released(struct hv_device *hdev)
return 0;
}
-#define HVPCI_DOM_MAP_SIZE (64 * 1024)
-static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
-
-/*
- * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
- * as invalid for passthrough PCI devices of this driver.
- */
-#define HVPCI_DOM_INVALID 0
-
-/**
- * hv_get_dom_num() - Get a valid PCI domain number
- * Check if the PCI domain number is in use, and return another number if
- * it is in use.
- *
- * @dom: Requested domain number
- *
- * return: domain number on success, HVPCI_DOM_INVALID on failure
- */
-static u16 hv_get_dom_num(u16 dom)
-{
- unsigned int i;
-
- if (test_and_set_bit(dom, hvpci_dom_map) == 0)
- return dom;
-
- for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
- if (test_and_set_bit(i, hvpci_dom_map) == 0)
- return i;
- }
-
- return HVPCI_DOM_INVALID;
-}
-
-/**
- * hv_put_dom_num() - Mark the PCI domain number as free
- * @dom: Domain number to be freed
- */
-static void hv_put_dom_num(u16 dom)
-{
- clear_bit(dom, hvpci_dom_map);
-}
-
/**
* hv_pci_probe() - New VMBus channel probe, for a root PCI bus
* @hdev: VMBus's tracking struct for this root PCI bus
@@ -3750,9 +3708,9 @@ static int hv_pci_probe(struct hv_device *hdev,
{
struct pci_host_bridge *bridge;
struct hv_pcibus_device *hbus;
- u16 dom_req, dom;
+ int ret, dom;
+ u16 dom_req;
char *name;
- int ret;
bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
if (!bridge)
@@ -3779,11 +3737,14 @@ static int hv_pci_probe(struct hv_device *hdev,
* PCI bus (which is actually emulated by the hypervisor) is domain 0.
* (2) There will be no overlap between domains (after fixing possible
* collisions) in the same VM.
+ *
+ * Because Gen1 VMs use domain 0, don't allow picking domain 0 here,
+ * even if bytes 4 and 5 of the instance GUID are both zero. For wider
+ * userspace compatibility, limit the domain ID to a 16-bit value.
*/
dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
- dom = hv_get_dom_num(dom_req);
-
- if (dom == HVPCI_DOM_INVALID) {
+ dom = pci_bus_find_emul_domain_nr(dom_req, 1, U16_MAX);
+ if (dom < 0) {
dev_err(&hdev->device,
"Unable to use dom# 0x%x or other numbers", dom_req);
ret = -EINVAL;
@@ -3917,7 +3878,7 @@ close:
destroy_wq:
destroy_workqueue(hbus->wq);
free_dom:
- hv_put_dom_num(hbus->bridge->domain_nr);
+ pci_bus_release_emul_domain_nr(hbus->bridge->domain_nr);
free_bus:
kfree(hbus);
return ret;
@@ -4042,8 +4003,6 @@ static void hv_pci_remove(struct hv_device *hdev)
irq_domain_remove(hbus->irq_domain);
irq_domain_free_fwnode(hbus->fwnode);
- hv_put_dom_num(hbus->bridge->domain_nr);
-
kfree(hbus);
}
@@ -4217,9 +4176,6 @@ static int __init init_hv_pci_drv(void)
if (ret)
return ret;
- /* Set the invalid domain number's bit, so it will not be used */
- set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
-
/* Initialize PCI block r/w interface */
hvpci_block_ops.read_block = hv_read_config_block;
hvpci_block_ops.write_block = hv_write_config_block;
diff --git a/drivers/pci/controller/pci-ixp4xx.c b/drivers/pci/controller/pci-ixp4xx.c
index acb85e0d5675..9fd401838bad 100644
--- a/drivers/pci/controller/pci-ixp4xx.c
+++ b/drivers/pci/controller/pci-ixp4xx.c
@@ -214,6 +214,7 @@ static u32 ixp4xx_crp_byte_lane_enable_bits(u32 n, int size)
return 0xffffffff;
}
+#ifdef CONFIG_ARM
static int ixp4xx_crp_read_config(struct ixp4xx_pci *p, int where, int size,
u32 *value)
{
@@ -251,6 +252,7 @@ static int ixp4xx_crp_read_config(struct ixp4xx_pci *p, int where, int size,
return PCIBIOS_SUCCESSFUL;
}
+#endif
static int ixp4xx_crp_write_config(struct ixp4xx_pci *p, int where, int size,
u32 value)
@@ -470,6 +472,7 @@ static int ixp4xx_pci_parse_map_dma_ranges(struct ixp4xx_pci *p)
return 0;
}
+#ifdef CONFIG_ARM
/* Only used to get context for abort handling */
static struct ixp4xx_pci *ixp4xx_pci_abort_singleton;
@@ -509,6 +512,7 @@ static int ixp4xx_pci_abort_handler(unsigned long addr, unsigned int fsr,
return 0;
}
+#endif
static int __init ixp4xx_pci_probe(struct platform_device *pdev)
{
@@ -555,10 +559,12 @@ static int __init ixp4xx_pci_probe(struct platform_device *pdev)
dev_info(dev, "controller is in %s mode\n",
p->host_mode ? "host" : "option");
+#ifdef CONFIG_ARM
/* Hook in our fault handler for PCI errors */
ixp4xx_pci_abort_singleton = p;
hook_fault_code(16+6, ixp4xx_pci_abort_handler, SIGBUS, 0,
"imprecise external abort");
+#endif
ret = ixp4xx_pci_parse_map_ranges(p);
if (ret)
diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c
index 0380d300adca..2d92fc79f6dd 100644
--- a/drivers/pci/controller/pcie-apple.c
+++ b/drivers/pci/controller/pcie-apple.c
@@ -187,7 +187,6 @@ struct apple_pcie {
const struct hw_info *hw;
unsigned long *bitmap;
struct list_head ports;
- struct list_head entry;
struct completion event;
struct irq_fwspec fwspec;
u32 nvecs;
@@ -206,9 +205,6 @@ struct apple_pcie_port {
int idx;
};
-static LIST_HEAD(pcie_list);
-static DEFINE_MUTEX(pcie_list_lock);
-
static void rmw_set(u32 set, void __iomem *addr)
{
writel_relaxed(readl_relaxed(addr) | set, addr);
@@ -724,32 +720,9 @@ static int apple_msi_init(struct apple_pcie *pcie)
return 0;
}
-static void apple_pcie_register(struct apple_pcie *pcie)
-{
- guard(mutex)(&pcie_list_lock);
-
- list_add_tail(&pcie->entry, &pcie_list);
-}
-
-static void apple_pcie_unregister(struct apple_pcie *pcie)
-{
- guard(mutex)(&pcie_list_lock);
-
- list_del(&pcie->entry);
-}
-
static struct apple_pcie *apple_pcie_lookup(struct device *dev)
{
- struct apple_pcie *pcie;
-
- guard(mutex)(&pcie_list_lock);
-
- list_for_each_entry(pcie, &pcie_list, entry) {
- if (pcie->dev == dev)
- return pcie;
- }
-
- return NULL;
+ return pci_host_bridge_priv(dev_get_drvdata(dev));
}
static struct apple_pcie_port *apple_pcie_get_port(struct pci_dev *pdev)
@@ -875,13 +848,15 @@ static const struct pci_ecam_ops apple_pcie_cfg_ecam_ops = {
static int apple_pcie_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
+ struct pci_host_bridge *bridge;
struct apple_pcie *pcie;
int ret;
- pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
- if (!pcie)
+ bridge = devm_pci_alloc_host_bridge(dev, sizeof(*pcie));
+ if (!bridge)
return -ENOMEM;
+ pcie = pci_host_bridge_priv(bridge);
pcie->dev = dev;
pcie->hw = of_device_get_match_data(dev);
if (!pcie->hw)
@@ -897,13 +872,7 @@ static int apple_pcie_probe(struct platform_device *pdev)
if (ret)
return ret;
- apple_pcie_register(pcie);
-
- ret = pci_host_common_init(pdev, &apple_pcie_cfg_ecam_ops);
- if (ret)
- apple_pcie_unregister(pcie);
-
- return ret;
+ return pci_host_common_init(pdev, bridge, &apple_pcie_cfg_ecam_ops);
}
static const struct of_device_id apple_pcie_of_match[] = {
diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c
index 9afbd02ded35..062f55690012 100644
--- a/drivers/pci/controller/pcie-brcmstb.c
+++ b/drivers/pci/controller/pcie-brcmstb.c
@@ -14,15 +14,18 @@
#include <linux/irqchip/chained_irq.h>
#include <linux/irqchip/irq-msi-lib.h>
#include <linux/irqdomain.h>
+#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/msi.h>
+#include <linux/notifier.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/of_pci.h>
#include <linux/of_platform.h>
+#include <linux/panic_notifier.h>
#include <linux/pci.h>
#include <linux/pci-ecam.h>
#include <linux/printk.h>
@@ -30,7 +33,9 @@
#include <linux/reset.h>
#include <linux/sizes.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include "../pci.h"
@@ -48,7 +53,6 @@
#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY 0x04dc
#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_MAX_LINK_WIDTH_MASK 0x1f0
-#define PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK 0xc00
#define PCIE_RC_CFG_PRIV1_ROOT_CAP 0x4f8
#define PCIE_RC_CFG_PRIV1_ROOT_CAP_L1SS_MODE_MASK 0xf8
@@ -155,8 +159,40 @@
#define MSI_INT_MASK_SET 0x10
#define MSI_INT_MASK_CLR 0x14
+/* Error report registers */
+#define PCIE_OUTB_ERR_TREAT 0x6000
+#define PCIE_OUTB_ERR_TREAT_CONFIG 0x1
+#define PCIE_OUTB_ERR_TREAT_MEM 0x2
+#define PCIE_OUTB_ERR_VALID 0x6004
+#define PCIE_OUTB_ERR_CLEAR 0x6008
+#define PCIE_OUTB_ERR_ACC_INFO 0x600c
+#define PCIE_OUTB_ERR_ACC_INFO_CFG_ERR BIT(0)
+#define PCIE_OUTB_ERR_ACC_INFO_MEM_ERR BIT(1)
+#define PCIE_OUTB_ERR_ACC_INFO_TYPE_64 BIT(2)
+#define PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE BIT(4)
+#define PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES 0xff00
+#define PCIE_OUTB_ERR_ACC_ADDR 0x6010
+#define PCIE_OUTB_ERR_ACC_ADDR_BUS 0xff00000
+#define PCIE_OUTB_ERR_ACC_ADDR_DEV 0xf8000
+#define PCIE_OUTB_ERR_ACC_ADDR_FUNC 0x7000
+#define PCIE_OUTB_ERR_ACC_ADDR_REG 0xfff
+#define PCIE_OUTB_ERR_CFG_CAUSE 0x6014
+#define PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT BIT(6)
+#define PCIE_OUTB_ERR_CFG_CAUSE_ABORT BIT(5)
+#define PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ BIT(4)
+#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT BIT(2)
+#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED BIT(1)
+#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT BIT(0)
+#define PCIE_OUTB_ERR_MEM_ADDR_LO 0x6018
+#define PCIE_OUTB_ERR_MEM_ADDR_HI 0x601c
+#define PCIE_OUTB_ERR_MEM_CAUSE 0x6020
+#define PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT BIT(6)
+#define PCIE_OUTB_ERR_MEM_CAUSE_ABORT BIT(5)
+#define PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ BIT(4)
+#define PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED BIT(1)
+#define PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR BIT(0)
+
#define PCIE_RGR1_SW_INIT_1_PERST_MASK 0x1
-#define PCIE_RGR1_SW_INIT_1_PERST_SHIFT 0x0
#define RGR1_SW_INIT_1_INIT_GENERIC_MASK 0x2
#define RGR1_SW_INIT_1_INIT_GENERIC_SHIFT 0x1
@@ -259,6 +295,7 @@ struct pcie_cfg_data {
int (*perst_set)(struct brcm_pcie *pcie, u32 val);
int (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val);
int (*post_setup)(struct brcm_pcie *pcie);
+ bool has_err_report;
};
struct subdev_regulators {
@@ -303,6 +340,10 @@ struct brcm_pcie {
struct subdev_regulators *sr;
bool ep_wakeup_capable;
const struct pcie_cfg_data *cfg;
+ bool bridge_in_reset;
+ struct notifier_block die_notifier;
+ struct notifier_block panic_notifier;
+ spinlock_t bridge_lock;
};
static inline bool is_bmips(const struct brcm_pcie *pcie)
@@ -310,6 +351,24 @@ static inline bool is_bmips(const struct brcm_pcie *pcie)
return pcie->cfg->soc_base == BCM7435 || pcie->cfg->soc_base == BCM7425;
}
+static int brcm_pcie_bridge_sw_init_set(struct brcm_pcie *pcie, u32 val)
+{
+ unsigned long flags;
+ int ret;
+
+ if (pcie->cfg->has_err_report)
+ spin_lock_irqsave(&pcie->bridge_lock, flags);
+
+ ret = pcie->cfg->bridge_sw_init_set(pcie, val);
+ /* If we fail, assume the bridge is in reset (off) */
+ pcie->bridge_in_reset = ret ? true : val;
+
+ if (pcie->cfg->has_err_report)
+ spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+
+ return ret;
+}
+
/*
* This is to convert the size of the inbound "BAR" region to the
* non-linear values of PCIE_X_MISC_RC_BAR[123]_CONFIG_LO.SIZE
@@ -1075,13 +1134,13 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
void __iomem *base = pcie->base;
struct pci_host_bridge *bridge;
struct resource_entry *entry;
- u32 tmp, burst, aspm_support, num_lanes, num_lanes_cap;
+ u32 tmp, burst, num_lanes, num_lanes_cap;
u8 num_out_wins = 0;
int num_inbound_wins = 0;
int memc, ret;
/* Reset the bridge */
- ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
if (ret)
return ret;
@@ -1097,7 +1156,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
usleep_range(100, 200);
/* Take the bridge out of reset */
- ret = pcie->cfg->bridge_sw_init_set(pcie, 0);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
if (ret)
return ret;
@@ -1175,12 +1234,9 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
/* Don't advertise L0s capability if 'aspm-no-l0s' */
- aspm_support = PCIE_LINK_STATE_L1;
- if (!of_property_read_bool(pcie->np, "aspm-no-l0s"))
- aspm_support |= PCIE_LINK_STATE_L0S;
tmp = readl(base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY);
- u32p_replace_bits(&tmp, aspm_support,
- PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK);
+ if (of_property_read_bool(pcie->np, "aspm-no-l0s"))
+ tmp &= ~PCI_EXP_LNKCAP_ASPM_L0S;
writel(tmp, base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY);
/* 'tmp' still holds the contents of PRIV1_LINK_CAPABILITY */
@@ -1565,7 +1621,7 @@ static int brcm_pcie_turn_off(struct brcm_pcie *pcie)
if (!(pcie->cfg->quirks & CFG_QUIRK_AVOID_BRIDGE_SHUTDOWN))
/* Shutdown PCIe bridge */
- ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
return ret;
}
@@ -1653,7 +1709,9 @@ static int brcm_pcie_resume_noirq(struct device *dev)
goto err_reset;
/* Take bridge out of reset so we can access the SERDES reg */
- pcie->cfg->bridge_sw_init_set(pcie, 0);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
+ if (ret)
+ goto err_reset;
/* SERDES_IDDQ = 0 */
tmp = readl(base + HARD_DEBUG(pcie));
@@ -1707,6 +1765,119 @@ err_disable_clk:
return ret;
}
+/* Dump out PCIe errors on die or panic */
+static int brcm_pcie_dump_err(struct brcm_pcie *pcie,
+ const char *type)
+{
+ void __iomem *base = pcie->base;
+ int i, is_cfg_err, is_mem_err, lanes;
+ const char *width_str, *direction_str;
+ u32 info, cfg_addr, cfg_cause, mem_cause, lo, hi;
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie);
+ unsigned long flags;
+ char lanes_str[9];
+
+ spin_lock_irqsave(&pcie->bridge_lock, flags);
+ /* Don't access registers when the bridge is off */
+ if (pcie->bridge_in_reset || readl(base + PCIE_OUTB_ERR_VALID) == 0) {
+ spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+ return NOTIFY_DONE;
+ }
+
+ /* Read all necessary registers so we can release the spinlock ASAP */
+ info = readl(base + PCIE_OUTB_ERR_ACC_INFO);
+ is_cfg_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_CFG_ERR);
+ is_mem_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_MEM_ERR);
+ if (is_cfg_err) {
+ cfg_addr = readl(base + PCIE_OUTB_ERR_ACC_ADDR);
+ cfg_cause = readl(base + PCIE_OUTB_ERR_CFG_CAUSE);
+ }
+ if (is_mem_err) {
+ mem_cause = readl(base + PCIE_OUTB_ERR_MEM_CAUSE);
+ lo = readl(base + PCIE_OUTB_ERR_MEM_ADDR_LO);
+ hi = readl(base + PCIE_OUTB_ERR_MEM_ADDR_HI);
+ }
+ /* We've got all of the info, clear the error */
+ writel(1, base + PCIE_OUTB_ERR_CLEAR);
+ spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+
+ dev_err(pcie->dev, "reporting PCIe info which may be related to %s error\n",
+ type);
+ width_str = (info & PCIE_OUTB_ERR_ACC_INFO_TYPE_64) ? "64bit" : "32bit";
+ direction_str = str_read_write(!(info & PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE));
+ lanes = FIELD_GET(PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES, info);
+ for (i = 0, lanes_str[8] = 0; i < 8; i++)
+ lanes_str[i] = (lanes & (1 << i)) ? '1' : '0';
+
+ if (is_cfg_err) {
+ int bus = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_BUS, cfg_addr);
+ int dev = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_DEV, cfg_addr);
+ int func = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_FUNC, cfg_addr);
+ int reg = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_REG, cfg_addr);
+
+ dev_err(pcie->dev, "Error: CFG Acc, %s, %s (%04x:%02x:%02x.%d) reg=0x%x, lanes=%s\n",
+ width_str, direction_str, bridge->domain_nr, bus, dev,
+ func, reg, lanes_str);
+ dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccTO=%d AccDsbld=%d Acc64bit=%d\n",
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ABORT),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED),
+ !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT));
+ }
+
+ if (is_mem_err) {
+ u64 addr = ((u64)hi << 32) | (u64)lo;
+
+ dev_err(pcie->dev, "Error: Mem Acc, %s, %s, @0x%llx, lanes=%s\n",
+ width_str, direction_str, addr, lanes_str);
+ dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccDsble=%d BadAddr=%d\n",
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT),
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ABORT),
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ),
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED),
+ !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR));
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int brcm_pcie_die_notify_cb(struct notifier_block *self,
+ unsigned long v, void *p)
+{
+ struct brcm_pcie *pcie =
+ container_of(self, struct brcm_pcie, die_notifier);
+
+ return brcm_pcie_dump_err(pcie, "Die");
+}
+
+static int brcm_pcie_panic_notify_cb(struct notifier_block *self,
+ unsigned long v, void *p)
+{
+ struct brcm_pcie *pcie =
+ container_of(self, struct brcm_pcie, panic_notifier);
+
+ return brcm_pcie_dump_err(pcie, "Panic");
+}
+
+static void brcm_register_die_notifiers(struct brcm_pcie *pcie)
+{
+ pcie->panic_notifier.notifier_call = brcm_pcie_panic_notify_cb;
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &pcie->panic_notifier);
+
+ pcie->die_notifier.notifier_call = brcm_pcie_die_notify_cb;
+ register_die_notifier(&pcie->die_notifier);
+}
+
+static void brcm_unregister_die_notifiers(struct brcm_pcie *pcie)
+{
+ unregister_die_notifier(&pcie->die_notifier);
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &pcie->panic_notifier);
+}
+
static void __brcm_pcie_remove(struct brcm_pcie *pcie)
{
brcm_msi_remove(pcie);
@@ -1725,6 +1896,9 @@ static void brcm_pcie_remove(struct platform_device *pdev)
pci_stop_root_bus(bridge->bus);
pci_remove_root_bus(bridge->bus);
+ if (pcie->cfg->has_err_report)
+ brcm_unregister_die_notifiers(pcie);
+
__brcm_pcie_remove(pcie);
}
@@ -1825,6 +1999,7 @@ static const struct pcie_cfg_data bcm7216_cfg = {
.bridge_sw_init_set = brcm_pcie_bridge_sw_init_set_7278,
.has_phy = true,
.num_inbound_wins = 3,
+ .has_err_report = true,
};
static const struct pcie_cfg_data bcm7712_cfg = {
@@ -1921,7 +2096,10 @@ static int brcm_pcie_probe(struct platform_device *pdev)
if (ret)
return dev_err_probe(&pdev->dev, ret, "could not enable clock\n");
- pcie->cfg->bridge_sw_init_set(pcie, 0);
+ ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
+ if (ret)
+ return dev_err_probe(&pdev->dev, ret,
+ "could not de-assert bridge reset\n");
if (pcie->swinit_reset) {
ret = reset_control_assert(pcie->swinit_reset);
@@ -1996,6 +2174,11 @@ static int brcm_pcie_probe(struct platform_device *pdev)
return ret;
}
+ if (pcie->cfg->has_err_report) {
+ spin_lock_init(&pcie->bridge_lock);
+ brcm_register_die_notifiers(pcie);
+ }
+
return 0;
fail:
diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c
index 24cc30a2ab6c..4b78b6528f9f 100644
--- a/drivers/pci/controller/pcie-mediatek.c
+++ b/drivers/pci/controller/pcie-mediatek.c
@@ -143,23 +143,33 @@
struct mtk_pcie_port;
/**
+ * enum mtk_pcie_quirks - MTK PCIe quirks
+ * @MTK_PCIE_FIX_CLASS_ID: host's class ID needed to be fixed
+ * @MTK_PCIE_FIX_DEVICE_ID: host's device ID needed to be fixed
+ * @MTK_PCIE_NO_MSI: Bridge has no MSI support, and relies on an external block
+ * @MTK_PCIE_SKIP_RSTB: Skip calling RSTB bits on PCIe probe
+ */
+enum mtk_pcie_quirks {
+ MTK_PCIE_FIX_CLASS_ID = BIT(0),
+ MTK_PCIE_FIX_DEVICE_ID = BIT(1),
+ MTK_PCIE_NO_MSI = BIT(2),
+ MTK_PCIE_SKIP_RSTB = BIT(3),
+};
+
+/**
* struct mtk_pcie_soc - differentiate between host generations
- * @need_fix_class_id: whether this host's class ID needed to be fixed or not
- * @need_fix_device_id: whether this host's device ID needed to be fixed or not
- * @no_msi: Bridge has no MSI support, and relies on an external block
* @device_id: device ID which this host need to be fixed
* @ops: pointer to configuration access functions
* @startup: pointer to controller setting functions
* @setup_irq: pointer to initialize IRQ functions
+ * @quirks: PCIe device quirks.
*/
struct mtk_pcie_soc {
- bool need_fix_class_id;
- bool need_fix_device_id;
- bool no_msi;
unsigned int device_id;
struct pci_ops *ops;
int (*startup)(struct mtk_pcie_port *port);
int (*setup_irq)(struct mtk_pcie_port *port, struct device_node *node);
+ enum mtk_pcie_quirks quirks;
};
/**
@@ -679,31 +689,28 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port)
regmap_update_bits(pcie->cfg, PCIE_SYS_CFG_V2, val, val);
}
- /* Assert all reset signals */
- writel(0, port->base + PCIE_RST_CTRL);
+ if (!(soc->quirks & MTK_PCIE_SKIP_RSTB)) {
+ /* Assert all reset signals */
+ writel(0, port->base + PCIE_RST_CTRL);
- /*
- * Enable PCIe link down reset, if link status changed from link up to
- * link down, this will reset MAC control registers and configuration
- * space.
- */
- writel(PCIE_LINKDOWN_RST_EN, port->base + PCIE_RST_CTRL);
+ /*
+ * Enable PCIe link down reset, if link status changed from
+ * link up to link down, this will reset MAC control registers
+ * and configuration space.
+ */
+ writel(PCIE_LINKDOWN_RST_EN, port->base + PCIE_RST_CTRL);
- /*
- * Described in PCIe CEM specification sections 2.2 (PERST# Signal) and
- * 2.2.1 (Initial Power-Up (G3 to S0)). The deassertion of PERST# should
- * be delayed 100ms (TPVPERL) for the power and clock to become stable.
- */
- msleep(100);
+ msleep(PCIE_T_PVPERL_MS);
- /* De-assert PHY, PE, PIPE, MAC and configuration reset */
- val = readl(port->base + PCIE_RST_CTRL);
- val |= PCIE_PHY_RSTB | PCIE_PERSTB | PCIE_PIPE_SRSTB |
- PCIE_MAC_SRSTB | PCIE_CRSTB;
- writel(val, port->base + PCIE_RST_CTRL);
+ /* De-assert PHY, PE, PIPE, MAC and configuration reset */
+ val = readl(port->base + PCIE_RST_CTRL);
+ val |= PCIE_PHY_RSTB | PCIE_PERSTB | PCIE_PIPE_SRSTB |
+ PCIE_MAC_SRSTB | PCIE_CRSTB;
+ writel(val, port->base + PCIE_RST_CTRL);
+ }
/* Set up vendor ID and class code */
- if (soc->need_fix_class_id) {
+ if (soc->quirks & MTK_PCIE_FIX_CLASS_ID) {
val = PCI_VENDOR_ID_MEDIATEK;
writew(val, port->base + PCIE_CONF_VEND_ID);
@@ -711,7 +718,7 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port)
writew(val, port->base + PCIE_CONF_CLASS_ID);
}
- if (soc->need_fix_device_id)
+ if (soc->quirks & MTK_PCIE_FIX_DEVICE_ID)
writew(soc->device_id, port->base + PCIE_CONF_DEVICE_ID);
/* 100ms timeout value should be enough for Gen1/2 training */
@@ -821,6 +828,41 @@ static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
return 0;
}
+static int mtk_pcie_startup_port_an7583(struct mtk_pcie_port *port)
+{
+ struct mtk_pcie *pcie = port->pcie;
+ struct device *dev = pcie->dev;
+ struct pci_host_bridge *host;
+ struct resource_entry *entry;
+ struct regmap *pbus_regmap;
+ resource_size_t addr;
+ u32 args[2], size;
+
+ /*
+ * Configure PBus base address and base address mask to allow
+ * the hw to detect if a given address is accessible on PCIe
+ * controller.
+ */
+ pbus_regmap = syscon_regmap_lookup_by_phandle_args(dev->of_node,
+ "mediatek,pbus-csr",
+ ARRAY_SIZE(args),
+ args);
+ if (IS_ERR(pbus_regmap))
+ return PTR_ERR(pbus_regmap);
+
+ host = pci_host_bridge_from_priv(pcie);
+ entry = resource_list_first_type(&host->windows, IORESOURCE_MEM);
+ if (!entry)
+ return -ENODEV;
+
+ addr = entry->res->start - entry->offset;
+ regmap_write(pbus_regmap, args[0], lower_32_bits(addr));
+ size = lower_32_bits(resource_size(entry->res));
+ regmap_write(pbus_regmap, args[1], GENMASK(31, __fls(size)));
+
+ return mtk_pcie_startup_port_v2(port);
+}
+
static void mtk_pcie_enable_port(struct mtk_pcie_port *port)
{
struct mtk_pcie *pcie = port->pcie;
@@ -1099,7 +1141,7 @@ static int mtk_pcie_probe(struct platform_device *pdev)
host->ops = pcie->soc->ops;
host->sysdata = pcie;
- host->msi_domain = pcie->soc->no_msi;
+ host->msi_domain = !!(pcie->soc->quirks & MTK_PCIE_NO_MSI);
err = pci_host_probe(host);
if (err)
@@ -1187,9 +1229,9 @@ static const struct dev_pm_ops mtk_pcie_pm_ops = {
};
static const struct mtk_pcie_soc mtk_pcie_soc_v1 = {
- .no_msi = true,
.ops = &mtk_pcie_ops,
.startup = mtk_pcie_startup_port,
+ .quirks = MTK_PCIE_NO_MSI,
};
static const struct mtk_pcie_soc mtk_pcie_soc_mt2712 = {
@@ -1199,22 +1241,29 @@ static const struct mtk_pcie_soc mtk_pcie_soc_mt2712 = {
};
static const struct mtk_pcie_soc mtk_pcie_soc_mt7622 = {
- .need_fix_class_id = true,
.ops = &mtk_pcie_ops_v2,
.startup = mtk_pcie_startup_port_v2,
.setup_irq = mtk_pcie_setup_irq,
+ .quirks = MTK_PCIE_FIX_CLASS_ID,
+};
+
+static const struct mtk_pcie_soc mtk_pcie_soc_an7583 = {
+ .ops = &mtk_pcie_ops_v2,
+ .startup = mtk_pcie_startup_port_an7583,
+ .setup_irq = mtk_pcie_setup_irq,
+ .quirks = MTK_PCIE_FIX_CLASS_ID | MTK_PCIE_SKIP_RSTB,
};
static const struct mtk_pcie_soc mtk_pcie_soc_mt7629 = {
- .need_fix_class_id = true,
- .need_fix_device_id = true,
.device_id = PCI_DEVICE_ID_MEDIATEK_7629,
.ops = &mtk_pcie_ops_v2,
.startup = mtk_pcie_startup_port_v2,
.setup_irq = mtk_pcie_setup_irq,
+ .quirks = MTK_PCIE_FIX_CLASS_ID | MTK_PCIE_FIX_DEVICE_ID,
};
static const struct of_device_id mtk_pcie_ids[] = {
+ { .compatible = "airoha,an7583-pcie", .data = &mtk_pcie_soc_an7583 },
{ .compatible = "mediatek,mt2701-pcie", .data = &mtk_pcie_soc_v1 },
{ .compatible = "mediatek,mt7623-pcie", .data = &mtk_pcie_soc_v1 },
{ .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_mt2712 },
diff --git a/drivers/pci/controller/pcie-rzg3s-host.c b/drivers/pci/controller/pcie-rzg3s-host.c
new file mode 100644
index 000000000000..667e6d629474
--- /dev/null
+++ b/drivers/pci/controller/pcie-rzg3s-host.c
@@ -0,0 +1,1761 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCIe driver for Renesas RZ/G3S SoCs
+ *
+ * Copyright (C) 2025 Renesas Electronics Corp.
+ *
+ * Based on:
+ * drivers/pci/controller/pcie-rcar-host.c
+ * Copyright (C) 2009 - 2011 Paul Mundt
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/cleanup.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/iopoll.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mutex.h>
+#include <linux/msi.h>
+#include <linux/of_irq.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/reset.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/units.h>
+
+#include "../pci.h"
+
+/* AXI registers */
+#define RZG3S_PCI_REQDATA(id) (0x80 + (id) * 0x4)
+#define RZG3S_PCI_REQRCVDAT 0x8c
+
+#define RZG3S_PCI_REQADR1 0x90
+#define RZG3S_PCI_REQADR1_BUS GENMASK(31, 24)
+#define RZG3S_PCI_REQADR1_DEV GENMASK(23, 19)
+#define RZG3S_PCI_REQADR1_FUNC GENMASK(18, 16)
+#define RZG3S_PCI_REQADR1_REG GENMASK(11, 0)
+
+#define RZG3S_PCI_REQBE 0x98
+#define RZG3S_PCI_REQBE_BYTE_EN GENMASK(3, 0)
+
+#define RZG3S_PCI_REQISS 0x9c
+#define RZG3S_PCI_REQISS_MOR_STATUS GENMASK(18, 16)
+#define RZG3S_PCI_REQISS_TR_TYPE GENMASK(11, 8)
+#define RZG3S_PCI_REQISS_TR_TP0_RD FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x4)
+#define RZG3S_PCI_REQISS_TR_TP0_WR FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x5)
+#define RZG3S_PCI_REQISS_TR_TP1_RD FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x6)
+#define RZG3S_PCI_REQISS_TR_TP1_WR FIELD_PREP(RZG3S_PCI_REQISS_TR_TYPE, 0x7)
+#define RZG3S_PCI_REQISS_REQ_ISSUE BIT(0)
+
+#define RZG3S_PCI_MSIRCVWADRL 0x100
+#define RZG3S_PCI_MSIRCVWADRL_MASK GENMASK(31, 3)
+#define RZG3S_PCI_MSIRCVWADRL_MSG_DATA_ENA BIT(1)
+#define RZG3S_PCI_MSIRCVWADRL_ENA BIT(0)
+
+#define RZG3S_PCI_MSIRCVWADRU 0x104
+
+#define RZG3S_PCI_MSIRCVWMSKL 0x108
+#define RZG3S_PCI_MSIRCVWMSKL_MASK GENMASK(31, 2)
+
+#define RZG3S_PCI_PINTRCVIE 0x110
+#define RZG3S_PCI_PINTRCVIE_INTX(i) BIT(i)
+#define RZG3S_PCI_PINTRCVIE_MSI BIT(4)
+
+#define RZG3S_PCI_PINTRCVIS 0x114
+#define RZG3S_PCI_PINTRCVIS_INTX(i) BIT(i)
+#define RZG3S_PCI_PINTRCVIS_MSI BIT(4)
+
+#define RZG3S_PCI_MSGRCVIE 0x120
+#define RZG3S_PCI_MSGRCVIE_MSG_RCV BIT(24)
+
+#define RZG3S_PCI_MSGRCVIS 0x124
+#define RZG3S_PCI_MSGRCVIS_MRI BIT(24)
+
+#define RZG3S_PCI_PEIE0 0x200
+
+#define RZG3S_PCI_PEIS0 0x204
+#define RZG3S_PCI_PEIS0_RX_DLLP_PM_ENTER BIT(12)
+#define RZG3S_PCI_PEIS0_DL_UPDOWN BIT(9)
+
+#define RZG3S_PCI_PEIE1 0x208
+#define RZG3S_PCI_PEIS1 0x20c
+#define RZG3S_PCI_AMEIS 0x214
+#define RZG3S_PCI_ASEIS1 0x224
+
+#define RZG3S_PCI_PCSTAT1 0x408
+#define RZG3S_PCI_PCSTAT1_LTSSM_STATE GENMASK(14, 10)
+#define RZG3S_PCI_PCSTAT1_DL_DOWN_STS BIT(0)
+
+#define RZG3S_PCI_PCCTRL2 0x410
+#define RZG3S_PCI_PCCTRL2_LS_CHG GENMASK(9, 8)
+#define RZG3S_PCI_PCCTRL2_LS_CHG_REQ BIT(0)
+
+#define RZG3S_PCI_PCSTAT2 0x414
+#define RZG3S_PCI_PCSTAT2_LS_CHG_DONE BIT(28)
+#define RZG3S_PCI_PCSTAT2_SDRIRE GENMASK(7, 1)
+
+#define RZG3S_PCI_PERM 0x300
+#define RZG3S_PCI_PERM_CFG_HWINIT_EN BIT(2)
+#define RZG3S_PCI_PERM_PIPE_PHY_REG_EN BIT(1)
+
+#define RZG3S_PCI_MSIRE(id) (0x600 + (id) * 0x10)
+#define RZG3S_PCI_MSIRE_ENA BIT(0)
+
+#define RZG3S_PCI_MSIRM(id) (0x608 + (id) * 0x10)
+#define RZG3S_PCI_MSIRS(id) (0x60c + (id) * 0x10)
+
+#define RZG3S_PCI_AWBASEL(id) (0x1000 + (id) * 0x20)
+#define RZG3S_PCI_AWBASEL_WIN_ENA BIT(0)
+
+#define RZG3S_PCI_AWBASEU(id) (0x1004 + (id) * 0x20)
+#define RZG3S_PCI_AWMASKL(id) (0x1008 + (id) * 0x20)
+#define RZG3S_PCI_AWMASKU(id) (0x100c + (id) * 0x20)
+#define RZG3S_PCI_ADESTL(id) (0x1010 + (id) * 0x20)
+#define RZG3S_PCI_ADESTU(id) (0x1014 + (id) * 0x20)
+
+#define RZG3S_PCI_PWBASEL(id) (0x1100 + (id) * 0x20)
+#define RZG3S_PCI_PWBASEL_ENA BIT(0)
+
+#define RZG3S_PCI_PWBASEU(id) (0x1104 + (id) * 0x20)
+#define RZG3S_PCI_PDESTL(id) (0x1110 + (id) * 0x20)
+#define RZG3S_PCI_PDESTU(id) (0x1114 + (id) * 0x20)
+#define RZG3S_PCI_PWMASKL(id) (0x1108 + (id) * 0x20)
+#define RZG3S_PCI_PWMASKU(id) (0x110c + (id) * 0x20)
+
+/* PHY control registers */
+#define RZG3S_PCI_PHY_XCFGD(id) (0x2000 + (id) * 0x10)
+#define RZG3S_PCI_PHY_XCFGD_NUM 39
+
+#define RZG3S_PCI_PHY_XCFGA_CMN(id) (0x2400 + (id) * 0x10)
+#define RZG3S_PCI_PHY_XCFGA_CMN_NUM 16
+
+#define RZG3S_PCI_PHY_XCFGA_RX(id) (0x2500 + (id) * 0x10)
+#define RZG3S_PCI_PHY_XCFGA_RX_NUM 13
+
+#define RZG3S_PCI_PHY_XCFGA_TX 0x25d0
+
+#define RZG3S_PCI_PHY_XCFG_CTRL 0x2a20
+#define RZG3S_PCI_PHY_XCFG_CTRL_PHYREG_SEL BIT(0)
+
+/* PCIe registers */
+#define RZG3S_PCI_CFG_BASE 0x6000
+#define RZG3S_PCI_CFG_BARMSK00L 0xa0
+#define RZG3S_PCI_CFG_BARMSK00U 0xa4
+
+#define RZG3S_PCI_CFG_PCIEC 0x60
+
+/* System controller registers */
+#define RZG3S_SYS_PCIE_RST_RSM_B 0xd74
+#define RZG3S_SYS_PCIE_RST_RSM_B_MASK BIT(0)
+
+/* Maximum number of windows */
+#define RZG3S_MAX_WINDOWS 8
+
+/* Number of MSI interrupts per register */
+#define RZG3S_PCI_MSI_INT_PER_REG 32
+/* The number of MSI interrupts */
+#define RZG3S_PCI_MSI_INT_NR RZG3S_PCI_MSI_INT_PER_REG
+
+/* Timeouts experimentally determined */
+#define RZG3S_REQ_ISSUE_TIMEOUT_US 2500
+
+/**
+ * struct rzg3s_pcie_msi - RZ/G3S PCIe MSI data structure
+ * @domain: IRQ domain
+ * @map: bitmap with the allocated MSIs
+ * @dma_addr: address of the allocated MSI window
+ * @window_base: base address of the MSI window
+ * @pages: allocated pages for MSI window mapping
+ * @map_lock: lock for bitmap with the allocated MSIs
+ * @irq: MSI interrupt
+ */
+struct rzg3s_pcie_msi {
+ struct irq_domain *domain;
+ DECLARE_BITMAP(map, RZG3S_PCI_MSI_INT_NR);
+ dma_addr_t dma_addr;
+ dma_addr_t window_base;
+ unsigned long pages;
+ struct mutex map_lock;
+ int irq;
+};
+
+struct rzg3s_pcie_host;
+
+/**
+ * struct rzg3s_pcie_soc_data - SoC specific data
+ * @init_phy: PHY initialization function
+ * @power_resets: array with the resets that need to be de-asserted after
+ * power-on
+ * @cfg_resets: array with the resets that need to be de-asserted after
+ * configuration
+ * @num_power_resets: number of power resets
+ * @num_cfg_resets: number of configuration resets
+ */
+struct rzg3s_pcie_soc_data {
+ int (*init_phy)(struct rzg3s_pcie_host *host);
+ const char * const *power_resets;
+ const char * const *cfg_resets;
+ u8 num_power_resets;
+ u8 num_cfg_resets;
+};
+
+/**
+ * struct rzg3s_pcie_port - RZ/G3S PCIe Root Port data structure
+ * @refclk: PCIe reference clock
+ * @vendor_id: Vendor ID
+ * @device_id: Device ID
+ */
+struct rzg3s_pcie_port {
+ struct clk *refclk;
+ u32 vendor_id;
+ u32 device_id;
+};
+
+/**
+ * struct rzg3s_pcie_host - RZ/G3S PCIe data structure
+ * @axi: base address for AXI registers
+ * @pcie: base address for PCIe registers
+ * @dev: struct device
+ * @power_resets: reset control signals that should be set after power up
+ * @cfg_resets: reset control signals that should be set after configuration
+ * @sysc: SYSC regmap
+ * @intx_domain: INTx IRQ domain
+ * @data: SoC specific data
+ * @msi: MSI data structure
+ * @port: PCIe Root Port
+ * @hw_lock: lock for access to the HW resources
+ * @intx_irqs: INTx interrupts
+ * @max_link_speed: maximum supported link speed
+ */
+struct rzg3s_pcie_host {
+ void __iomem *axi;
+ void __iomem *pcie;
+ struct device *dev;
+ struct reset_control_bulk_data *power_resets;
+ struct reset_control_bulk_data *cfg_resets;
+ struct regmap *sysc;
+ struct irq_domain *intx_domain;
+ const struct rzg3s_pcie_soc_data *data;
+ struct rzg3s_pcie_msi msi;
+ struct rzg3s_pcie_port port;
+ raw_spinlock_t hw_lock;
+ int intx_irqs[PCI_NUM_INTX];
+ int max_link_speed;
+};
+
+#define rzg3s_msi_to_host(_msi) container_of(_msi, struct rzg3s_pcie_host, msi)
+
+static void rzg3s_pcie_update_bits(void __iomem *base, u32 offset, u32 mask,
+ u32 val)
+{
+ u32 tmp;
+
+ tmp = readl_relaxed(base + offset);
+ tmp &= ~mask;
+ tmp |= val & mask;
+ writel_relaxed(tmp, base + offset);
+}
+
+static int rzg3s_pcie_child_issue_request(struct rzg3s_pcie_host *host)
+{
+ u32 val;
+ int ret;
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_REQISS,
+ RZG3S_PCI_REQISS_REQ_ISSUE,
+ RZG3S_PCI_REQISS_REQ_ISSUE);
+ ret = readl_poll_timeout_atomic(host->axi + RZG3S_PCI_REQISS, val,
+ !(val & RZG3S_PCI_REQISS_REQ_ISSUE),
+ 5, RZG3S_REQ_ISSUE_TIMEOUT_US);
+
+ if (val & RZG3S_PCI_REQISS_MOR_STATUS)
+ return -EIO;
+
+ return ret;
+}
+
+static void rzg3s_pcie_child_prepare_bus(struct pci_bus *bus,
+ unsigned int devfn, int where)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+ unsigned int dev, func, reg;
+
+ dev = PCI_SLOT(devfn);
+ func = PCI_FUNC(devfn);
+ reg = where & ~0x3;
+
+ /* Set the destination */
+ writel_relaxed(FIELD_PREP(RZG3S_PCI_REQADR1_BUS, bus->number) |
+ FIELD_PREP(RZG3S_PCI_REQADR1_DEV, dev) |
+ FIELD_PREP(RZG3S_PCI_REQADR1_FUNC, func) |
+ FIELD_PREP(RZG3S_PCI_REQADR1_REG, reg),
+ host->axi + RZG3S_PCI_REQADR1);
+
+ /* Set byte enable */
+ writel_relaxed(RZG3S_PCI_REQBE_BYTE_EN, host->axi + RZG3S_PCI_REQBE);
+}
+
+static int rzg3s_pcie_child_read_conf(struct rzg3s_pcie_host *host,
+ struct pci_bus *bus, unsigned int devfn,
+ int where, u32 *data)
+{
+ bool type0 = pci_is_root_bus(bus->parent) ? true : false;
+ int ret;
+
+ rzg3s_pcie_child_prepare_bus(bus, devfn, where);
+
+ /* Set the type of request */
+ writel_relaxed(type0 ? RZG3S_PCI_REQISS_TR_TP0_RD :
+ RZG3S_PCI_REQISS_TR_TP1_RD,
+ host->axi + RZG3S_PCI_REQISS);
+
+ /* Issue the request and wait to finish */
+ ret = rzg3s_pcie_child_issue_request(host);
+ if (ret)
+ return PCIBIOS_SET_FAILED;
+
+ /* Read the data */
+ *data = readl_relaxed(host->axi + RZG3S_PCI_REQRCVDAT);
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+/* Serialization is provided by 'pci_lock' in drivers/pci/access.c */
+static int rzg3s_pcie_child_read(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 *val)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+ int ret;
+
+ ret = rzg3s_pcie_child_read_conf(host, bus, devfn, where, val);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return ret;
+
+ if (size <= 2)
+ *val = (*val >> (8 * (where & 3))) & ((1 << (size * 8)) - 1);
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int rzg3s_pcie_child_write_conf(struct rzg3s_pcie_host *host,
+ struct pci_bus *bus, unsigned int devfn,
+ int where, u32 data)
+{
+ bool type0 = pci_is_root_bus(bus->parent) ? true : false;
+ int ret;
+
+ rzg3s_pcie_child_prepare_bus(bus, devfn, where);
+
+ /* Set the write data */
+ writel_relaxed(0, host->axi + RZG3S_PCI_REQDATA(0));
+ writel_relaxed(0, host->axi + RZG3S_PCI_REQDATA(1));
+ writel_relaxed(data, host->axi + RZG3S_PCI_REQDATA(2));
+
+ /* Set the type of request */
+ writel_relaxed(type0 ? RZG3S_PCI_REQISS_TR_TP0_WR :
+ RZG3S_PCI_REQISS_TR_TP1_WR,
+ host->axi + RZG3S_PCI_REQISS);
+
+ /* Issue the request and wait to finish */
+ ret = rzg3s_pcie_child_issue_request(host);
+ if (ret)
+ return PCIBIOS_SET_FAILED;
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+/* Serialization is provided by 'pci_lock' in drivers/pci/access.c */
+static int rzg3s_pcie_child_write(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+ u32 data, shift;
+ int ret;
+
+ if (size == 4)
+ return rzg3s_pcie_child_write_conf(host, bus, devfn, where, val);
+
+ /*
+ * Controller does 32 bit accesses. To do byte accesses software need
+ * to do read/modify/write. This may have potential side effects. For
+ * example, software may perform a 16-bit write. If the hardware only
+ * supports 32-bit accesses, we must do a 32-bit read, merge in the 16
+ * bits we intend to write, followed by a 32-bit write. If the 16 bits
+ * we *don't* intend to write happen to have any RW1C
+ * (write-one-to-clear) bits set, we just inadvertently cleared
+ * something we shouldn't have.
+ */
+ if (!bus->unsafe_warn) {
+ dev_warn(&bus->dev, "%d-byte config write to %04x:%02x:%02x.%d offset %#x may corrupt adjacent RW1C bits\n",
+ size, pci_domain_nr(bus), bus->number,
+ PCI_SLOT(devfn), PCI_FUNC(devfn), where);
+ bus->unsafe_warn = 1;
+ }
+
+ ret = rzg3s_pcie_child_read_conf(host, bus, devfn, where, &data);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return ret;
+
+ if (size == 1) {
+ shift = BITS_PER_BYTE * (where & 3);
+ data &= ~(0xff << shift);
+ data |= ((val & 0xff) << shift);
+ } else if (size == 2) {
+ shift = BITS_PER_BYTE * (where & 2);
+ data &= ~(0xffff << shift);
+ data |= ((val & 0xffff) << shift);
+ } else {
+ data = val;
+ }
+
+ return rzg3s_pcie_child_write_conf(host, bus, devfn, where, data);
+}
+
+static struct pci_ops rzg3s_pcie_child_ops = {
+ .read = rzg3s_pcie_child_read,
+ .write = rzg3s_pcie_child_write,
+};
+
+static void __iomem *rzg3s_pcie_root_map_bus(struct pci_bus *bus,
+ unsigned int devfn, int where)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+
+ if (devfn)
+ return NULL;
+
+ return host->pcie + where;
+}
+
+/* Serialized by 'pci_lock' */
+static int rzg3s_pcie_root_write(struct pci_bus *bus, unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct rzg3s_pcie_host *host = bus->sysdata;
+ int ret;
+
+ /* Enable access control to the CFGU */
+ writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN,
+ host->axi + RZG3S_PCI_PERM);
+
+ ret = pci_generic_config_write(bus, devfn, where, size, val);
+
+ /* Disable access control to the CFGU */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PERM);
+
+ return ret;
+}
+
+static struct pci_ops rzg3s_pcie_root_ops = {
+ .read = pci_generic_config_read,
+ .write = rzg3s_pcie_root_write,
+ .map_bus = rzg3s_pcie_root_map_bus,
+};
+
+static void rzg3s_pcie_intx_irq_handler(struct irq_desc *desc)
+{
+ struct rzg3s_pcie_host *host = irq_desc_get_handler_data(desc);
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ unsigned int irq = irq_desc_get_irq(desc);
+ u32 intx = irq - host->intx_irqs[0];
+
+ chained_irq_enter(chip, desc);
+ generic_handle_domain_irq(host->intx_domain, intx);
+ chained_irq_exit(chip, desc);
+}
+
+static irqreturn_t rzg3s_pcie_msi_irq(int irq, void *data)
+{
+ u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG;
+ DECLARE_BITMAP(bitmap, RZG3S_PCI_MSI_INT_NR);
+ struct rzg3s_pcie_host *host = data;
+ struct rzg3s_pcie_msi *msi = &host->msi;
+ unsigned long bit;
+ u32 status;
+
+ status = readl_relaxed(host->axi + RZG3S_PCI_PINTRCVIS);
+ if (!(status & RZG3S_PCI_PINTRCVIS_MSI))
+ return IRQ_NONE;
+
+ /* Clear the MSI */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIS,
+ RZG3S_PCI_PINTRCVIS_MSI,
+ RZG3S_PCI_PINTRCVIS_MSI);
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSGRCVIS,
+ RZG3S_PCI_MSGRCVIS_MRI, RZG3S_PCI_MSGRCVIS_MRI);
+
+ for (u8 reg_id = 0; reg_id < regs; reg_id++) {
+ status = readl_relaxed(host->axi + RZG3S_PCI_MSIRS(reg_id));
+ bitmap_write(bitmap, status, reg_id * RZG3S_PCI_MSI_INT_PER_REG,
+ RZG3S_PCI_MSI_INT_PER_REG);
+ }
+
+ for_each_set_bit(bit, bitmap, RZG3S_PCI_MSI_INT_NR) {
+ int ret;
+
+ ret = generic_handle_domain_irq(msi->domain, bit);
+ if (ret) {
+ u8 reg_bit = bit % RZG3S_PCI_MSI_INT_PER_REG;
+ u8 reg_id = bit / RZG3S_PCI_MSI_INT_PER_REG;
+
+ /* Unknown MSI, just clear it */
+ writel_relaxed(BIT(reg_bit),
+ host->axi + RZG3S_PCI_MSIRS(reg_id));
+ }
+ }
+
+ return IRQ_HANDLED;
+}
+
+static void rzg3s_pcie_msi_irq_ack(struct irq_data *d)
+{
+ struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d);
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG;
+ u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG;
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ writel_relaxed(BIT(reg_bit), host->axi + RZG3S_PCI_MSIRS(reg_id));
+}
+
+static void rzg3s_pcie_msi_irq_mask(struct irq_data *d)
+{
+ struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d);
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG;
+ u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG;
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSIRM(reg_id), BIT(reg_bit),
+ BIT(reg_bit));
+}
+
+static void rzg3s_pcie_msi_irq_unmask(struct irq_data *d)
+{
+ struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(d);
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ u8 reg_bit = d->hwirq % RZG3S_PCI_MSI_INT_PER_REG;
+ u8 reg_id = d->hwirq / RZG3S_PCI_MSI_INT_PER_REG;
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSIRM(reg_id), BIT(reg_bit),
+ 0);
+}
+
+static void rzg3s_pcie_irq_compose_msi_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ struct rzg3s_pcie_msi *msi = irq_data_get_irq_chip_data(data);
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ u32 lo, hi;
+
+ /*
+ * Enable and msg data enable bits are part of the address lo. Drop
+ * them along with the unused bit.
+ */
+ lo = readl_relaxed(host->axi + RZG3S_PCI_MSIRCVWADRL) &
+ RZG3S_PCI_MSIRCVWADRL_MASK;
+ hi = readl_relaxed(host->axi + RZG3S_PCI_MSIRCVWADRU);
+
+ msg->address_lo = lo;
+ msg->address_hi = hi;
+ msg->data = data->hwirq;
+}
+
+static struct irq_chip rzg3s_pcie_msi_bottom_chip = {
+ .name = "rzg3s-pcie-msi",
+ .irq_ack = rzg3s_pcie_msi_irq_ack,
+ .irq_mask = rzg3s_pcie_msi_irq_mask,
+ .irq_unmask = rzg3s_pcie_msi_irq_unmask,
+ .irq_compose_msi_msg = rzg3s_pcie_irq_compose_msi_msg,
+};
+
+static int rzg3s_pcie_msi_domain_alloc(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs,
+ void *args)
+{
+ struct rzg3s_pcie_msi *msi = domain->host_data;
+ int hwirq;
+
+ scoped_guard(mutex, &msi->map_lock) {
+ hwirq = bitmap_find_free_region(msi->map, RZG3S_PCI_MSI_INT_NR,
+ order_base_2(nr_irqs));
+ }
+
+ if (hwirq < 0)
+ return -ENOSPC;
+
+ for (unsigned int i = 0; i < nr_irqs; i++) {
+ irq_domain_set_info(domain, virq + i, hwirq + i,
+ &rzg3s_pcie_msi_bottom_chip,
+ domain->host_data, handle_edge_irq, NULL,
+ NULL);
+ }
+
+ return 0;
+}
+
+static void rzg3s_pcie_msi_domain_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+ struct rzg3s_pcie_msi *msi = domain->host_data;
+
+ guard(mutex)(&msi->map_lock);
+
+ bitmap_release_region(msi->map, d->hwirq, order_base_2(nr_irqs));
+}
+
+static const struct irq_domain_ops rzg3s_pcie_msi_domain_ops = {
+ .alloc = rzg3s_pcie_msi_domain_alloc,
+ .free = rzg3s_pcie_msi_domain_free,
+};
+
+#define RZG3S_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \
+ MSI_FLAG_USE_DEF_CHIP_OPS | \
+ MSI_FLAG_NO_AFFINITY | \
+ MSI_FLAG_PCI_MSI_MASK_PARENT)
+
+#define RZG3S_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI | \
+ MSI_GENERIC_FLAGS_MASK)
+
+static const struct msi_parent_ops rzg3s_pcie_msi_parent_ops = {
+ .required_flags = RZG3S_PCIE_MSI_FLAGS_REQUIRED,
+ .supported_flags = RZG3S_PCIE_MSI_FLAGS_SUPPORTED,
+ .bus_select_token = DOMAIN_BUS_PCI_MSI,
+ .chip_flags = MSI_CHIP_FLAG_SET_ACK,
+ .prefix = "RZG3S-",
+ .init_dev_msi_info = msi_lib_init_dev_msi_info,
+};
+
+static int rzg3s_pcie_msi_allocate_domains(struct rzg3s_pcie_msi *msi)
+{
+ struct rzg3s_pcie_host *host = rzg3s_msi_to_host(msi);
+ struct device *dev = host->dev;
+ struct irq_domain_info info = {
+ .fwnode = dev_fwnode(dev),
+ .ops = &rzg3s_pcie_msi_domain_ops,
+ .size = RZG3S_PCI_MSI_INT_NR,
+ .host_data = msi,
+ };
+
+ msi->domain = msi_create_parent_irq_domain(&info,
+ &rzg3s_pcie_msi_parent_ops);
+ if (!msi->domain)
+ return dev_err_probe(dev, -ENOMEM,
+ "failed to create IRQ domain\n");
+
+ return 0;
+}
+
+static int rzg3s_pcie_msi_hw_setup(struct rzg3s_pcie_host *host)
+{
+ u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG;
+ struct rzg3s_pcie_msi *msi = &host->msi;
+
+ /*
+ * Set MSI window size. HW will set the window to
+ * RZG3S_PCI_MSI_INT_NR * 4 bytes.
+ */
+ writel_relaxed(FIELD_PREP(RZG3S_PCI_MSIRCVWMSKL_MASK,
+ RZG3S_PCI_MSI_INT_NR - 1),
+ host->axi + RZG3S_PCI_MSIRCVWMSKL);
+
+ /* Set MSI window address and enable MSI window */
+ writel_relaxed(upper_32_bits(msi->window_base),
+ host->axi + RZG3S_PCI_MSIRCVWADRU);
+ writel_relaxed(lower_32_bits(msi->window_base) |
+ RZG3S_PCI_MSIRCVWADRL_ENA |
+ RZG3S_PCI_MSIRCVWADRL_MSG_DATA_ENA,
+ host->axi + RZG3S_PCI_MSIRCVWADRL);
+
+ /* Set MSI receive enable */
+ for (u8 reg_id = 0; reg_id < regs; reg_id++) {
+ writel_relaxed(RZG3S_PCI_MSIRE_ENA,
+ host->axi + RZG3S_PCI_MSIRE(reg_id));
+ }
+
+ /* Enable message receive interrupts */
+ writel_relaxed(RZG3S_PCI_MSGRCVIE_MSG_RCV,
+ host->axi + RZG3S_PCI_MSGRCVIE);
+
+ /* Enable MSI */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE,
+ RZG3S_PCI_PINTRCVIE_MSI,
+ RZG3S_PCI_PINTRCVIE_MSI);
+
+ return 0;
+}
+
+static int rzg3s_pcie_msi_setup(struct rzg3s_pcie_host *host)
+{
+ size_t size = RZG3S_PCI_MSI_INT_NR * sizeof(u32);
+ struct rzg3s_pcie_msi *msi = &host->msi;
+ struct device *dev = host->dev;
+ int id, ret;
+
+ msi->pages = __get_free_pages(GFP_KERNEL | GFP_DMA, 0);
+ if (!msi->pages)
+ return -ENOMEM;
+
+ msi->dma_addr = dma_map_single(dev, (void *)msi->pages, size * 2,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, msi->dma_addr)) {
+ ret = -ENOMEM;
+ goto free_pages;
+ }
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section 34.4.5.2 Setting
+ * the MSI Window) the MSI window needs to fall within one of the
+ * enabled AXI windows. Find an enabled AXI window to setup the MSI
+ * window.
+ */
+ for (id = 0; id < RZG3S_MAX_WINDOWS; id++) {
+ u64 base, basel, baseu;
+ u64 mask, maskl, masku;
+
+ basel = readl_relaxed(host->axi + RZG3S_PCI_AWBASEL(id));
+ /* Skip checking this AXI window if it's not enabled */
+ if (!(basel & RZG3S_PCI_AWBASEL_WIN_ENA))
+ continue;
+
+ baseu = readl_relaxed(host->axi + RZG3S_PCI_AWBASEU(id));
+ base = baseu << 32 | basel;
+
+ maskl = readl_relaxed(host->axi + RZG3S_PCI_AWMASKL(id));
+ masku = readl_relaxed(host->axi + RZG3S_PCI_AWMASKU(id));
+ mask = masku << 32 | maskl;
+
+ if (msi->dma_addr < base || msi->dma_addr > base + mask)
+ continue;
+
+ break;
+ }
+
+ if (id == RZG3S_MAX_WINDOWS) {
+ ret = -EINVAL;
+ goto dma_unmap;
+ }
+
+ /* The MSI base address must be aligned to the MSI size */
+ msi->window_base = ALIGN(msi->dma_addr, size);
+ if (msi->window_base < msi->dma_addr) {
+ ret = -EINVAL;
+ goto dma_unmap;
+ }
+
+ rzg3s_pcie_msi_hw_setup(host);
+
+ return 0;
+
+dma_unmap:
+ dma_unmap_single(dev, msi->dma_addr, size * 2, DMA_BIDIRECTIONAL);
+free_pages:
+ free_pages(msi->pages, 0);
+ return ret;
+}
+
+static void rzg3s_pcie_msi_hw_teardown(struct rzg3s_pcie_host *host)
+{
+ u8 regs = RZG3S_PCI_MSI_INT_NR / RZG3S_PCI_MSI_INT_PER_REG;
+
+ /* Disable MSI */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE,
+ RZG3S_PCI_PINTRCVIE_MSI, 0);
+
+ /* Disable message receive interrupts */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_MSGRCVIE,
+ RZG3S_PCI_MSGRCVIE_MSG_RCV, 0);
+
+ /* Disable MSI receive enable */
+ for (u8 reg_id = 0; reg_id < regs; reg_id++)
+ writel_relaxed(0, host->axi + RZG3S_PCI_MSIRE(reg_id));
+
+ /* Disable MSI window */
+ writel_relaxed(0, host->axi + RZG3S_PCI_MSIRCVWADRL);
+}
+
+static void rzg3s_pcie_teardown_msi(struct rzg3s_pcie_host *host)
+{
+ size_t size = RZG3S_PCI_MSI_INT_NR * sizeof(u32);
+ struct rzg3s_pcie_msi *msi = &host->msi;
+
+ rzg3s_pcie_msi_hw_teardown(host);
+
+ free_irq(msi->irq, host);
+ irq_domain_remove(msi->domain);
+
+ /* Free unused memory */
+ dma_unmap_single(host->dev, msi->dma_addr, size * 2, DMA_BIDIRECTIONAL);
+ free_pages(msi->pages, 0);
+}
+
+static int rzg3s_pcie_init_msi(struct rzg3s_pcie_host *host)
+{
+ struct platform_device *pdev = to_platform_device(host->dev);
+ struct rzg3s_pcie_msi *msi = &host->msi;
+ struct device *dev = host->dev;
+ const char *devname;
+ int ret;
+
+ ret = devm_mutex_init(dev, &msi->map_lock);
+ if (ret)
+ return ret;
+
+ msi->irq = platform_get_irq_byname(pdev, "msi");
+ if (msi->irq < 0)
+ return dev_err_probe(dev, msi->irq, "Failed to get MSI IRQ!\n");
+
+ devname = devm_kasprintf(dev, GFP_KERNEL, "%s-msi", dev_name(dev));
+ if (!devname)
+ return -ENOMEM;
+
+ ret = rzg3s_pcie_msi_allocate_domains(msi);
+ if (ret)
+ return ret;
+
+ /*
+ * Don't use devm_request_irq() as the driver uses non-devm helpers
+ * to control clocks. Mixing them may lead to subtle bugs.
+ */
+ ret = request_irq(msi->irq, rzg3s_pcie_msi_irq, 0, devname, host);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to request IRQ: %d\n", ret);
+ goto free_domains;
+ }
+
+ ret = rzg3s_pcie_msi_setup(host);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to setup MSI!\n");
+ goto free_irq;
+ }
+
+ return 0;
+
+free_irq:
+ free_irq(msi->irq, host);
+free_domains:
+ irq_domain_remove(msi->domain);
+ return ret;
+}
+
+static void rzg3s_pcie_intx_irq_ack(struct irq_data *d)
+{
+ struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d);
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIS,
+ RZG3S_PCI_PINTRCVIS_INTX(d->hwirq),
+ RZG3S_PCI_PINTRCVIS_INTX(d->hwirq));
+}
+
+static void rzg3s_pcie_intx_irq_mask(struct irq_data *d)
+{
+ struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d);
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE,
+ RZG3S_PCI_PINTRCVIE_INTX(d->hwirq), 0);
+}
+
+static void rzg3s_pcie_intx_irq_unmask(struct irq_data *d)
+{
+ struct rzg3s_pcie_host *host = irq_data_get_irq_chip_data(d);
+
+ guard(raw_spinlock_irqsave)(&host->hw_lock);
+
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PINTRCVIE,
+ RZG3S_PCI_PINTRCVIE_INTX(d->hwirq),
+ RZG3S_PCI_PINTRCVIE_INTX(d->hwirq));
+}
+
+static struct irq_chip rzg3s_pcie_intx_irq_chip = {
+ .name = "PCIe INTx",
+ .irq_ack = rzg3s_pcie_intx_irq_ack,
+ .irq_mask = rzg3s_pcie_intx_irq_mask,
+ .irq_unmask = rzg3s_pcie_intx_irq_unmask,
+};
+
+static int rzg3s_pcie_intx_map(struct irq_domain *domain, unsigned int irq,
+ irq_hw_number_t hwirq)
+{
+ irq_set_chip_and_handler(irq, &rzg3s_pcie_intx_irq_chip,
+ handle_level_irq);
+ irq_set_chip_data(irq, domain->host_data);
+
+ return 0;
+}
+
+static const struct irq_domain_ops rzg3s_pcie_intx_domain_ops = {
+ .map = rzg3s_pcie_intx_map,
+ .xlate = irq_domain_xlate_onetwocell,
+};
+
+static int rzg3s_pcie_init_irqdomain(struct rzg3s_pcie_host *host)
+{
+ struct device *dev = host->dev;
+ struct platform_device *pdev = to_platform_device(dev);
+
+ for (int i = 0; i < PCI_NUM_INTX; i++) {
+ char irq_name[5] = {0};
+ int irq;
+
+ scnprintf(irq_name, ARRAY_SIZE(irq_name), "int%c", 'a' + i);
+
+ irq = platform_get_irq_byname(pdev, irq_name);
+ if (irq < 0)
+ return dev_err_probe(dev, -EINVAL,
+ "Failed to parse and map INT%c IRQ\n",
+ 'A' + i);
+
+ host->intx_irqs[i] = irq;
+ irq_set_chained_handler_and_data(irq,
+ rzg3s_pcie_intx_irq_handler,
+ host);
+ }
+
+ host->intx_domain = irq_domain_create_linear(dev_fwnode(dev),
+ PCI_NUM_INTX,
+ &rzg3s_pcie_intx_domain_ops,
+ host);
+ if (!host->intx_domain)
+ return dev_err_probe(dev, -EINVAL,
+ "Failed to add irq domain for INTx IRQs\n");
+ irq_domain_update_bus_token(host->intx_domain, DOMAIN_BUS_WIRED);
+
+ if (IS_ENABLED(CONFIG_PCI_MSI)) {
+ int ret = rzg3s_pcie_init_msi(host);
+
+ if (ret) {
+ irq_domain_remove(host->intx_domain);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void rzg3s_pcie_teardown_irqdomain(struct rzg3s_pcie_host *host)
+{
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ rzg3s_pcie_teardown_msi(host);
+
+ irq_domain_remove(host->intx_domain);
+}
+
+static int rzg3s_pcie_set_max_link_speed(struct rzg3s_pcie_host *host)
+{
+ u32 remote_supported_link_speeds, max_supported_link_speeds;
+ u32 cs2, tmp, pcie_cap = RZG3S_PCI_CFG_PCIEC;
+ u32 cur_link_speed, link_speed;
+ u8 ltssm_state_l0 = 0xc;
+ int ret;
+ u16 ls;
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section 34.6.3 Caution
+ * when Changing the Speed Spontaneously) link speed change can be done
+ * only when the LTSSM is in L0.
+ */
+ ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT1, tmp,
+ FIELD_GET(RZG3S_PCI_PCSTAT1_LTSSM_STATE, tmp) == ltssm_state_l0,
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI,
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI *
+ PCIE_LINK_WAIT_MAX_RETRIES);
+ if (ret)
+ return ret;
+
+ ls = readw_relaxed(host->pcie + pcie_cap + PCI_EXP_LNKSTA);
+ cs2 = readl_relaxed(host->axi + RZG3S_PCI_PCSTAT2);
+
+ switch (pcie_link_speed[host->max_link_speed]) {
+ case PCIE_SPEED_5_0GT:
+ max_supported_link_speeds = GENMASK(PCI_EXP_LNKSTA_CLS_5_0GB - 1, 0);
+ link_speed = PCI_EXP_LNKCTL2_TLS_5_0GT;
+ break;
+ default:
+ /* Should not happen */
+ return -EINVAL;
+ }
+
+ cur_link_speed = FIELD_GET(PCI_EXP_LNKSTA_CLS, ls);
+ remote_supported_link_speeds = FIELD_GET(RZG3S_PCI_PCSTAT2_SDRIRE, cs2);
+ /* Drop reserved bits */
+ remote_supported_link_speeds &= max_supported_link_speeds;
+
+ /*
+ * Return if max link speed is already set or the connected device
+ * doesn't support it.
+ */
+ if (cur_link_speed == host->max_link_speed ||
+ remote_supported_link_speeds != max_supported_link_speeds)
+ return 0;
+
+ /* Set target Link speed */
+ rzg3s_pcie_update_bits(host->pcie, pcie_cap + PCI_EXP_LNKCTL2,
+ PCI_EXP_LNKCTL2_TLS,
+ FIELD_PREP(PCI_EXP_LNKCTL2_TLS, link_speed));
+
+ /* Request link speed change */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PCCTRL2,
+ RZG3S_PCI_PCCTRL2_LS_CHG_REQ |
+ RZG3S_PCI_PCCTRL2_LS_CHG,
+ RZG3S_PCI_PCCTRL2_LS_CHG_REQ |
+ FIELD_PREP(RZG3S_PCI_PCCTRL2_LS_CHG,
+ link_speed - 1));
+
+ ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT2, cs2,
+ (cs2 & RZG3S_PCI_PCSTAT2_LS_CHG_DONE),
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI,
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI *
+ PCIE_LINK_WAIT_MAX_RETRIES);
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section 34.6.3 Caution
+ * when Changing the Speed Spontaneously) the PCI_PCCTRL2_LS_CHG_REQ
+ * should be de-asserted after checking for PCI_PCSTAT2_LS_CHG_DONE.
+ */
+ rzg3s_pcie_update_bits(host->axi, RZG3S_PCI_PCCTRL2,
+ RZG3S_PCI_PCCTRL2_LS_CHG_REQ, 0);
+
+ return ret;
+}
+
+static int rzg3s_pcie_config_init(struct rzg3s_pcie_host *host)
+{
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host);
+ struct resource_entry *ft;
+ struct resource *bus;
+ u8 subordinate_bus;
+ u8 secondary_bus;
+ u8 primary_bus;
+
+ ft = resource_list_first_type(&bridge->windows, IORESOURCE_BUS);
+ if (!ft)
+ return -ENODEV;
+
+ bus = ft->res;
+ primary_bus = bus->start;
+ secondary_bus = bus->start + 1;
+ subordinate_bus = bus->end;
+
+ /* Enable access control to the CFGU */
+ writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN,
+ host->axi + RZG3S_PCI_PERM);
+
+ /* HW manual recommends to write 0xffffffff on initialization */
+ writel_relaxed(0xffffffff, host->pcie + RZG3S_PCI_CFG_BARMSK00L);
+ writel_relaxed(0xffffffff, host->pcie + RZG3S_PCI_CFG_BARMSK00U);
+
+ /* Update bus info */
+ writeb_relaxed(primary_bus, host->pcie + PCI_PRIMARY_BUS);
+ writeb_relaxed(secondary_bus, host->pcie + PCI_SECONDARY_BUS);
+ writeb_relaxed(subordinate_bus, host->pcie + PCI_SUBORDINATE_BUS);
+
+ /* Disable access control to the CFGU */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PERM);
+
+ return 0;
+}
+
+static void rzg3s_pcie_irq_init(struct rzg3s_pcie_host *host)
+{
+ /*
+ * According to the HW manual of the RZ/G3S (Rev.1.10, sections
+ * corresponding to all registers written with ~0U), the hardware
+ * ignores value written to unused bits. Writing ~0U to these registers
+ * should be safe.
+ */
+
+ /* Clear the link state and PM transitions */
+ writel_relaxed(RZG3S_PCI_PEIS0_DL_UPDOWN |
+ RZG3S_PCI_PEIS0_RX_DLLP_PM_ENTER,
+ host->axi + RZG3S_PCI_PEIS0);
+
+ /* Disable all interrupts */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PEIE0);
+
+ /* Clear all parity and ecc error interrupts */
+ writel_relaxed(~0U, host->axi + RZG3S_PCI_PEIS1);
+
+ /* Disable all parity and ecc error interrupts */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PEIE1);
+
+ /* Clear all AXI master error interrupts */
+ writel_relaxed(~0U, host->axi + RZG3S_PCI_AMEIS);
+
+ /* Clear all AXI slave error interrupts */
+ writel_relaxed(~0U, host->axi + RZG3S_PCI_ASEIS1);
+
+ /* Clear all message receive interrupts */
+ writel_relaxed(~0U, host->axi + RZG3S_PCI_MSGRCVIS);
+}
+
+static int rzg3s_pcie_power_resets_deassert(struct rzg3s_pcie_host *host)
+{
+ const struct rzg3s_pcie_soc_data *data = host->data;
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section
+ * 34.5.1.2 De-asserting the Reset) the PCIe IP needs to wait 5ms from
+ * power on to the de-assertion of reset.
+ */
+ fsleep(5000);
+ return reset_control_bulk_deassert(data->num_power_resets,
+ host->power_resets);
+}
+
+static int rzg3s_pcie_resets_prepare_and_get(struct rzg3s_pcie_host *host)
+{
+ const struct rzg3s_pcie_soc_data *data = host->data;
+ unsigned int i;
+ int ret;
+
+ host->power_resets = devm_kmalloc_array(host->dev,
+ data->num_power_resets,
+ sizeof(*host->power_resets),
+ GFP_KERNEL);
+ if (!host->power_resets)
+ return -ENOMEM;
+
+ for (i = 0; i < data->num_power_resets; i++)
+ host->power_resets[i].id = data->power_resets[i];
+
+ host->cfg_resets = devm_kmalloc_array(host->dev,
+ data->num_cfg_resets,
+ sizeof(*host->cfg_resets),
+ GFP_KERNEL);
+ if (!host->cfg_resets)
+ return -ENOMEM;
+
+ for (i = 0; i < data->num_cfg_resets; i++)
+ host->cfg_resets[i].id = data->cfg_resets[i];
+
+ ret = devm_reset_control_bulk_get_exclusive(host->dev,
+ data->num_power_resets,
+ host->power_resets);
+ if (ret)
+ return ret;
+
+ return devm_reset_control_bulk_get_exclusive(host->dev,
+ data->num_cfg_resets,
+ host->cfg_resets);
+}
+
+static int rzg3s_pcie_host_parse_port(struct rzg3s_pcie_host *host)
+{
+ struct device_node *of_port = of_get_next_child(host->dev->of_node, NULL);
+ struct rzg3s_pcie_port *port = &host->port;
+ int ret;
+
+ ret = of_property_read_u32(of_port, "vendor-id", &port->vendor_id);
+ if (ret)
+ return ret;
+
+ ret = of_property_read_u32(of_port, "device-id", &port->device_id);
+ if (ret)
+ return ret;
+
+ port->refclk = of_clk_get_by_name(of_port, "ref");
+ if (IS_ERR(port->refclk))
+ return PTR_ERR(port->refclk);
+
+ return 0;
+}
+
+static int rzg3s_pcie_host_init_port(struct rzg3s_pcie_host *host)
+{
+ struct rzg3s_pcie_port *port = &host->port;
+ struct device *dev = host->dev;
+ int ret;
+
+ /* Enable access control to the CFGU */
+ writel_relaxed(RZG3S_PCI_PERM_CFG_HWINIT_EN,
+ host->axi + RZG3S_PCI_PERM);
+
+ /* Update vendor ID and device ID */
+ writew_relaxed(port->vendor_id, host->pcie + PCI_VENDOR_ID);
+ writew_relaxed(port->device_id, host->pcie + PCI_DEVICE_ID);
+
+ /* Disable access control to the CFGU */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PERM);
+
+ ret = clk_prepare_enable(port->refclk);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to enable refclk!\n");
+
+ /* Set the PHY, if any */
+ if (host->data->init_phy) {
+ ret = host->data->init_phy(host);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to set the PHY!\n");
+ goto refclk_disable;
+ }
+ }
+
+ return 0;
+
+refclk_disable:
+ clk_disable_unprepare(port->refclk);
+ return ret;
+}
+
+static int rzg3s_pcie_host_init(struct rzg3s_pcie_host *host)
+{
+ u32 val;
+ int ret;
+
+ /* Initialize the PCIe related registers */
+ ret = rzg3s_pcie_config_init(host);
+ if (ret)
+ return ret;
+
+ ret = rzg3s_pcie_host_init_port(host);
+ if (ret)
+ return ret;
+
+ /* Initialize the interrupts */
+ rzg3s_pcie_irq_init(host);
+
+ ret = reset_control_bulk_deassert(host->data->num_cfg_resets,
+ host->cfg_resets);
+ if (ret)
+ goto disable_port_refclk;
+
+ /* Wait for link up */
+ ret = readl_poll_timeout(host->axi + RZG3S_PCI_PCSTAT1, val,
+ !(val & RZG3S_PCI_PCSTAT1_DL_DOWN_STS),
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI,
+ PCIE_LINK_WAIT_SLEEP_MS * MILLI *
+ PCIE_LINK_WAIT_MAX_RETRIES);
+ if (ret)
+ goto cfg_resets_deassert;
+
+ val = readl_relaxed(host->axi + RZG3S_PCI_PCSTAT2);
+ dev_info(host->dev, "PCIe link status [0x%x]\n", val);
+
+ return 0;
+
+cfg_resets_deassert:
+ reset_control_bulk_assert(host->data->num_cfg_resets,
+ host->cfg_resets);
+disable_port_refclk:
+ clk_disable_unprepare(host->port.refclk);
+ return ret;
+}
+
+static void rzg3s_pcie_set_inbound_window(struct rzg3s_pcie_host *host,
+ u64 cpu_addr, u64 pci_addr, u64 size,
+ int id)
+{
+ /* Set CPU window base address */
+ writel_relaxed(upper_32_bits(cpu_addr),
+ host->axi + RZG3S_PCI_ADESTU(id));
+ writel_relaxed(lower_32_bits(cpu_addr),
+ host->axi + RZG3S_PCI_ADESTL(id));
+
+ /* Set window size */
+ writel_relaxed(upper_32_bits(size), host->axi + RZG3S_PCI_AWMASKU(id));
+ writel_relaxed(lower_32_bits(size), host->axi + RZG3S_PCI_AWMASKL(id));
+
+ /* Set PCIe window base address and enable the window */
+ writel_relaxed(upper_32_bits(pci_addr),
+ host->axi + RZG3S_PCI_AWBASEU(id));
+ writel_relaxed(lower_32_bits(pci_addr) | RZG3S_PCI_AWBASEL_WIN_ENA,
+ host->axi + RZG3S_PCI_AWBASEL(id));
+}
+
+static int rzg3s_pcie_set_inbound_windows(struct rzg3s_pcie_host *host,
+ struct resource_entry *entry,
+ int *index)
+{
+ u64 pci_addr = entry->res->start - entry->offset;
+ u64 cpu_addr = entry->res->start;
+ u64 cpu_end = entry->res->end;
+ u64 size_id = 0;
+ int id = *index;
+ u64 size;
+
+ while (cpu_addr < cpu_end) {
+ if (id >= RZG3S_MAX_WINDOWS)
+ return dev_err_probe(host->dev, -ENOSPC,
+ "Failed to map inbound window for resource (%s)\n",
+ entry->res->name);
+
+ size = resource_size(entry->res) - size_id;
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10,
+ * section 34.3.1.71 AXI Window Mask (Lower) Registers) the min
+ * size is 4K.
+ */
+ size = max(size, SZ_4K);
+
+ /*
+ * According the RZ/G3S HW manual (Rev.1.10, sections:
+ * - 34.3.1.69 AXI Window Base (Lower) Registers
+ * - 34.3.1.71 AXI Window Mask (Lower) Registers
+ * - 34.3.1.73 AXI Destination (Lower) Registers)
+ * the CPU addr, PCIe addr, size should be 4K aligned and be a
+ * power of 2.
+ */
+ size = ALIGN(size, SZ_4K);
+ size = roundup_pow_of_two(size);
+
+ cpu_addr = ALIGN(cpu_addr, SZ_4K);
+ pci_addr = ALIGN(pci_addr, SZ_4K);
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section
+ * 34.3.1.71 AXI Window Mask (Lower) Registers) HW expects first
+ * 12 LSB bits to be 0xfff. Subtract 1 from size for this.
+ */
+ rzg3s_pcie_set_inbound_window(host, cpu_addr, pci_addr,
+ size - 1, id);
+
+ pci_addr += size;
+ cpu_addr += size;
+ size_id = size;
+ id++;
+ }
+ *index = id;
+
+ return 0;
+}
+
+static int rzg3s_pcie_parse_map_dma_ranges(struct rzg3s_pcie_host *host)
+{
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host);
+ struct resource_entry *entry;
+ int i = 0, ret;
+
+ resource_list_for_each_entry(entry, &bridge->dma_ranges) {
+ ret = rzg3s_pcie_set_inbound_windows(host, entry, &i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void rzg3s_pcie_set_outbound_window(struct rzg3s_pcie_host *host,
+ struct resource_entry *win, int id)
+{
+ struct resource *res = win->res;
+ resource_size_t size = resource_size(res);
+ resource_size_t res_start;
+
+ if (res->flags & IORESOURCE_IO)
+ res_start = pci_pio_to_address(res->start) - win->offset;
+ else
+ res_start = res->start - win->offset;
+
+ /*
+ * According to the RZ/G3S HW manual (Rev.1.10, section 34.3.1.75 PCIe
+ * Window Base (Lower) Registers) the window base address need to be 4K
+ * aligned.
+ */
+ res_start = ALIGN(res_start, SZ_4K);
+
+ size = ALIGN(size, SZ_4K);
+ size = roundup_pow_of_two(size) - 1;
+
+ /* Set PCIe destination */
+ writel_relaxed(upper_32_bits(res_start),
+ host->axi + RZG3S_PCI_PDESTU(id));
+ writel_relaxed(lower_32_bits(res_start),
+ host->axi + RZG3S_PCI_PDESTL(id));
+
+ /* Set PCIe window mask */
+ writel_relaxed(upper_32_bits(size), host->axi + RZG3S_PCI_PWMASKU(id));
+ writel_relaxed(lower_32_bits(size), host->axi + RZG3S_PCI_PWMASKL(id));
+
+ /* Set PCIe window base and enable the window */
+ writel_relaxed(upper_32_bits(res_start),
+ host->axi + RZG3S_PCI_PWBASEU(id));
+ writel_relaxed(lower_32_bits(res_start) | RZG3S_PCI_PWBASEL_ENA,
+ host->axi + RZG3S_PCI_PWBASEL(id));
+}
+
+static int rzg3s_pcie_parse_map_ranges(struct rzg3s_pcie_host *host)
+{
+ struct pci_host_bridge *bridge = pci_host_bridge_from_priv(host);
+ struct resource_entry *win;
+ int i = 0;
+
+ resource_list_for_each_entry(win, &bridge->windows) {
+ struct resource *res = win->res;
+
+ if (i >= RZG3S_MAX_WINDOWS)
+ return dev_err_probe(host->dev, -ENOSPC,
+ "Failed to map outbound window for resource (%s)\n",
+ res->name);
+
+ if (!res->flags)
+ continue;
+
+ switch (resource_type(res)) {
+ case IORESOURCE_IO:
+ case IORESOURCE_MEM:
+ rzg3s_pcie_set_outbound_window(host, win, i);
+ i++;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int rzg3s_soc_pcie_init_phy(struct rzg3s_pcie_host *host)
+{
+ static const u32 xcfgd_settings[RZG3S_PCI_PHY_XCFGD_NUM] = {
+ [8] = 0xe0006801, 0x007f7e30, 0x183e0000, 0x978ff500,
+ 0xec000000, 0x009f1400, 0x0000d009,
+ [17] = 0x78000000,
+ [19] = 0x00880000, 0x000005c0, 0x07000000, 0x00780920,
+ 0xc9400ce2, 0x90000c0c, 0x000c1414, 0x00005034,
+ 0x00006000, 0x00000001,
+ };
+ static const u32 xcfga_cmn_settings[RZG3S_PCI_PHY_XCFGA_CMN_NUM] = {
+ 0x00000d10, 0x08310100, 0x00c21404, 0x013c0010, 0x01874440,
+ 0x1a216082, 0x00103440, 0x00000080, 0x00000010, 0x0c1000c1,
+ 0x1000c100, 0x0222000c, 0x00640019, 0x00a00028, 0x01d11228,
+ 0x0201001d,
+ };
+ static const u32 xcfga_rx_settings[RZG3S_PCI_PHY_XCFGA_RX_NUM] = {
+ 0x07d55000, 0x030e3f00, 0x00000288, 0x102c5880, 0x0000000b,
+ 0x04141441, 0x00641641, 0x00d63d63, 0x00641641, 0x01970377,
+ 0x00190287, 0x00190028, 0x00000028,
+ };
+ unsigned int i;
+
+ /*
+ * Enable access permission for physical layer control and status
+ * registers.
+ */
+ writel_relaxed(RZG3S_PCI_PERM_PIPE_PHY_REG_EN,
+ host->axi + RZG3S_PCI_PERM);
+
+ for (i = 0; i < RZG3S_PCI_PHY_XCFGD_NUM; i++) {
+ writel_relaxed(xcfgd_settings[i],
+ host->axi + RZG3S_PCI_PHY_XCFGD(i));
+ }
+
+ for (i = 0; i < RZG3S_PCI_PHY_XCFGA_CMN_NUM; i++) {
+ writel_relaxed(xcfga_cmn_settings[i],
+ host->axi + RZG3S_PCI_PHY_XCFGA_CMN(i));
+ }
+
+ for (i = 0; i < RZG3S_PCI_PHY_XCFGA_RX_NUM; i++) {
+ writel_relaxed(xcfga_rx_settings[i],
+ host->axi + RZG3S_PCI_PHY_XCFGA_RX(i));
+ }
+
+ writel_relaxed(0x107, host->axi + RZG3S_PCI_PHY_XCFGA_TX);
+
+ /* Select PHY settings values */
+ writel_relaxed(RZG3S_PCI_PHY_XCFG_CTRL_PHYREG_SEL,
+ host->axi + RZG3S_PCI_PHY_XCFG_CTRL);
+
+ /*
+ * Disable access permission for physical layer control and status
+ * registers.
+ */
+ writel_relaxed(0, host->axi + RZG3S_PCI_PERM);
+
+ return 0;
+}
+
+static int
+rzg3s_pcie_host_setup(struct rzg3s_pcie_host *host,
+ int (*init_irqdomain)(struct rzg3s_pcie_host *host),
+ void (*teardown_irqdomain)(struct rzg3s_pcie_host *host))
+{
+ struct device *dev = host->dev;
+ int ret;
+
+ /* Set inbound windows */
+ ret = rzg3s_pcie_parse_map_dma_ranges(host);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to set inbound windows!\n");
+
+ /* Set outbound windows */
+ ret = rzg3s_pcie_parse_map_ranges(host);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to set outbound windows!\n");
+
+ ret = init_irqdomain(host);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to init IRQ domain\n");
+
+ ret = rzg3s_pcie_host_init(host);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to initialize the HW!\n");
+ goto teardown_irqdomain;
+ }
+
+ ret = rzg3s_pcie_set_max_link_speed(host);
+ if (ret)
+ dev_info(dev, "Failed to set max link speed\n");
+
+ msleep(PCIE_RESET_CONFIG_WAIT_MS);
+
+ return 0;
+
+teardown_irqdomain:
+ teardown_irqdomain(host);
+
+ return ret;
+}
+
+static int rzg3s_pcie_probe(struct platform_device *pdev)
+{
+ struct pci_host_bridge *bridge;
+ struct device *dev = &pdev->dev;
+ struct device_node *np = dev->of_node;
+ struct device_node *sysc_np __free(device_node) =
+ of_parse_phandle(np, "renesas,sysc", 0);
+ struct rzg3s_pcie_host *host;
+ int ret;
+
+ bridge = devm_pci_alloc_host_bridge(dev, sizeof(*host));
+ if (!bridge)
+ return -ENOMEM;
+
+ host = pci_host_bridge_priv(bridge);
+ host->dev = dev;
+ host->data = device_get_match_data(dev);
+ platform_set_drvdata(pdev, host);
+
+ host->axi = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(host->axi))
+ return PTR_ERR(host->axi);
+ host->pcie = host->axi + RZG3S_PCI_CFG_BASE;
+
+ host->max_link_speed = of_pci_get_max_link_speed(np);
+ if (host->max_link_speed < 0)
+ host->max_link_speed = 2;
+
+ ret = rzg3s_pcie_host_parse_port(host);
+ if (ret)
+ return ret;
+
+ host->sysc = syscon_node_to_regmap(sysc_np);
+ if (IS_ERR(host->sysc)) {
+ ret = PTR_ERR(host->sysc);
+ goto port_refclk_put;
+ }
+
+ ret = regmap_update_bits(host->sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 1));
+ if (ret)
+ goto port_refclk_put;
+
+ ret = rzg3s_pcie_resets_prepare_and_get(host);
+ if (ret)
+ goto sysc_signal_restore;
+
+ ret = rzg3s_pcie_power_resets_deassert(host);
+ if (ret)
+ goto sysc_signal_restore;
+
+ pm_runtime_enable(dev);
+
+ /*
+ * Controller clocks are part of a clock power domain. Enable them
+ * through runtime PM.
+ */
+ ret = pm_runtime_resume_and_get(dev);
+ if (ret)
+ goto rpm_disable;
+
+ raw_spin_lock_init(&host->hw_lock);
+
+ ret = rzg3s_pcie_host_setup(host, rzg3s_pcie_init_irqdomain,
+ rzg3s_pcie_teardown_irqdomain);
+ if (ret)
+ goto rpm_put;
+
+ bridge->sysdata = host;
+ bridge->ops = &rzg3s_pcie_root_ops;
+ bridge->child_ops = &rzg3s_pcie_child_ops;
+ ret = pci_host_probe(bridge);
+ if (ret)
+ goto host_probe_teardown;
+
+ return 0;
+
+host_probe_teardown:
+ rzg3s_pcie_teardown_irqdomain(host);
+ reset_control_bulk_deassert(host->data->num_cfg_resets,
+ host->cfg_resets);
+rpm_put:
+ pm_runtime_put_sync(dev);
+rpm_disable:
+ pm_runtime_disable(dev);
+ reset_control_bulk_assert(host->data->num_power_resets,
+ host->power_resets);
+sysc_signal_restore:
+ /*
+ * SYSC RST_RSM_B signal need to be asserted before turning off the
+ * power to the PHY.
+ */
+ regmap_update_bits(host->sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0));
+port_refclk_put:
+ clk_put(host->port.refclk);
+
+ return ret;
+}
+
+static int rzg3s_pcie_suspend_noirq(struct device *dev)
+{
+ struct rzg3s_pcie_host *host = dev_get_drvdata(dev);
+ const struct rzg3s_pcie_soc_data *data = host->data;
+ struct rzg3s_pcie_port *port = &host->port;
+ struct regmap *sysc = host->sysc;
+ int ret;
+
+ ret = pm_runtime_put_sync(dev);
+ if (ret)
+ return ret;
+
+ clk_disable_unprepare(port->refclk);
+
+ ret = reset_control_bulk_assert(data->num_power_resets,
+ host->power_resets);
+ if (ret)
+ goto refclk_restore;
+
+ ret = reset_control_bulk_assert(data->num_cfg_resets,
+ host->cfg_resets);
+ if (ret)
+ goto power_resets_restore;
+
+ ret = regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0));
+ if (ret)
+ goto cfg_resets_restore;
+
+ return 0;
+
+ /* Restore the previous state if any error happens */
+cfg_resets_restore:
+ reset_control_bulk_deassert(data->num_cfg_resets,
+ host->cfg_resets);
+power_resets_restore:
+ reset_control_bulk_deassert(data->num_power_resets,
+ host->power_resets);
+refclk_restore:
+ clk_prepare_enable(port->refclk);
+ pm_runtime_resume_and_get(dev);
+ return ret;
+}
+
+static int rzg3s_pcie_resume_noirq(struct device *dev)
+{
+ struct rzg3s_pcie_host *host = dev_get_drvdata(dev);
+ const struct rzg3s_pcie_soc_data *data = host->data;
+ struct regmap *sysc = host->sysc;
+ int ret;
+
+ ret = regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 1));
+ if (ret)
+ return ret;
+
+ ret = rzg3s_pcie_power_resets_deassert(host);
+ if (ret)
+ goto assert_rst_rsm_b;
+
+ ret = pm_runtime_resume_and_get(dev);
+ if (ret)
+ goto assert_power_resets;
+
+ ret = rzg3s_pcie_host_setup(host, rzg3s_pcie_msi_hw_setup,
+ rzg3s_pcie_msi_hw_teardown);
+ if (ret)
+ goto rpm_put;
+
+ return 0;
+
+ /*
+ * If any error happens there is no way to recover the IP. Put it in the
+ * lowest possible power state.
+ */
+rpm_put:
+ pm_runtime_put_sync(dev);
+assert_power_resets:
+ reset_control_bulk_assert(data->num_power_resets,
+ host->power_resets);
+assert_rst_rsm_b:
+ regmap_update_bits(sysc, RZG3S_SYS_PCIE_RST_RSM_B,
+ RZG3S_SYS_PCIE_RST_RSM_B_MASK,
+ FIELD_PREP(RZG3S_SYS_PCIE_RST_RSM_B_MASK, 0));
+ return ret;
+}
+
+static const struct dev_pm_ops rzg3s_pcie_pm_ops = {
+ NOIRQ_SYSTEM_SLEEP_PM_OPS(rzg3s_pcie_suspend_noirq,
+ rzg3s_pcie_resume_noirq)
+};
+
+static const char * const rzg3s_soc_power_resets[] = {
+ "aresetn", "rst_cfg_b", "rst_load_b",
+};
+
+static const char * const rzg3s_soc_cfg_resets[] = {
+ "rst_b", "rst_ps_b", "rst_gp_b", "rst_rsm_b",
+};
+
+static const struct rzg3s_pcie_soc_data rzg3s_soc_data = {
+ .power_resets = rzg3s_soc_power_resets,
+ .num_power_resets = ARRAY_SIZE(rzg3s_soc_power_resets),
+ .cfg_resets = rzg3s_soc_cfg_resets,
+ .num_cfg_resets = ARRAY_SIZE(rzg3s_soc_cfg_resets),
+ .init_phy = rzg3s_soc_pcie_init_phy,
+};
+
+static const struct of_device_id rzg3s_pcie_of_match[] = {
+ {
+ .compatible = "renesas,r9a08g045-pcie",
+ .data = &rzg3s_soc_data,
+ },
+ {}
+};
+
+static struct platform_driver rzg3s_pcie_driver = {
+ .driver = {
+ .name = "rzg3s-pcie-host",
+ .of_match_table = rzg3s_pcie_of_match,
+ .pm = pm_ptr(&rzg3s_pcie_pm_ops),
+ .suppress_bind_attrs = true,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+ .probe = rzg3s_pcie_probe,
+};
+builtin_platform_driver(rzg3s_pcie_driver);
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index b4b62b9ccc45..ec6afc38e898 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -578,22 +578,6 @@ static void vmd_detach_resources(struct vmd_dev *vmd)
vmd->dev->resource[VMD_MEMBAR2].child = NULL;
}
-/*
- * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
- * Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of which the lower
- * 16 bits are the PCI Segment Group (domain) number. Other bits are
- * currently reserved.
- */
-static int vmd_find_free_domain(void)
-{
- int domain = 0xffff;
- struct pci_bus *bus = NULL;
-
- while ((bus = pci_find_next_bus(bus)) != NULL)
- domain = max_t(int, domain, pci_domain_nr(bus));
- return domain + 1;
-}
-
static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint,
resource_size_t *offset1,
resource_size_t *offset2)
@@ -878,13 +862,6 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
.parent = res,
};
- sd->vmd_dev = vmd->dev;
- sd->domain = vmd_find_free_domain();
- if (sd->domain < 0)
- return sd->domain;
-
- sd->node = pcibus_to_node(vmd->dev->bus);
-
/*
* Currently MSI remapping must be enabled in guest passthrough mode
* due to some missing interrupt remapping plumbing. This is probably
@@ -910,9 +887,24 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]);
pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]);
+ sd->vmd_dev = vmd->dev;
+
+ /*
+ * Emulated domains start at 0x10000 to not clash with ACPI _SEG
+ * domains. Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of
+ * which the lower 16 bits are the PCI Segment Group (domain) number.
+ * Other bits are currently reserved.
+ */
+ sd->domain = pci_bus_find_emul_domain_nr(0, 0x10000, INT_MAX);
+ if (sd->domain < 0)
+ return sd->domain;
+
+ sd->node = pcibus_to_node(vmd->dev->bus);
+
vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start,
&vmd_ops, sd, &resources);
if (!vmd->bus) {
+ pci_bus_release_emul_domain_nr(sd->domain);
pci_free_resource_list(&resources);
vmd_remove_irq_domain(vmd);
return -ENODEV;
@@ -1005,6 +997,7 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
return -ENOMEM;
vmd->dev = dev;
+ vmd->sysdata.domain = PCI_DOMAIN_NR_NOT_SET;
vmd->instance = ida_alloc(&vmd_instance_ida, GFP_KERNEL);
if (vmd->instance < 0)
return vmd->instance;
@@ -1070,6 +1063,7 @@ static void vmd_remove(struct pci_dev *dev)
vmd_detach_resources(vmd);
vmd_remove_irq_domain(vmd);
ida_free(&vmd_instance_ida, vmd->instance);
+ pci_bus_release_emul_domain_nr(vmd->sysdata.domain);
}
static void vmd_shutdown(struct pci_dev *dev)
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 0686f7ecbe91..debd235253c5 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -729,8 +729,9 @@ static void pci_epf_test_enable_doorbell(struct pci_epf_test *epf_test,
if (bar < BAR_0)
goto err_doorbell_cleanup;
- ret = request_irq(epf->db_msg[0].virq, pci_epf_test_doorbell_handler, 0,
- "pci-ep-test-doorbell", epf_test);
+ ret = request_threaded_irq(epf->db_msg[0].virq, NULL,
+ pci_epf_test_doorbell_handler, IRQF_ONESHOT,
+ "pci-ep-test-doorbell", epf_test);
if (ret) {
dev_err(&epf->dev,
"Failed to request doorbell IRQ: %d\n",
diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c
index 83e9ab10f9c4..3ecc5059f92b 100644
--- a/drivers/pci/endpoint/functions/pci-epf-vntb.c
+++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c
@@ -36,11 +36,13 @@
* PCIe Root Port PCI EP
*/
+#include <linux/atomic.h>
#include <linux/delay.h>
#include <linux/io.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/pci-ep-msi.h>
#include <linux/pci-epc.h>
#include <linux/pci-epf.h>
#include <linux/ntb.h>
@@ -126,12 +128,13 @@ struct epf_ntb {
u32 db_count;
u32 spad_count;
u64 mws_size[MAX_MW];
- u64 db;
+ atomic64_t db;
u32 vbus_number;
u16 vntb_pid;
u16 vntb_vid;
bool linkup;
+ bool msi_doorbell;
u32 spad_size;
enum pci_barno epf_ntb_bar[VNTB_BAR_NUM];
@@ -258,9 +261,9 @@ static void epf_ntb_cmd_handler(struct work_struct *work)
ntb = container_of(work, struct epf_ntb, cmd_handler.work);
- for (i = 1; i < ntb->db_count; i++) {
+ for (i = 1; i < ntb->db_count && !ntb->msi_doorbell; i++) {
if (ntb->epf_db[i]) {
- ntb->db |= 1 << (i - 1);
+ atomic64_or(1 << (i - 1), &ntb->db);
ntb_db_event(&ntb->ntb, i);
ntb->epf_db[i] = 0;
}
@@ -319,7 +322,21 @@ static void epf_ntb_cmd_handler(struct work_struct *work)
reset_handler:
queue_delayed_work(kpcintb_workqueue, &ntb->cmd_handler,
- msecs_to_jiffies(5));
+ ntb->msi_doorbell ? msecs_to_jiffies(500) : msecs_to_jiffies(5));
+}
+
+static irqreturn_t epf_ntb_doorbell_handler(int irq, void *data)
+{
+ struct epf_ntb *ntb = data;
+ int i;
+
+ for (i = 1; i < ntb->db_count; i++)
+ if (irq == ntb->epf->db_msg[i].virq) {
+ atomic64_or(1 << (i - 1), &ntb->db);
+ ntb_db_event(&ntb->ntb, i);
+ }
+
+ return IRQ_HANDLED;
}
/**
@@ -500,6 +517,94 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb)
return 0;
}
+static int epf_ntb_db_bar_init_msi_doorbell(struct epf_ntb *ntb,
+ struct pci_epf_bar *db_bar,
+ const struct pci_epc_features *epc_features,
+ enum pci_barno barno)
+{
+ struct pci_epf *epf = ntb->epf;
+ dma_addr_t low, high;
+ struct msi_msg *msg;
+ size_t sz;
+ int ret;
+ int i;
+
+ ret = pci_epf_alloc_doorbell(epf, ntb->db_count);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < ntb->db_count; i++) {
+ ret = request_irq(epf->db_msg[i].virq, epf_ntb_doorbell_handler,
+ 0, "pci_epf_vntb_db", ntb);
+
+ if (ret) {
+ dev_err(&epf->dev,
+ "Failed to request doorbell IRQ: %d\n",
+ epf->db_msg[i].virq);
+ goto err_free_irq;
+ }
+ }
+
+ msg = &epf->db_msg[0].msg;
+
+ high = 0;
+ low = (u64)msg->address_hi << 32 | msg->address_lo;
+
+ for (i = 0; i < ntb->db_count; i++) {
+ struct msi_msg *msg = &epf->db_msg[i].msg;
+ dma_addr_t addr = (u64)msg->address_hi << 32 | msg->address_lo;
+
+ low = min(low, addr);
+ high = max(high, addr);
+ }
+
+ sz = high - low + sizeof(u32);
+
+ ret = pci_epf_assign_bar_space(epf, sz, barno, epc_features, 0, low);
+ if (ret) {
+ dev_err(&epf->dev, "Failed to assign Doorbell BAR space\n");
+ goto err_free_irq;
+ }
+
+ ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no,
+ ntb->epf->vfunc_no, db_bar);
+ if (ret) {
+ dev_err(&epf->dev, "Failed to set Doorbell BAR\n");
+ goto err_free_irq;
+ }
+
+ for (i = 0; i < ntb->db_count; i++) {
+ struct msi_msg *msg = &epf->db_msg[i].msg;
+ dma_addr_t addr;
+ size_t offset;
+
+ ret = pci_epf_align_inbound_addr(epf, db_bar->barno,
+ ((u64)msg->address_hi << 32) | msg->address_lo,
+ &addr, &offset);
+
+ if (ret) {
+ ntb->msi_doorbell = false;
+ goto err_free_irq;
+ }
+
+ ntb->reg->db_data[i] = msg->data;
+ ntb->reg->db_offset[i] = offset;
+ }
+
+ ntb->reg->db_entry_size = 0;
+
+ ntb->msi_doorbell = true;
+
+ return 0;
+
+err_free_irq:
+ for (i--; i >= 0; i--)
+ free_irq(epf->db_msg[i].virq, ntb);
+
+ pci_epf_free_doorbell(ntb->epf);
+ return ret;
+}
+
/**
* epf_ntb_db_bar_init() - Configure Doorbell window BARs
* @ntb: NTB device that facilitates communication between HOST and VHOST
@@ -520,21 +625,25 @@ static int epf_ntb_db_bar_init(struct epf_ntb *ntb)
ntb->epf->func_no,
ntb->epf->vfunc_no);
barno = ntb->epf_ntb_bar[BAR_DB];
-
- mw_addr = pci_epf_alloc_space(ntb->epf, size, barno, epc_features, 0);
- if (!mw_addr) {
- dev_err(dev, "Failed to allocate OB address\n");
- return -ENOMEM;
- }
-
- ntb->epf_db = mw_addr;
-
epf_bar = &ntb->epf->bar[barno];
- ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no, ntb->epf->vfunc_no, epf_bar);
+ ret = epf_ntb_db_bar_init_msi_doorbell(ntb, epf_bar, epc_features, barno);
if (ret) {
- dev_err(dev, "Doorbell BAR set failed\n");
+ /* fall back to polling mode */
+ mw_addr = pci_epf_alloc_space(ntb->epf, size, barno, epc_features, 0);
+ if (!mw_addr) {
+ dev_err(dev, "Failed to allocate OB address\n");
+ return -ENOMEM;
+ }
+
+ ntb->epf_db = mw_addr;
+
+ ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no,
+ ntb->epf->vfunc_no, epf_bar);
+ if (ret) {
+ dev_err(dev, "Doorbell BAR set failed\n");
goto err_alloc_peer_mem;
+ }
}
return ret;
@@ -554,6 +663,16 @@ static void epf_ntb_db_bar_clear(struct epf_ntb *ntb)
{
enum pci_barno barno;
+ if (ntb->msi_doorbell) {
+ int i;
+
+ for (i = 0; i < ntb->db_count; i++)
+ free_irq(ntb->epf->db_msg[i].virq, ntb);
+ }
+
+ if (ntb->epf->db_msg)
+ pci_epf_free_doorbell(ntb->epf);
+
barno = ntb->epf_ntb_bar[BAR_DB];
pci_epf_free_space(ntb->epf, ntb->epf_db, barno, 0);
pci_epc_clear_bar(ntb->epf->epc,
@@ -1268,7 +1387,7 @@ static u64 vntb_epf_db_read(struct ntb_dev *ndev)
{
struct epf_ntb *ntb = ntb_ndev(ndev);
- return ntb->db;
+ return atomic64_read(&ntb->db);
}
static int vntb_epf_mw_get_align(struct ntb_dev *ndev, int pidx, int idx,
@@ -1308,7 +1427,7 @@ static int vntb_epf_db_clear(struct ntb_dev *ndev, u64 db_bits)
{
struct epf_ntb *ntb = ntb_ndev(ndev);
- ntb->db &= ~db_bits;
+ atomic64_and(~db_bits, &ntb->db);
return 0;
}
diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index d54e18872aef..9a505c796370 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -208,6 +208,48 @@ void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf)
}
EXPORT_SYMBOL_GPL(pci_epf_remove_vepf);
+static int pci_epf_get_required_bar_size(struct pci_epf *epf, size_t *bar_size,
+ size_t *aligned_mem_size,
+ enum pci_barno bar,
+ const struct pci_epc_features *epc_features,
+ enum pci_epc_interface_type type)
+{
+ u64 bar_fixed_size = epc_features->bar[bar].fixed_size;
+ size_t align = epc_features->align;
+ size_t size = *bar_size;
+
+ if (size < 128)
+ size = 128;
+
+ /* According to PCIe base spec, min size for a resizable BAR is 1 MB. */
+ if (epc_features->bar[bar].type == BAR_RESIZABLE && size < SZ_1M)
+ size = SZ_1M;
+
+ if (epc_features->bar[bar].type == BAR_FIXED && bar_fixed_size) {
+ if (size > bar_fixed_size) {
+ dev_err(&epf->dev,
+ "requested BAR size is larger than fixed size\n");
+ return -ENOMEM;
+ }
+ size = bar_fixed_size;
+ } else {
+ /* BAR size must be power of two */
+ size = roundup_pow_of_two(size);
+ }
+
+ *bar_size = size;
+
+ /*
+ * The EPC's BAR start address must meet alignment requirements. In most
+ * cases, the alignment will match the BAR size. However, differences
+ * can occur—for example, when the fixed BAR size (e.g., 128 bytes) is
+ * smaller than the required alignment (e.g., 4 KB).
+ */
+ *aligned_mem_size = align ? ALIGN(size, align) : size;
+
+ return 0;
+}
+
/**
* pci_epf_free_space() - free the allocated PCI EPF register space
* @epf: the EPF device from whom to free the memory
@@ -236,13 +278,13 @@ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
}
dev = epc->dev.parent;
- dma_free_coherent(dev, epf_bar[bar].aligned_size, addr,
+ dma_free_coherent(dev, epf_bar[bar].mem_size, addr,
epf_bar[bar].phys_addr);
epf_bar[bar].phys_addr = 0;
epf_bar[bar].addr = NULL;
epf_bar[bar].size = 0;
- epf_bar[bar].aligned_size = 0;
+ epf_bar[bar].mem_size = 0;
epf_bar[bar].barno = 0;
epf_bar[bar].flags = 0;
}
@@ -264,40 +306,16 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
const struct pci_epc_features *epc_features,
enum pci_epc_interface_type type)
{
- u64 bar_fixed_size = epc_features->bar[bar].fixed_size;
- size_t aligned_size, align = epc_features->align;
struct pci_epf_bar *epf_bar;
dma_addr_t phys_addr;
struct pci_epc *epc;
struct device *dev;
+ size_t mem_size;
void *space;
- if (size < 128)
- size = 128;
-
- /* According to PCIe base spec, min size for a resizable BAR is 1 MB. */
- if (epc_features->bar[bar].type == BAR_RESIZABLE && size < SZ_1M)
- size = SZ_1M;
-
- if (epc_features->bar[bar].type == BAR_FIXED && bar_fixed_size) {
- if (size > bar_fixed_size) {
- dev_err(&epf->dev,
- "requested BAR size is larger than fixed size\n");
- return NULL;
- }
- size = bar_fixed_size;
- } else {
- /* BAR size must be power of two */
- size = roundup_pow_of_two(size);
- }
-
- /*
- * Allocate enough memory to accommodate the iATU alignment
- * requirement. In most cases, this will be the same as .size but
- * it might be different if, for example, the fixed size of a BAR
- * is smaller than align.
- */
- aligned_size = align ? ALIGN(size, align) : size;
+ if (pci_epf_get_required_bar_size(epf, &size, &mem_size, bar,
+ epc_features, type))
+ return NULL;
if (type == PRIMARY_INTERFACE) {
epc = epf->epc;
@@ -308,7 +326,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
}
dev = epc->dev.parent;
- space = dma_alloc_coherent(dev, aligned_size, &phys_addr, GFP_KERNEL);
+ space = dma_alloc_coherent(dev, mem_size, &phys_addr, GFP_KERNEL);
if (!space) {
dev_err(dev, "failed to allocate mem space\n");
return NULL;
@@ -317,7 +335,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
epf_bar[bar].phys_addr = phys_addr;
epf_bar[bar].addr = space;
epf_bar[bar].size = size;
- epf_bar[bar].aligned_size = aligned_size;
+ epf_bar[bar].mem_size = mem_size;
epf_bar[bar].barno = bar;
if (upper_32_bits(size) || epc_features->bar[bar].only_64bit)
epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
@@ -328,6 +346,83 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
}
EXPORT_SYMBOL_GPL(pci_epf_alloc_space);
+/**
+ * pci_epf_assign_bar_space() - Assign PCI EPF BAR space
+ * @epf: EPF device to assign the BAR memory
+ * @size: Size of the memory that has to be assigned
+ * @bar: BAR number for which the memory is assigned
+ * @epc_features: Features provided by the EPC specific to this EPF
+ * @type: Identifies if the assignment is for primary EPC or secondary EPC
+ * @bar_addr: Address to be assigned for the @bar
+ *
+ * Invoke to assign memory for the PCI EPF BAR.
+ * Flag PCI_BASE_ADDRESS_MEM_TYPE_64 will automatically get set if the BAR
+ * can only be a 64-bit BAR, or if the requested size is larger than 2 GB.
+ */
+int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size,
+ enum pci_barno bar,
+ const struct pci_epc_features *epc_features,
+ enum pci_epc_interface_type type,
+ dma_addr_t bar_addr)
+{
+ size_t bar_size, aligned_mem_size;
+ struct pci_epf_bar *epf_bar;
+ dma_addr_t limit;
+ int pos;
+
+ if (!size)
+ return -EINVAL;
+
+ limit = bar_addr + size - 1;
+
+ /*
+ * Bits: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ * bar_addr: U U U U U U 0 X X X X X X X X X
+ * limit: U U U U U U 1 X X X X X X X X X
+ *
+ * bar_addr^limit 0 0 0 0 0 0 1 X X X X X X X X X
+ *
+ * U: unchanged address bits in range [bar_addr, limit]
+ * X: bit 0 or 1
+ *
+ * (bar_addr^limit) & BIT_ULL(pos) will find the first set bit from MSB
+ * (pos). And value of (2 ^ pos) should be able to cover the BAR range.
+ */
+ for (pos = 8 * sizeof(dma_addr_t) - 1; pos > 0; pos--)
+ if ((limit ^ bar_addr) & BIT_ULL(pos))
+ break;
+
+ if (pos == 8 * sizeof(dma_addr_t) - 1)
+ return -EINVAL;
+
+ bar_size = BIT_ULL(pos + 1);
+ if (pci_epf_get_required_bar_size(epf, &bar_size, &aligned_mem_size,
+ bar, epc_features, type))
+ return -ENOMEM;
+
+ if (type == PRIMARY_INTERFACE)
+ epf_bar = epf->bar;
+ else
+ epf_bar = epf->sec_epc_bar;
+
+ epf_bar[bar].phys_addr = ALIGN_DOWN(bar_addr, aligned_mem_size);
+
+ if (epf_bar[bar].phys_addr + bar_size < limit)
+ return -ENOMEM;
+
+ epf_bar[bar].addr = NULL;
+ epf_bar[bar].size = bar_size;
+ epf_bar[bar].mem_size = aligned_mem_size;
+ epf_bar[bar].barno = bar;
+ if (upper_32_bits(size) || epc_features->bar[bar].only_64bit)
+ epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+ else
+ epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_32;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_epf_assign_bar_space);
+
static void pci_epf_remove_cfs(struct pci_epf_driver *driver)
{
struct config_group *group, *tmp;
diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c
index afa50b446567..be5ef6516cff 100644
--- a/drivers/pci/host-bridge.c
+++ b/drivers/pci/host-bridge.c
@@ -33,6 +33,7 @@ struct device *pci_get_host_bridge_device(struct pci_dev *dev)
kobject_get(&bridge->kobj);
return bridge;
}
+EXPORT_SYMBOL_GPL(pci_get_host_bridge_device);
void pci_put_host_bridge_device(struct device *dev)
{
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 77dee43b7858..00784a60ba80 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -158,8 +158,7 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
return dev->sriov->barsz[pci_resource_num_to_vf_bar(resno)];
}
-void pci_iov_resource_set_size(struct pci_dev *dev, int resno,
- resource_size_t size)
+void pci_iov_resource_set_size(struct pci_dev *dev, int resno, int size)
{
if (!pci_resource_is_iov(resno)) {
pci_warn(dev, "%s is not an IOV resource\n",
@@ -167,7 +166,8 @@ void pci_iov_resource_set_size(struct pci_dev *dev, int resno,
return;
}
- dev->sriov->barsz[pci_resource_num_to_vf_bar(resno)] = size;
+ resno = pci_resource_num_to_vf_bar(resno);
+ dev->sriov->barsz[resno] = pci_rebar_size_to_bytes(size);
}
bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev)
@@ -1339,29 +1339,16 @@ EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
*/
int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size)
{
- u32 sizes;
- int ret;
-
if (!pci_resource_is_iov(resno))
return -EINVAL;
if (pci_iov_is_memory_decoding_enabled(dev))
return -EBUSY;
- sizes = pci_rebar_get_possible_sizes(dev, resno);
- if (!sizes)
- return -ENOTSUPP;
-
- if (!(sizes & BIT(size)))
+ if (!pci_rebar_size_supported(dev, resno, size))
return -EINVAL;
- ret = pci_rebar_set_size(dev, resno, size);
- if (ret)
- return ret;
-
- pci_iov_resource_set_size(dev, resno, pci_rebar_size_to_bytes(size));
-
- return 0;
+ return pci_rebar_set_size(dev, resno, size);
}
EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size);
@@ -1380,7 +1367,7 @@ EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size);
u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs)
{
u64 vf_len = pci_resource_len(dev, resno);
- u32 sizes;
+ u64 sizes;
if (!num_vfs)
return 0;
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 78e108e47254..981a76b6b7c0 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -25,12 +25,12 @@ struct pci_p2pdma {
struct gen_pool *pool;
bool p2pmem_published;
struct xarray map_types;
+ struct p2pdma_provider mem[PCI_STD_NUM_BARS];
};
struct pci_p2pdma_pagemap {
- struct pci_dev *provider;
- u64 bus_offset;
struct dev_pagemap pgmap;
+ struct p2pdma_provider *mem;
};
static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
@@ -204,8 +204,8 @@ static void p2pdma_page_free(struct page *page)
{
struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
/* safe to dereference while a reference is held to the percpu ref */
- struct pci_p2pdma *p2pdma =
- rcu_dereference_protected(pgmap->provider->p2pdma, 1);
+ struct pci_p2pdma *p2pdma = rcu_dereference_protected(
+ to_pci_dev(pgmap->mem->owner)->p2pdma, 1);
struct percpu_ref *ref;
gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page),
@@ -228,56 +228,136 @@ static void pci_p2pdma_release(void *data)
/* Flush and disable pci_alloc_p2p_mem() */
pdev->p2pdma = NULL;
- synchronize_rcu();
+ if (p2pdma->pool)
+ synchronize_rcu();
+ xa_destroy(&p2pdma->map_types);
+
+ if (!p2pdma->pool)
+ return;
gen_pool_destroy(p2pdma->pool);
sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
- xa_destroy(&p2pdma->map_types);
}
-static int pci_p2pdma_setup(struct pci_dev *pdev)
+/**
+ * pcim_p2pdma_init - Initialise peer-to-peer DMA providers
+ * @pdev: The PCI device to enable P2PDMA for
+ *
+ * This function initializes the peer-to-peer DMA infrastructure
+ * for a PCI device. It allocates and sets up the necessary data
+ * structures to support P2PDMA operations, including mapping type
+ * tracking.
+ */
+int pcim_p2pdma_init(struct pci_dev *pdev)
{
- int error = -ENOMEM;
struct pci_p2pdma *p2p;
+ int i, ret;
+
+ p2p = rcu_dereference_protected(pdev->p2pdma, 1);
+ if (p2p)
+ return 0;
p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL);
if (!p2p)
return -ENOMEM;
xa_init(&p2p->map_types);
+ /*
+ * Iterate over all standard PCI BARs and record only those that
+ * correspond to MMIO regions. Skip non-memory resources (e.g. I/O
+ * port BARs) since they cannot be used for peer-to-peer (P2P)
+ * transactions.
+ */
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+ continue;
- p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
- if (!p2p->pool)
- goto out;
+ p2p->mem[i].owner = &pdev->dev;
+ p2p->mem[i].bus_offset =
+ pci_bus_address(pdev, i) - pci_resource_start(pdev, i);
+ }
- error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
- if (error)
- goto out_pool_destroy;
+ ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
+ if (ret)
+ goto out_p2p;
- error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
- if (error)
+ rcu_assign_pointer(pdev->p2pdma, p2p);
+ return 0;
+
+out_p2p:
+ devm_kfree(&pdev->dev, p2p);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pcim_p2pdma_init);
+
+/**
+ * pcim_p2pdma_provider - Get peer-to-peer DMA provider
+ * @pdev: The PCI device to enable P2PDMA for
+ * @bar: BAR index to get provider
+ *
+ * This function gets peer-to-peer DMA provider for a PCI device. The lifetime
+ * of the provider (and of course the MMIO) is bound to the lifetime of the
+ * driver. A driver calling this function must ensure that all references to the
+ * provider, and any DMA mappings created for any MMIO, are all cleaned up
+ * before the driver remove() completes.
+ *
+ * Since P2P is almost always shared with a second driver this means some system
+ * to notify, invalidate and revoke the MMIO's DMA must be in place to use this
+ * function. For example a revoke can be built using DMABUF.
+ */
+struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar)
+{
+ struct pci_p2pdma *p2p;
+
+ if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
+ return NULL;
+
+ p2p = rcu_dereference_protected(pdev->p2pdma, 1);
+ if (WARN_ON(!p2p))
+ /* Someone forgot to call to pcim_p2pdma_init() before */
+ return NULL;
+
+ return &p2p->mem[bar];
+}
+EXPORT_SYMBOL_GPL(pcim_p2pdma_provider);
+
+static int pci_p2pdma_setup_pool(struct pci_dev *pdev)
+{
+ struct pci_p2pdma *p2pdma;
+ int ret;
+
+ p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
+ if (p2pdma->pool)
+ /* We already setup pools, do nothing, */
+ return 0;
+
+ p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
+ if (!p2pdma->pool)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
+ if (ret)
goto out_pool_destroy;
- rcu_assign_pointer(pdev->p2pdma, p2p);
return 0;
out_pool_destroy:
- gen_pool_destroy(p2p->pool);
-out:
- devm_kfree(&pdev->dev, p2p);
- return error;
+ gen_pool_destroy(p2pdma->pool);
+ p2pdma->pool = NULL;
+ return ret;
}
static void pci_p2pdma_unmap_mappings(void *data)
{
- struct pci_dev *pdev = data;
+ struct pci_p2pdma_pagemap *p2p_pgmap = data;
/*
* Removing the alloc attribute from sysfs will call
* unmap_mapping_range() on the inode, teardown any existing userspace
* mappings and prevent new ones from being created.
*/
- sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
+ sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj,
+ &p2pmem_alloc_attr.attr,
p2pmem_group.name);
}
@@ -295,6 +375,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset)
{
struct pci_p2pdma_pagemap *p2p_pgmap;
+ struct p2pdma_provider *mem;
struct dev_pagemap *pgmap;
struct pci_p2pdma *p2pdma;
void *addr;
@@ -312,11 +393,21 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
if (size + offset > pci_resource_len(pdev, bar))
return -EINVAL;
- if (!pdev->p2pdma) {
- error = pci_p2pdma_setup(pdev);
- if (error)
- return error;
- }
+ error = pcim_p2pdma_init(pdev);
+ if (error)
+ return error;
+
+ error = pci_p2pdma_setup_pool(pdev);
+ if (error)
+ return error;
+
+ mem = pcim_p2pdma_provider(pdev, bar);
+ /*
+ * We checked validity of BAR prior to call
+ * to pcim_p2pdma_provider. It should never return NULL.
+ */
+ if (WARN_ON(!mem))
+ return -EINVAL;
p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
if (!p2p_pgmap)
@@ -328,10 +419,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
pgmap->ops = &p2pdma_pgmap_ops;
-
- p2p_pgmap->provider = pdev;
- p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
- pci_resource_start(pdev, bar);
+ p2p_pgmap->mem = mem;
addr = devm_memremap_pages(&pdev->dev, pgmap);
if (IS_ERR(addr)) {
@@ -340,7 +428,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
}
error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
- pdev);
+ p2p_pgmap);
if (error)
goto pages_free;
@@ -972,16 +1060,26 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
}
EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
-static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
- struct device *dev)
+/**
+ * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers
+ * @provider: P2PDMA provider structure
+ * @dev: Target device for the transfer
+ *
+ * Determines how peer-to-peer DMA transfers should be mapped between
+ * the provider and the target device. The mapping type indicates whether
+ * the transfer can be done directly through PCI switches or must go
+ * through the host bridge.
+ */
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider,
+ struct device *dev)
{
enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
- struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider;
+ struct pci_dev *pdev = to_pci_dev(provider->owner);
struct pci_dev *client;
struct pci_p2pdma *p2pdma;
int dist;
- if (!provider->p2pdma)
+ if (!pdev->p2pdma)
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
if (!dev_is_pci(dev))
@@ -990,7 +1088,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
client = to_pci_dev(dev);
rcu_read_lock();
- p2pdma = rcu_dereference(provider->p2pdma);
+ p2pdma = rcu_dereference(pdev->p2pdma);
if (p2pdma)
type = xa_to_value(xa_load(&p2pdma->map_types,
@@ -998,7 +1096,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
rcu_read_unlock();
if (type == PCI_P2PDMA_MAP_UNKNOWN)
- return calc_map_type_and_dist(provider, client, &dist, true);
+ return calc_map_type_and_dist(pdev, client, &dist, true);
return type;
}
@@ -1006,9 +1104,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
struct device *dev, struct page *page)
{
- state->pgmap = page_pgmap(page);
- state->map = pci_p2pdma_map_type(state->pgmap, dev);
- state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset;
+ struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page));
+
+ if (state->mem == p2p_pgmap->mem)
+ return;
+
+ state->mem = p2p_pgmap->mem;
+ state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev);
}
/**
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 302d61783f6c..7c2d9d596258 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -629,6 +629,8 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state)
struct pci_dev *pci_dev = to_pci_dev(dev);
struct pci_driver *drv = pci_dev->driver;
+ pci_dev->state_saved = false;
+
if (drv && drv->suspend) {
pci_power_t prev = pci_dev->current_state;
int error;
@@ -1036,6 +1038,8 @@ static int pci_pm_freeze(struct device *dev)
if (!pm) {
pci_pm_default_suspend(pci_dev);
+ if (!pm_runtime_suspended(dev))
+ pci_dev->state_saved = false;
return 0;
}
@@ -1129,8 +1133,6 @@ static int pci_pm_thaw(struct device *dev)
pci_pm_reenable_device(pci_dev);
}
- pci_dev->state_saved = false;
-
return error;
}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 3881359440b1..80a7c4fe6b03 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1587,7 +1587,7 @@ static ssize_t __resource_resize_show(struct device *dev, int n, char *buf)
pci_config_pm_runtime_get(pdev);
ret = sysfs_emit(buf, "%016llx\n",
- (u64)pci_rebar_get_possible_sizes(pdev, n));
+ pci_rebar_get_possible_sizes(pdev, n));
pci_config_pm_runtime_put(pdev);
@@ -1599,18 +1599,13 @@ static ssize_t __resource_resize_store(struct device *dev, int n,
{
struct pci_dev *pdev = to_pci_dev(dev);
struct pci_bus *bus = pdev->bus;
- struct resource *b_win, *res;
unsigned long size;
- int ret, i;
+ int ret;
u16 cmd;
if (kstrtoul(buf, 0, &size) < 0)
return -EINVAL;
- b_win = pbus_select_window(bus, pci_resource_n(pdev, n));
- if (!b_win)
- return -EINVAL;
-
device_lock(dev);
if (dev->driver || pci_num_vf(pdev)) {
ret = -EBUSY;
@@ -1632,15 +1627,7 @@ static ssize_t __resource_resize_store(struct device *dev, int n,
pci_remove_resource_files(pdev);
- pci_dev_for_each_resource(pdev, res, i) {
- if (i >= PCI_BRIDGE_RESOURCES)
- break;
-
- if (b_win == pbus_select_window(bus, res))
- pci_release_resource(pdev, i);
- }
-
- ret = pci_resize_resource(pdev, n, size);
+ ret = pci_resize_resource(pdev, n, size, 0);
pci_assign_unassigned_bus_resources(bus);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b14dd064006c..13dbb405dc31 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1823,41 +1823,12 @@ static void pci_restore_config_space(struct pci_dev *pdev)
}
}
-static void pci_restore_rebar_state(struct pci_dev *pdev)
-{
- unsigned int pos, nbars, i;
- u32 ctrl;
-
- pos = pdev->rebar_cap;
- if (!pos)
- return;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl);
-
- for (i = 0; i < nbars; i++, pos += 8) {
- struct resource *res;
- int bar_idx, size;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX;
- res = pci_resource_n(pdev, bar_idx);
- size = pci_rebar_bytes_to_size(resource_size(res));
- ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
- ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size);
- pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
- }
-}
-
/**
* pci_restore_state - Restore the saved state of a PCI device
* @dev: PCI device that we're dealing with
*/
void pci_restore_state(struct pci_dev *dev)
{
- if (!dev->state_saved)
- return;
-
pci_restore_pcie_state(dev);
pci_restore_pasid_state(dev);
pci_restore_pri_state(dev);
@@ -3687,125 +3658,6 @@ void pci_acs_init(struct pci_dev *dev)
pci_enable_acs(dev);
}
-void pci_rebar_init(struct pci_dev *pdev)
-{
- pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR);
-}
-
-/**
- * pci_rebar_find_pos - find position of resize ctrl reg for BAR
- * @pdev: PCI device
- * @bar: BAR to find
- *
- * Helper to find the position of the ctrl register for a BAR.
- * Returns -ENOTSUPP if resizable BARs are not supported at all.
- * Returns -ENOENT if no ctrl register for the BAR could be found.
- */
-static int pci_rebar_find_pos(struct pci_dev *pdev, int bar)
-{
- unsigned int pos, nbars, i;
- u32 ctrl;
-
- if (pci_resource_is_iov(bar)) {
- pos = pci_iov_vf_rebar_cap(pdev);
- bar = pci_resource_num_to_vf_bar(bar);
- } else {
- pos = pdev->rebar_cap;
- }
-
- if (!pos)
- return -ENOTSUPP;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl);
-
- for (i = 0; i < nbars; i++, pos += 8) {
- int bar_idx;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- bar_idx = FIELD_GET(PCI_REBAR_CTRL_BAR_IDX, ctrl);
- if (bar_idx == bar)
- return pos;
- }
-
- return -ENOENT;
-}
-
-/**
- * pci_rebar_get_possible_sizes - get possible sizes for BAR
- * @pdev: PCI device
- * @bar: BAR to query
- *
- * Get the possible sizes of a resizable BAR as bitmask defined in the spec
- * (bit 0=1MB, bit 31=128TB). Returns 0 if BAR isn't resizable.
- */
-u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
-{
- int pos;
- u32 cap;
-
- pos = pci_rebar_find_pos(pdev, bar);
- if (pos < 0)
- return 0;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap);
- cap = FIELD_GET(PCI_REBAR_CAP_SIZES, cap);
-
- /* Sapphire RX 5600 XT Pulse has an invalid cap dword for BAR 0 */
- if (pdev->vendor == PCI_VENDOR_ID_ATI && pdev->device == 0x731f &&
- bar == 0 && cap == 0x700)
- return 0x3f00;
-
- return cap;
-}
-EXPORT_SYMBOL(pci_rebar_get_possible_sizes);
-
-/**
- * pci_rebar_get_current_size - get the current size of a BAR
- * @pdev: PCI device
- * @bar: BAR to set size to
- *
- * Read the size of a BAR from the resizable BAR config.
- * Returns size if found or negative error code.
- */
-int pci_rebar_get_current_size(struct pci_dev *pdev, int bar)
-{
- int pos;
- u32 ctrl;
-
- pos = pci_rebar_find_pos(pdev, bar);
- if (pos < 0)
- return pos;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- return FIELD_GET(PCI_REBAR_CTRL_BAR_SIZE, ctrl);
-}
-
-/**
- * pci_rebar_set_size - set a new size for a BAR
- * @pdev: PCI device
- * @bar: BAR to set size to
- * @size: new size as defined in the spec (0=1MB, 31=128TB)
- *
- * Set the new size of a BAR as defined in the spec.
- * Returns zero if resizing was successful, error code otherwise.
- */
-int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size)
-{
- int pos;
- u32 ctrl;
-
- pos = pci_rebar_find_pos(pdev, bar);
- if (pos < 0)
- return pos;
-
- pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
- ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
- ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size);
- pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
- return 0;
-}
-
/**
* pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port
* @dev: the PCI device
@@ -6656,9 +6508,31 @@ static void pci_no_domains(void)
#endif
}
+#ifdef CONFIG_PCI_DOMAINS
+static DEFINE_IDA(pci_domain_nr_dynamic_ida);
+
+/**
+ * pci_bus_find_emul_domain_nr() - allocate a PCI domain number per constraints
+ * @hint: desired domain, 0 if any ID in the range of @min to @max is acceptable
+ * @min: minimum allowable domain
+ * @max: maximum allowable domain, no IDs higher than INT_MAX will be returned
+ */
+int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max)
+{
+ return ida_alloc_range(&pci_domain_nr_dynamic_ida, max(hint, min), max,
+ GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(pci_bus_find_emul_domain_nr);
+
+void pci_bus_release_emul_domain_nr(int domain_nr)
+{
+ ida_free(&pci_domain_nr_dynamic_ida, domain_nr);
+}
+EXPORT_SYMBOL_GPL(pci_bus_release_emul_domain_nr);
+#endif
+
#ifdef CONFIG_PCI_DOMAINS_GENERIC
static DEFINE_IDA(pci_domain_nr_static_ida);
-static DEFINE_IDA(pci_domain_nr_dynamic_ida);
static void of_pci_reserve_static_domain_nr(void)
{
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 36f8c0985430..a33bc4e0bf34 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -421,8 +421,10 @@ enum pci_bar_type {
struct device *pci_get_host_bridge_device(struct pci_dev *dev);
void pci_put_host_bridge_device(struct device *dev);
+void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size);
+int pci_do_resource_release_and_resize(struct pci_dev *dev, int resno, int size,
+ int exclude_bars);
unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge);
-int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res);
int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
int pci_configure_extended_tags(struct pci_dev *dev, void *ign);
@@ -808,8 +810,7 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno);
resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno);
void pci_restore_iov_state(struct pci_dev *dev);
int pci_iov_bus_range(struct pci_bus *bus);
-void pci_iov_resource_set_size(struct pci_dev *dev, int resno,
- resource_size_t size);
+void pci_iov_resource_set_size(struct pci_dev *dev, int resno, int size);
bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev);
static inline u16 pci_iov_vf_rebar_cap(struct pci_dev *dev)
{
@@ -851,7 +852,7 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
return 0;
}
static inline void pci_iov_resource_set_size(struct pci_dev *dev, int resno,
- resource_size_t size) { }
+ int size) { }
static inline bool pci_iov_is_memory_decoding_enabled(struct pci_dev *dev)
{
return false;
@@ -1022,12 +1023,9 @@ static inline int acpi_get_rc_resources(struct device *dev, const char *hid,
#endif
void pci_rebar_init(struct pci_dev *pdev);
+void pci_restore_rebar_state(struct pci_dev *pdev);
int pci_rebar_get_current_size(struct pci_dev *pdev, int bar);
int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size);
-static inline u64 pci_rebar_size_to_bytes(int size)
-{
- return 1ULL << (size + 20);
-}
struct device_node;
diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c
index d1b68c18444f..38a41ccf79b9 100644
--- a/drivers/pci/pcie/portdrv.c
+++ b/drivers/pci/pcie/portdrv.c
@@ -760,7 +760,6 @@ static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
device_for_each_child(&dev->dev, &off, pcie_port_device_iter);
pci_restore_state(dev);
- pci_save_state(dev);
return PCI_ERS_RESULT_RECOVERED;
}
diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c
index 65e4b008be00..ed0f9691e7d1 100644
--- a/drivers/pci/pcie/ptm.c
+++ b/drivers/pci/pcie/ptm.c
@@ -81,6 +81,11 @@ void pci_ptm_init(struct pci_dev *dev)
dev->ptm_granularity = 0;
}
+ if (cap & PCI_PTM_CAP_RES)
+ dev->ptm_responder = 1;
+ if (cap & PCI_PTM_CAP_REQ)
+ dev->ptm_requester = 1;
+
if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM)
pci_enable_ptm(dev, NULL);
@@ -144,6 +149,24 @@ static int __pci_enable_ptm(struct pci_dev *dev)
return -EINVAL;
}
+ switch (pci_pcie_type(dev)) {
+ case PCI_EXP_TYPE_ROOT_PORT:
+ if (!dev->ptm_root)
+ return -EINVAL;
+ break;
+ case PCI_EXP_TYPE_UPSTREAM:
+ if (!dev->ptm_responder)
+ return -EINVAL;
+ break;
+ case PCI_EXP_TYPE_ENDPOINT:
+ case PCI_EXP_TYPE_LEG_END:
+ if (!dev->ptm_requester)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl);
ctrl |= PCI_PTM_CTRL_ENABLE;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 9cd032dff31e..124d2d309c58 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -650,6 +650,11 @@ static void pci_release_host_bridge_dev(struct device *dev)
pci_free_resource_list(&bridge->windows);
pci_free_resource_list(&bridge->dma_ranges);
+
+ /* Host bridges only have domain_nr set in the emulation case */
+ if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET)
+ pci_bus_release_emul_domain_nr(bridge->domain_nr);
+
kfree(bridge);
}
@@ -1130,7 +1135,8 @@ unregister:
device_del(&bridge->dev);
free:
#ifdef CONFIG_PCI_DOMAINS_GENERIC
- pci_bus_release_domain_nr(parent, bus->domain_nr);
+ if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET)
+ pci_bus_release_domain_nr(parent, bus->domain_nr);
#endif
if (bus_registered)
put_device(&bus->dev);
@@ -2747,8 +2753,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
pci_reassigndev_resource_alignment(dev);
- dev->state_saved = false;
-
pci_init_capabilities(dev);
/*
@@ -3170,8 +3174,7 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus,
* bus number if there is room.
*/
if (bus->self && bus->self->is_hotplug_bridge) {
- used_buses = max_t(unsigned int, available_buses,
- pci_hotplug_bus_size - 1);
+ used_buses = max(available_buses, pci_hotplug_bus_size - 1);
if (max - start < used_buses) {
max = start + used_buses;
diff --git a/drivers/pci/pwrctrl/Kconfig b/drivers/pci/pwrctrl/Kconfig
index 6956c1854811..e0f999f299bb 100644
--- a/drivers/pci/pwrctrl/Kconfig
+++ b/drivers/pci/pwrctrl/Kconfig
@@ -22,6 +22,21 @@ config PCI_PWRCTRL_SLOT
PCI slots. The voltage regulators powering the rails of the PCI slots
are expected to be defined in the devicetree node of the PCI bridge.
+config PCI_PWRCTRL_TC9563
+ tristate "PCI Power Control driver for TC9563 PCIe switch"
+ select PCI_PWRCTRL
+ default m if ARCH_QCOM
+ depends on I2C
+ help
+ Say Y here to enable the PCI Power Control driver of TC9563 PCIe
+ switch.
+
+ This driver enables power and configures the TC9563 PCIe switch
+ through i2c. TC9563 is a PCIe switch which has one upstream and three
+ downstream ports. To one of the downstream ports integrated ethernet
+ MAC is connected as endpoint device. Other two downstream ports are
+ supposed to connect to external device.
+
# deprecated
config HAVE_PWRCTL
bool
diff --git a/drivers/pci/pwrctrl/Makefile b/drivers/pci/pwrctrl/Makefile
index a4e5808d7850..13b02282106c 100644
--- a/drivers/pci/pwrctrl/Makefile
+++ b/drivers/pci/pwrctrl/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_PCI_PWRCTRL_PWRSEQ) += pci-pwrctrl-pwrseq.o
obj-$(CONFIG_PCI_PWRCTRL_SLOT) += pci-pwrctrl-slot.o
pci-pwrctrl-slot-y := slot.o
+
+obj-$(CONFIG_PCI_PWRCTRL_TC9563) += pci-pwrctrl-tc9563.o
diff --git a/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c b/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c
new file mode 100644
index 000000000000..ec423432ac65
--- /dev/null
+++ b/drivers/pci/pwrctrl/pci-pwrctrl-tc9563.c
@@ -0,0 +1,648 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include <linux/array_size.h>
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/gpio/consumer.h>
+#include <linux/i2c.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/pci-pwrctrl.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/unaligned.h>
+
+#include "../pci.h"
+
+#define TC9563_GPIO_CONFIG 0x801208
+#define TC9563_RESET_GPIO 0x801210
+
+#define TC9563_PORT_L0S_DELAY 0x82496c
+#define TC9563_PORT_L1_DELAY 0x824970
+
+#define TC9563_EMBEDDED_ETH_DELAY 0x8200d8
+#define TC9563_ETH_L1_DELAY_MASK GENMASK(27, 18)
+#define TC9563_ETH_L1_DELAY_VALUE(x) FIELD_PREP(TC9563_ETH_L1_DELAY_MASK, x)
+#define TC9563_ETH_L0S_DELAY_MASK GENMASK(17, 13)
+#define TC9563_ETH_L0S_DELAY_VALUE(x) FIELD_PREP(TC9563_ETH_L0S_DELAY_MASK, x)
+
+#define TC9563_NFTS_2_5_GT 0x824978
+#define TC9563_NFTS_5_GT 0x82497c
+
+#define TC9563_PORT_LANE_ACCESS_ENABLE 0x828000
+
+#define TC9563_PHY_RATE_CHANGE_OVERRIDE 0x828040
+#define TC9563_PHY_RATE_CHANGE 0x828050
+
+#define TC9563_TX_MARGIN 0x828234
+
+#define TC9563_DFE_ENABLE 0x828a04
+#define TC9563_DFE_EQ0_MODE 0x828a08
+#define TC9563_DFE_EQ1_MODE 0x828a0c
+#define TC9563_DFE_EQ2_MODE 0x828a14
+#define TC9563_DFE_PD_MASK 0x828254
+
+#define TC9563_PORT_SELECT 0x82c02c
+#define TC9563_PORT_ACCESS_ENABLE 0x82c030
+
+#define TC9563_POWER_CONTROL 0x82b09c
+#define TC9563_POWER_CONTROL_OVREN 0x82b2c8
+
+#define TC9563_GPIO_MASK 0xfffffff3
+#define TC9563_GPIO_DEASSERT_BITS 0xc /* Bits to clear for GPIO deassert */
+
+#define TC9563_TX_MARGIN_MIN_UA 400000
+
+/*
+ * From TC9563 PORSYS rev 0.2, figure 1.1 POR boot sequence
+ * wait for 10ms for the internal osc frequency to stabilize.
+ */
+#define TC9563_OSC_STAB_DELAY_US (10 * USEC_PER_MSEC)
+
+#define TC9563_L0S_L1_DELAY_UNIT_NS 256 /* Each unit represents 256 nanoseconds */
+
+struct tc9563_pwrctrl_reg_setting {
+ unsigned int offset;
+ unsigned int val;
+};
+
+enum tc9563_pwrctrl_ports {
+ TC9563_USP,
+ TC9563_DSP1,
+ TC9563_DSP2,
+ TC9563_DSP3,
+ TC9563_ETHERNET,
+ TC9563_MAX
+};
+
+struct tc9563_pwrctrl_cfg {
+ u32 l0s_delay;
+ u32 l1_delay;
+ u32 tx_amp;
+ u8 nfts[2]; /* GEN1 & GEN2 */
+ bool disable_dfe;
+ bool disable_port;
+};
+
+#define TC9563_PWRCTL_MAX_SUPPLY 6
+
+static const char *const tc9563_supply_names[TC9563_PWRCTL_MAX_SUPPLY] = {
+ "vddc",
+ "vdd18",
+ "vdd09",
+ "vddio1",
+ "vddio2",
+ "vddio18",
+};
+
+struct tc9563_pwrctrl_ctx {
+ struct regulator_bulk_data supplies[TC9563_PWRCTL_MAX_SUPPLY];
+ struct tc9563_pwrctrl_cfg cfg[TC9563_MAX];
+ struct gpio_desc *reset_gpio;
+ struct i2c_adapter *adapter;
+ struct i2c_client *client;
+ struct pci_pwrctrl pwrctrl;
+};
+
+/*
+ * downstream port power off sequence, hardcoding the address
+ * as we don't know register names for these register offsets.
+ */
+static const struct tc9563_pwrctrl_reg_setting common_pwroff_seq[] = {
+ {0x82900c, 0x1},
+ {0x829010, 0x1},
+ {0x829018, 0x0},
+ {0x829020, 0x1},
+ {0x82902c, 0x1},
+ {0x829030, 0x1},
+ {0x82903c, 0x1},
+ {0x829058, 0x0},
+ {0x82905c, 0x1},
+ {0x829060, 0x1},
+ {0x8290cc, 0x1},
+ {0x8290d0, 0x1},
+ {0x8290d8, 0x1},
+ {0x8290e0, 0x1},
+ {0x8290e8, 0x1},
+ {0x8290ec, 0x1},
+ {0x8290f4, 0x1},
+ {0x82910c, 0x1},
+ {0x829110, 0x1},
+ {0x829114, 0x1},
+};
+
+static const struct tc9563_pwrctrl_reg_setting dsp1_pwroff_seq[] = {
+ {TC9563_PORT_ACCESS_ENABLE, 0x2},
+ {TC9563_PORT_LANE_ACCESS_ENABLE, 0x3},
+ {TC9563_POWER_CONTROL, 0x014f4804},
+ {TC9563_POWER_CONTROL_OVREN, 0x1},
+ {TC9563_PORT_ACCESS_ENABLE, 0x4},
+};
+
+static const struct tc9563_pwrctrl_reg_setting dsp2_pwroff_seq[] = {
+ {TC9563_PORT_ACCESS_ENABLE, 0x8},
+ {TC9563_PORT_LANE_ACCESS_ENABLE, 0x1},
+ {TC9563_POWER_CONTROL, 0x014f4804},
+ {TC9563_POWER_CONTROL_OVREN, 0x1},
+ {TC9563_PORT_ACCESS_ENABLE, 0x8},
+};
+
+/*
+ * Since all transfers are initiated by the probe, no locks are necessary,
+ * as there are no concurrent calls.
+ */
+static int tc9563_pwrctrl_i2c_write(struct i2c_client *client,
+ u32 reg_addr, u32 reg_val)
+{
+ struct i2c_msg msg;
+ u8 msg_buf[7];
+ int ret;
+
+ msg.addr = client->addr;
+ msg.len = 7;
+ msg.flags = 0;
+
+ /* Big Endian for reg addr */
+ put_unaligned_be24(reg_addr, &msg_buf[0]);
+
+ /* Little Endian for reg val */
+ put_unaligned_le32(reg_val, &msg_buf[3]);
+
+ msg.buf = msg_buf;
+ ret = i2c_transfer(client->adapter, &msg, 1);
+ return ret == 1 ? 0 : ret;
+}
+
+static int tc9563_pwrctrl_i2c_read(struct i2c_client *client,
+ u32 reg_addr, u32 *reg_val)
+{
+ struct i2c_msg msg[2];
+ u8 wr_data[3];
+ u32 rd_data;
+ int ret;
+
+ msg[0].addr = client->addr;
+ msg[0].len = 3;
+ msg[0].flags = 0;
+
+ /* Big Endian for reg addr */
+ put_unaligned_be24(reg_addr, &wr_data[0]);
+
+ msg[0].buf = wr_data;
+
+ msg[1].addr = client->addr;
+ msg[1].len = 4;
+ msg[1].flags = I2C_M_RD;
+
+ msg[1].buf = (u8 *)&rd_data;
+
+ ret = i2c_transfer(client->adapter, &msg[0], 2);
+ if (ret == 2) {
+ *reg_val = get_unaligned_le32(&rd_data);
+ return 0;
+ }
+
+ /* If only one message successfully completed, return -EIO */
+ return ret == 1 ? -EIO : ret;
+}
+
+static int tc9563_pwrctrl_i2c_bulk_write(struct i2c_client *client,
+ const struct tc9563_pwrctrl_reg_setting *seq, int len)
+{
+ int ret, i;
+
+ for (i = 0; i < len; i++) {
+ ret = tc9563_pwrctrl_i2c_write(client, seq[i].offset, seq[i].val);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int tc9563_pwrctrl_disable_port(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port)
+{
+ struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port];
+ const struct tc9563_pwrctrl_reg_setting *seq;
+ int ret, len;
+
+ if (!cfg->disable_port)
+ return 0;
+
+ if (port == TC9563_DSP1) {
+ seq = dsp1_pwroff_seq;
+ len = ARRAY_SIZE(dsp1_pwroff_seq);
+ } else {
+ seq = dsp2_pwroff_seq;
+ len = ARRAY_SIZE(dsp2_pwroff_seq);
+ }
+
+ ret = tc9563_pwrctrl_i2c_bulk_write(ctx->client, seq, len);
+ if (ret)
+ return ret;
+
+ return tc9563_pwrctrl_i2c_bulk_write(ctx->client,
+ common_pwroff_seq, ARRAY_SIZE(common_pwroff_seq));
+}
+
+static int tc9563_pwrctrl_set_l0s_l1_entry_delay(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port, bool is_l1, u32 ns)
+{
+ u32 rd_val, units;
+ int ret;
+
+ if (ns < TC9563_L0S_L1_DELAY_UNIT_NS)
+ return 0;
+
+ /* convert to units of 256ns */
+ units = ns / TC9563_L0S_L1_DELAY_UNIT_NS;
+
+ if (port == TC9563_ETHERNET) {
+ ret = tc9563_pwrctrl_i2c_read(ctx->client, TC9563_EMBEDDED_ETH_DELAY, &rd_val);
+ if (ret)
+ return ret;
+
+ if (is_l1)
+ rd_val = u32_replace_bits(rd_val, units, TC9563_ETH_L1_DELAY_MASK);
+ else
+ rd_val = u32_replace_bits(rd_val, units, TC9563_ETH_L0S_DELAY_MASK);
+
+ return tc9563_pwrctrl_i2c_write(ctx->client, TC9563_EMBEDDED_ETH_DELAY, rd_val);
+ }
+
+ ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_PORT_SELECT, BIT(port));
+ if (ret)
+ return ret;
+
+ return tc9563_pwrctrl_i2c_write(ctx->client,
+ is_l1 ? TC9563_PORT_L1_DELAY : TC9563_PORT_L0S_DELAY, units);
+}
+
+static int tc9563_pwrctrl_set_tx_amplitude(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port)
+{
+ u32 amp = ctx->cfg[port].tx_amp;
+ int port_access;
+
+ if (amp < TC9563_TX_MARGIN_MIN_UA)
+ return 0;
+
+ /* txmargin = (Amp(uV) - 400000) / 3125 */
+ amp = (amp - TC9563_TX_MARGIN_MIN_UA) / 3125;
+
+ switch (port) {
+ case TC9563_USP:
+ port_access = 0x1;
+ break;
+ case TC9563_DSP1:
+ port_access = 0x2;
+ break;
+ case TC9563_DSP2:
+ port_access = 0x8;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ struct tc9563_pwrctrl_reg_setting tx_amp_seq[] = {
+ {TC9563_PORT_ACCESS_ENABLE, port_access},
+ {TC9563_PORT_LANE_ACCESS_ENABLE, 0x3},
+ {TC9563_TX_MARGIN, amp},
+ };
+
+ return tc9563_pwrctrl_i2c_bulk_write(ctx->client, tx_amp_seq, ARRAY_SIZE(tx_amp_seq));
+}
+
+static int tc9563_pwrctrl_disable_dfe(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port)
+{
+ struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port];
+ int port_access, lane_access = 0x3;
+ u32 phy_rate = 0x21;
+
+ if (!cfg->disable_dfe)
+ return 0;
+
+ switch (port) {
+ case TC9563_USP:
+ phy_rate = 0x1;
+ port_access = 0x1;
+ break;
+ case TC9563_DSP1:
+ port_access = 0x2;
+ break;
+ case TC9563_DSP2:
+ port_access = 0x8;
+ lane_access = 0x1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ struct tc9563_pwrctrl_reg_setting disable_dfe_seq[] = {
+ {TC9563_PORT_ACCESS_ENABLE, port_access},
+ {TC9563_PORT_LANE_ACCESS_ENABLE, lane_access},
+ {TC9563_DFE_ENABLE, 0x0},
+ {TC9563_DFE_EQ0_MODE, 0x411},
+ {TC9563_DFE_EQ1_MODE, 0x11},
+ {TC9563_DFE_EQ2_MODE, 0x11},
+ {TC9563_DFE_PD_MASK, 0x7},
+ {TC9563_PHY_RATE_CHANGE_OVERRIDE, 0x10},
+ {TC9563_PHY_RATE_CHANGE, phy_rate},
+ {TC9563_PHY_RATE_CHANGE, 0x0},
+ {TC9563_PHY_RATE_CHANGE_OVERRIDE, 0x0},
+ };
+
+ return tc9563_pwrctrl_i2c_bulk_write(ctx->client,
+ disable_dfe_seq, ARRAY_SIZE(disable_dfe_seq));
+}
+
+static int tc9563_pwrctrl_set_nfts(struct tc9563_pwrctrl_ctx *ctx,
+ enum tc9563_pwrctrl_ports port)
+{
+ u8 *nfts = ctx->cfg[port].nfts;
+ struct tc9563_pwrctrl_reg_setting nfts_seq[] = {
+ {TC9563_NFTS_2_5_GT, nfts[0]},
+ {TC9563_NFTS_5_GT, nfts[1]},
+ };
+ int ret;
+
+ if (!nfts[0])
+ return 0;
+
+ ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_PORT_SELECT, BIT(port));
+ if (ret)
+ return ret;
+
+ return tc9563_pwrctrl_i2c_bulk_write(ctx->client, nfts_seq, ARRAY_SIZE(nfts_seq));
+}
+
+static int tc9563_pwrctrl_assert_deassert_reset(struct tc9563_pwrctrl_ctx *ctx, bool deassert)
+{
+ int ret, val;
+
+ ret = tc9563_pwrctrl_i2c_write(ctx->client, TC9563_GPIO_CONFIG, TC9563_GPIO_MASK);
+ if (ret)
+ return ret;
+
+ val = deassert ? TC9563_GPIO_DEASSERT_BITS : 0;
+
+ return tc9563_pwrctrl_i2c_write(ctx->client, TC9563_RESET_GPIO, val);
+}
+
+static int tc9563_pwrctrl_parse_device_dt(struct tc9563_pwrctrl_ctx *ctx, struct device_node *node,
+ enum tc9563_pwrctrl_ports port)
+{
+ struct tc9563_pwrctrl_cfg *cfg = &ctx->cfg[port];
+ int ret;
+
+ /* Disable port if the status of the port is disabled. */
+ if (!of_device_is_available(node)) {
+ cfg->disable_port = true;
+ return 0;
+ }
+
+ ret = of_property_read_u32(node, "aspm-l0s-entry-delay-ns", &cfg->l0s_delay);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ ret = of_property_read_u32(node, "aspm-l1-entry-delay-ns", &cfg->l1_delay);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ ret = of_property_read_u32(node, "toshiba,tx-amplitude-microvolt", &cfg->tx_amp);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ ret = of_property_read_u8_array(node, "n-fts", cfg->nfts, ARRAY_SIZE(cfg->nfts));
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ cfg->disable_dfe = of_property_read_bool(node, "toshiba,no-dfe-support");
+
+ return 0;
+}
+
+static void tc9563_pwrctrl_power_off(struct tc9563_pwrctrl_ctx *ctx)
+{
+ gpiod_set_value(ctx->reset_gpio, 1);
+
+ regulator_bulk_disable(ARRAY_SIZE(ctx->supplies), ctx->supplies);
+}
+
+static int tc9563_pwrctrl_bring_up(struct tc9563_pwrctrl_ctx *ctx)
+{
+ struct tc9563_pwrctrl_cfg *cfg;
+ int ret, i;
+
+ ret = regulator_bulk_enable(ARRAY_SIZE(ctx->supplies), ctx->supplies);
+ if (ret < 0)
+ return dev_err_probe(ctx->pwrctrl.dev, ret, "cannot enable regulators\n");
+
+ gpiod_set_value(ctx->reset_gpio, 0);
+
+ fsleep(TC9563_OSC_STAB_DELAY_US);
+
+ ret = tc9563_pwrctrl_assert_deassert_reset(ctx, false);
+ if (ret)
+ goto power_off;
+
+ for (i = 0; i < TC9563_MAX; i++) {
+ cfg = &ctx->cfg[i];
+ ret = tc9563_pwrctrl_disable_port(ctx, i);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Disabling port failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_set_l0s_l1_entry_delay(ctx, i, false, cfg->l0s_delay);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Setting L0s entry delay failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_set_l0s_l1_entry_delay(ctx, i, true, cfg->l1_delay);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Setting L1 entry delay failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_set_tx_amplitude(ctx, i);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Setting Tx amplitude failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_set_nfts(ctx, i);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Setting N_FTS failed\n");
+ goto power_off;
+ }
+
+ ret = tc9563_pwrctrl_disable_dfe(ctx, i);
+ if (ret) {
+ dev_err(ctx->pwrctrl.dev, "Disabling DFE failed\n");
+ goto power_off;
+ }
+ }
+
+ ret = tc9563_pwrctrl_assert_deassert_reset(ctx, true);
+ if (!ret)
+ return 0;
+
+power_off:
+ tc9563_pwrctrl_power_off(ctx);
+ return ret;
+}
+
+static int tc9563_pwrctrl_probe(struct platform_device *pdev)
+{
+ struct pci_host_bridge *bridge = to_pci_host_bridge(pdev->dev.parent);
+ struct pci_bus *bus = bridge->bus;
+ struct device *dev = &pdev->dev;
+ enum tc9563_pwrctrl_ports port;
+ struct tc9563_pwrctrl_ctx *ctx;
+ struct device_node *i2c_node;
+ int ret, addr;
+
+ ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ret = of_property_read_u32_index(pdev->dev.of_node, "i2c-parent", 1, &addr);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to read i2c-parent property\n");
+
+ i2c_node = of_parse_phandle(dev->of_node, "i2c-parent", 0);
+ ctx->adapter = of_find_i2c_adapter_by_node(i2c_node);
+ of_node_put(i2c_node);
+ if (!ctx->adapter)
+ return dev_err_probe(dev, -EPROBE_DEFER, "Failed to find I2C adapter\n");
+
+ ctx->client = i2c_new_dummy_device(ctx->adapter, addr);
+ if (IS_ERR(ctx->client)) {
+ dev_err(dev, "Failed to create I2C client\n");
+ i2c_put_adapter(ctx->adapter);
+ return PTR_ERR(ctx->client);
+ }
+
+ for (int i = 0; i < ARRAY_SIZE(tc9563_supply_names); i++)
+ ctx->supplies[i].supply = tc9563_supply_names[i];
+
+ ret = devm_regulator_bulk_get(dev, TC9563_PWRCTL_MAX_SUPPLY, ctx->supplies);
+ if (ret) {
+ dev_err_probe(dev, ret, "failed to get supply regulator\n");
+ goto remove_i2c;
+ }
+
+ ctx->reset_gpio = devm_gpiod_get(dev, "resx", GPIOD_OUT_HIGH);
+ if (IS_ERR(ctx->reset_gpio)) {
+ ret = dev_err_probe(dev, PTR_ERR(ctx->reset_gpio), "failed to get resx GPIO\n");
+ goto remove_i2c;
+ }
+
+ pci_pwrctrl_init(&ctx->pwrctrl, dev);
+
+ port = TC9563_USP;
+ ret = tc9563_pwrctrl_parse_device_dt(ctx, pdev->dev.of_node, port);
+ if (ret) {
+ dev_err(dev, "failed to parse device tree properties: %d\n", ret);
+ goto remove_i2c;
+ }
+
+ /*
+ * Downstream ports are always children of the upstream port.
+ * The first node represents DSP1, the second node represents DSP2, and so on.
+ */
+ for_each_child_of_node_scoped(pdev->dev.of_node, child) {
+ port++;
+ ret = tc9563_pwrctrl_parse_device_dt(ctx, child, port);
+ if (ret)
+ break;
+ /* Embedded ethernet device are under DSP3 */
+ if (port == TC9563_DSP3) {
+ for_each_child_of_node_scoped(child, child1) {
+ port++;
+ ret = tc9563_pwrctrl_parse_device_dt(ctx, child1, port);
+ if (ret)
+ break;
+ }
+ }
+ }
+ if (ret) {
+ dev_err(dev, "failed to parse device tree properties: %d\n", ret);
+ goto remove_i2c;
+ }
+
+ if (bridge->ops->assert_perst) {
+ ret = bridge->ops->assert_perst(bus, true);
+ if (ret)
+ goto remove_i2c;
+ }
+
+ ret = tc9563_pwrctrl_bring_up(ctx);
+ if (ret)
+ goto remove_i2c;
+
+ if (bridge->ops->assert_perst) {
+ ret = bridge->ops->assert_perst(bus, false);
+ if (ret)
+ goto power_off;
+ }
+
+ ret = devm_pci_pwrctrl_device_set_ready(dev, &ctx->pwrctrl);
+ if (ret)
+ goto power_off;
+
+ platform_set_drvdata(pdev, ctx);
+
+ return 0;
+
+power_off:
+ tc9563_pwrctrl_power_off(ctx);
+remove_i2c:
+ i2c_unregister_device(ctx->client);
+ i2c_put_adapter(ctx->adapter);
+ return ret;
+}
+
+static void tc9563_pwrctrl_remove(struct platform_device *pdev)
+{
+ struct tc9563_pwrctrl_ctx *ctx = platform_get_drvdata(pdev);
+
+ tc9563_pwrctrl_power_off(ctx);
+ i2c_unregister_device(ctx->client);
+ i2c_put_adapter(ctx->adapter);
+}
+
+static const struct of_device_id tc9563_pwrctrl_of_match[] = {
+ { .compatible = "pci1179,0623"},
+ { }
+};
+MODULE_DEVICE_TABLE(of, tc9563_pwrctrl_of_match);
+
+static struct platform_driver tc9563_pwrctrl_driver = {
+ .driver = {
+ .name = "pwrctrl-tc9563",
+ .of_match_table = tc9563_pwrctrl_of_match,
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+ .probe = tc9563_pwrctrl_probe,
+ .remove = tc9563_pwrctrl_remove,
+};
+module_platform_driver(tc9563_pwrctrl_driver);
+
+MODULE_AUTHOR("Krishna chaitanya chundru <quic_krichai@quicinc.com>");
+MODULE_DESCRIPTION("TC956x power control driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c
new file mode 100644
index 000000000000..ecdebdeb2dff
--- /dev/null
+++ b/drivers/pci/rebar.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Resizable BAR Extended Capability handling.
+ */
+
+#include <linux/bits.h>
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/ioport.h>
+#include <linux/log2.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include "pci.h"
+
+#define PCI_REBAR_MIN_SIZE ((resource_size_t)SZ_1M)
+
+/**
+ * pci_rebar_bytes_to_size - Convert size in bytes to PCI BAR Size
+ * @bytes: size in bytes
+ *
+ * Convert size in bytes to encoded BAR Size in Resizable BAR Capability
+ * (PCIe r6.2, sec. 7.8.6.3).
+ *
+ * Return: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB)
+ */
+int pci_rebar_bytes_to_size(u64 bytes)
+{
+ int rebar_minsize = ilog2(PCI_REBAR_MIN_SIZE);
+
+ bytes = roundup_pow_of_two(bytes);
+
+ return max(ilog2(bytes), rebar_minsize) - rebar_minsize;
+}
+EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size);
+
+/**
+ * pci_rebar_size_to_bytes - Convert encoded BAR Size to size in bytes
+ * @size: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB)
+ *
+ * Return: BAR size in bytes
+ */
+resource_size_t pci_rebar_size_to_bytes(int size)
+{
+ return 1ULL << (size + ilog2(PCI_REBAR_MIN_SIZE));
+}
+EXPORT_SYMBOL_GPL(pci_rebar_size_to_bytes);
+
+void pci_rebar_init(struct pci_dev *pdev)
+{
+ pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR);
+}
+
+/**
+ * pci_rebar_find_pos - find position of resize control reg for BAR
+ * @pdev: PCI device
+ * @bar: BAR to find
+ *
+ * Helper to find the position of the control register for a BAR.
+ *
+ * Return:
+ * * %-ENOTSUPP if resizable BARs are not supported at all,
+ * * %-ENOENT if no control register for the BAR could be found.
+ */
+static int pci_rebar_find_pos(struct pci_dev *pdev, int bar)
+{
+ unsigned int pos, nbars, i;
+ u32 ctrl;
+
+ if (pci_resource_is_iov(bar)) {
+ pos = pci_iov_vf_rebar_cap(pdev);
+ bar = pci_resource_num_to_vf_bar(bar);
+ } else {
+ pos = pdev->rebar_cap;
+ }
+
+ if (!pos)
+ return -ENOTSUPP;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl);
+
+ for (i = 0; i < nbars; i++, pos += 8) {
+ int bar_idx;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ bar_idx = FIELD_GET(PCI_REBAR_CTRL_BAR_IDX, ctrl);
+ if (bar_idx == bar)
+ return pos;
+ }
+
+ return -ENOENT;
+}
+
+/**
+ * pci_rebar_get_possible_sizes - get possible sizes for Resizable BAR
+ * @pdev: PCI device
+ * @bar: BAR to query
+ *
+ * Get the possible sizes of a resizable BAR as bitmask.
+ *
+ * Return: A bitmask of possible sizes (bit 0=1MB, bit 31=128TB), or %0 if
+ * BAR isn't resizable.
+ */
+u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
+{
+ int pos;
+ u32 cap;
+
+ pos = pci_rebar_find_pos(pdev, bar);
+ if (pos < 0)
+ return 0;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap);
+ cap = FIELD_GET(PCI_REBAR_CAP_SIZES, cap);
+
+ /* Sapphire RX 5600 XT Pulse has an invalid cap dword for BAR 0 */
+ if (pdev->vendor == PCI_VENDOR_ID_ATI && pdev->device == 0x731f &&
+ bar == 0 && cap == 0x700)
+ return 0x3f00;
+
+ return cap;
+}
+EXPORT_SYMBOL(pci_rebar_get_possible_sizes);
+
+/**
+ * pci_rebar_size_supported - check if size is supported for BAR
+ * @pdev: PCI device
+ * @bar: BAR to check
+ * @size: encoded size as defined in the PCIe spec (0=1MB, 31=128TB)
+ *
+ * Return: %true if @bar is resizable and @size is supported, otherwise
+ * %false.
+ */
+bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size)
+{
+ u64 sizes = pci_rebar_get_possible_sizes(pdev, bar);
+
+ if (size < 0 || size > ilog2(SZ_128T) - ilog2(PCI_REBAR_MIN_SIZE))
+ return false;
+
+ return BIT(size) & sizes;
+}
+EXPORT_SYMBOL_GPL(pci_rebar_size_supported);
+
+/**
+ * pci_rebar_get_max_size - get the maximum supported size of a BAR
+ * @pdev: PCI device
+ * @bar: BAR to query
+ *
+ * Get the largest supported size of a resizable BAR as a size.
+ *
+ * Return: the encoded maximum BAR size as defined in the PCIe spec
+ * (0=1MB, 31=128TB), or %-NOENT on error.
+ */
+int pci_rebar_get_max_size(struct pci_dev *pdev, int bar)
+{
+ u64 sizes;
+
+ sizes = pci_rebar_get_possible_sizes(pdev, bar);
+ if (!sizes)
+ return -ENOENT;
+
+ return __fls(sizes);
+}
+EXPORT_SYMBOL_GPL(pci_rebar_get_max_size);
+
+/**
+ * pci_rebar_get_current_size - get the current size of a Resizable BAR
+ * @pdev: PCI device
+ * @bar: BAR to get the size from
+ *
+ * Read the current size of a BAR from the Resizable BAR config.
+ *
+ * Return: BAR Size if @bar is resizable (0=1MB, 31=128TB), or negative on
+ * error.
+ */
+int pci_rebar_get_current_size(struct pci_dev *pdev, int bar)
+{
+ int pos;
+ u32 ctrl;
+
+ pos = pci_rebar_find_pos(pdev, bar);
+ if (pos < 0)
+ return pos;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ return FIELD_GET(PCI_REBAR_CTRL_BAR_SIZE, ctrl);
+}
+
+/**
+ * pci_rebar_set_size - set a new size for a Resizable BAR
+ * @pdev: PCI device
+ * @bar: BAR to set size to
+ * @size: new size as defined in the PCIe spec (0=1MB, 31=128TB)
+ *
+ * Set the new size of a BAR as defined in the spec.
+ *
+ * Return: %0 if resizing was successful, or negative on error.
+ */
+int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size)
+{
+ int pos;
+ u32 ctrl;
+
+ pos = pci_rebar_find_pos(pdev, bar);
+ if (pos < 0)
+ return pos;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
+ ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size);
+ pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
+
+ if (pci_resource_is_iov(bar))
+ pci_iov_resource_set_size(pdev, bar, size);
+
+ return 0;
+}
+
+void pci_restore_rebar_state(struct pci_dev *pdev)
+{
+ unsigned int pos, nbars, i;
+ u32 ctrl;
+
+ pos = pdev->rebar_cap;
+ if (!pos)
+ return;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ nbars = FIELD_GET(PCI_REBAR_CTRL_NBAR_MASK, ctrl);
+
+ for (i = 0; i < nbars; i++, pos += 8) {
+ struct resource *res;
+ int bar_idx, size;
+
+ pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
+ bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX;
+ res = pci_resource_n(pdev, bar_idx);
+ size = pci_rebar_bytes_to_size(resource_size(res));
+ ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
+ ctrl |= FIELD_PREP(PCI_REBAR_CTRL_BAR_SIZE, size);
+ pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
+ }
+}
+
+static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev,
+ int resno)
+{
+ u16 cmd;
+
+ if (pci_resource_is_iov(resno))
+ return pci_iov_is_memory_decoding_enabled(dev);
+
+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
+
+ return cmd & PCI_COMMAND_MEMORY;
+}
+
+void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size)
+{
+ resource_size_t res_size = pci_rebar_size_to_bytes(size);
+ struct resource *res = pci_resource_n(dev, resno);
+
+ if (pci_resource_is_iov(resno))
+ res_size *= pci_sriov_get_totalvfs(dev);
+
+ resource_set_size(res, res_size);
+}
+
+/**
+ * pci_resize_resource - reconfigure a Resizable BAR and resources
+ * @dev: the PCI device
+ * @resno: index of the BAR to be resized
+ * @size: new size as defined in the spec (0=1MB, 31=128TB)
+ * @exclude_bars: a mask of BARs that should not be released
+ *
+ * Reconfigure @resno to @size and re-run resource assignment algorithm
+ * with the new size.
+ *
+ * Prior to resize, release @dev resources that share a bridge window with
+ * @resno. This unpins the bridge window resource to allow changing it.
+ *
+ * The caller may prevent releasing a particular BAR by providing
+ * @exclude_bars mask, but this may result in the resize operation failing
+ * due to insufficient space.
+ *
+ * Return: 0 on success, or negative on error. In case of an error, the
+ * resources are restored to their original places.
+ */
+int pci_resize_resource(struct pci_dev *dev, int resno, int size,
+ int exclude_bars)
+{
+ struct pci_host_bridge *host;
+ int old, ret;
+
+ /* Check if we must preserve the firmware's resource assignment */
+ host = pci_find_host_bridge(dev->bus);
+ if (host->preserve_config)
+ return -ENOTSUPP;
+
+ if (pci_resize_is_memory_decoding_enabled(dev, resno))
+ return -EBUSY;
+
+ if (!pci_rebar_size_supported(dev, resno, size))
+ return -EINVAL;
+
+ old = pci_rebar_get_current_size(dev, resno);
+ if (old < 0)
+ return old;
+
+ ret = pci_rebar_set_size(dev, resno, size);
+ if (ret)
+ return ret;
+
+ ret = pci_do_resource_release_and_resize(dev, resno, size, exclude_bars);
+ if (ret)
+ goto error_resize;
+ return 0;
+
+error_resize:
+ pci_rebar_set_size(dev, resno, old);
+ return ret;
+}
+EXPORT_SYMBOL(pci_resize_resource);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 3645f392a9fd..6e90f46f52af 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -15,6 +15,7 @@
*/
#include <linux/bitops.h>
+#include <linux/bug.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -135,6 +136,9 @@ static void restore_dev_resource(struct pci_dev_resource *dev_res)
{
struct resource *res = dev_res->res;
+ if (WARN_ON_ONCE(res->parent))
+ return;
+
res->start = dev_res->start;
res->end = dev_res->end;
res->flags = dev_res->flags;
@@ -2420,18 +2424,16 @@ EXPORT_SYMBOL_GPL(pci_assign_unassigned_bridge_resources);
* release it when possible. If the bridge window contains assigned
* resources, it cannot be released.
*/
-int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
+static int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res,
+ struct list_head *saved)
{
unsigned long type = res->flags;
struct pci_dev_resource *dev_res;
- struct pci_dev *bridge;
- LIST_HEAD(saved);
+ struct pci_dev *bridge = NULL;
LIST_HEAD(added);
LIST_HEAD(failed);
unsigned int i;
- int ret;
-
- down_read(&pci_bus_sem);
+ int ret = 0;
while (!pci_is_root_bus(bus)) {
bridge = bus->self;
@@ -2443,9 +2445,9 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
/* Ignore BARs which are still in use */
if (!res->child) {
- ret = add_to_list(&saved, bridge, res, 0, 0);
+ ret = add_to_list(saved, bridge, res, 0, 0);
if (ret)
- goto cleanup;
+ return ret;
pci_release_resource(bridge, i);
} else {
@@ -2459,10 +2461,8 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
bus = bus->parent;
}
- if (list_empty(&saved)) {
- up_read(&pci_bus_sem);
+ if (!bridge)
return -ENOENT;
- }
__pci_bus_size_bridges(bridge->subordinate, &added);
__pci_bridge_assign_resources(bridge, &added, &failed);
@@ -2470,49 +2470,107 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res)
free_list(&added);
if (!list_empty(&failed)) {
- if (pci_required_resource_failed(&failed, type)) {
+ if (pci_required_resource_failed(&failed, type))
ret = -ENOSPC;
- goto cleanup;
- }
- /* Only resources with unrelated types failed (again) */
free_list(&failed);
+ if (ret)
+ return ret;
+
+ /* Only resources with unrelated types failed (again) */
}
- list_for_each_entry(dev_res, &saved, list) {
+ list_for_each_entry(dev_res, saved, list) {
+ struct pci_dev *dev = dev_res->dev;
+
/* Skip the bridge we just assigned resources for */
- if (bridge == dev_res->dev)
+ if (bridge == dev)
+ continue;
+
+ if (!dev->subordinate)
continue;
- bridge = dev_res->dev;
- pci_setup_bridge(bridge->subordinate);
+ pci_setup_bridge(dev->subordinate);
}
- free_list(&saved);
- up_read(&pci_bus_sem);
return 0;
+}
-cleanup:
- /* Restore size and flags */
- list_for_each_entry(dev_res, &failed, list)
- restore_dev_resource(dev_res);
- free_list(&failed);
+int pci_do_resource_release_and_resize(struct pci_dev *pdev, int resno, int size,
+ int exclude_bars)
+{
+ struct resource *res = pci_resource_n(pdev, resno);
+ struct pci_dev_resource *dev_res;
+ struct pci_bus *bus = pdev->bus;
+ struct resource *b_win, *r;
+ LIST_HEAD(saved);
+ unsigned int i;
+ int ret = 0;
+
+ b_win = pbus_select_window(bus, res);
+ if (!b_win)
+ return -EINVAL;
+ pci_dev_for_each_resource(pdev, r, i) {
+ if (i >= PCI_BRIDGE_RESOURCES)
+ break;
+
+ if (exclude_bars & BIT(i))
+ continue;
+
+ if (b_win != pbus_select_window(bus, r))
+ continue;
+
+ ret = add_to_list(&saved, pdev, r, 0, 0);
+ if (ret)
+ goto restore;
+ pci_release_resource(pdev, i);
+ }
+
+ pci_resize_resource_set_size(pdev, resno, size);
+
+ if (!bus->self)
+ goto out;
+
+ down_read(&pci_bus_sem);
+ ret = pbus_reassign_bridge_resources(bus, res, &saved);
+ if (ret)
+ goto restore;
+
+out:
+ up_read(&pci_bus_sem);
+ free_list(&saved);
+ return ret;
+
+restore:
/* Revert to the old configuration */
list_for_each_entry(dev_res, &saved, list) {
struct resource *res = dev_res->res;
+ struct pci_dev *dev = dev_res->dev;
- bridge = dev_res->dev;
- i = pci_resource_num(bridge, res);
+ i = pci_resource_num(dev, res);
+
+ if (res->parent) {
+ release_child_resources(res);
+ pci_release_resource(dev, i);
+ }
restore_dev_resource(dev_res);
- pci_claim_resource(bridge, i);
- pci_setup_bridge(bridge->subordinate);
- }
- free_list(&saved);
- up_read(&pci_bus_sem);
+ ret = pci_claim_resource(dev, i);
+ if (ret)
+ continue;
- return ret;
+ if (i < PCI_BRIDGE_RESOURCES) {
+ const char *res_name = pci_resource_name(dev, i);
+
+ pci_update_resource(dev, i);
+ pci_info(dev, "%s %pR: old value restored\n",
+ res_name, res);
+ }
+ if (dev->subordinate)
+ pci_setup_bridge(dev->subordinate);
+ }
+ goto out;
}
void pci_assign_unassigned_bus_resources(struct pci_bus *bus)
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index c3ba4ccecd43..e5fcadfc58b0 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -431,84 +431,6 @@ int pci_release_resource(struct pci_dev *dev, int resno)
}
EXPORT_SYMBOL(pci_release_resource);
-static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev,
- int resno)
-{
- u16 cmd;
-
- if (pci_resource_is_iov(resno))
- return pci_iov_is_memory_decoding_enabled(dev);
-
- pci_read_config_word(dev, PCI_COMMAND, &cmd);
-
- return cmd & PCI_COMMAND_MEMORY;
-}
-
-static void pci_resize_resource_set_size(struct pci_dev *dev, int resno,
- int size)
-{
- resource_size_t res_size = pci_rebar_size_to_bytes(size);
- struct resource *res = pci_resource_n(dev, resno);
-
- if (!pci_resource_is_iov(resno)) {
- resource_set_size(res, res_size);
- } else {
- resource_set_size(res, res_size * pci_sriov_get_totalvfs(dev));
- pci_iov_resource_set_size(dev, resno, res_size);
- }
-}
-
-int pci_resize_resource(struct pci_dev *dev, int resno, int size)
-{
- struct resource *res = pci_resource_n(dev, resno);
- struct pci_host_bridge *host;
- int old, ret;
- u32 sizes;
-
- /* Check if we must preserve the firmware's resource assignment */
- host = pci_find_host_bridge(dev->bus);
- if (host->preserve_config)
- return -ENOTSUPP;
-
- /* Make sure the resource isn't assigned before resizing it. */
- if (!(res->flags & IORESOURCE_UNSET))
- return -EBUSY;
-
- if (pci_resize_is_memory_decoding_enabled(dev, resno))
- return -EBUSY;
-
- sizes = pci_rebar_get_possible_sizes(dev, resno);
- if (!sizes)
- return -ENOTSUPP;
-
- if (!(sizes & BIT(size)))
- return -EINVAL;
-
- old = pci_rebar_get_current_size(dev, resno);
- if (old < 0)
- return old;
-
- ret = pci_rebar_set_size(dev, resno, size);
- if (ret)
- return ret;
-
- pci_resize_resource_set_size(dev, resno, size);
-
- /* Check if the new config works by trying to assign everything. */
- if (dev->bus->self) {
- ret = pbus_reassign_bridge_resources(dev->bus, res);
- if (ret)
- goto error_resize;
- }
- return 0;
-
-error_resize:
- pci_rebar_set_size(dev, resno, old);
- pci_resize_resource_set_size(dev, resno, old);
- return ret;
-}
-EXPORT_SYMBOL(pci_resize_resource);
-
int pci_enable_resources(struct pci_dev *dev, int mask)
{
u16 cmd, old_cmd;
diff --git a/drivers/power/reset/Kconfig b/drivers/power/reset/Kconfig
index 8248895ca903..f6c1bcbb57de 100644
--- a/drivers/power/reset/Kconfig
+++ b/drivers/power/reset/Kconfig
@@ -283,6 +283,15 @@ config POWER_RESET_KEYSTONE
help
Reboot support for the KEYSTONE SoCs.
+config POWER_RESET_SPACEMIT_P1
+ tristate "SpacemiT P1 poweroff and reset driver"
+ depends on ARCH_SPACEMIT || COMPILE_TEST
+ depends on MFD_SPACEMIT_P1
+ default MFD_SPACEMIT_P1
+ help
+ This driver supports power-off and reset operations for the SpacemiT
+ P1 PMIC.
+
config POWER_RESET_SYSCON
bool "Generic SYSCON regmap reset driver"
depends on OF
diff --git a/drivers/power/reset/Makefile b/drivers/power/reset/Makefile
index 51da87e05ce7..0e4ae6f6b5c5 100644
--- a/drivers/power/reset/Makefile
+++ b/drivers/power/reset/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_POWER_RESET_LTC2952) += ltc2952-poweroff.o
obj-$(CONFIG_POWER_RESET_QNAP) += qnap-poweroff.o
obj-$(CONFIG_POWER_RESET_REGULATOR) += regulator-poweroff.o
obj-$(CONFIG_POWER_RESET_RESTART) += restart-poweroff.o
+obj-$(CONFIG_POWER_RESET_SPACEMIT_P1) += spacemit-p1-reboot.o
obj-$(CONFIG_POWER_RESET_ST) += st-poweroff.o
obj-$(CONFIG_POWER_RESET_TH1520_AON) += th1520-aon-reboot.o
obj-$(CONFIG_POWER_RESET_TORADEX_EC) += tdx-ec-poweroff.o
diff --git a/drivers/power/reset/spacemit-p1-reboot.c b/drivers/power/reset/spacemit-p1-reboot.c
new file mode 100644
index 000000000000..9ec3d1fff8f3
--- /dev/null
+++ b/drivers/power/reset/spacemit-p1-reboot.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 by Aurelien Jarno
+ */
+
+#include <linux/bits.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/reboot.h>
+
+/* Power Control Register 2 */
+#define PWR_CTRL2 0x7e
+#define PWR_CTRL2_SHUTDOWN BIT(2) /* Shutdown request */
+#define PWR_CTRL2_RST BIT(1) /* Reset request */
+
+static int spacemit_p1_pwroff_handler(struct sys_off_data *data)
+{
+ struct regmap *regmap = data->cb_data;
+ int ret;
+
+ /* Put the PMIC into shutdown state */
+ ret = regmap_set_bits(regmap, PWR_CTRL2, PWR_CTRL2_SHUTDOWN);
+ if (ret) {
+ dev_err(data->dev, "shutdown failed: %d\n", ret);
+ return notifier_from_errno(ret);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int spacemit_p1_restart_handler(struct sys_off_data *data)
+{
+ struct regmap *regmap = data->cb_data;
+ int ret;
+
+ /* Put the PMIC into reset state */
+ ret = regmap_set_bits(regmap, PWR_CTRL2, PWR_CTRL2_RST);
+ if (ret) {
+ dev_err(data->dev, "restart failed: %d\n", ret);
+ return notifier_from_errno(ret);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int spacemit_p1_reboot_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct regmap *regmap;
+ int ret;
+
+ regmap = dev_get_regmap(dev->parent, NULL);
+ if (!regmap)
+ return -ENODEV;
+
+ ret = devm_register_power_off_handler(dev, &spacemit_p1_pwroff_handler,
+ regmap);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to register power off handler\n");
+
+ ret = devm_register_restart_handler(dev, spacemit_p1_restart_handler,
+ regmap);
+ if (ret)
+ return dev_err_probe(dev, ret,
+ "Failed to register restart handler\n");
+
+ return 0;
+}
+
+static const struct platform_device_id spacemit_p1_reboot_id_table[] = {
+ { "spacemit-p1-reboot", },
+ { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(platform, spacemit_p1_reboot_id_table);
+
+static struct platform_driver spacemit_p1_reboot_driver = {
+ .driver = {
+ .name = "spacemit-p1-reboot",
+ },
+ .probe = spacemit_p1_reboot_probe,
+ .id_table = spacemit_p1_reboot_id_table,
+};
+module_platform_driver(spacemit_p1_reboot_driver);
+
+MODULE_DESCRIPTION("SpacemiT P1 reboot/poweroff driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig
index 03c8525b480f..92f9f7aae92f 100644
--- a/drivers/power/supply/Kconfig
+++ b/drivers/power/supply/Kconfig
@@ -942,6 +942,21 @@ config CHARGER_RT9471
This driver can also be built as a module. If so, the module will be
called rt9471.
+config CHARGER_RT9756
+ tristate "Richtek RT9756 smart cap divider charger driver"
+ depends on I2C
+ select REGMAP_I2C
+ select LINEAR_RANGES
+ help
+ This adds support for Richtek RT9756 smart cap divider charger driver.
+ It's a high efficiency and high charge current charger. the device
+ integrates smart cap divider topology with 9-channel high speed
+ ADCs that can provide input and output voltage, current and
+ temperature monitoring.
+
+ This driver can also be built as a module. If so, the module will be
+ called rt9756.
+
config CHARGER_CROS_USBPD
tristate "ChromeOS EC based USBPD charger"
depends on CROS_USBPD_NOTIFY
@@ -1007,6 +1022,15 @@ config CHARGER_UCS1002
Say Y to enable support for Microchip UCS1002 Programmable
USB Port Power Controller with Charger Emulation.
+config CHARGER_BD71828
+ tristate "Power-supply driver for ROHM BD71828 and BD71815 PMIC"
+ depends on MFD_ROHM_BD71828
+ help
+ Say Y here to enable support for charger and battery
+ in ROHM BD71815, BD71817, ROHM BD71828 power management
+ ICs. This driver gets various bits of information about battery
+ and charger states.
+
config CHARGER_BD99954
tristate "ROHM bd99954 charger driver"
depends on I2C
diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile
index 6e37a3edf7e3..4b79d5abc49a 100644
--- a/drivers/power/supply/Makefile
+++ b/drivers/power/supply/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_CHARGER_RT5033) += rt5033_charger.o
obj-$(CONFIG_CHARGER_RT9455) += rt9455_charger.o
obj-$(CONFIG_CHARGER_RT9467) += rt9467-charger.o
obj-$(CONFIG_CHARGER_RT9471) += rt9471.o
+obj-$(CONFIG_CHARGER_RT9756) += rt9756.o
obj-$(CONFIG_BATTERY_TWL4030_MADC) += twl4030_madc_battery.o
obj-$(CONFIG_CHARGER_88PM860X) += 88pm860x_charger.o
obj-$(CONFIG_CHARGER_PF1550) += pf1550-charger.o
@@ -117,6 +118,7 @@ obj-$(CONFIG_CHARGER_SC2731) += sc2731_charger.o
obj-$(CONFIG_FUEL_GAUGE_SC27XX) += sc27xx_fuel_gauge.o
obj-$(CONFIG_FUEL_GAUGE_STC3117) += stc3117_fuel_gauge.o
obj-$(CONFIG_CHARGER_UCS1002) += ucs1002_power.o
+obj-$(CONFIG_CHARGER_BD71828) += bd71828-power.o
obj-$(CONFIG_CHARGER_BD99954) += bd99954-charger.o
obj-$(CONFIG_CHARGER_WILCO) += wilco-charger.o
obj-$(CONFIG_RN5T618_POWER) += rn5t618_power.o
diff --git a/drivers/power/supply/apm_power.c b/drivers/power/supply/apm_power.c
index 9236e0078578..9933cdc5c387 100644
--- a/drivers/power/supply/apm_power.c
+++ b/drivers/power/supply/apm_power.c
@@ -364,7 +364,8 @@ static int __init apm_battery_init(void)
static void __exit apm_battery_exit(void)
{
- apm_get_power_status = NULL;
+ if (apm_get_power_status == apm_battery_apm_get_power_status)
+ apm_get_power_status = NULL;
}
module_init(apm_battery_init);
diff --git a/drivers/power/supply/bd71828-power.c b/drivers/power/supply/bd71828-power.c
new file mode 100644
index 000000000000..f667baedeb77
--- /dev/null
+++ b/drivers/power/supply/bd71828-power.c
@@ -0,0 +1,1049 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* ROHM BD71815, BD71828 and BD71878 Charger driver */
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mfd/rohm-bd71815.h>
+#include <linux/mfd/rohm-bd71828.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+#include <linux/power_supply.h>
+#include <linux/slab.h>
+
+/* common defines */
+#define BD7182x_MASK_VBAT_U 0x1f
+#define BD7182x_MASK_VDCIN_U 0x0f
+#define BD7182x_MASK_IBAT_U 0x3f
+#define BD7182x_MASK_CURDIR_DISCHG 0x80
+#define BD7182x_MASK_CHG_STATE 0x7f
+#define BD7182x_MASK_BAT_TEMP 0x07
+#define BD7182x_MASK_DCIN_DET BIT(0)
+#define BD7182x_MASK_CONF_PON BIT(0)
+#define BD71815_MASK_CONF_XSTB BIT(1)
+#define BD7182x_MASK_BAT_STAT 0x3f
+#define BD7182x_MASK_DCIN_STAT 0x07
+
+#define BD7182x_MASK_WDT_AUTO 0x40
+#define BD7182x_MASK_VBAT_ALM_LIMIT_U 0x01
+#define BD7182x_MASK_CHG_EN 0x01
+
+#define BD7182x_DCIN_COLLAPSE_DEFAULT 0x36
+
+#define MAX_CURRENT_DEFAULT 890000 /* uA */
+#define AC_NAME "bd71828_ac"
+#define BAT_NAME "bd71828_bat"
+
+#define BAT_OPEN 0x7
+
+/*
+ * VBAT Low voltage detection Threshold
+ * 0x00D4*16mV = 212*0.016 = 3.392v
+ */
+#define VBAT_LOW_TH 0x00D4
+
+struct pwr_regs {
+ u8 vbat_avg;
+ u8 ibat;
+ u8 ibat_avg;
+ u8 btemp_vth;
+ u8 chg_state;
+ u8 bat_temp;
+ u8 dcin_stat;
+ u8 dcin_collapse_limit;
+ u8 chg_set1;
+ u8 chg_en;
+ u8 vbat_alm_limit_u;
+ u8 conf;
+ u8 vdcin;
+};
+
+static const struct pwr_regs pwr_regs_bd71828 = {
+ .vbat_avg = BD71828_REG_VBAT_U,
+ .ibat = BD71828_REG_IBAT_U,
+ .ibat_avg = BD71828_REG_IBAT_AVG_U,
+ .btemp_vth = BD71828_REG_VM_BTMP_U,
+ .chg_state = BD71828_REG_CHG_STATE,
+ .bat_temp = BD71828_REG_BAT_TEMP,
+ .dcin_stat = BD71828_REG_DCIN_STAT,
+ .dcin_collapse_limit = BD71828_REG_DCIN_CLPS,
+ .chg_set1 = BD71828_REG_CHG_SET1,
+ .chg_en = BD71828_REG_CHG_EN,
+ .vbat_alm_limit_u = BD71828_REG_ALM_VBAT_LIMIT_U,
+ .conf = BD71828_REG_CONF,
+ .vdcin = BD71828_REG_VDCIN_U,
+};
+
+static const struct pwr_regs pwr_regs_bd71815 = {
+ .vbat_avg = BD71815_REG_VM_SA_VBAT_U,
+ /* BD71815 does not have separate current and current avg */
+ .ibat = BD71815_REG_CC_CURCD_U,
+ .ibat_avg = BD71815_REG_CC_CURCD_U,
+
+ .btemp_vth = BD71815_REG_VM_BTMP,
+ .chg_state = BD71815_REG_CHG_STATE,
+ .bat_temp = BD71815_REG_BAT_TEMP,
+ .dcin_stat = BD71815_REG_DCIN_STAT,
+ .dcin_collapse_limit = BD71815_REG_DCIN_CLPS,
+ .chg_set1 = BD71815_REG_CHG_SET1,
+ .chg_en = BD71815_REG_CHG_SET1,
+ .vbat_alm_limit_u = BD71815_REG_ALM_VBAT_TH_U,
+ .conf = BD71815_REG_CONF,
+
+ .vdcin = BD71815_REG_VM_DCIN_U,
+};
+
+struct bd71828_power {
+ struct regmap *regmap;
+ enum rohm_chip_type chip_type;
+ struct device *dev;
+ struct power_supply *ac;
+ struct power_supply *bat;
+
+ const struct pwr_regs *regs;
+ /* Reg val to uA */
+ int curr_factor;
+ int rsens;
+ int (*get_temp)(struct bd71828_power *pwr, int *temp);
+ int (*bat_inserted)(struct bd71828_power *pwr);
+};
+
+static int bd7182x_write16(struct bd71828_power *pwr, int reg, u16 val)
+{
+ __be16 tmp;
+
+ tmp = cpu_to_be16(val);
+
+ return regmap_bulk_write(pwr->regmap, reg, &tmp, sizeof(tmp));
+}
+
+static int bd7182x_read16_himask(struct bd71828_power *pwr, int reg, int himask,
+ u16 *val)
+{
+ struct regmap *regmap = pwr->regmap;
+ int ret;
+ __be16 rvals;
+ u8 *tmp = (u8 *)&rvals;
+
+ ret = regmap_bulk_read(regmap, reg, &rvals, sizeof(*val));
+ if (!ret) {
+ *tmp &= himask;
+ *val = be16_to_cpu(rvals);
+ }
+
+ return ret;
+}
+
+static int bd71828_get_vbat(struct bd71828_power *pwr, int *vcell)
+{
+ u16 tmp_vcell;
+ int ret;
+
+ ret = bd7182x_read16_himask(pwr, pwr->regs->vbat_avg,
+ BD7182x_MASK_VBAT_U, &tmp_vcell);
+ if (ret)
+ dev_err(pwr->dev, "Failed to read battery average voltage\n");
+ else
+ *vcell = ((int)tmp_vcell) * 1000;
+
+ return ret;
+}
+
+static int bd71828_get_current_ds_adc(struct bd71828_power *pwr, int *curr, int *curr_avg)
+{
+ __be16 tmp_curr;
+ char *tmp = (char *)&tmp_curr;
+ int dir = 1;
+ int regs[] = { pwr->regs->ibat, pwr->regs->ibat_avg };
+ int *vals[] = { curr, curr_avg };
+ int ret, i;
+
+ for (dir = 1, i = 0; i < ARRAY_SIZE(regs); i++) {
+ ret = regmap_bulk_read(pwr->regmap, regs[i], &tmp_curr,
+ sizeof(tmp_curr));
+ if (ret)
+ break;
+
+ if (*tmp & BD7182x_MASK_CURDIR_DISCHG)
+ dir = -1;
+
+ *tmp &= BD7182x_MASK_IBAT_U;
+
+ *vals[i] = dir * ((int)be16_to_cpu(tmp_curr)) * pwr->curr_factor;
+ }
+
+ return ret;
+}
+
+/* Unit is tenths of degree C */
+static int bd71815_get_temp(struct bd71828_power *pwr, int *temp)
+{
+ struct regmap *regmap = pwr->regmap;
+ int ret;
+ int t;
+
+ ret = regmap_read(regmap, pwr->regs->btemp_vth, &t);
+ if (ret)
+ return ret;
+
+ t = 200 - t;
+
+ if (t > 200) {
+ dev_err(pwr->dev, "Failed to read battery temperature\n");
+ return -ENODATA;
+ }
+
+ return 0;
+}
+
+/* Unit is tenths of degree C */
+static int bd71828_get_temp(struct bd71828_power *pwr, int *temp)
+{
+ u16 t;
+ int ret;
+ int tmp = 200 * 10000;
+
+ ret = bd7182x_read16_himask(pwr, pwr->regs->btemp_vth,
+ BD71828_MASK_VM_BTMP_U, &t);
+ if (ret)
+ return ret;
+
+ if (t > 3200) {
+ dev_err(pwr->dev,
+ "Failed to read battery temperature\n");
+ return -ENODATA;
+ }
+
+ tmp -= 625ULL * (unsigned int)t;
+ *temp = tmp / 1000;
+
+ return ret;
+}
+
+static int bd71828_charge_status(struct bd71828_power *pwr,
+ int *s, int *h)
+{
+ unsigned int state;
+ int status, health;
+ int ret = 1;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->chg_state, &state);
+ if (ret) {
+ dev_err(pwr->dev, "charger status reading failed (%d)\n", ret);
+ return ret;
+ }
+
+ state &= BD7182x_MASK_CHG_STATE;
+
+ dev_dbg(pwr->dev, "CHG_STATE %d\n", state);
+
+ switch (state) {
+ case 0x00:
+ status = POWER_SUPPLY_STATUS_DISCHARGING;
+ health = POWER_SUPPLY_HEALTH_GOOD;
+ break;
+ case 0x01:
+ case 0x02:
+ case 0x03:
+ case 0x0E:
+ status = POWER_SUPPLY_STATUS_CHARGING;
+ health = POWER_SUPPLY_HEALTH_GOOD;
+ break;
+ case 0x0F:
+ status = POWER_SUPPLY_STATUS_FULL;
+ health = POWER_SUPPLY_HEALTH_GOOD;
+ break;
+ case 0x10:
+ case 0x11:
+ case 0x12:
+ case 0x13:
+ case 0x14:
+ case 0x20:
+ case 0x21:
+ case 0x22:
+ case 0x23:
+ case 0x24:
+ status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+ health = POWER_SUPPLY_HEALTH_OVERHEAT;
+ break;
+ case 0x30:
+ case 0x31:
+ case 0x32:
+ case 0x40:
+ status = POWER_SUPPLY_STATUS_DISCHARGING;
+ health = POWER_SUPPLY_HEALTH_GOOD;
+ break;
+ case 0x7f:
+ default:
+ status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+ health = POWER_SUPPLY_HEALTH_DEAD;
+ break;
+ }
+
+ if (s)
+ *s = status;
+ if (h)
+ *h = health;
+
+ return ret;
+}
+
+static int get_chg_online(struct bd71828_power *pwr, int *chg_online)
+{
+ int r, ret;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->dcin_stat, &r);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to read DCIN status\n");
+ return ret;
+ }
+ *chg_online = ((r & BD7182x_MASK_DCIN_DET) != 0);
+
+ return 0;
+}
+
+static int get_bat_online(struct bd71828_power *pwr, int *bat_online)
+{
+ int r, ret;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->bat_temp, &r);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to read battery temperature\n");
+ return ret;
+ }
+ *bat_online = ((r & BD7182x_MASK_BAT_TEMP) != BAT_OPEN);
+
+ return 0;
+}
+
+static int bd71828_bat_inserted(struct bd71828_power *pwr)
+{
+ int ret, val;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->conf, &val);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to read CONF register\n");
+ return 0;
+ }
+ ret = val & BD7182x_MASK_CONF_PON;
+
+ if (ret)
+ regmap_update_bits(pwr->regmap, pwr->regs->conf,
+ BD7182x_MASK_CONF_PON, 0);
+
+ return ret;
+}
+
+static int bd71815_bat_inserted(struct bd71828_power *pwr)
+{
+ int ret, val;
+
+ ret = regmap_read(pwr->regmap, pwr->regs->conf, &val);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to read CONF register\n");
+ return ret;
+ }
+
+ ret = !(val & BD71815_MASK_CONF_XSTB);
+ if (ret)
+ regmap_write(pwr->regmap, pwr->regs->conf, val |
+ BD71815_MASK_CONF_XSTB);
+
+ return ret;
+}
+
+static int bd71828_init_hardware(struct bd71828_power *pwr)
+{
+ int ret;
+
+ /* TODO: Collapse limit should come from device-tree ? */
+ ret = regmap_write(pwr->regmap, pwr->regs->dcin_collapse_limit,
+ BD7182x_DCIN_COLLAPSE_DEFAULT);
+ if (ret) {
+ dev_err(pwr->dev, "Failed to write DCIN collapse limit\n");
+ return ret;
+ }
+
+ ret = pwr->bat_inserted(pwr);
+ if (ret < 0)
+ return ret;
+
+ if (ret) {
+ /* WDT_FST auto set */
+ ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_set1,
+ BD7182x_MASK_WDT_AUTO,
+ BD7182x_MASK_WDT_AUTO);
+ if (ret)
+ return ret;
+
+ ret = bd7182x_write16(pwr, pwr->regs->vbat_alm_limit_u,
+ VBAT_LOW_TH);
+ if (ret)
+ return ret;
+
+ /*
+ * On BD71815 "we mask the power-state" from relax detection.
+ * I am unsure what the impact of the power-state would be if
+ * we didn't - but this is what the vendor driver did - and
+ * that driver has been used in few projects so I just assume
+ * this is needed.
+ */
+ if (pwr->chip_type == ROHM_CHIP_TYPE_BD71815) {
+ ret = regmap_set_bits(pwr->regmap,
+ BD71815_REG_REX_CTRL_1,
+ REX_PMU_STATE_MASK);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int bd71828_charger_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent);
+ u32 vot;
+ u16 tmp;
+ int online;
+ int ret;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_ONLINE:
+ ret = get_chg_online(pwr, &online);
+ if (!ret)
+ val->intval = online;
+ break;
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ ret = bd7182x_read16_himask(pwr, pwr->regs->vdcin,
+ BD7182x_MASK_VDCIN_U, &tmp);
+ if (ret)
+ return ret;
+
+ vot = tmp;
+ /* 5 milli volt steps */
+ val->intval = 5000 * vot;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bd71828_battery_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent);
+ int ret = 0;
+ int status, health, tmp, curr, curr_avg, chg_en;
+
+ if (psp == POWER_SUPPLY_PROP_STATUS ||
+ psp == POWER_SUPPLY_PROP_HEALTH ||
+ psp == POWER_SUPPLY_PROP_CHARGE_TYPE)
+ ret = bd71828_charge_status(pwr, &status, &health);
+ else if (psp == POWER_SUPPLY_PROP_CURRENT_AVG ||
+ psp == POWER_SUPPLY_PROP_CURRENT_NOW)
+ ret = bd71828_get_current_ds_adc(pwr, &curr, &curr_avg);
+ if (ret)
+ return ret;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ val->intval = status;
+ break;
+ case POWER_SUPPLY_PROP_HEALTH:
+ val->intval = health;
+ break;
+ case POWER_SUPPLY_PROP_PRESENT:
+ ret = get_bat_online(pwr, &tmp);
+ if (!ret)
+ val->intval = tmp;
+ break;
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ ret = bd71828_get_vbat(pwr, &tmp);
+ val->intval = tmp;
+ break;
+ case POWER_SUPPLY_PROP_TECHNOLOGY:
+ val->intval = POWER_SUPPLY_TECHNOLOGY_LION;
+ break;
+ case POWER_SUPPLY_PROP_CURRENT_AVG:
+ val->intval = curr_avg;
+ break;
+ case POWER_SUPPLY_PROP_CURRENT_NOW:
+ val->intval = curr;
+ break;
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ val->intval = MAX_CURRENT_DEFAULT;
+ break;
+ case POWER_SUPPLY_PROP_TEMP:
+ ret = pwr->get_temp(pwr, &val->intval);
+ break;
+ case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR:
+ ret = regmap_read(pwr->regmap, pwr->regs->chg_en, &chg_en);
+ if (ret)
+ return ret;
+
+ val->intval = (chg_en & BD7182x_MASK_CHG_EN) ?
+ POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO :
+ POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int bd71828_battery_set_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ const union power_supply_propval *val)
+{
+ struct bd71828_power *pwr = dev_get_drvdata(psy->dev.parent);
+ int ret = 0;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR:
+ if (val->intval == POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO)
+ ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_en,
+ BD7182x_MASK_CHG_EN,
+ BD7182x_MASK_CHG_EN);
+ else
+ ret = regmap_update_bits(pwr->regmap, pwr->regs->chg_en,
+ BD7182x_MASK_CHG_EN,
+ 0);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+static int bd71828_battery_property_is_writeable(struct power_supply *psy,
+ enum power_supply_property psp)
+{
+ switch (psp) {
+ case POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/** @brief ac properties */
+static const enum power_supply_property bd71828_charger_props[] = {
+ POWER_SUPPLY_PROP_ONLINE,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+};
+
+static const enum power_supply_property bd71828_battery_props[] = {
+ POWER_SUPPLY_PROP_STATUS,
+ POWER_SUPPLY_PROP_HEALTH,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+ POWER_SUPPLY_PROP_HEALTH,
+ POWER_SUPPLY_PROP_PRESENT,
+ POWER_SUPPLY_PROP_TECHNOLOGY,
+ POWER_SUPPLY_PROP_TEMP,
+ POWER_SUPPLY_PROP_CURRENT_AVG,
+ POWER_SUPPLY_PROP_CURRENT_NOW,
+ POWER_SUPPLY_PROP_CURRENT_MAX,
+ POWER_SUPPLY_PROP_CHARGE_BEHAVIOUR,
+};
+
+/** @brief powers supplied by bd71828_ac */
+static char *bd71828_ac_supplied_to[] = {
+ BAT_NAME,
+};
+
+static const struct power_supply_desc bd71828_ac_desc = {
+ .name = AC_NAME,
+ .type = POWER_SUPPLY_TYPE_MAINS,
+ .properties = bd71828_charger_props,
+ .num_properties = ARRAY_SIZE(bd71828_charger_props),
+ .get_property = bd71828_charger_get_property,
+};
+
+static const struct power_supply_desc bd71828_bat_desc = {
+ .name = BAT_NAME,
+ .type = POWER_SUPPLY_TYPE_BATTERY,
+ .charge_behaviours = BIT(POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO) |
+ BIT(POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE),
+ .properties = bd71828_battery_props,
+ .num_properties = ARRAY_SIZE(bd71828_battery_props),
+ .get_property = bd71828_battery_get_property,
+ .set_property = bd71828_battery_set_property,
+ .property_is_writeable = bd71828_battery_property_is_writeable,
+};
+
+#define RSENS_CURR 10000000LLU
+
+#define BD_ISR_NAME(name) \
+bd7181x_##name##_isr
+
+#define BD_ISR_BAT(name, print, run_gauge) \
+static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \
+{ \
+ struct bd71828_power *pwr = (struct bd71828_power *)data; \
+ \
+ dev_dbg(pwr->dev, "%s\n", print); \
+ power_supply_changed(pwr->bat); \
+ \
+ return IRQ_HANDLED; \
+}
+
+#define BD_ISR_AC(name, print, run_gauge) \
+static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \
+{ \
+ struct bd71828_power *pwr = (struct bd71828_power *)data; \
+ \
+ power_supply_changed(pwr->ac); \
+ dev_dbg(pwr->dev, "%s\n", print); \
+ power_supply_changed(pwr->bat); \
+ \
+ return IRQ_HANDLED; \
+}
+
+#define BD_ISR_DUMMY(name, print) \
+static irqreturn_t BD_ISR_NAME(name)(int irq, void *data) \
+{ \
+ struct bd71828_power *pwr = (struct bd71828_power *)data; \
+ \
+ dev_dbg(pwr->dev, "%s\n", print); \
+ \
+ return IRQ_HANDLED; \
+}
+
+BD_ISR_BAT(chg_state_changed, "CHG state changed", true)
+/* DCIN voltage changes */
+BD_ISR_AC(dcin_removed, "DCIN removed", true)
+BD_ISR_AC(clps_out, "DCIN voltage back to normal", true)
+BD_ISR_AC(clps_in, "DCIN voltage collapsed", false)
+BD_ISR_AC(dcin_ovp_res, "DCIN voltage normal", true)
+BD_ISR_AC(dcin_ovp_det, "DCIN OVER VOLTAGE", true)
+
+BD_ISR_DUMMY(dcin_mon_det, "DCIN voltage below threshold")
+BD_ISR_DUMMY(dcin_mon_res, "DCIN voltage above threshold")
+
+BD_ISR_DUMMY(vsys_uv_res, "VSYS under-voltage cleared")
+BD_ISR_DUMMY(vsys_uv_det, "VSYS under-voltage")
+BD_ISR_DUMMY(vsys_low_res, "'VSYS low' cleared")
+BD_ISR_DUMMY(vsys_low_det, "VSYS low")
+BD_ISR_DUMMY(vsys_mon_res, "VSYS mon - resumed")
+BD_ISR_DUMMY(vsys_mon_det, "VSYS mon - detected")
+BD_ISR_BAT(chg_wdg_temp, "charger temperature watchdog triggered", true)
+BD_ISR_BAT(chg_wdg, "charging watchdog triggered", true)
+BD_ISR_BAT(bat_removed, "Battery removed", true)
+BD_ISR_BAT(bat_det, "Battery detected", true)
+/* TODO: Verify the meaning of these interrupts */
+BD_ISR_BAT(rechg_det, "Recharging", true)
+BD_ISR_BAT(rechg_res, "Recharge ending", true)
+BD_ISR_DUMMY(temp_transit, "Temperature transition")
+BD_ISR_BAT(therm_rmv, "bd71815-therm-rmv", false)
+BD_ISR_BAT(therm_det, "bd71815-therm-det", true)
+BD_ISR_BAT(bat_dead, "bd71815-bat-dead", false)
+BD_ISR_BAT(bat_short_res, "bd71815-bat-short-res", true)
+BD_ISR_BAT(bat_short, "bd71815-bat-short-det", false)
+BD_ISR_BAT(bat_low_res, "bd71815-bat-low-res", true)
+BD_ISR_BAT(bat_low, "bd71815-bat-low-det", true)
+BD_ISR_BAT(bat_ov_res, "bd71815-bat-over-res", true)
+/* What should we do here? */
+BD_ISR_BAT(bat_ov, "bd71815-bat-over-det", false)
+BD_ISR_BAT(bat_mon_res, "bd71815-bat-mon-res", true)
+BD_ISR_BAT(bat_mon, "bd71815-bat-mon-det", true)
+BD_ISR_BAT(bat_cc_mon, "bd71815-bat-cc-mon2", false)
+BD_ISR_BAT(bat_oc1_res, "bd71815-bat-oc1-res", true)
+BD_ISR_BAT(bat_oc1, "bd71815-bat-oc1-det", false)
+BD_ISR_BAT(bat_oc2_res, "bd71815-bat-oc2-res", true)
+BD_ISR_BAT(bat_oc2, "bd71815-bat-oc2-det", false)
+BD_ISR_BAT(bat_oc3_res, "bd71815-bat-oc3-res", true)
+BD_ISR_BAT(bat_oc3, "bd71815-bat-oc3-det", false)
+BD_ISR_BAT(temp_bat_low_res, "bd71815-temp-bat-low-res", true)
+BD_ISR_BAT(temp_bat_low, "bd71815-temp-bat-low-det", true)
+BD_ISR_BAT(temp_bat_hi_res, "bd71815-temp-bat-hi-res", true)
+BD_ISR_BAT(temp_bat_hi, "bd71815-temp-bat-hi-det", true)
+
+static irqreturn_t bd7182x_dcin_removed(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ power_supply_changed(pwr->ac);
+ dev_dbg(pwr->dev, "DCIN removed\n");
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd718x7_chg_done(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd7182x_dcin_detected(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "DCIN inserted\n");
+ power_supply_changed(pwr->ac);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_vbat_low_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VBAT LOW Resumed\n");
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_vbat_low_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VBAT LOW Detected\n");
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_bat_hi_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_warn(pwr->dev, "Overtemp Detected\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_bat_hi_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "Overtemp Resumed\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_bat_low_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "Lowtemp Detected\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_bat_low_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "Lowtemp Resumed\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_vf_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VF Detected\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_vf_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VF Resumed\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_vf125_det(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VF125 Detected\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t bd71828_temp_vf125_res(int irq, void *data)
+{
+ struct bd71828_power *pwr = (struct bd71828_power *)data;
+
+ dev_dbg(pwr->dev, "VF125 Resumed\n");
+ power_supply_changed(pwr->bat);
+
+ return IRQ_HANDLED;
+}
+
+struct bd7182x_irq_res {
+ const char *name;
+ irq_handler_t handler;
+};
+
+#define BDIRQ(na, hn) { .name = (na), .handler = (hn) }
+
+static int bd7182x_get_irqs(struct platform_device *pdev,
+ struct bd71828_power *pwr)
+{
+ int i, irq, ret;
+ static const struct bd7182x_irq_res bd71815_irqs[] = {
+ BDIRQ("bd71815-dcin-rmv", BD_ISR_NAME(dcin_removed)),
+ BDIRQ("bd71815-dcin-clps-out", BD_ISR_NAME(clps_out)),
+ BDIRQ("bd71815-dcin-clps-in", BD_ISR_NAME(clps_in)),
+ BDIRQ("bd71815-dcin-ovp-res", BD_ISR_NAME(dcin_ovp_res)),
+ BDIRQ("bd71815-dcin-ovp-det", BD_ISR_NAME(dcin_ovp_det)),
+ BDIRQ("bd71815-dcin-mon-res", BD_ISR_NAME(dcin_mon_res)),
+ BDIRQ("bd71815-dcin-mon-det", BD_ISR_NAME(dcin_mon_det)),
+
+ BDIRQ("bd71815-vsys-uv-res", BD_ISR_NAME(vsys_uv_res)),
+ BDIRQ("bd71815-vsys-uv-det", BD_ISR_NAME(vsys_uv_det)),
+ BDIRQ("bd71815-vsys-low-res", BD_ISR_NAME(vsys_low_res)),
+ BDIRQ("bd71815-vsys-low-det", BD_ISR_NAME(vsys_low_det)),
+ BDIRQ("bd71815-vsys-mon-res", BD_ISR_NAME(vsys_mon_res)),
+ BDIRQ("bd71815-vsys-mon-det", BD_ISR_NAME(vsys_mon_det)),
+ BDIRQ("bd71815-chg-wdg-temp", BD_ISR_NAME(chg_wdg_temp)),
+ BDIRQ("bd71815-chg-wdg", BD_ISR_NAME(chg_wdg)),
+ BDIRQ("bd71815-rechg-det", BD_ISR_NAME(rechg_det)),
+ BDIRQ("bd71815-rechg-res", BD_ISR_NAME(rechg_res)),
+ BDIRQ("bd71815-ranged-temp-transit", BD_ISR_NAME(temp_transit)),
+ BDIRQ("bd71815-chg-state-change", BD_ISR_NAME(chg_state_changed)),
+ BDIRQ("bd71815-bat-temp-normal", bd71828_temp_bat_hi_res),
+ BDIRQ("bd71815-bat-temp-erange", bd71828_temp_bat_hi_det),
+ BDIRQ("bd71815-bat-rmv", BD_ISR_NAME(bat_removed)),
+ BDIRQ("bd71815-bat-det", BD_ISR_NAME(bat_det)),
+
+ /* Add ISRs for these */
+ BDIRQ("bd71815-therm-rmv", BD_ISR_NAME(therm_rmv)),
+ BDIRQ("bd71815-therm-det", BD_ISR_NAME(therm_det)),
+ BDIRQ("bd71815-bat-dead", BD_ISR_NAME(bat_dead)),
+ BDIRQ("bd71815-bat-short-res", BD_ISR_NAME(bat_short_res)),
+ BDIRQ("bd71815-bat-short-det", BD_ISR_NAME(bat_short)),
+ BDIRQ("bd71815-bat-low-res", BD_ISR_NAME(bat_low_res)),
+ BDIRQ("bd71815-bat-low-det", BD_ISR_NAME(bat_low)),
+ BDIRQ("bd71815-bat-over-res", BD_ISR_NAME(bat_ov_res)),
+ BDIRQ("bd71815-bat-over-det", BD_ISR_NAME(bat_ov)),
+ BDIRQ("bd71815-bat-mon-res", BD_ISR_NAME(bat_mon_res)),
+ BDIRQ("bd71815-bat-mon-det", BD_ISR_NAME(bat_mon)),
+ /* cc-mon 1 & 3 ? */
+ BDIRQ("bd71815-bat-cc-mon2", BD_ISR_NAME(bat_cc_mon)),
+ BDIRQ("bd71815-bat-oc1-res", BD_ISR_NAME(bat_oc1_res)),
+ BDIRQ("bd71815-bat-oc1-det", BD_ISR_NAME(bat_oc1)),
+ BDIRQ("bd71815-bat-oc2-res", BD_ISR_NAME(bat_oc2_res)),
+ BDIRQ("bd71815-bat-oc2-det", BD_ISR_NAME(bat_oc2)),
+ BDIRQ("bd71815-bat-oc3-res", BD_ISR_NAME(bat_oc3_res)),
+ BDIRQ("bd71815-bat-oc3-det", BD_ISR_NAME(bat_oc3)),
+ BDIRQ("bd71815-temp-bat-low-res", BD_ISR_NAME(temp_bat_low_res)),
+ BDIRQ("bd71815-temp-bat-low-det", BD_ISR_NAME(temp_bat_low)),
+ BDIRQ("bd71815-temp-bat-hi-res", BD_ISR_NAME(temp_bat_hi_res)),
+ BDIRQ("bd71815-temp-bat-hi-det", BD_ISR_NAME(temp_bat_hi)),
+ /*
+ * TODO: add rest of the IRQs and re-check the handling.
+ * Check the bd71815-bat-cc-mon1, bd71815-bat-cc-mon3,
+ * bd71815-bat-low-res, bd71815-bat-low-det,
+ * bd71815-bat-hi-res, bd71815-bat-hi-det.
+ */
+ };
+ static const struct bd7182x_irq_res bd71828_irqs[] = {
+ BDIRQ("bd71828-chg-done", bd718x7_chg_done),
+ BDIRQ("bd71828-pwr-dcin-in", bd7182x_dcin_detected),
+ BDIRQ("bd71828-pwr-dcin-out", bd7182x_dcin_removed),
+ BDIRQ("bd71828-vbat-normal", bd71828_vbat_low_res),
+ BDIRQ("bd71828-vbat-low", bd71828_vbat_low_det),
+ BDIRQ("bd71828-btemp-hi", bd71828_temp_bat_hi_det),
+ BDIRQ("bd71828-btemp-cool", bd71828_temp_bat_hi_res),
+ BDIRQ("bd71828-btemp-lo", bd71828_temp_bat_low_det),
+ BDIRQ("bd71828-btemp-warm", bd71828_temp_bat_low_res),
+ BDIRQ("bd71828-temp-hi", bd71828_temp_vf_det),
+ BDIRQ("bd71828-temp-norm", bd71828_temp_vf_res),
+ BDIRQ("bd71828-temp-125-over", bd71828_temp_vf125_det),
+ BDIRQ("bd71828-temp-125-under", bd71828_temp_vf125_res),
+ };
+ int num_irqs;
+ const struct bd7182x_irq_res *irqs;
+
+ switch (pwr->chip_type) {
+ case ROHM_CHIP_TYPE_BD71828:
+ irqs = &bd71828_irqs[0];
+ num_irqs = ARRAY_SIZE(bd71828_irqs);
+ break;
+ case ROHM_CHIP_TYPE_BD71815:
+ irqs = &bd71815_irqs[0];
+ num_irqs = ARRAY_SIZE(bd71815_irqs);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ for (i = 0; i < num_irqs; i++) {
+ irq = platform_get_irq_byname(pdev, irqs[i].name);
+
+ ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+ irqs[i].handler, 0,
+ irqs[i].name, pwr);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+#define RSENS_DEFAULT_30MOHM 30000 /* 30 mOhm in uOhms*/
+
+static int bd7182x_get_rsens(struct bd71828_power *pwr)
+{
+ u64 tmp = RSENS_CURR;
+ int rsens_ohm = RSENS_DEFAULT_30MOHM;
+ struct fwnode_handle *node = NULL;
+
+ if (pwr->dev->parent)
+ node = dev_fwnode(pwr->dev->parent);
+
+ if (node) {
+ int ret;
+ u32 rs;
+
+ ret = fwnode_property_read_u32(node,
+ "rohm,charger-sense-resistor-micro-ohms",
+ &rs);
+ if (ret) {
+ if (ret == -EINVAL) {
+ rs = RSENS_DEFAULT_30MOHM;
+ } else {
+ dev_err(pwr->dev, "Bad RSENS dt property\n");
+ return ret;
+ }
+ }
+ if (!rs) {
+ dev_err(pwr->dev, "Bad RSENS value\n");
+ return -EINVAL;
+ }
+
+ rsens_ohm = (int)rs;
+ }
+
+ /* Reg val to uA */
+ do_div(tmp, rsens_ohm);
+
+ pwr->curr_factor = tmp;
+ pwr->rsens = rsens_ohm;
+ dev_dbg(pwr->dev, "Setting rsens to %u micro ohm\n", pwr->rsens);
+ dev_dbg(pwr->dev, "Setting curr-factor to %u\n", pwr->curr_factor);
+
+ return 0;
+}
+
+static int bd71828_power_probe(struct platform_device *pdev)
+{
+ struct bd71828_power *pwr;
+ struct power_supply_config ac_cfg = {};
+ struct power_supply_config bat_cfg = {};
+ int ret;
+ struct regmap *regmap;
+
+ regmap = dev_get_regmap(pdev->dev.parent, NULL);
+ if (!regmap) {
+ dev_err(&pdev->dev, "No parent regmap\n");
+ return -EINVAL;
+ }
+
+ pwr = devm_kzalloc(&pdev->dev, sizeof(*pwr), GFP_KERNEL);
+ if (!pwr)
+ return -ENOMEM;
+
+ pwr->regmap = regmap;
+ pwr->dev = &pdev->dev;
+ pwr->chip_type = platform_get_device_id(pdev)->driver_data;
+
+ switch (pwr->chip_type) {
+ case ROHM_CHIP_TYPE_BD71828:
+ pwr->bat_inserted = bd71828_bat_inserted;
+ pwr->get_temp = bd71828_get_temp;
+ pwr->regs = &pwr_regs_bd71828;
+ break;
+ case ROHM_CHIP_TYPE_BD71815:
+ pwr->bat_inserted = bd71815_bat_inserted;
+ pwr->get_temp = bd71815_get_temp;
+ pwr->regs = &pwr_regs_bd71815;
+ break;
+ default:
+ dev_err(pwr->dev, "Unknown PMIC\n");
+ return -EINVAL;
+ }
+
+ ret = bd7182x_get_rsens(pwr);
+ if (ret)
+ return dev_err_probe(&pdev->dev, ret, "sense resistor missing\n");
+
+ dev_set_drvdata(&pdev->dev, pwr);
+ bd71828_init_hardware(pwr);
+
+ bat_cfg.drv_data = pwr;
+ bat_cfg.fwnode = dev_fwnode(&pdev->dev);
+
+ ac_cfg.supplied_to = bd71828_ac_supplied_to;
+ ac_cfg.num_supplicants = ARRAY_SIZE(bd71828_ac_supplied_to);
+ ac_cfg.drv_data = pwr;
+
+ pwr->ac = devm_power_supply_register(&pdev->dev, &bd71828_ac_desc,
+ &ac_cfg);
+ if (IS_ERR(pwr->ac))
+ return dev_err_probe(&pdev->dev, PTR_ERR(pwr->ac),
+ "failed to register ac\n");
+
+ pwr->bat = devm_power_supply_register(&pdev->dev, &bd71828_bat_desc,
+ &bat_cfg);
+ if (IS_ERR(pwr->bat))
+ return dev_err_probe(&pdev->dev, PTR_ERR(pwr->bat),
+ "failed to register bat\n");
+
+ ret = bd7182x_get_irqs(pdev, pwr);
+ if (ret)
+ return dev_err_probe(&pdev->dev, ret, "failed to request IRQs");
+
+ /* Configure wakeup capable */
+ device_set_wakeup_capable(pwr->dev, 1);
+ device_set_wakeup_enable(pwr->dev, 1);
+
+ return 0;
+}
+
+static const struct platform_device_id bd71828_charger_id[] = {
+ { "bd71815-power", ROHM_CHIP_TYPE_BD71815 },
+ { "bd71828-power", ROHM_CHIP_TYPE_BD71828 },
+ { },
+};
+MODULE_DEVICE_TABLE(platform, bd71828_charger_id);
+
+static struct platform_driver bd71828_power_driver = {
+ .driver = {
+ .name = "bd718xx-power",
+ },
+ .probe = bd71828_power_probe,
+ .id_table = bd71828_charger_id,
+};
+
+module_platform_driver(bd71828_power_driver);
+
+MODULE_AUTHOR("Cong Pham <cpham2403@gmail.com>");
+MODULE_DESCRIPTION("ROHM BD718(15/28/78) PMIC Battery Charger driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/cw2015_battery.c b/drivers/power/supply/cw2015_battery.c
index 2263d5d3448f..0806abea2372 100644
--- a/drivers/power/supply/cw2015_battery.c
+++ b/drivers/power/supply/cw2015_battery.c
@@ -699,7 +699,13 @@ static int cw_bat_probe(struct i2c_client *client)
if (!cw_bat->battery_workqueue)
return -ENOMEM;
- devm_delayed_work_autocancel(&client->dev, &cw_bat->battery_delay_work, cw_bat_work);
+ ret = devm_delayed_work_autocancel(&client->dev, &cw_bat->battery_delay_work, cw_bat_work);
+ if (ret) {
+ dev_err_probe(&client->dev, ret,
+ "Failed to register delayed work\n");
+ return ret;
+ }
+
queue_delayed_work(cw_bat->battery_workqueue,
&cw_bat->battery_delay_work, msecs_to_jiffies(10));
return 0;
diff --git a/drivers/power/supply/max17040_battery.c b/drivers/power/supply/max17040_battery.c
index c1640bc6accd..48453508688a 100644
--- a/drivers/power/supply/max17040_battery.c
+++ b/drivers/power/supply/max17040_battery.c
@@ -388,6 +388,7 @@ static int max17040_get_property(struct power_supply *psy,
union power_supply_propval *val)
{
struct max17040_chip *chip = power_supply_get_drvdata(psy);
+ int ret;
switch (psp) {
case POWER_SUPPLY_PROP_ONLINE:
@@ -410,7 +411,10 @@ static int max17040_get_property(struct power_supply *psy,
if (!chip->channel_temp)
return -ENODATA;
- iio_read_channel_processed(chip->channel_temp, &val->intval);
+ ret = iio_read_channel_processed(chip->channel_temp, &val->intval);
+ if (ret)
+ return ret;
+
val->intval /= 100; /* Convert from milli- to deci-degree */
break;
diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c
index b1a227bf72e2..5dd02f658f5b 100644
--- a/drivers/power/supply/max77705_charger.c
+++ b/drivers/power/supply/max77705_charger.c
@@ -40,6 +40,39 @@ static enum power_supply_property max77705_charger_props[] = {
POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT,
};
+static irqreturn_t max77705_aicl_irq(int irq, void *irq_drv_data)
+{
+ struct max77705_charger_data *chg = irq_drv_data;
+ unsigned int regval, irq_status;
+ int err;
+
+ err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status);
+ if (err < 0)
+ return IRQ_HANDLED;
+
+ // irq is fiered at the end of current decrease sequence too
+ // early check AICL_I bit to guard against that excess irq call
+ while (!(irq_status & BIT(MAX77705_AICL_I))) {
+ err = regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], &regval);
+ if (err < 0)
+ return IRQ_HANDLED;
+
+ regval--;
+
+ err = regmap_field_write(chg->rfield[MAX77705_CHG_CHGIN_LIM], regval);
+ if (err < 0)
+ return IRQ_HANDLED;
+
+ msleep(AICL_WORK_DELAY_MS);
+
+ err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status);
+ if (err < 0)
+ return IRQ_HANDLED;
+ }
+
+ return IRQ_HANDLED;
+}
+
static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data)
{
struct max77705_charger_data *chg = irq_drv_data;
@@ -60,7 +93,7 @@ static const struct regmap_irq max77705_charger_irqs[] = {
REGMAP_IRQ_REG_LINE(MAX77705_AICL_I, BITS_PER_BYTE),
};
-static struct regmap_irq_chip max77705_charger_irq_chip = {
+static const struct regmap_irq_chip max77705_charger_irq_chip = {
.name = "max77705-charger",
.status_base = MAX77705_CHG_REG_INT,
.mask_base = MAX77705_CHG_REG_INT_MASK,
@@ -567,6 +600,7 @@ static int max77705_charger_probe(struct i2c_client *i2c)
{
struct power_supply_config pscfg = {};
struct max77705_charger_data *chg;
+ struct regmap_irq_chip *chip_desc;
struct device *dev;
struct regmap_irq_chip_data *irq_data;
int ret;
@@ -580,6 +614,13 @@ static int max77705_charger_probe(struct i2c_client *i2c)
chg->dev = dev;
i2c_set_clientdata(i2c, chg);
+ chip_desc = devm_kmemdup(dev, &max77705_charger_irq_chip,
+ sizeof(max77705_charger_irq_chip),
+ GFP_KERNEL);
+ if (!chip_desc)
+ return -ENOMEM;
+ chip_desc->irq_drv_data = chg;
+
chg->regmap = devm_regmap_init_i2c(i2c, &max77705_chg_regmap_config);
if (IS_ERR(chg->regmap))
return PTR_ERR(chg->regmap);
@@ -599,11 +640,9 @@ static int max77705_charger_probe(struct i2c_client *i2c)
if (IS_ERR(chg->psy_chg))
return PTR_ERR(chg->psy_chg);
- max77705_charger_irq_chip.irq_drv_data = chg;
ret = devm_regmap_add_irq_chip(chg->dev, chg->regmap, i2c->irq,
IRQF_ONESHOT, 0,
- &max77705_charger_irq_chip,
- &irq_data);
+ chip_desc, &irq_data);
if (ret)
return dev_err_probe(dev, ret, "failed to add irq chip\n");
@@ -632,6 +671,15 @@ static int max77705_charger_probe(struct i2c_client *i2c)
goto destroy_wq;
}
+ ret = devm_request_threaded_irq(dev, regmap_irq_get_virq(irq_data, MAX77705_AICL_I),
+ NULL, max77705_aicl_irq,
+ IRQF_TRIGGER_NONE,
+ "aicl-irq", chg);
+ if (ret) {
+ dev_err_probe(dev, ret, "Failed to Request aicl IRQ\n");
+ goto destroy_wq;
+ }
+
ret = max77705_charger_enable(chg);
if (ret) {
dev_err_probe(dev, ret, "failed to enable charge\n");
diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c
index 3c2837ef3461..c8028606bba0 100644
--- a/drivers/power/supply/qcom_battmgr.c
+++ b/drivers/power/supply/qcom_battmgr.c
@@ -678,12 +678,7 @@ static int qcom_battmgr_set_charge_start_threshold(struct qcom_battmgr *battmgr,
u32 target_soc, delta_soc;
int ret;
- if (start_soc < CHARGE_CTRL_START_THR_MIN ||
- start_soc > CHARGE_CTRL_START_THR_MAX) {
- dev_err(battmgr->dev, "charge control start threshold exceed range: [%u - %u]\n",
- CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX);
- return -EINVAL;
- }
+ start_soc = clamp(start_soc, CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX);
/*
* If the new start threshold is larger than the old end threshold,
@@ -716,12 +711,7 @@ static int qcom_battmgr_set_charge_end_threshold(struct qcom_battmgr *battmgr, i
u32 delta_soc = CHARGE_CTRL_DELTA_SOC;
int ret;
- if (end_soc < CHARGE_CTRL_END_THR_MIN ||
- end_soc > CHARGE_CTRL_END_THR_MAX) {
- dev_err(battmgr->dev, "charge control end threshold exceed range: [%u - %u]\n",
- CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX);
- return -EINVAL;
- }
+ end_soc = clamp(end_soc, CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX);
if (battmgr->info.charge_ctrl_start && end_soc > battmgr->info.charge_ctrl_start)
delta_soc = end_soc - battmgr->info.charge_ctrl_start;
diff --git a/drivers/power/supply/rt5033_charger.c b/drivers/power/supply/rt5033_charger.c
index 2fdc58439707..de724f23e453 100644
--- a/drivers/power/supply/rt5033_charger.c
+++ b/drivers/power/supply/rt5033_charger.c
@@ -701,6 +701,8 @@ static int rt5033_charger_probe(struct platform_device *pdev)
np_conn = of_parse_phandle(pdev->dev.of_node, "richtek,usb-connector", 0);
np_edev = of_get_parent(np_conn);
charger->edev = extcon_find_edev_by_node(np_edev);
+ of_node_put(np_edev);
+ of_node_put(np_conn);
if (IS_ERR(charger->edev)) {
dev_warn(charger->dev, "no extcon device found in device-tree\n");
goto out;
diff --git a/drivers/power/supply/rt9467-charger.c b/drivers/power/supply/rt9467-charger.c
index fe773dd8b404..44c26fb37a77 100644
--- a/drivers/power/supply/rt9467-charger.c
+++ b/drivers/power/supply/rt9467-charger.c
@@ -376,7 +376,7 @@ static int rt9467_set_value_from_ranges(struct rt9467_chg_data *data,
if (rsel == RT9467_RANGE_VMIVR) {
ret = linear_range_get_selector_high(range, value, &sel, &found);
if (ret)
- value = range->max_sel;
+ sel = range->max_sel;
} else {
linear_range_get_selector_within(range, value, &sel);
}
@@ -588,6 +588,10 @@ static int rt9467_run_aicl(struct rt9467_chg_data *data)
aicl_vth = mivr_vth + RT9467_AICLVTH_GAP_uV;
ret = rt9467_set_value_from_ranges(data, F_AICL_VTH,
RT9467_RANGE_AICL_VTH, aicl_vth);
+ if (ret) {
+ dev_err(data->dev, "Failed to set AICL VTH\n");
+ return ret;
+ }
/* Trigger AICL function */
ret = regmap_field_write(data->rm_field[F_AICL_MEAS], 1);
diff --git a/drivers/power/supply/rt9756.c b/drivers/power/supply/rt9756.c
new file mode 100644
index 000000000000..f254527be653
--- /dev/null
+++ b/drivers/power/supply/rt9756.c
@@ -0,0 +1,955 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright (C) 2025 Richtek Technology Corp.
+//
+// Authors: ChiYuan Huang <cy_huang@richtek.com>
+
+#include <linux/atomic.h>
+#include <linux/cleanup.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/linear_range.h>
+#include <linux/interrupt.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/power_supply.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <linux/sysfs.h>
+#include <linux/util_macros.h>
+
+#define RT9756_REG_INTFLAG1 0x0B
+#define RT9756_REG_INTFLAG2 0x0D
+#define RT9756_REG_INTFLAG3 0x0F
+#define RT9756_REG_ADCCTL 0x11
+#define RT9756_REG_VBUSADC 0x12
+#define RT9756_REG_BC12FLAG 0x45
+#define RT9756_REG_INTFLAG4 0x49
+
+/* Flag1 */
+#define RT9756_EVT_BUSOVP BIT(3)
+#define RT9756_EVT_BUSOCP BIT(2)
+#define RT9756_EVT_BUSUCP BIT(0)
+/* Flag2 */
+#define RT9756_EVT_BATOVP BIT(7)
+#define RT9756_EVT_BATOCP BIT(6)
+#define RT9756_EVT_TDIEOTP BIT(3)
+#define RT9756_EVT_VBUSLOW_ERR BIT(2)
+#define RT9756_EVT_VAC_INSERT BIT(0)
+/* Flag3 */
+#define RT9756_EVT_WDT BIT(5)
+#define RT9756_EVT_VAC_UVLO BIT(4)
+/* ADCCTL */
+#define RT9756_ADCEN_MASK BIT(7)
+#define RT9756_ADCONCE_MASK BIT(6)
+/* Bc12_flag */
+#define RT9756_EVT_BC12_DONE BIT(3)
+/* Flag4 */
+#define RT9756_EVT_OUTOVP BIT(0)
+
+#define RICHTEK_DEVID 7
+#define RT9756_REVID 0
+#define RT9756A_REVID 1
+#define RT9757_REVID 2
+#define RT9757A_REVID 3
+#define RT9756_ADC_CONVTIME 1200
+#define RT9756_ADC_MAXWAIT 16000
+
+enum rt9756_model {
+ MODEL_RT9756 = 0,
+ MODEL_RT9757,
+ MODEL_RT9770,
+ MODEL_MAX
+};
+
+enum rt9756_adc_chan {
+ ADC_VBUS = 0,
+ ADC_IBUS,
+ ADC_VBAT,
+ ADC_IBAT,
+ ADC_TDIE,
+ ADC_MAX_CHANNEL
+};
+
+enum rt9756_usb_type {
+ USB_NO_VBUS = 0,
+ USB_SDP = 2,
+ USB_NSTD,
+ USB_DCP,
+ USB_CDP,
+ MAX_USB_TYPE
+};
+
+enum rt9756_fields {
+ F_VBATOVP = 0,
+ F_VBATOVP_EN,
+ F_IBATOCP,
+ F_IBATOCP_EN,
+ F_VBUSOVP,
+ F_VBUSOVP_EN,
+ F_IBUSOCP,
+ F_IBUSOCP_EN,
+ F_SWITCHING,
+ F_REG_RST,
+ F_CHG_EN,
+ F_OP_MODE,
+ F_WDT_DIS,
+ F_WDT_TMR,
+ F_DEV_ID,
+ F_BC12_EN,
+ F_USB_STATE,
+ F_VBUS_STATE,
+ F_IBAT_RSEN,
+ F_REVISION,
+ F_MAX_FIELD
+};
+
+enum rt9756_ranges {
+ R_VBATOVP = 0,
+ R_IBATOCP,
+ R_VBUSOVP,
+ R_IBUSOCP,
+ R_MAX_RANGE
+};
+
+static const struct reg_field rt9756_chg_fields[F_MAX_FIELD] = {
+ [F_VBATOVP] = REG_FIELD(0x08, 0, 4),
+ [F_VBATOVP_EN] = REG_FIELD(0x08, 7, 7),
+ [F_IBATOCP] = REG_FIELD(0x09, 0, 5),
+ [F_IBATOCP_EN] = REG_FIELD(0x09, 7, 7),
+ [F_VBUSOVP] = REG_FIELD(0x06, 0, 5),
+ [F_VBUSOVP_EN] = REG_FIELD(0x06, 7, 7),
+ [F_IBUSOCP] = REG_FIELD(0x07, 0, 4),
+ [F_IBUSOCP_EN] = REG_FIELD(0x07, 5, 5),
+ [F_SWITCHING] = REG_FIELD(0x5c, 7, 7),
+ [F_REG_RST] = REG_FIELD(0x00, 7, 7),
+ [F_CHG_EN] = REG_FIELD(0x00, 6, 6),
+ [F_OP_MODE] = REG_FIELD(0x00, 5, 5),
+ [F_WDT_DIS] = REG_FIELD(0x00, 3, 3),
+ [F_WDT_TMR] = REG_FIELD(0x00, 0, 2),
+ [F_DEV_ID] = REG_FIELD(0x03, 0, 3),
+ [F_BC12_EN] = REG_FIELD(0x44, 7, 7),
+ [F_USB_STATE] = REG_FIELD(0x46, 5, 7),
+ [F_VBUS_STATE] = REG_FIELD(0x4c, 0, 0),
+ [F_IBAT_RSEN] = REG_FIELD(0x5e, 0, 1),
+ [F_REVISION] = REG_FIELD(0x62, 0, 1),
+};
+
+static const struct reg_field rt9770_chg_fields[F_MAX_FIELD] = {
+ [F_VBATOVP] = REG_FIELD(0x08, 0, 4),
+ [F_VBATOVP_EN] = REG_FIELD(0x08, 7, 7),
+ [F_IBATOCP] = REG_FIELD(0x09, 0, 5),
+ [F_IBATOCP_EN] = REG_FIELD(0x09, 7, 7),
+ [F_VBUSOVP] = REG_FIELD(0x06, 0, 5),
+ [F_VBUSOVP_EN] = REG_FIELD(0x06, 7, 7),
+ [F_IBUSOCP] = REG_FIELD(0x07, 0, 4),
+ [F_IBUSOCP_EN] = REG_FIELD(0x07, 5, 5),
+ [F_SWITCHING] = REG_FIELD(0x5c, 7, 7),
+ [F_REG_RST] = REG_FIELD(0x00, 7, 7),
+ [F_CHG_EN] = REG_FIELD(0x00, 6, 6),
+ [F_OP_MODE] = REG_FIELD(0x00, 5, 5),
+ [F_WDT_DIS] = REG_FIELD(0x00, 3, 3),
+ [F_WDT_TMR] = REG_FIELD(0x00, 0, 2),
+ [F_DEV_ID] = REG_FIELD(0x60, 0, 3),
+ [F_BC12_EN] = REG_FIELD(0x03, 7, 7),
+ [F_USB_STATE] = REG_FIELD(0x02, 5, 7),
+ [F_VBUS_STATE] = REG_FIELD(0x4c, 0, 0),
+ [F_IBAT_RSEN] = REG_FIELD(0x5e, 0, 1),
+ [F_REVISION] = REG_FIELD(0x62, 3, 7),
+};
+
+/* All converted to microvolt or microamp */
+static const struct linear_range rt9756_chg_ranges[R_MAX_RANGE] = {
+ LINEAR_RANGE_IDX(R_VBATOVP, 4200000, 0, 31, 25000),
+ LINEAR_RANGE_IDX(R_IBATOCP, 2000000, 0, 63, 100000),
+ LINEAR_RANGE_IDX(R_VBUSOVP, 3000000, 0, 63, 50000),
+ LINEAR_RANGE_IDX(R_IBUSOCP, 1000000, 0, 31, 250000),
+};
+
+struct charger_event {
+ unsigned int flag1;
+ unsigned int flag2;
+ unsigned int flag3;
+ unsigned int flag4;
+};
+
+struct rt9756_data {
+ struct device *dev;
+ struct regmap *regmap;
+ struct regmap_field *rm_fields[F_MAX_FIELD];
+ struct power_supply *psy;
+ struct power_supply *bat_psy;
+ struct mutex adc_lock;
+ struct power_supply_desc psy_desc;
+ struct power_supply_desc bat_psy_desc;
+ struct charger_event chg_evt;
+ unsigned int rg_resistor;
+ unsigned int real_resistor;
+ enum rt9756_model model;
+ atomic_t usb_type;
+};
+
+struct rt975x_dev_data {
+ const struct regmap_config *regmap_config;
+ const struct reg_field *reg_fields;
+ const struct reg_sequence *init_regs;
+ size_t num_init_regs;
+ int (*check_device_model)(struct rt9756_data *data);
+};
+
+static int rt9756_get_value_field_range(struct rt9756_data *data, enum rt9756_fields en_field,
+ enum rt9756_fields field, enum rt9756_ranges rsel, int *val)
+{
+ const struct linear_range *range = rt9756_chg_ranges + rsel;
+ unsigned int enable, selector, value;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[en_field], &enable);
+ if (ret)
+ return ret;
+
+ if (!enable) {
+ *val = 0;
+ return 0;
+ }
+
+ ret = regmap_field_read(data->rm_fields[field], &selector);
+ if (ret)
+ return ret;
+
+ ret = linear_range_get_value(range, selector, &value);
+ if (ret)
+ return ret;
+
+ *val = (int)value;
+
+ return 0;
+}
+
+static int rt9756_set_value_field_range(struct rt9756_data *data, enum rt9756_fields en_field,
+ enum rt9756_fields field, enum rt9756_ranges rsel, int val)
+{
+ const struct linear_range *range = rt9756_chg_ranges + rsel;
+ unsigned int selector, value;
+ int ret;
+
+ if (!val)
+ return regmap_field_write(data->rm_fields[en_field], 0);
+
+ value = (unsigned int)val;
+ linear_range_get_selector_within(range, value, &selector);
+ ret = regmap_field_write(data->rm_fields[field], selector);
+ if (ret)
+ return ret;
+
+ return regmap_field_write(data->rm_fields[en_field], 1);
+}
+
+static int rt9756_get_adc(struct rt9756_data *data, enum rt9756_adc_chan chan,
+ int *val)
+{
+ struct regmap *regmap = data->regmap;
+ unsigned int reg_addr = RT9756_REG_VBUSADC + chan * 2;
+ unsigned int mask = RT9756_ADCEN_MASK | RT9756_ADCONCE_MASK;
+ unsigned int shift = 0, adc_cntl;
+ __be16 raws;
+ int scale, offset = 0, ret;
+
+ guard(mutex)(&data->adc_lock);
+
+ ret = regmap_update_bits(regmap, RT9756_REG_ADCCTL, mask, mask);
+ if (ret)
+ return ret;
+
+ ret = regmap_read_poll_timeout(regmap, RT9756_REG_ADCCTL, adc_cntl,
+ !(adc_cntl & RT9756_ADCEN_MASK),
+ RT9756_ADC_CONVTIME, RT9756_ADC_MAXWAIT);
+ if (ret && ret != -ETIMEDOUT)
+ return ret;
+
+ ret = regmap_raw_read(regmap, reg_addr, &raws, sizeof(raws));
+ if (ret)
+ return ret;
+
+ /*
+ * TDIE LSB 1'c, others LSB 1000uV or 1000uA.
+ * Rsense ratio is needed for IBAT channel
+ */
+ if (chan == ADC_TDIE) {
+ scale = 10;
+ shift = 8;
+ offset = -40;
+ } else if (chan == ADC_IBAT)
+ scale = 1000 * data->rg_resistor / data->real_resistor;
+ else
+ scale = 1000;
+
+ *val = ((be16_to_cpu(raws) >> shift) + offset) * scale;
+
+ return regmap_update_bits(regmap, RT9756_REG_ADCCTL, mask, 0);
+}
+
+static int rt9756_get_switching_state(struct rt9756_data *data, int *status)
+{
+ unsigned int switching_state;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_SWITCHING], &switching_state);
+ if (ret)
+ return ret;
+
+ if (switching_state)
+ *status = POWER_SUPPLY_STATUS_CHARGING;
+ else
+ *status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+
+ return 0;
+}
+
+static int rt9756_get_charger_health(struct rt9756_data *data)
+{
+ struct charger_event *evt = &data->chg_evt;
+
+ if (evt->flag2 & RT9756_EVT_VBUSLOW_ERR)
+ return POWER_SUPPLY_HEALTH_UNDERVOLTAGE;
+
+ if (evt->flag1 & RT9756_EVT_BUSOVP || evt->flag2 & RT9756_EVT_BATOVP ||
+ evt->flag4 & RT9756_EVT_OUTOVP)
+ return POWER_SUPPLY_HEALTH_OVERVOLTAGE;
+
+ if (evt->flag1 & RT9756_EVT_BUSOCP || evt->flag2 & RT9756_EVT_BATOCP)
+ return POWER_SUPPLY_HEALTH_OVERCURRENT;
+
+ if (evt->flag1 & RT9756_EVT_BUSUCP)
+ return POWER_SUPPLY_HEALTH_UNSPEC_FAILURE;
+
+ if (evt->flag2 & RT9756_EVT_TDIEOTP)
+ return POWER_SUPPLY_HEALTH_OVERHEAT;
+
+ if (evt->flag3 & RT9756_EVT_WDT)
+ return POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE;
+
+ return POWER_SUPPLY_HEALTH_GOOD;
+}
+
+static int rt9756_get_charger_online(struct rt9756_data *data, int *val)
+{
+ unsigned int online;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_VBUS_STATE], &online);
+ if (ret)
+ return ret;
+
+ *val = !!online;
+ return 0;
+}
+
+static int rt9756_get_vbus_ovp(struct rt9756_data *data, int *val)
+{
+ unsigned int opmode;
+ int ovpval, ret;
+
+ /* operating mode -> 0 bypass, 1 div2 */
+ ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode);
+ if (ret)
+ return ret;
+
+ ret = rt9756_get_value_field_range(data, F_VBUSOVP_EN, F_VBUSOVP, R_VBUSOVP, &ovpval);
+ if (ret)
+ return ret;
+
+ *val = opmode ? ovpval * 2 : ovpval;
+ return 0;
+}
+
+static int rt9756_set_vbus_ovp(struct rt9756_data *data, int val)
+{
+ unsigned int opmode;
+ int ret;
+
+ /* operating mode -> 0 bypass, 1 div2 */
+ ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode);
+ if (ret)
+ return ret;
+
+ return rt9756_set_value_field_range(data, F_VBUSOVP_EN, F_VBUSOVP, R_VBUSOVP,
+ opmode ? val / 2 : val);
+}
+
+static const char * const rt9756_manufacturer = "Richtek Technology Corp.";
+static const char * const rt9756_model[MODEL_MAX] = { "RT9756", "RT9757", "RT9770" };
+
+static int rt9756_psy_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ int *pval = &val->intval;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ return rt9756_get_switching_state(data, pval);
+ case POWER_SUPPLY_PROP_HEALTH:
+ *pval = rt9756_get_charger_health(data);
+ return 0;
+ case POWER_SUPPLY_PROP_ONLINE:
+ return rt9756_get_charger_online(data, pval);
+ case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+ return rt9756_get_vbus_ovp(data, pval);
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ return rt9756_get_adc(data, ADC_VBUS, pval);
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ return rt9756_get_value_field_range(data, F_IBUSOCP_EN, F_IBUSOCP, R_IBUSOCP, pval);
+ case POWER_SUPPLY_PROP_CURRENT_NOW:
+ return rt9756_get_adc(data, ADC_IBUS, pval);
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+ return rt9756_get_value_field_range(data, F_VBATOVP_EN, F_VBATOVP, R_VBATOVP, pval);
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+ return rt9756_get_value_field_range(data, F_IBATOCP_EN, F_IBATOCP, R_IBATOCP, pval);
+ case POWER_SUPPLY_PROP_TEMP:
+ return rt9756_get_adc(data, ADC_TDIE, pval);
+ case POWER_SUPPLY_PROP_USB_TYPE:
+ *pval = atomic_read(&data->usb_type);
+ return 0;
+ case POWER_SUPPLY_PROP_MODEL_NAME:
+ val->strval = rt9756_model[data->model];
+ return 0;
+ case POWER_SUPPLY_PROP_MANUFACTURER:
+ val->strval = rt9756_manufacturer;
+ return 0;
+ default:
+ return -ENODATA;
+ }
+}
+
+static int rt9756_psy_set_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ const union power_supply_propval *val)
+{
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ int intval = val->intval;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ memset(&data->chg_evt, 0, sizeof(data->chg_evt));
+ return regmap_field_write(data->rm_fields[F_CHG_EN], !!intval);
+ case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+ return rt9756_set_vbus_ovp(data, intval);
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ return rt9756_set_value_field_range(data, F_IBUSOCP_EN, F_IBUSOCP, R_IBUSOCP,
+ intval);
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+ return rt9756_set_value_field_range(data, F_VBATOVP_EN, F_VBATOVP, R_VBATOVP,
+ intval);
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+ return rt9756_set_value_field_range(data, F_IBATOCP_EN, F_IBATOCP, R_IBATOCP,
+ intval);
+ case POWER_SUPPLY_PROP_USB_TYPE:
+ return regmap_field_write(data->rm_fields[F_BC12_EN], !!intval);
+ default:
+ return -EINVAL;
+ }
+}
+
+static const enum power_supply_property rt9756_psy_properties[] = {
+ POWER_SUPPLY_PROP_STATUS,
+ POWER_SUPPLY_PROP_ONLINE,
+ POWER_SUPPLY_PROP_HEALTH,
+ POWER_SUPPLY_PROP_ONLINE,
+ POWER_SUPPLY_PROP_VOLTAGE_MAX,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+ POWER_SUPPLY_PROP_CURRENT_MAX,
+ POWER_SUPPLY_PROP_CURRENT_NOW,
+ POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX,
+ POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX,
+ POWER_SUPPLY_PROP_TEMP,
+ POWER_SUPPLY_PROP_USB_TYPE,
+ POWER_SUPPLY_PROP_MODEL_NAME,
+ POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
+static int rt9756_bat_psy_get_property(struct power_supply *psy,
+ enum power_supply_property psp,
+ union power_supply_propval *val)
+{
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ int *pval = &val->intval;
+
+ switch (psp) {
+ case POWER_SUPPLY_PROP_TECHNOLOGY:
+ *pval = POWER_SUPPLY_TECHNOLOGY_LION;
+ return 0;
+ case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+ return rt9756_get_adc(data, ADC_VBAT, pval);
+ case POWER_SUPPLY_PROP_CURRENT_NOW:
+ return rt9756_get_adc(data, ADC_IBAT, pval);
+ default:
+ return -ENODATA;
+ }
+}
+
+static const enum power_supply_property rt9756_bat_psy_properties[] = {
+ POWER_SUPPLY_PROP_TECHNOLOGY,
+ POWER_SUPPLY_PROP_VOLTAGE_NOW,
+ POWER_SUPPLY_PROP_CURRENT_NOW,
+};
+
+static int rt9756_psy_property_is_writeable(struct power_supply *psy,
+ enum power_supply_property psp)
+{
+ switch (psp) {
+ case POWER_SUPPLY_PROP_STATUS:
+ case POWER_SUPPLY_PROP_ONLINE:
+ case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+ case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+ case POWER_SUPPLY_PROP_USB_TYPE:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static const unsigned int rt9756_wdt_millisecond[] = {
+ 500, 1000, 5000, 30000, 40000, 80000, 128000, 255000
+};
+
+static ssize_t watchdog_timer_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct power_supply *psy = to_power_supply(dev);
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ unsigned int wdt_tmr_now = 0, wdt_sel, wdt_dis;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_WDT_DIS], &wdt_dis);
+ if (ret)
+ return ret;
+
+ if (!wdt_dis) {
+ ret = regmap_field_read(data->rm_fields[F_WDT_TMR], &wdt_sel);
+ if (ret)
+ return ret;
+
+ wdt_tmr_now = rt9756_wdt_millisecond[wdt_sel];
+ }
+
+ return sysfs_emit(buf, "%d\n", wdt_tmr_now);
+}
+
+static ssize_t watchdog_timer_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct power_supply *psy = to_power_supply(dev);
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ unsigned int wdt_set, wdt_sel;
+ int ret;
+
+ ret = kstrtouint(buf, 10, &wdt_set);
+ if (ret)
+ return ret;
+
+ ret = regmap_field_write(data->rm_fields[F_WDT_DIS], 1);
+ if (ret)
+ return ret;
+
+ wdt_sel = find_closest(wdt_set, rt9756_wdt_millisecond,
+ ARRAY_SIZE(rt9756_wdt_millisecond));
+
+ ret = regmap_field_write(data->rm_fields[F_WDT_TMR], wdt_sel);
+ if (ret)
+ return ret;
+
+ if (wdt_set) {
+ ret = regmap_field_write(data->rm_fields[F_WDT_DIS], 0);
+ if (ret)
+ return ret;
+ }
+
+ return count;
+}
+
+static const char * const rt9756_opmode_str[] = { "bypass", "div2" };
+
+static ssize_t operation_mode_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct power_supply *psy = to_power_supply(dev);
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ unsigned int opmode;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_OP_MODE], &opmode);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%s\n", rt9756_opmode_str[opmode]);
+}
+
+static ssize_t operation_mode_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct power_supply *psy = to_power_supply(dev);
+ struct rt9756_data *data = power_supply_get_drvdata(psy);
+ int index, ret;
+
+ index = sysfs_match_string(rt9756_opmode_str, buf);
+ if (index < 0)
+ return index;
+
+ ret = regmap_field_write(data->rm_fields[F_OP_MODE], index);
+
+ return ret ?: count;
+}
+
+static DEVICE_ATTR_RW(watchdog_timer);
+static DEVICE_ATTR_RW(operation_mode);
+
+static struct attribute *rt9756_sysfs_attrs[] = {
+ &dev_attr_watchdog_timer.attr,
+ &dev_attr_operation_mode.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(rt9756_sysfs);
+
+static int rt9756_register_psy(struct rt9756_data *data)
+{
+ struct power_supply_desc *desc = &data->psy_desc;
+ struct power_supply_desc *bat_desc = &data->bat_psy_desc;
+ struct power_supply_config cfg = {}, bat_cfg = {};
+ struct device *dev = data->dev;
+ char *psy_name, *bat_psy_name, **supplied_to;
+
+ bat_cfg.drv_data = data;
+ bat_cfg.fwnode = dev_fwnode(dev);
+
+ bat_psy_name = devm_kasprintf(dev, GFP_KERNEL, "rt9756-%s-battery", dev_name(dev));
+ if (!bat_psy_name)
+ return -ENOMEM;
+
+ bat_desc->name = bat_psy_name;
+ bat_desc->type = POWER_SUPPLY_TYPE_BATTERY;
+ bat_desc->properties = rt9756_bat_psy_properties;
+ bat_desc->num_properties = ARRAY_SIZE(rt9756_bat_psy_properties);
+ bat_desc->get_property = rt9756_bat_psy_get_property;
+
+ data->bat_psy = devm_power_supply_register(dev, bat_desc, &bat_cfg);
+ if (IS_ERR(data->bat_psy))
+ return dev_err_probe(dev, PTR_ERR(data->bat_psy), "Failed to register battery\n");
+
+ supplied_to = devm_kzalloc(dev, sizeof(*supplied_to), GFP_KERNEL);
+ if (!supplied_to)
+ return -ENOMEM;
+
+ /* Link charger psy to battery psy */
+ supplied_to[0] = bat_psy_name;
+
+ cfg.drv_data = data;
+ cfg.fwnode = dev_fwnode(dev);
+ cfg.attr_grp = rt9756_sysfs_groups;
+ cfg.supplied_to = supplied_to;
+ cfg.num_supplicants = 1;
+
+ psy_name = devm_kasprintf(dev, GFP_KERNEL, "rt9756-%s", dev_name(dev));
+ if (!psy_name)
+ return -ENOMEM;
+
+ desc->name = psy_name;
+ desc->type = POWER_SUPPLY_TYPE_USB;
+ desc->usb_types = BIT(POWER_SUPPLY_USB_TYPE_UNKNOWN) | BIT(POWER_SUPPLY_USB_TYPE_SDP) |
+ BIT(POWER_SUPPLY_USB_TYPE_DCP) | BIT(POWER_SUPPLY_USB_TYPE_CDP);
+ desc->properties = rt9756_psy_properties;
+ desc->num_properties = ARRAY_SIZE(rt9756_psy_properties);
+ desc->property_is_writeable = rt9756_psy_property_is_writeable;
+ desc->get_property = rt9756_psy_get_property;
+ desc->set_property = rt9756_psy_set_property;
+
+ data->psy = devm_power_supply_register(dev, desc, &cfg);
+
+ return PTR_ERR_OR_ZERO(data->psy);
+}
+
+static int rt9756_get_usb_type(struct rt9756_data *data)
+{
+ unsigned int type;
+ int report_type, ret;
+
+ ret = regmap_field_read(data->rm_fields[F_USB_STATE], &type);
+ if (ret)
+ return ret;
+
+ switch (type) {
+ case USB_SDP:
+ case USB_NSTD:
+ report_type = POWER_SUPPLY_USB_TYPE_SDP;
+ break;
+ case USB_DCP:
+ report_type = POWER_SUPPLY_USB_TYPE_DCP;
+ break;
+ case USB_CDP:
+ report_type = POWER_SUPPLY_USB_TYPE_CDP;
+ break;
+ case USB_NO_VBUS:
+ default:
+ report_type = POWER_SUPPLY_USB_TYPE_UNKNOWN;
+ break;
+ }
+
+ atomic_set(&data->usb_type, report_type);
+ return 0;
+}
+
+static irqreturn_t rt9756_irq_handler(int irq, void *devid)
+{
+ struct rt9756_data *data = devid;
+ struct regmap *regmap = data->regmap;
+ struct charger_event *evt = &data->chg_evt;
+ unsigned int bc12_flag = 0;
+ int ret;
+
+ ret = regmap_read(regmap, RT9756_REG_INTFLAG1, &evt->flag1);
+ if (ret)
+ return IRQ_NONE;
+
+ ret = regmap_read(regmap, RT9756_REG_INTFLAG2, &evt->flag2);
+ if (ret)
+ return IRQ_NONE;
+
+ ret = regmap_read(regmap, RT9756_REG_INTFLAG3, &evt->flag3);
+ if (ret)
+ return IRQ_NONE;
+
+ if (data->model != MODEL_RT9770) {
+ ret = regmap_read(regmap, RT9756_REG_INTFLAG4, &evt->flag4);
+ if (ret)
+ return IRQ_NONE;
+
+ ret = regmap_read(regmap, RT9756_REG_BC12FLAG, &bc12_flag);
+ if (ret)
+ return IRQ_NONE;
+ }
+
+ dev_dbg(data->dev, "events: 0x%02x,%02x,%02x,%02x,%02x\n", evt->flag1, evt->flag2,
+ evt->flag3, evt->flag4, bc12_flag);
+
+ if (evt->flag2 & RT9756_EVT_VAC_INSERT) {
+ ret = regmap_field_write(data->rm_fields[F_BC12_EN], 1);
+ if (ret)
+ return IRQ_NONE;
+ }
+
+ if (evt->flag3 & RT9756_EVT_VAC_UVLO)
+ atomic_set(&data->usb_type, POWER_SUPPLY_USB_TYPE_UNKNOWN);
+
+ if (bc12_flag & RT9756_EVT_BC12_DONE) {
+ ret = rt9756_get_usb_type(data);
+ if (ret)
+ return IRQ_NONE;
+ }
+
+ power_supply_changed(data->psy);
+
+ return IRQ_HANDLED;
+}
+
+static int rt9756_config_batsense_resistor(struct rt9756_data *data)
+{
+ unsigned int shunt_resistor_uohms = 2000, rsense_sel;
+
+ device_property_read_u32(data->dev, "shunt-resistor-micro-ohms", &shunt_resistor_uohms);
+
+ if (!shunt_resistor_uohms || shunt_resistor_uohms > 5000)
+ return -EINVAL;
+
+ data->real_resistor = shunt_resistor_uohms;
+
+ /* Always choose the larger or equal one to prevent false ocp alarm */
+ if (shunt_resistor_uohms <= 1000) {
+ rsense_sel = 0;
+ data->rg_resistor = 1000;
+ } else if (shunt_resistor_uohms <= 2000) {
+ rsense_sel = 1;
+ data->rg_resistor = 2000;
+ } else {
+ rsense_sel = 2;
+ data->rg_resistor = 5000;
+ }
+
+ return regmap_field_write(data->rm_fields[F_IBAT_RSEN], rsense_sel);
+}
+
+static const struct reg_sequence rt9756_init_regs[] = {
+ REG_SEQ(0x00, 0x80, 1000), /* REG_RESET */
+ REG_SEQ0(0x04, 0x13), /* VACOVP/OVPGATE 12V */
+ REG_SEQ0(0x00, 0x28), /* WDT_DIS = 1 */
+ REG_SEQ0(0x0c, 0x02), /* MASK FLAG1 */
+ REG_SEQ0(0x0e, 0x06), /* MASK FLAG2 */
+ REG_SEQ0(0x10, 0xca), /* MASK FLAG3 */
+ REG_SEQ0(0x44, 0xa0), /* BC12_EN */
+ REG_SEQ0(0x47, 0x07), /* MASK BC12FLAG */
+ REG_SEQ0(0x4a, 0xfe), /* MASK FLAG4 */
+ REG_SEQ0(0x5c, 0x40), /* MASK CON_SWITCHING */
+ REG_SEQ0(0x63, 0x01), /* MASK VDDA_UVLO */
+};
+
+static const struct reg_sequence rt9770_init_regs[] = {
+ REG_SEQ(0x00, 0x80, 1000), /* REG_RESET */
+ REG_SEQ0(0x04, 0x13), /* VACOVP/OVPGATE 12V */
+ REG_SEQ0(0x00, 0x28), /* WDT_DIS = 1 */
+ REG_SEQ0(0x0c, 0x02), /* MASK FLAG1 */
+ REG_SEQ0(0x0e, 0x06), /* MASK FLAG2 */
+ REG_SEQ0(0x10, 0xca), /* MASK FLAG3 */
+ REG_SEQ0(0x5c, 0x40), /* MASK CON_SWITCHING */
+ REG_SEQ0(0x63, 0x01), /* MASK VDDA_UVLO */
+};
+
+static const struct regmap_config rt9756_regmap_config = {
+ .name = "rt9756",
+ .reg_bits = 16,
+ .val_bits = 8,
+ .max_register = 0x1ff,
+};
+
+static const struct regmap_config rt9770_regmap_config = {
+ .name = "rt9770",
+ .reg_bits = 8,
+ .val_bits = 8,
+ .max_register = 0xff,
+};
+
+static int rt9756_check_device_model(struct rt9756_data *data)
+{
+ struct device *dev = data->dev;
+ unsigned int revid;
+ int ret;
+
+ ret = regmap_field_read(data->rm_fields[F_REVISION], &revid);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to read revid\n");
+
+ if (revid == RT9757_REVID || revid == RT9757A_REVID)
+ data->model = MODEL_RT9757;
+ else if (revid == RT9756_REVID || revid == RT9756A_REVID)
+ data->model = MODEL_RT9756;
+ else
+ return dev_err_probe(dev, -EINVAL, "Unknown revision %d\n", revid);
+
+ return 0;
+}
+
+static int rt9770_check_device_model(struct rt9756_data *data)
+{
+ data->model = MODEL_RT9770;
+ return 0;
+}
+
+static int rt9756_probe(struct i2c_client *i2c)
+{
+ const struct rt975x_dev_data *dev_data;
+ struct device *dev = &i2c->dev;
+ struct rt9756_data *data;
+ struct regmap *regmap;
+ unsigned int devid;
+ int ret;
+
+ dev_data = device_get_match_data(dev);
+ if (!dev_data)
+ return dev_err_probe(dev, -EINVAL, "No device data found\n");
+
+ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->dev = dev;
+ mutex_init(&data->adc_lock);
+ atomic_set(&data->usb_type, POWER_SUPPLY_USB_TYPE_UNKNOWN);
+ i2c_set_clientdata(i2c, data);
+
+ regmap = devm_regmap_init_i2c(i2c, dev_data->regmap_config);
+ if (IS_ERR(regmap))
+ return dev_err_probe(dev, PTR_ERR(regmap), "Failed to init regmap\n");
+
+ data->regmap = regmap;
+
+ ret = devm_regmap_field_bulk_alloc(dev, regmap, data->rm_fields, dev_data->reg_fields,
+ F_MAX_FIELD);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to alloc regmap fields\n");
+
+ /* Richtek Device ID check */
+ ret = regmap_field_read(data->rm_fields[F_DEV_ID], &devid);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to read devid\n");
+
+ if (devid != RICHTEK_DEVID)
+ return dev_err_probe(dev, -ENODEV, "Incorrect VID 0x%02x\n", devid);
+
+ /* Get specific model */
+ ret = dev_data->check_device_model(data);
+ if (ret)
+ return ret;
+
+ ret = regmap_register_patch(regmap, dev_data->init_regs, dev_data->num_init_regs);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to init registers\n");
+
+ ret = rt9756_config_batsense_resistor(data);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to config batsense resistor\n");
+
+ ret = rt9756_register_psy(data);
+ if (ret)
+ return dev_err_probe(dev, ret, "Failed to init power supply\n");
+
+ return devm_request_threaded_irq(dev, i2c->irq, NULL, rt9756_irq_handler, IRQF_ONESHOT,
+ dev_name(dev), data);
+}
+
+static void rt9756_shutdown(struct i2c_client *i2c)
+{
+ struct rt9756_data *data = i2c_get_clientdata(i2c);
+
+ regmap_field_write(data->rm_fields[F_REG_RST], 1);
+}
+
+static const struct rt975x_dev_data rt9756_dev_data = {
+ .regmap_config = &rt9756_regmap_config,
+ .reg_fields = rt9756_chg_fields,
+ .init_regs = rt9756_init_regs,
+ .num_init_regs = ARRAY_SIZE(rt9756_init_regs),
+ .check_device_model = rt9756_check_device_model,
+};
+
+static const struct rt975x_dev_data rt9770_dev_data = {
+ .regmap_config = &rt9770_regmap_config,
+ .reg_fields = rt9770_chg_fields,
+ .init_regs = rt9770_init_regs,
+ .num_init_regs = ARRAY_SIZE(rt9770_init_regs),
+ .check_device_model = rt9770_check_device_model,
+};
+
+static const struct of_device_id rt9756_device_match_table[] = {
+ { .compatible = "richtek,rt9756", .data = &rt9756_dev_data },
+ { .compatible = "richtek,rt9770", .data = &rt9770_dev_data },
+ {}
+};
+MODULE_DEVICE_TABLE(of, rt9756_device_match_table);
+
+static struct i2c_driver rt9756_charger_driver = {
+ .driver = {
+ .name = "rt9756",
+ .of_match_table = rt9756_device_match_table,
+ },
+ .probe = rt9756_probe,
+ .shutdown = rt9756_shutdown,
+};
+module_i2c_driver(rt9756_charger_driver);
+
+MODULE_DESCRIPTION("Richtek RT9756 charger driver");
+MODULE_AUTHOR("ChiYuan Huang <cy_huang@richtek.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/wm831x_power.c b/drivers/power/supply/wm831x_power.c
index 6acdba7885ca..78fa0573ef25 100644
--- a/drivers/power/supply/wm831x_power.c
+++ b/drivers/power/supply/wm831x_power.c
@@ -144,6 +144,7 @@ static int wm831x_usb_limit_change(struct notifier_block *nb,
struct wm831x_power,
usb_notify);
unsigned int i, best;
+ int ret;
/* Find the highest supported limit */
best = 0;
@@ -156,8 +157,13 @@ static int wm831x_usb_limit_change(struct notifier_block *nb,
dev_dbg(wm831x_power->wm831x->dev,
"Limiting USB current to %umA", wm831x_usb_limits[best]);
- wm831x_set_bits(wm831x_power->wm831x, WM831X_POWER_STATE,
- WM831X_USB_ILIM_MASK, best);
+ ret = wm831x_set_bits(wm831x_power->wm831x, WM831X_POWER_STATE,
+ WM831X_USB_ILIM_MASK, best);
+ if (ret < 0) {
+ dev_err(wm831x_power->wm831x->dev,
+ "Failed to set USB current limit: %d\n", ret);
+ return ret;
+ }
return 0;
}
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index ac0e132ccc3e..2a5b5a9fdcb3 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -53,9 +53,45 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
}
EXPORT_SYMBOL_GPL(log_non_standard_event);
-void log_arm_hw_error(struct cper_sec_proc_arm *err)
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
{
- trace_arm_event(err);
+ struct cper_arm_err_info *err_info;
+ struct cper_arm_ctx_info *ctx_info;
+ u8 *ven_err_data;
+ u32 ctx_len = 0;
+ int n, sz, cpu;
+ s32 vsei_len;
+ u32 pei_len;
+ u8 *pei_err, *ctx_err;
+
+ pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
+ pei_err = (u8 *)(err + 1);
+
+ err_info = (struct cper_arm_err_info *)(err + 1);
+ ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
+ ctx_err = (u8 *)ctx_info;
+
+ for (n = 0; n < err->context_info_num; n++) {
+ sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
+ ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
+ ctx_len += sz;
+ }
+
+ vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + pei_len + ctx_len);
+ if (vsei_len < 0) {
+ pr_warn(FW_BUG "section length: %d\n", err->section_length);
+ pr_warn(FW_BUG "section length is too small\n");
+ pr_warn(FW_BUG "firmware-generated error record is incorrect\n");
+ vsei_len = 0;
+ }
+ ven_err_data = (u8 *)ctx_info;
+
+ cpu = GET_LOGICAL_INDEX(err->mpidr);
+ if (cpu < 0)
+ cpu = -1;
+
+ trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
+ ven_err_data, (u32)vsei_len, sev, cpu);
}
static int __init ras_init(void)
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index ea532a8a4a0c..a596f6013019 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -313,10 +313,12 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_ccw_private *private,
return 0;
}
-static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
- struct vfio_region_info *info,
- unsigned long arg)
+static int vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
+ struct vfio_ccw_private *private =
+ container_of(vdev, struct vfio_ccw_private, vdev);
int i;
switch (info->index) {
@@ -328,7 +330,6 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
return 0;
default: /* all other regions are handled via capability chain */
{
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
struct vfio_region_info_cap_type cap_type = {
.header.id = VFIO_REGION_INFO_CAP_TYPE,
.header.version = 1 };
@@ -351,27 +352,10 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
cap_type.type = private->region[i].type;
cap_type.subtype = private->region[i].subtype;
- ret = vfio_info_add_capability(&caps, &cap_type.header,
+ ret = vfio_info_add_capability(caps, &cap_type.header,
sizeof(cap_type));
if (ret)
return ret;
-
- info->flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info->argsz < sizeof(*info) + caps.size) {
- info->argsz = sizeof(*info) + caps.size;
- info->cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(*info));
- if (copy_to_user((void __user *)arg + sizeof(*info),
- caps.buf, caps.size)) {
- kfree(caps.buf);
- return -EFAULT;
- }
- info->cap_offset = sizeof(*info);
- }
-
- kfree(caps.buf);
-
}
}
return 0;
@@ -532,24 +516,6 @@ static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev,
return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
}
- case VFIO_DEVICE_GET_REGION_INFO:
- {
- struct vfio_region_info info;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- ret = vfio_ccw_mdev_get_region_info(private, &info, arg);
- if (ret)
- return ret;
-
- return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
- }
case VFIO_DEVICE_GET_IRQ_INFO:
{
struct vfio_irq_info info;
@@ -627,6 +593,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = {
.read = vfio_ccw_mdev_read,
.write = vfio_ccw_mdev_write,
.ioctl = vfio_ccw_mdev_ioctl,
+ .get_region_info_caps = vfio_ccw_mdev_ioctl_get_region_info,
.request = vfio_ccw_mdev_request,
.dma_unmap = vfio_ccw_dma_unmap,
.bind_iommufd = vfio_iommufd_emulated_bind,
diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c
index ff9adfc0b332..bdfd06516671 100644
--- a/drivers/scsi/bfa/bfad.c
+++ b/drivers/scsi/bfa/bfad.c
@@ -1528,7 +1528,6 @@ bfad_pci_slot_reset(struct pci_dev *pdev)
goto out_disable_device;
}
- pci_save_state(pdev);
pci_set_master(pdev);
rc = dma_set_mask_and_coherent(&bfad->pcidev->dev, DMA_BIT_MASK(64));
diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c
index 79c8dafdd49e..db0c2174430a 100644
--- a/drivers/scsi/csiostor/csio_init.c
+++ b/drivers/scsi/csiostor/csio_init.c
@@ -1093,7 +1093,6 @@ csio_pci_slot_reset(struct pci_dev *pdev)
pci_set_master(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
/* Bring HW s/m to ready state.
* but don't resume IOs.
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 44214884deaf..95123689e9d1 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -7859,7 +7859,6 @@ static int ipr_reset_restore_cfg_space(struct ipr_cmnd *ipr_cmd)
struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg;
ENTER;
- ioa_cfg->pdev->state_saved = true;
pci_restore_state(ioa_cfg->pdev);
if (ipr_set_pcix_cmd_reg(ioa_cfg)) {
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index f206267d9ecd..065eb91de9c0 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -14434,12 +14434,6 @@ lpfc_io_slot_reset_s3(struct pci_dev *pdev)
pci_restore_state(pdev);
- /*
- * As the new kernel behavior of pci_restore_state() API call clears
- * device saved_state flag, need to save the restored state again.
- */
- pci_save_state(pdev);
-
if (pdev->is_busmaster)
pci_set_master(pdev);
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 5ffd94586652..9007533e36e0 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -7886,11 +7886,6 @@ qla2xxx_pci_slot_reset(struct pci_dev *pdev)
pci_restore_state(pdev);
- /* pci_restore_state() clears the saved_state flag of the device
- * save restored state which resets saved_state flag
- */
- pci_save_state(pdev);
-
if (ha->mem_only)
rc = pci_enable_device_mem(pdev);
else
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 83ff66f954e6..97329c97332f 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -9796,11 +9796,6 @@ qla4xxx_pci_slot_reset(struct pci_dev *pdev)
*/
pci_restore_state(pdev);
- /* pci_restore_state() clears the saved_state flag of the device
- * save restored state which resets saved_state flag
- */
- pci_save_state(pdev);
-
/* Initialize device or resume if in suspended state */
rc = pci_enable_device(pdev);
if (rc) {
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 152f914c599d..65bd370f282a 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -6178,7 +6178,6 @@ static pci_ers_result_t serial8250_io_slot_reset(struct pci_dev *dev)
return PCI_ERS_RESULT_DISCONNECT;
pci_restore_state(dev);
- pci_save_state(dev);
return PCI_ERS_RESULT_RECOVERED;
}
diff --git a/drivers/tty/serial/jsm/jsm_driver.c b/drivers/tty/serial/jsm/jsm_driver.c
index 417a5b6bffc3..8d21373cae57 100644
--- a/drivers/tty/serial/jsm/jsm_driver.c
+++ b/drivers/tty/serial/jsm/jsm_driver.c
@@ -355,7 +355,6 @@ static void jsm_io_resume(struct pci_dev *pdev)
struct jsm_board *brd = pci_get_drvdata(pdev);
pci_restore_state(pdev);
- pci_save_state(pdev);
jsm_uart_port_init(brd);
}
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index a7936bd1aabe..ddaa1366704b 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1256,7 +1256,7 @@ static int query_virtqueues(struct mlx5_vdpa_net *ndev,
int vq_idx = start_vq + i;
if (cmd->err) {
- mlx5_vdpa_err(mvdev, "query vq %d failed, err: %d\n", vq_idx, err);
+ mlx5_vdpa_err(mvdev, "query vq %d failed, err: %d\n", vq_idx, cmd->err);
if (!err)
err = cmd->err;
continue;
diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
index 9e8d07078606..31a02e7fd7f2 100644
--- a/drivers/vdpa/octeon_ep/octep_vdpa_main.c
+++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c
@@ -736,6 +736,7 @@ static int octep_sriov_enable(struct pci_dev *pdev, int num_vfs)
octep_vdpa_assign_barspace(vf_pdev, pdev, index);
if (++index == num_vfs) {
done = true;
+ pci_dev_put(vf_pdev);
break;
}
}
diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c
index 36f61cc96e21..43426bd971ac 100644
--- a/drivers/vdpa/pds/vdpa_dev.c
+++ b/drivers/vdpa/pds/vdpa_dev.c
@@ -51,7 +51,7 @@ static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv)
err = pdsc_register_notify(nb);
if (err) {
nb->notifier_call = NULL;
- dev_err(dev, "failed to register pds event handler: %ps\n",
+ dev_err(dev, "failed to register pds event handler: %pe\n",
ERR_PTR(err));
return -EINVAL;
}
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index e7bced0b5542..ae357d014564 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -2173,7 +2173,8 @@ static int vduse_init(void)
if (!vduse_irq_wq)
goto err_wq;
- vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0);
+ vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound",
+ WQ_HIGHPRI | WQ_PERCPU, 0);
if (!vduse_irq_bound_wq)
goto err_bound_wq;
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index 5dd5f5ad7686..253031b86b60 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -129,28 +129,22 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
-static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev,
- struct vfio_region_info __user *arg)
+static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_cdx_device *vdev =
+ container_of(core_vdev, struct vfio_cdx_device, vdev);
struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
- struct vfio_region_info info;
-
- if (copy_from_user(&info, arg, minsz))
- return -EFAULT;
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.index >= cdx_dev->res_count)
+ if (info->index >= cdx_dev->res_count)
return -EINVAL;
/* map offset to the physical address */
- info.offset = vfio_cdx_index_to_offset(info.index);
- info.size = vdev->regions[info.index].size;
- info.flags = vdev->regions[info.index].flags;
-
- return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+ info->offset = vfio_cdx_index_to_offset(info->index);
+ info->size = vdev->regions[info->index].size;
+ info->flags = vdev->regions[info->index].flags;
+ return 0;
}
static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev,
@@ -219,8 +213,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev,
switch (cmd) {
case VFIO_DEVICE_GET_INFO:
return vfio_cdx_ioctl_get_info(vdev, uarg);
- case VFIO_DEVICE_GET_REGION_INFO:
- return vfio_cdx_ioctl_get_region_info(vdev, uarg);
case VFIO_DEVICE_GET_IRQ_INFO:
return vfio_cdx_ioctl_get_irq_info(vdev, uarg);
case VFIO_DEVICE_SET_IRQS:
@@ -284,6 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = {
.open_device = vfio_cdx_open_device,
.close_device = vfio_cdx_close_device,
.ioctl = vfio_cdx_ioctl,
+ .get_region_info_caps = vfio_cdx_ioctl_get_region_info,
.device_feature = vfio_cdx_ioctl_feature,
.mmap = vfio_cdx_mmap,
.bind_iommufd = vfio_iommufd_physical_bind,
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 480cac3a0c27..8ceca24ac136 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -99,7 +99,7 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
return ret;
if (user_size < minsz)
return -EINVAL;
- ret = copy_struct_from_user(&bind, minsz, arg, user_size);
+ ret = copy_struct_from_user(&bind, sizeof(bind), arg, user_size);
if (ret)
return ret;
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
index 76ccbab0e3d6..ba47100f28c1 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
@@ -117,6 +117,24 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev)
fsl_mc_cleanup_irq_pool(mc_cont);
}
+static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_fsl_mc_device *vdev =
+ container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+
+ if (info->index >= mc_dev->obj_desc.region_count)
+ return -EINVAL;
+
+ /* map offset to the physical address */
+ info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index);
+ info->size = vdev->regions[info->index].size;
+ info->flags = vdev->regions[info->index].flags;
+ return 0;
+}
+
static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
@@ -149,30 +167,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev,
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
}
- case VFIO_DEVICE_GET_REGION_INFO:
- {
- struct vfio_region_info info;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.index >= mc_dev->obj_desc.region_count)
- return -EINVAL;
-
- /* map offset to the physical address */
- info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index);
- info.size = vdev->regions[info.index].size;
- info.flags = vdev->regions[info.index].flags;
-
- if (copy_to_user((void __user *)arg, &info, minsz))
- return -EFAULT;
- return 0;
- }
case VFIO_DEVICE_GET_IRQ_INFO:
{
struct vfio_irq_info info;
@@ -589,6 +583,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = {
.open_device = vfio_fsl_mc_open_device,
.close_device = vfio_fsl_mc_close_device,
.ioctl = vfio_fsl_mc_ioctl,
+ .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info,
.read = vfio_fsl_mc_read,
.write = vfio_fsl_mc_write,
.mmap = vfio_fsl_mc_mmap,
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f54665..1e82b44bda1a 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM
To enable s390x KVM vfio-pci extensions, say Y.
+config VFIO_PCI_DMABUF
+ def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER
+
source "drivers/vfio/pci/mlx5/Kconfig"
source "drivers/vfio/pci/hisilicon/Kconfig"
@@ -67,4 +70,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
source "drivers/vfio/pci/qat/Kconfig"
+source "drivers/vfio/pci/xe/Kconfig"
+
endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c..e0a0757dd1d2 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,6 +2,7 @@
vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
vfio-pci-y := vfio_pci.o
@@ -19,3 +20,5 @@ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
obj-$(CONFIG_QAT_VFIO_PCI) += qat/
+
+obj-$(CONFIG_XE_VFIO_PCI) += xe/
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index fde33f54e99e..cf45f6370c36 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr)
return 0;
}
+static void qm_xqc_reg_offsets(struct hisi_qm *qm,
+ u32 *eqc_addr, u32 *aeqc_addr)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev =
+ container_of(qm, struct hisi_acc_vf_core_device, vf_qm);
+
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) {
+ *eqc_addr = QM_EQC_VF_DW0;
+ *aeqc_addr = QM_AEQC_VF_DW0;
+ } else {
+ *eqc_addr = QM_EQC_PF_DW0;
+ *aeqc_addr = QM_AEQC_PF_DW0;
+ }
+}
+
static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
{
struct device *dev = &qm->pdev->dev;
+ u32 eqc_addr, aeqc_addr;
int ret;
ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
@@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
return ret;
}
+ qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr);
/* QM_EQC_DW has 7 regs */
- ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+ ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7);
if (ret) {
dev_err(dev, "failed to read QM_EQC_DW\n");
return ret;
}
/* QM_AEQC_DW has 7 regs */
- ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+ ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7);
if (ret) {
dev_err(dev, "failed to read QM_AEQC_DW\n");
return ret;
@@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
{
struct device *dev = &qm->pdev->dev;
+ u32 eqc_addr, aeqc_addr;
int ret;
/* Check VF state */
@@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
return ret;
}
+ qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr);
/* QM_EQC_DW has 7 regs */
- ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+ ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7);
if (ret) {
dev_err(dev, "failed to write QM_EQC_DW\n");
return ret;
}
/* QM_AEQC_DW has 7 regs */
- ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+ ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7);
if (ret) {
dev_err(dev, "failed to write QM_AEQC_DW\n");
return ret;
@@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm;
struct pci_dev *vf_dev = vdev->pdev;
+ u32 val;
- /*
- * ACC VF dev BAR2 region consists of both functional register space
- * and migration control register space. For migration to work, we
- * need access to both. Hence, we map the entire BAR2 region here.
- * But unnecessarily exposing the migration BAR region to the Guest
- * has the potential to prevent/corrupt the Guest migration. Hence,
- * we restrict access to the migration control space from
- * Guest(Please see mmap/ioctl/read/write override functions).
- *
- * Please note that it is OK to expose the entire VF BAR if migration
- * is not supported or required as this cannot affect the ACC PF
- * configurations.
- *
- * Also the HiSilicon ACC VF devices supported by this driver on
- * HiSilicon hardware platforms are integrated end point devices
- * and the platform lacks the capability to perform any PCIe P2P
- * between these devices.
- */
+ val = readl(pf_qm->io_base + QM_MIG_REGION_SEL);
+ if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN))
+ hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL;
+ else
+ hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL;
- vf_qm->io_base =
- ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX),
- pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX));
- if (!vf_qm->io_base)
- return -EIO;
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) {
+ /*
+ * On hardware platforms greater than QM_HW_V3, the migration function
+ * register is placed in the BAR2 configuration region of the PF,
+ * and each VF device occupies 8KB of configuration space.
+ */
+ vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET +
+ hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE;
+ } else {
+ /*
+ * ACC VF dev BAR2 region consists of both functional register space
+ * and migration control register space. For migration to work, we
+ * need access to both. Hence, we map the entire BAR2 region here.
+ * But unnecessarily exposing the migration BAR region to the Guest
+ * has the potential to prevent/corrupt the Guest migration. Hence,
+ * we restrict access to the migration control space from
+ * Guest(Please see mmap/ioctl/read/write override functions).
+ *
+ * Please note that it is OK to expose the entire VF BAR if migration
+ * is not supported or required as this cannot affect the ACC PF
+ * configurations.
+ *
+ * Also the HiSilicon ACC VF devices supported by this driver on
+ * HiSilicon hardware platforms are integrated end point devices
+ * and the platform lacks the capability to perform any PCIe P2P
+ * between these devices.
+ */
+ vf_qm->io_base =
+ ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX),
+ pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX));
+ if (!vf_qm->io_base)
+ return -EIO;
+ }
vf_qm->fun_type = QM_HW_VF;
+ vf_qm->ver = pf_qm->ver;
vf_qm->pdev = vf_dev;
mutex_init(&vf_qm->mailbox_lock);
@@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev)
return !IS_ERR(pf_qm) ? pf_qm : NULL;
}
+static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev,
+ unsigned int index)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev =
+ hisi_acc_drvdata(vdev->pdev);
+
+ /*
+ * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device
+ * BAR2 region encompasses both functional register space
+ * and migration control register space.
+ * only the functional region should be report to Guest.
+ */
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL)
+ return (pci_resource_len(vdev->pdev, index) >> 1);
+ /*
+ * On the new HW device, the migration control register
+ * has been moved to the PF device BAR2 region.
+ * The VF device BAR2 is entirely functional register space.
+ */
+ return pci_resource_len(vdev->pdev, index);
+}
+
static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev,
size_t count, loff_t *ppos,
size_t *new_count)
@@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev,
if (index == VFIO_PCI_BAR2_REGION_INDEX) {
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- resource_size_t end = pci_resource_len(vdev->pdev, index) / 2;
+ resource_size_t end;
+ end = hisi_acc_get_resource_len(vdev, index);
/* Check if access is for migration control region */
if (pos >= end)
return -EINVAL;
@@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev,
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
if (index == VFIO_PCI_BAR2_REGION_INDEX) {
u64 req_len, pgoff, req_start;
- resource_size_t end = pci_resource_len(vdev->pdev, index) / 2;
+ resource_size_t end;
+ end = hisi_acc_get_resource_len(vdev, index);
req_len = vma->vm_end - vma->vm_start;
pgoff = vma->vm_pgoff &
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
@@ -1324,43 +1385,23 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev,
return vfio_pci_core_read(core_vdev, buf, new_count, ppos);
}
-static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
- unsigned long arg)
+static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
- if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
- struct vfio_pci_core_device *vdev =
- container_of(core_vdev, struct vfio_pci_core_device, vdev);
- struct pci_dev *pdev = vdev->pdev;
- struct vfio_region_info info;
- unsigned long minsz;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
- if (info.index == VFIO_PCI_BAR2_REGION_INDEX) {
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ if (info->index != VFIO_PCI_BAR2_REGION_INDEX)
+ return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
- /*
- * ACC VF dev BAR2 region consists of both functional
- * register space and migration control register space.
- * Report only the functional region to Guest.
- */
- info.size = pci_resource_len(pdev, info.index) / 2;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE |
- VFIO_REGION_INFO_FLAG_MMAP;
+ info->size = hisi_acc_get_resource_len(vdev, info->index);
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
- }
- }
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP;
+ return 0;
}
static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev)
@@ -1521,7 +1562,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
hisi_acc_vf_disable_fds(hisi_acc_vdev);
mutex_lock(&hisi_acc_vdev->open_mutex);
hisi_acc_vdev->dev_opened = false;
- iounmap(vf_qm->io_base);
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL)
+ iounmap(vf_qm->io_base);
mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_close_device(core_vdev);
}
@@ -1557,13 +1599,15 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
.release = vfio_pci_core_release_dev,
.open_device = hisi_acc_vfio_pci_open_device,
.close_device = hisi_acc_vfio_pci_close_device,
- .ioctl = hisi_acc_vfio_pci_ioctl,
+ .ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = hisi_acc_vfio_ioctl_get_region,
.device_feature = vfio_pci_core_ioctl_feature,
.read = hisi_acc_vfio_pci_read,
.write = hisi_acc_vfio_pci_write,
.mmap = hisi_acc_vfio_pci_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -1577,6 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.open_device = hisi_acc_vfio_pci_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index 91002ceeebc1..cd55eba64dfb 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -50,8 +50,10 @@
#define QM_QUE_ISO_CFG_V 0x0030
#define QM_PAGE_SIZE 0x0034
-#define QM_EQC_DW0 0X8000
-#define QM_AEQC_DW0 0X8020
+#define QM_EQC_VF_DW0 0X8000
+#define QM_AEQC_VF_DW0 0X8020
+#define QM_EQC_PF_DW0 0x1c00
+#define QM_AEQC_PF_DW0 0x1c20
#define ACC_DRV_MAJOR_VER 1
#define ACC_DRV_MINOR_VER 0
@@ -59,6 +61,22 @@
#define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC
#define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE
+#define QM_MIG_REGION_OFFSET 0x180000
+#define QM_MIG_REGION_SIZE 0x2000
+
+/**
+ * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live
+ * migration functionality is located in the latter 32KB of the VF's BAR2.
+ * The Guest is only provided with the first 32KB of the VF's BAR2.
+ * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live
+ * migration functionality is located in the PF's BAR2, and the entire 64KB
+ * of the VF's BAR2 is allocated to the Guest.
+ */
+enum hw_drv_mode {
+ HW_ACC_MIG_VF_CTRL = 0,
+ HW_ACC_MIG_PF_CTRL,
+};
+
struct acc_vf_data {
#define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state)
/* QM match information */
@@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device {
struct pci_dev *vf_dev;
struct hisi_qm *pf_qm;
struct hisi_qm vf_qm;
+ enum hw_drv_mode drv_mode;
/*
* vf_qm_state represents the QM_VF_STATE register value.
* It is set by Guest driver for the ACC VF dev indicating
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 7ec47e736a8e..9c5970411d07 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
.open_device = mlx5vf_pci_open_device,
.close_device = mlx5vf_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index e346392b72f6..84d142a47ec6 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -7,6 +7,8 @@
#include <linux/vfio_pci_core.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/pm_runtime.h>
/*
* The device memory usable to the workloads running in the VM is cached
@@ -58,6 +60,8 @@ struct nvgrace_gpu_pci_core_device {
/* Lock to control device memory kernel mapping */
struct mutex remap_lock;
bool has_mig_hw_bug;
+ /* GPU has just been reset */
+ bool reset_done;
};
static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -102,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
mutex_init(&nvdev->remap_lock);
}
+ /*
+ * GPU readiness is checked by reading the BAR0 registers.
+ *
+ * ioremap BAR0 to ensure that the BAR0 mapping is present before
+ * register reads on first fault before establishing any GPU
+ * memory mapping.
+ */
+ ret = vfio_pci_core_setup_barmap(vdev, 0);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
+ }
+
vfio_pci_core_finish_enable(vdev);
return 0;
@@ -130,6 +147,106 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
vfio_pci_core_close_device(core_vdev);
}
+static int nvgrace_gpu_wait_device_ready(void __iomem *io)
+{
+ unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
+
+ do {
+ if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+ (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY))
+ return 0;
+ msleep(POLL_QUANTUM_MS);
+ } while (!time_after(jiffies, timeout));
+
+ return -ETIME;
+}
+
+/*
+ * If the GPU memory is accessed by the CPU while the GPU is not ready
+ * after reset, it can cause harmless corrected RAS events to be logged.
+ * Make sure the GPU is ready before establishing the mappings.
+ */
+static int
+nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
+{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ int ret;
+
+ lockdep_assert_held_read(&vdev->memory_lock);
+
+ if (!nvdev->reset_done)
+ return 0;
+
+ if (!__vfio_pci_memory_enabled(vdev))
+ return -EIO;
+
+ ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+ if (ret)
+ return ret;
+
+ nvdev->reset_done = false;
+
+ return 0;
+}
+
+static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ u64 pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data;
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ unsigned int index =
+ vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+ struct mem_region *memregion;
+ unsigned long pfn, addr;
+
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+ if (!memregion)
+ return VM_FAULT_SIGBUS;
+
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
+
+ if (is_aligned_for_order(vma, addr, pfn, order)) {
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ if (vdev->pm_runtime_engaged ||
+ nvgrace_gpu_check_device_ready(nvdev))
+ return VM_FAULT_SIGBUS;
+
+ ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+ }
+ }
+
+ dev_dbg_ratelimited(&vdev->pdev->dev,
+ "%s order = %d pfn 0x%lx: 0x%x\n",
+ __func__, order, pfn,
+ (unsigned int)ret);
+
+ return ret;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+ return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+ .fault = nvgrace_gpu_vfio_pci_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+ .huge_fault = nvgrace_gpu_vfio_pci_huge_fault,
+#endif
+};
+
static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
struct vm_area_struct *vma)
{
@@ -137,10 +254,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
struct mem_region *memregion;
- unsigned long start_pfn;
u64 req_len, pgoff, end;
unsigned int index;
- int ret = 0;
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
@@ -157,17 +272,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
- check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
return -EOVERFLOW;
/*
- * Check that the mapping request does not go beyond available device
- * memory size
+ * Check that the mapping request does not go beyond the exposed
+ * device memory size.
*/
if (end > memregion->memlength)
return -EINVAL;
+ vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+
/*
* The carved out region of the device memory needs the NORMAL_NC
* property. Communicate as such to the hypervisor.
@@ -184,56 +300,31 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
}
- /*
- * Perform a PFN map to the memory and back the device BAR by the
- * GPU memory.
- *
- * The available GPU memory size may not be power-of-2 aligned. The
- * remainder is only backed by vfio_device_ops read/write handlers.
- *
- * During device reset, the GPU is safely disconnected to the CPU
- * and access to the BAR will be immediately returned preventing
- * machine check.
- */
- ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
- req_len, vma->vm_page_prot);
- if (ret)
- return ret;
-
- vma->vm_pgoff = start_pfn;
+ vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
+ vma->vm_private_data = nvdev;
return 0;
}
-static long
-nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned long arg)
+static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
struct vfio_region_info_cap_sparse_mmap *sparse;
- struct vfio_region_info info;
struct mem_region *memregion;
u32 size;
int ret;
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
/*
* Request to determine the BAR region information. Send the
* GPU memory information.
*/
- memregion = nvgrace_gpu_memregion(info.index, nvdev);
+ memregion = nvgrace_gpu_memregion(info->index, nvdev);
if (!memregion)
- return vfio_pci_core_ioctl(core_vdev,
- VFIO_DEVICE_GET_REGION_INFO, arg);
+ return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
size = struct_size(sparse, areas, 1);
@@ -252,49 +343,28 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
sparse->header.version = 1;
- ret = vfio_info_add_capability(&caps, &sparse->header, size);
+ ret = vfio_info_add_capability(caps, &sparse->header, size);
kfree(sparse);
if (ret)
return ret;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
/*
* The region memory size may not be power-of-2 aligned.
* Given that the memory is a BAR and may not be
* aligned, roundup to the next power-of-2.
*/
- info.size = memregion->bar_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
+ info->size = memregion->bar_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE |
VFIO_REGION_INFO_FLAG_MMAP;
-
- if (caps.size) {
- info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info.argsz < sizeof(info) + caps.size) {
- info.argsz = sizeof(info) + caps.size;
- info.cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(info));
- if (copy_to_user((void __user *)arg +
- sizeof(info), caps.buf,
- caps.size)) {
- kfree(caps.buf);
- return -EFAULT;
- }
- info.cap_offset = sizeof(info);
- }
- kfree(caps.buf);
- }
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
+ return 0;
}
static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
switch (cmd) {
- case VFIO_DEVICE_GET_REGION_INFO:
- return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg);
case VFIO_DEVICE_IOEVENTFD:
return -ENOTTY;
case VFIO_DEVICE_RESET:
@@ -510,6 +580,7 @@ static ssize_t
nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
char __user *buf, size_t count, loff_t *ppos)
{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
struct mem_region *memregion;
@@ -536,9 +607,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
else
mem_count = min(count, memregion->memlength - (size_t)offset);
- ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
- if (ret)
- return ret;
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ ret = nvgrace_gpu_check_device_ready(nvdev);
+ if (ret)
+ return ret;
+
+ ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+ }
/*
* Only the device memory present on the hardware is mapped, which may
@@ -563,9 +640,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev,
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ int ret;
- if (nvgrace_gpu_memregion(index, nvdev))
- return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+ if (nvgrace_gpu_memregion(index, nvdev)) {
+ if (pm_runtime_resume_and_get(&vdev->pdev->dev))
+ return -EIO;
+ ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+ pm_runtime_put(&vdev->pdev->dev);
+ return ret;
+ }
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
@@ -627,6 +711,7 @@ static ssize_t
nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
size_t count, loff_t *ppos, const char __user *buf)
{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
struct mem_region *memregion;
@@ -656,9 +741,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
*/
mem_count = min(count, memregion->memlength - (size_t)offset);
- ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
- if (ret)
- return ret;
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ ret = nvgrace_gpu_check_device_ready(nvdev);
+ if (ret)
+ return ret;
+
+ ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+ }
exitfn:
*ppos += count;
@@ -672,10 +763,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev,
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ int ret;
- if (nvgrace_gpu_memregion(index, nvdev))
- return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+ if (nvgrace_gpu_memregion(index, nvdev)) {
+ if (pm_runtime_resume_and_get(&vdev->pdev->dev))
+ return -EIO;
+ ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+ pm_runtime_put(&vdev->pdev->dev);
+ return ret;
+ }
if (index == VFIO_PCI_CONFIG_REGION_INDEX)
return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
@@ -683,6 +781,50 @@ nvgrace_gpu_write(struct vfio_device *core_vdev,
return vfio_pci_core_write(core_vdev, buf, count, ppos);
}
+static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev,
+ struct p2pdma_provider **provider,
+ unsigned int region_index,
+ struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev = container_of(
+ core_vdev, struct nvgrace_gpu_pci_core_device, core_device);
+ struct pci_dev *pdev = core_vdev->pdev;
+ struct mem_region *mem_region;
+
+ /*
+ * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) {
+ * The P2P properties of the non-BAR memory is the same as the
+ * BAR memory, so just use the provider for index 0. Someday
+ * when CXL gets P2P support we could create CXLish providers
+ * for the non-BAR memory.
+ * } else if (region_index == USEMEM_REGION_INDEX) {
+ * This is actually cachable memory and isn't treated as P2P in
+ * the chip. For now we have no way to push cachable memory
+ * through everything and the Grace HW doesn't care what caching
+ * attribute is programmed into the SMMU. So use BAR 0.
+ * }
+ */
+ mem_region = nvgrace_gpu_memregion(region_index, nvdev);
+ if (mem_region) {
+ *provider = pcim_p2pdma_provider(pdev, 0);
+ if (!*provider)
+ return -EINVAL;
+ return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges,
+ nr_ranges,
+ mem_region->memphys,
+ mem_region->memlength);
+ }
+
+ return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index,
+ phys_vec, dma_ranges, nr_ranges);
+}
+
+static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = {
+ .get_dmabuf_phys = nvgrace_get_dmabuf_phys,
+};
+
static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
.name = "nvgrace-gpu-vfio-pci",
.init = vfio_pci_core_init_dev,
@@ -690,6 +832,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
.open_device = nvgrace_gpu_open_device,
.close_device = nvgrace_gpu_close_device,
.ioctl = nvgrace_gpu_ioctl,
+ .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = nvgrace_gpu_read,
.write = nvgrace_gpu_write,
@@ -703,6 +846,10 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
+static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = {
+ .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys,
+};
+
static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
.name = "nvgrace-gpu-vfio-pci-core",
.init = vfio_pci_core_init_dev,
@@ -710,6 +857,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
.open_device = nvgrace_gpu_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
@@ -893,11 +1041,10 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
* Ensure that the BAR0 region is enabled before accessing the
* registers.
*/
-static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
+static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
{
- unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
void __iomem *io;
- int ret = -ETIME;
+ int ret;
ret = pci_enable_device(pdev);
if (ret)
@@ -913,16 +1060,8 @@ static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
goto iomap_exit;
}
- do {
- if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
- (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
- ret = 0;
- goto reg_check_exit;
- }
- msleep(POLL_QUANTUM_MS);
- } while (!time_after(jiffies, timeout));
+ ret = nvgrace_gpu_wait_device_ready(io);
-reg_check_exit:
pci_iounmap(pdev, io);
iomap_exit:
pci_release_selected_regions(pdev, 1 << 0);
@@ -939,7 +1078,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
u64 memphys, memlength;
int ret;
- ret = nvgrace_gpu_wait_device_ready(pdev);
+ ret = nvgrace_gpu_probe_check_device_ready(pdev);
if (ret)
return ret;
@@ -965,6 +1104,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
memphys, memlength);
if (ret)
goto out_put_vdev;
+ nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops;
+ } else {
+ nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops;
}
ret = vfio_pci_core_register_device(&nvdev->core_device);
@@ -1002,12 +1144,38 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
+/*
+ * The GPU reset is required to be serialized against the *first* mapping
+ * faults and read/writes accesses to prevent potential RAS events logging.
+ *
+ * First fault or access after a reset needs to poll device readiness,
+ * flag that a reset has occurred. The readiness test is done by holding
+ * the memory_lock read lock and we expect all vfio-pci initiated resets to
+ * hold the memory_lock write lock to avoid races. However, .reset_done
+ * extends beyond the scope of vfio-pci initiated resets therefore we
+ * cannot assert this behavior and use lockdep_assert_held_write.
+ */
+static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_device, struct nvgrace_gpu_pci_core_device,
+ core_device);
+
+ nvdev->reset_done = true;
+}
+
+static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = {
+ .reset_done = nvgrace_gpu_vfio_pci_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = nvgrace_gpu_vfio_pci_table,
.probe = nvgrace_gpu_probe,
.remove = nvgrace_gpu_remove,
- .err_handler = &vfio_pci_core_err_handlers,
+ .err_handler = &nvgrace_gpu_vfio_pci_err_handlers,
.driver_managed_dma = true,
};
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index f3ccb0008f67..be103c74e969 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -195,6 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = {
.open_device = pds_vfio_open_device,
.close_device = pds_vfio_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
index a19b68043eb2..8fbdf7c6d666 100644
--- a/drivers/vfio/pci/qat/main.c
+++ b/drivers/vfio/pci/qat/main.c
@@ -609,6 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = {
.open_device = qat_vf_pci_open_device,
.close_device = qat_vf_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
.mmap = vfio_pci_core_mmap,
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index ac10f14417f2..0c771064c0b8 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -132,6 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
.open_device = vfio_pci_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
@@ -147,6 +148,10 @@ static const struct vfio_device_ops vfio_pci_ops = {
.pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas,
};
+static const struct vfio_pci_device_ops vfio_pci_dev_ops = {
+ .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys,
+};
+
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct vfio_pci_core_device *vdev;
@@ -161,6 +166,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return PTR_ERR(vdev);
dev_set_drvdata(&pdev->dev, vdev);
+ vdev->pci_ops = &vfio_pci_dev_ops;
ret = vfio_pci_core_register_device(vdev);
if (ret)
goto out_put_vdev;
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 8f02f236b5b4..dc4e510e6e1b 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
return pdev->current_state < PCI_D3hot &&
(pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
}
+EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled);
/*
* Restore the *real* BARs after we detect a FLR or backdoor reset.
@@ -589,10 +590,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
- if (!new_mem)
+ if (!new_mem) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
- else
+ vfio_pci_dma_buf_move(vdev, true);
+ } else {
down_write(&vdev->memory_lock);
+ }
/*
* If the user is writing mem/io enable (new_mem/io) and we
@@ -627,6 +630,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
*virt_cmd &= cpu_to_le16(~mask);
*virt_cmd |= cpu_to_le16(new_cmd & mask);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
@@ -707,12 +712,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
pci_power_t state)
{
- if (state >= PCI_D3hot)
+ if (state >= PCI_D3hot) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
- else
+ vfio_pci_dma_buf_move(vdev, true);
+ } else {
down_write(&vdev->memory_lock);
+ }
vfio_pci_set_power_state(vdev, state);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
@@ -900,7 +909,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
}
@@ -982,7 +994,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 7dcf5439dedc..3a11e6f450f7 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -28,6 +28,7 @@
#include <linux/nospec.h>
#include <linux/sched/mm.h>
#include <linux/iommufd.h>
+#include <linux/pci-p2pdma.h>
#if IS_ENABLED(CONFIG_EEH)
#include <asm/eeh.h>
#endif
@@ -41,6 +42,40 @@ static bool nointxmask;
static bool disable_vga;
static bool disable_idle_d3;
+static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu)
+{
+ struct vfio_pci_eventfd *eventfd =
+ container_of(rcu, struct vfio_pci_eventfd, rcu);
+
+ eventfd_ctx_put(eventfd->ctx);
+ kfree(eventfd);
+}
+
+int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_eventfd __rcu **peventfd,
+ struct eventfd_ctx *ctx)
+{
+ struct vfio_pci_eventfd *new = NULL;
+ struct vfio_pci_eventfd *old;
+
+ lockdep_assert_held(&vdev->igate);
+
+ if (ctx) {
+ new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
+ if (!new)
+ return -ENOMEM;
+
+ new->ctx = ctx;
+ }
+
+ old = rcu_replace_pointer(*peventfd, new,
+ lockdep_is_held(&vdev->igate));
+ if (old)
+ call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free);
+
+ return 0;
+}
+
/* List of PF's that vfio_pci_core_sriov_configure() has been called on */
static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
static LIST_HEAD(vfio_pci_sriov_pfs);
@@ -286,6 +321,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
* semaphore.
*/
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
+
if (vdev->pm_runtime_engaged) {
up_write(&vdev->memory_lock);
return -EINVAL;
@@ -299,11 +336,9 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
return 0;
}
-static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
+static int vfio_pci_core_pm_entry(struct vfio_pci_core_device *vdev, u32 flags,
void __user *arg, size_t argsz)
{
- struct vfio_pci_core_device *vdev =
- container_of(device, struct vfio_pci_core_device, vdev);
int ret;
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
@@ -320,12 +355,10 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
}
static int vfio_pci_core_pm_entry_with_wakeup(
- struct vfio_device *device, u32 flags,
+ struct vfio_pci_core_device *vdev, u32 flags,
struct vfio_device_low_power_entry_with_wakeup __user *arg,
size_t argsz)
{
- struct vfio_pci_core_device *vdev =
- container_of(device, struct vfio_pci_core_device, vdev);
struct vfio_device_low_power_entry_with_wakeup entry;
struct eventfd_ctx *efdctx;
int ret;
@@ -373,14 +406,14 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
*/
down_write(&vdev->memory_lock);
__vfio_pci_runtime_pm_exit(vdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
-static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags,
+static int vfio_pci_core_pm_exit(struct vfio_pci_core_device *vdev, u32 flags,
void __user *arg, size_t argsz)
{
- struct vfio_pci_core_device *vdev =
- container_of(device, struct vfio_pci_core_device, vdev);
int ret;
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
@@ -695,15 +728,11 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
#endif
vfio_pci_core_disable(vdev);
+ vfio_pci_dma_buf_cleanup(vdev);
+
mutex_lock(&vdev->igate);
- if (vdev->err_trigger) {
- eventfd_ctx_put(vdev->err_trigger);
- vdev->err_trigger = NULL;
- }
- if (vdev->req_trigger) {
- eventfd_ctx_put(vdev->req_trigger);
- vdev->req_trigger = NULL;
- }
+ vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL);
+ vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL);
mutex_unlock(&vdev->igate);
}
EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
@@ -996,42 +1025,36 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
-static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
- struct vfio_region_info __user *arg)
+int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
struct pci_dev *pdev = vdev->pdev;
- struct vfio_region_info info;
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
int i, ret;
- if (copy_from_user(&info, arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
+ switch (info->index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = pdev->cfg_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = pdev->cfg_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
break;
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = pci_resource_len(pdev, info.index);
- if (!info.size) {
- info.flags = 0;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = pci_resource_len(pdev, info->index);
+ if (!info->size) {
+ info->flags = 0;
break;
}
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- if (vdev->bar_mmap_supported[info.index]) {
- info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
- if (info.index == vdev->msix_bar) {
- ret = msix_mmappable_cap(vdev, &caps);
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ if (vdev->bar_mmap_supported[info->index]) {
+ info->flags |= VFIO_REGION_INFO_FLAG_MMAP;
+ if (info->index == vdev->msix_bar) {
+ ret = msix_mmappable_cap(vdev, caps);
if (ret)
return ret;
}
@@ -1043,9 +1066,9 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
size_t size;
u16 cmd;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.flags = 0;
- info.size = 0;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->flags = 0;
+ info->size = 0;
if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
/*
@@ -1055,16 +1078,17 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
cmd = vfio_pci_memory_lock_and_enable(vdev);
io = pci_map_rom(pdev, &size);
if (io) {
- info.flags = VFIO_REGION_INFO_FLAG_READ;
+ info->flags = VFIO_REGION_INFO_FLAG_READ;
/* Report the BAR size, not the ROM size. */
- info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
+ info->size = pci_resource_len(pdev,
+ PCI_ROM_RESOURCE);
pci_unmap_rom(pdev, io);
}
vfio_pci_memory_unlock_and_restore(vdev, cmd);
} else if (pdev->rom && pdev->romlen) {
- info.flags = VFIO_REGION_INFO_FLAG_READ;
+ info->flags = VFIO_REGION_INFO_FLAG_READ;
/* Report BAR size as power of two. */
- info.size = roundup_pow_of_two(pdev->romlen);
+ info->size = roundup_pow_of_two(pdev->romlen);
}
break;
@@ -1073,10 +1097,10 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
if (!vdev->has_vga)
return -EINVAL;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = 0xc0000;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = 0xc0000;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
break;
default: {
@@ -1085,53 +1109,36 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
.header.version = 1
};
- if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
+ if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
return -EINVAL;
- info.index = array_index_nospec(
- info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
+ info->index = array_index_nospec(
+ info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
- i = info.index - VFIO_PCI_NUM_REGIONS;
+ i = info->index - VFIO_PCI_NUM_REGIONS;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = vdev->region[i].size;
- info.flags = vdev->region[i].flags;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = vdev->region[i].size;
+ info->flags = vdev->region[i].flags;
cap_type.type = vdev->region[i].type;
cap_type.subtype = vdev->region[i].subtype;
- ret = vfio_info_add_capability(&caps, &cap_type.header,
+ ret = vfio_info_add_capability(caps, &cap_type.header,
sizeof(cap_type));
if (ret)
return ret;
if (vdev->region[i].ops->add_capability) {
ret = vdev->region[i].ops->add_capability(
- vdev, &vdev->region[i], &caps);
+ vdev, &vdev->region[i], caps);
if (ret)
return ret;
}
}
}
-
- if (caps.size) {
- info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info.argsz < sizeof(info) + caps.size) {
- info.argsz = sizeof(info) + caps.size;
- info.cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(info));
- if (copy_to_user(arg + 1, caps.buf, caps.size)) {
- kfree(caps.buf);
- return -EFAULT;
- }
- info.cap_offset = sizeof(*arg);
- }
-
- kfree(caps.buf);
- }
-
- return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+ return 0;
}
+EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info);
static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev,
struct vfio_irq_info __user *arg)
@@ -1227,7 +1234,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
*/
vfio_pci_set_power_state(vdev, PCI_D0);
+ vfio_pci_dma_buf_move(vdev, true);
ret = pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
return ret;
@@ -1457,8 +1467,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
return vfio_pci_ioctl_get_irq_info(vdev, uarg);
case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg);
- case VFIO_DEVICE_GET_REGION_INFO:
- return vfio_pci_ioctl_get_region_info(vdev, uarg);
case VFIO_DEVICE_IOEVENTFD:
return vfio_pci_ioctl_ioeventfd(vdev, uarg);
case VFIO_DEVICE_PCI_HOT_RESET:
@@ -1473,11 +1481,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
}
EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
-static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
- uuid_t __user *arg, size_t argsz)
+static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev,
+ u32 flags, uuid_t __user *arg,
+ size_t argsz)
{
- struct vfio_pci_core_device *vdev =
- container_of(device, struct vfio_pci_core_device, vdev);
uuid_t uuid;
int ret;
@@ -1504,16 +1511,21 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
void __user *arg, size_t argsz)
{
+ struct vfio_pci_core_device *vdev =
+ container_of(device, struct vfio_pci_core_device, vdev);
+
switch (flags & VFIO_DEVICE_FEATURE_MASK) {
case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY:
- return vfio_pci_core_pm_entry(device, flags, arg, argsz);
+ return vfio_pci_core_pm_entry(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP:
- return vfio_pci_core_pm_entry_with_wakeup(device, flags,
+ return vfio_pci_core_pm_entry_with_wakeup(vdev, flags,
arg, argsz);
case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT:
- return vfio_pci_core_pm_exit(device, flags, arg, argsz);
+ return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
- return vfio_pci_core_feature_token(device, flags, arg, argsz);
+ return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
+ case VFIO_DEVICE_FEATURE_DMA_BUF:
+ return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
default:
return -ENOTTY;
}
@@ -1640,49 +1652,49 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma)
return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
}
-static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
- unsigned int order)
+vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev,
+ struct vm_fault *vmf,
+ unsigned long pfn,
+ unsigned int order)
{
- struct vm_area_struct *vma = vmf->vma;
- struct vfio_pci_core_device *vdev = vma->vm_private_data;
- unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
- unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
- unsigned long pfn = vma_to_pfn(vma) + pgoff;
- vm_fault_t ret = VM_FAULT_SIGBUS;
-
- if (order && (addr < vma->vm_start ||
- addr + (PAGE_SIZE << order) > vma->vm_end ||
- pfn & ((1 << order) - 1))) {
- ret = VM_FAULT_FALLBACK;
- goto out;
- }
-
- down_read(&vdev->memory_lock);
+ lockdep_assert_held_read(&vdev->memory_lock);
if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
- goto out_unlock;
+ return VM_FAULT_SIGBUS;
switch (order) {
case 0:
- ret = vmf_insert_pfn(vma, vmf->address, pfn);
- break;
+ return vmf_insert_pfn(vmf->vma, vmf->address, pfn);
#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
case PMD_ORDER:
- ret = vmf_insert_pfn_pmd(vmf, pfn, false);
- break;
+ return vmf_insert_pfn_pmd(vmf, pfn, false);
#endif
#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
case PUD_ORDER:
- ret = vmf_insert_pfn_pud(vmf, pfn, false);
+ return vmf_insert_pfn_pud(vmf, pfn, false);
break;
#endif
default:
- ret = VM_FAULT_FALLBACK;
+ return VM_FAULT_FALLBACK;
+ }
+}
+EXPORT_SYMBOL_GPL(vfio_pci_vmf_insert_pfn);
+
+static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct vfio_pci_core_device *vdev = vma->vm_private_data;
+ unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
+ unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ unsigned long pfn = vma_to_pfn(vma) + pgoff;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+
+ if (is_aligned_for_order(vma, addr, pfn, order)) {
+ scoped_guard(rwsem_read, &vdev->memory_lock)
+ ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
}
-out_unlock:
- up_read(&vdev->memory_lock);
-out:
dev_dbg_ratelimited(&vdev->pdev->dev,
"%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n",
__func__, order,
@@ -1749,18 +1761,9 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
* Even though we don't make use of the barmap for the mmap,
* we need to request the region and the barmap tracks that.
*/
- if (!vdev->barmap[index]) {
- ret = pci_request_selected_regions(pdev,
- 1 << index, "vfio-pci");
- if (ret)
- return ret;
-
- vdev->barmap[index] = pci_iomap(pdev, index, 0);
- if (!vdev->barmap[index]) {
- pci_release_selected_regions(pdev, 1 << index);
- return -ENOMEM;
- }
- }
+ ret = vfio_pci_core_setup_barmap(vdev, index);
+ if (ret)
+ return ret;
vma->vm_private_data = vdev;
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
@@ -1800,21 +1803,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
struct vfio_pci_core_device *vdev =
container_of(core_vdev, struct vfio_pci_core_device, vdev);
struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_eventfd *eventfd;
- mutex_lock(&vdev->igate);
-
- if (vdev->req_trigger) {
+ rcu_read_lock();
+ eventfd = rcu_dereference(vdev->req_trigger);
+ if (eventfd) {
if (!(count % 10))
pci_notice_ratelimited(pdev,
"Relaying device request to user (#%u)\n",
count);
- eventfd_signal(vdev->req_trigger);
+ eventfd_signal(eventfd->ctx);
} else if (count == 0) {
pci_warn(pdev,
"No device request channel registered, blocked until released by user\n");
}
-
- mutex_unlock(&vdev->igate);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(vfio_pci_core_request);
@@ -2085,6 +2088,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
{
struct vfio_pci_core_device *vdev =
container_of(core_vdev, struct vfio_pci_core_device, vdev);
+ int ret;
vdev->pdev = to_pci_dev(core_vdev->dev);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
@@ -2094,6 +2098,10 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
INIT_LIST_HEAD(&vdev->dummy_resources_list);
INIT_LIST_HEAD(&vdev->ioeventfds_list);
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
+ ret = pcim_p2pdma_init(vdev->pdev);
+ if (ret && ret != -EOPNOTSUPP)
+ return ret;
+ INIT_LIST_HEAD(&vdev->dmabufs);
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);
@@ -2227,13 +2235,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+ struct vfio_pci_eventfd *eventfd;
- mutex_lock(&vdev->igate);
-
- if (vdev->err_trigger)
- eventfd_signal(vdev->err_trigger);
-
- mutex_unlock(&vdev->igate);
+ rcu_read_lock();
+ eventfd = rcu_dereference(vdev->err_trigger);
+ if (eventfd)
+ eventfd_signal(eventfd->ctx);
+ rcu_read_unlock();
return PCI_ERS_RESULT_CAN_RECOVER;
}
@@ -2458,6 +2466,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
break;
}
+ vfio_pci_dma_buf_move(vdev, true);
vfio_pci_zap_bars(vdev);
}
@@ -2486,8 +2495,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
err_undo:
list_for_each_entry_from_reverse(vdev, &dev_set->device_list,
- vdev.dev_set_list)
+ vdev.dev_set_list) {
+ if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
+ }
list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
pm_runtime_put(&vdev->pdev->dev);
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
new file mode 100644
index 000000000000..d4d0f7d08c53
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ */
+#include <linux/dma-buf-mapping.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/dma-resv.h>
+
+#include "vfio_pci_priv.h"
+
+MODULE_IMPORT_NS("DMA_BUF");
+
+struct vfio_pci_dma_buf {
+ struct dma_buf *dmabuf;
+ struct vfio_pci_core_device *vdev;
+ struct list_head dmabufs_elm;
+ size_t size;
+ struct dma_buf_phys_vec *phys_vec;
+ struct p2pdma_provider *provider;
+ u32 nr_ranges;
+ u8 revoked : 1;
+};
+
+static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ if (!attachment->peer2peer)
+ return -EOPNOTSUPP;
+
+ if (priv->revoked)
+ return -ENODEV;
+
+ return 0;
+}
+
+static struct sg_table *
+vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
+ enum dma_data_direction dir)
+{
+ struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+
+ dma_resv_assert_held(priv->dmabuf->resv);
+
+ if (priv->revoked)
+ return ERR_PTR(-ENODEV);
+
+ return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
+ priv->phys_vec, priv->nr_ranges,
+ priv->size, dir);
+}
+
+static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
+ struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+ dma_buf_free_sgt(attachment, sgt, dir);
+}
+
+static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ /*
+ * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
+ * The refcount prevents both.
+ */
+ if (priv->vdev) {
+ down_write(&priv->vdev->memory_lock);
+ list_del_init(&priv->dmabufs_elm);
+ up_write(&priv->vdev->memory_lock);
+ vfio_device_put_registration(&priv->vdev->vdev);
+ }
+ kfree(priv->phys_vec);
+ kfree(priv);
+}
+
+static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
+ .attach = vfio_pci_dma_buf_attach,
+ .map_dma_buf = vfio_pci_dma_buf_map,
+ .unmap_dma_buf = vfio_pci_dma_buf_unmap,
+ .release = vfio_pci_dma_buf_release,
+};
+
+/*
+ * This is a temporary "private interconnect" between VFIO DMABUF and iommufd.
+ * It allows the two co-operating drivers to exchange the physical address of
+ * the BAR. This is to be replaced with a formal DMABUF system for negotiated
+ * interconnect types.
+ *
+ * If this function succeeds the following are true:
+ * - There is one physical range and it is pointing to MMIO
+ * - When move_notify is called it means revoke, not move, vfio_dma_buf_map
+ * will fail if it is currently revoked
+ */
+int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ struct vfio_pci_dma_buf *priv;
+
+ dma_resv_assert_held(attachment->dmabuf->resv);
+
+ if (attachment->dmabuf->ops != &vfio_pci_dmabuf_ops)
+ return -EOPNOTSUPP;
+
+ priv = attachment->dmabuf->priv;
+ if (priv->revoked)
+ return -ENODEV;
+
+ /* More than one range to iommufd will require proper DMABUF support */
+ if (priv->nr_ranges != 1)
+ return -EOPNOTSUPP;
+
+ *phys = priv->phys_vec[0];
+ return 0;
+}
+EXPORT_SYMBOL_FOR_MODULES(vfio_pci_dma_buf_iommufd_map, "iommufd");
+
+int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges, phys_addr_t start,
+ phys_addr_t len)
+{
+ phys_addr_t max_addr;
+ unsigned int i;
+
+ max_addr = start + len;
+ for (i = 0; i < nr_ranges; i++) {
+ phys_addr_t end;
+
+ if (!dma_ranges[i].length)
+ return -EINVAL;
+
+ if (check_add_overflow(start, dma_ranges[i].offset,
+ &phys_vec[i].paddr) ||
+ check_add_overflow(phys_vec[i].paddr,
+ dma_ranges[i].length, &end))
+ return -EOVERFLOW;
+ if (end > max_addr)
+ return -EINVAL;
+
+ phys_vec[i].len = dma_ranges[i].length;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec);
+
+int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
+ struct p2pdma_provider **provider,
+ unsigned int region_index,
+ struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges)
+{
+ struct pci_dev *pdev = vdev->pdev;
+
+ *provider = pcim_p2pdma_provider(pdev, region_index);
+ if (!*provider)
+ return -EINVAL;
+
+ return vfio_pci_core_fill_phys_vec(
+ phys_vec, dma_ranges, nr_ranges,
+ pci_resource_start(pdev, region_index),
+ pci_resource_len(pdev, region_index));
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys);
+
+static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t *lengthp)
+{
+ size_t length = 0;
+ u32 i;
+
+ for (i = 0; i < dma_buf->nr_ranges; i++) {
+ u64 offset = dma_ranges[i].offset;
+ u64 len = dma_ranges[i].length;
+
+ if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+ return -EINVAL;
+
+ if (check_add_overflow(length, len, &length))
+ return -EINVAL;
+ }
+
+ /*
+ * dma_iova_try_alloc() will WARN on if userspace proposes a size that
+ * is too big, eg with lots of ranges.
+ */
+ if ((u64)(length) & DMA_IOVA_USE_SWIOTLB)
+ return -EINVAL;
+
+ *lengthp = length;
+ return 0;
+}
+
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_dma_buf get_dma_buf = {};
+ struct vfio_region_dma_range *dma_ranges;
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct vfio_pci_dma_buf *priv;
+ size_t length;
+ int ret;
+
+ if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys)
+ return -EOPNOTSUPP;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+ sizeof(get_dma_buf));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
+ return -EFAULT;
+
+ if (!get_dma_buf.nr_ranges || get_dma_buf.flags)
+ return -EINVAL;
+
+ /*
+ * For PCI the region_index is the BAR number like everything else.
+ */
+ if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX)
+ return -ENODEV;
+
+ dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
+ sizeof(*dma_ranges));
+ if (IS_ERR(dma_ranges))
+ return PTR_ERR(dma_ranges);
+
+ ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length);
+ if (ret)
+ goto err_free_ranges;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto err_free_ranges;
+ }
+ priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec),
+ GFP_KERNEL);
+ if (!priv->phys_vec) {
+ ret = -ENOMEM;
+ goto err_free_priv;
+ }
+
+ priv->vdev = vdev;
+ priv->nr_ranges = get_dma_buf.nr_ranges;
+ priv->size = length;
+ ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
+ get_dma_buf.region_index,
+ priv->phys_vec, dma_ranges,
+ priv->nr_ranges);
+ if (ret)
+ goto err_free_phys;
+
+ kfree(dma_ranges);
+ dma_ranges = NULL;
+
+ if (!vfio_device_try_get_registration(&vdev->vdev)) {
+ ret = -ENODEV;
+ goto err_free_phys;
+ }
+
+ exp_info.ops = &vfio_pci_dmabuf_ops;
+ exp_info.size = priv->size;
+ exp_info.flags = get_dma_buf.open_flags;
+ exp_info.priv = priv;
+
+ priv->dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(priv->dmabuf)) {
+ ret = PTR_ERR(priv->dmabuf);
+ goto err_dev_put;
+ }
+
+ /* dma_buf_put() now frees priv */
+ INIT_LIST_HEAD(&priv->dmabufs_elm);
+ down_write(&vdev->memory_lock);
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->revoked = !__vfio_pci_memory_enabled(vdev);
+ list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
+ dma_resv_unlock(priv->dmabuf->resv);
+ up_write(&vdev->memory_lock);
+
+ /*
+ * dma_buf_fd() consumes the reference, when the file closes the dmabuf
+ * will be released.
+ */
+ ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
+ if (ret < 0)
+ goto err_dma_buf;
+ return ret;
+
+err_dma_buf:
+ dma_buf_put(priv->dmabuf);
+err_dev_put:
+ vfio_device_put_registration(&vdev->vdev);
+err_free_phys:
+ kfree(priv->phys_vec);
+err_free_priv:
+ kfree(priv);
+err_free_ranges:
+ kfree(dma_ranges);
+ return ret;
+}
+
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
+{
+ struct vfio_pci_dma_buf *priv;
+ struct vfio_pci_dma_buf *tmp;
+
+ lockdep_assert_held_write(&vdev->memory_lock);
+
+ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ if (!get_file_active(&priv->dmabuf->file))
+ continue;
+
+ if (priv->revoked != revoked) {
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->revoked = revoked;
+ dma_buf_move_notify(priv->dmabuf);
+ dma_resv_unlock(priv->dmabuf->resv);
+ }
+ fput(priv->dmabuf->file);
+ }
+}
+
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_dma_buf *priv;
+ struct vfio_pci_dma_buf *tmp;
+
+ down_write(&vdev->memory_lock);
+ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ if (!get_file_active(&priv->dmabuf->file))
+ continue;
+
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ list_del_init(&priv->dmabufs_elm);
+ priv->vdev = NULL;
+ priv->revoked = true;
+ dma_buf_move_notify(priv->dmabuf);
+ dma_resv_unlock(priv->dmabuf->resv);
+ vfio_device_put_registration(&vdev->vdev);
+ fput(priv->dmabuf->file);
+ }
+ up_write(&vdev->memory_lock);
+}
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 30d3e921cb0d..c76e753b3cec 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
return 0;
}
-static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
+static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_eventfd __rcu **peventfd,
unsigned int count, uint32_t flags,
void *data)
{
/* DATA_NONE/DATA_BOOL enables loopback testing */
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- if (*ctx) {
- if (count) {
- eventfd_signal(*ctx);
- } else {
- eventfd_ctx_put(*ctx);
- *ctx = NULL;
- }
+ struct vfio_pci_eventfd *eventfd;
+
+ eventfd = rcu_dereference_protected(*peventfd,
+ lockdep_is_held(&vdev->igate));
+
+ if (!eventfd)
+ return -EINVAL;
+
+ if (count) {
+ eventfd_signal(eventfd->ctx);
return 0;
}
+
+ return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger;
@@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
return -EINVAL;
trigger = *(uint8_t *)data;
- if (trigger && *ctx)
- eventfd_signal(*ctx);
+
+ if (trigger) {
+ struct vfio_pci_eventfd *eventfd =
+ rcu_dereference_protected(*peventfd,
+ lockdep_is_held(&vdev->igate));
+
+ if (eventfd)
+ eventfd_signal(eventfd->ctx);
+ }
return 0;
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
@@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
fd = *(int32_t *)data;
if (fd == -1) {
- if (*ctx)
- eventfd_ctx_put(*ctx);
- *ctx = NULL;
+ return vfio_pci_eventfd_replace_locked(vdev,
+ peventfd, NULL);
} else if (fd >= 0) {
struct eventfd_ctx *efdctx;
+ int ret;
efdctx = eventfd_ctx_fdget(fd);
if (IS_ERR(efdctx))
return PTR_ERR(efdctx);
- if (*ctx)
- eventfd_ctx_put(*ctx);
+ ret = vfio_pci_eventfd_replace_locked(vdev,
+ peventfd, efdctx);
+ if (ret)
+ eventfd_ctx_put(efdctx);
- *ctx = efdctx;
+ return ret;
}
- return 0;
}
return -EINVAL;
@@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
return -EINVAL;
- return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger,
+ return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger,
count, flags, data);
}
@@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
return -EINVAL;
- return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger,
+ return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger,
count, flags, data);
}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index a9972eacb293..27ac280f00b9 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd {
bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
+int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_eventfd __rcu **peventfd,
+ struct eventfd_ctx *ctx);
+
int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
unsigned index, unsigned start, unsigned count,
void *data);
@@ -60,7 +64,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev);
int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
pci_power_t state);
-bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
@@ -107,4 +110,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
}
+#ifdef CONFIG_VFIO_PCI_DMABUF
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz);
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
+#else
+static inline int
+vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz)
+{
+ return -ENOTTY;
+}
+static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+}
+static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
+ bool revoked)
+{
+}
+#endif
+
#endif
diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h
index c7d7e27af386..cb3d5e57d3a3 100644
--- a/drivers/vfio/pci/virtio/common.h
+++ b/drivers/vfio/pci/virtio/common.h
@@ -109,10 +109,9 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev);
#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev);
-long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg);
int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg);
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev,
const char __user *buf, size_t count,
loff_t *ppos);
diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c
index 832af5ba267c..1ed349a55629 100644
--- a/drivers/vfio/pci/virtio/legacy_io.c
+++ b/drivers/vfio/pci/virtio/legacy_io.c
@@ -281,41 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user
}
int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg)
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
struct virtiovf_pci_core_device *virtvdev = container_of(
core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
- void __user *uarg = (void __user *)arg;
- struct vfio_region_info info = {};
- if (copy_from_user(&info, uarg, minsz))
- return -EFAULT;
+ if (info->index != VFIO_PCI_BAR0_REGION_INDEX)
+ return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
- case VFIO_PCI_BAR0_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = virtvdev->bar0_virtual_buf_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
-}
-
-long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
- unsigned long arg)
-{
- switch (cmd) {
- case VFIO_DEVICE_GET_REGION_INFO:
- return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg);
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = virtvdev->bar0_virtual_buf_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+ return 0;
}
static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index 8084f3e36a9f..d2e5cbca13c8 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -88,6 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = {
.open_device = virtiovf_pci_open_device,
.close_device = virtiovf_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
@@ -108,7 +109,8 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = {
.release = virtiovf_pci_core_release_dev,
.open_device = virtiovf_pci_open_device,
.close_device = virtiovf_pci_close_device,
- .ioctl = virtiovf_vfio_pci_core_ioctl,
+ .ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = virtiovf_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = virtiovf_pci_core_read,
.write = virtiovf_pci_core_write,
@@ -130,6 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.open_device = virtiovf_pci_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/xe/Kconfig b/drivers/vfio/pci/xe/Kconfig
new file mode 100644
index 000000000000..cc9b6dac6ed3
--- /dev/null
+++ b/drivers/vfio/pci/xe/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config XE_VFIO_PCI
+ tristate "VFIO support for Intel Graphics"
+ depends on DRM_XE && PCI_IOV
+ select VFIO_PCI_CORE
+ help
+ This option enables device specific VFIO driver variant for Intel Graphics.
+ In addition to generic VFIO PCI functionality, it implements VFIO
+ migration uAPI allowing userspace to enable migration for
+ Intel Graphics SR-IOV Virtual Functions supported by the Xe driver.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/xe/Makefile b/drivers/vfio/pci/xe/Makefile
new file mode 100644
index 000000000000..13aa0fd192cd
--- /dev/null
+++ b/drivers/vfio/pci/xe/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_XE_VFIO_PCI) += xe-vfio-pci.o
+xe-vfio-pci-y := main.o
diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c
new file mode 100644
index 000000000000..0156b53c678b
--- /dev/null
+++ b/drivers/vfio/pci/xe/main.c
@@ -0,0 +1,573 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <linux/anon_inodes.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include <drm/intel/xe_sriov_vfio.h>
+#include <drm/intel/pciids.h>
+
+struct xe_vfio_pci_migration_file {
+ struct file *filp;
+ /* serializes accesses to migration data */
+ struct mutex lock;
+ struct xe_vfio_pci_core_device *xe_vdev;
+ u8 disabled:1;
+};
+
+struct xe_vfio_pci_core_device {
+ struct vfio_pci_core_device core_device;
+ struct xe_device *xe;
+ /* PF internal control uses vfid index starting from 1 */
+ unsigned int vfid;
+ u8 deferred_reset:1;
+ /* protects migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ /* protects the reset_done flow */
+ spinlock_t reset_lock;
+ struct xe_vfio_pci_migration_file *migf;
+};
+
+#define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev)
+
+static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->disabled = true;
+ mutex_unlock(&migf->lock);
+}
+
+static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev)
+{
+ xe_vfio_pci_disable_file(xe_vdev->migf);
+ fput(xe_vdev->migf->filp);
+ xe_vdev->migf = NULL;
+}
+
+static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev)
+{
+ if (xe_vdev->migf)
+ xe_vfio_pci_put_file(xe_vdev);
+
+ xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+}
+
+static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev)
+{
+ mutex_lock(&xe_vdev->state_mutex);
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev)
+{
+again:
+ spin_lock(&xe_vdev->reset_lock);
+ if (xe_vdev->deferred_reset) {
+ xe_vdev->deferred_reset = false;
+ spin_unlock(&xe_vdev->reset_lock);
+ xe_vfio_pci_reset(xe_vdev);
+ goto again;
+ }
+ mutex_unlock(&xe_vdev->state_mutex);
+ spin_unlock(&xe_vdev->reset_lock);
+}
+
+static void xe_vfio_pci_reset_done(struct pci_dev *pdev)
+{
+ struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
+ int ret;
+
+ if (!pdev->is_virtfn)
+ return;
+
+ /*
+ * VF FLR requires additional processing done by PF driver.
+ * The processing is done after FLR is already finished from PCIe
+ * perspective.
+ * In order to avoid a scenario where VF is used while PF processing
+ * is still in progress, additional synchronization point is needed.
+ */
+ ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid);
+ if (ret)
+ dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret);
+
+ if (!xe_vdev->vfid)
+ return;
+
+ /*
+ * As the higher VFIO layers are holding locks across reset and using
+ * those same locks with the mm_lock we need to prevent ABBA deadlock
+ * with the state_mutex and mm_lock.
+ * In case the state_mutex was taken already we defer the cleanup work
+ * to the unlock flow of the other running context.
+ */
+ spin_lock(&xe_vdev->reset_lock);
+ xe_vdev->deferred_reset = true;
+ if (!mutex_trylock(&xe_vdev->state_mutex)) {
+ spin_unlock(&xe_vdev->reset_lock);
+ return;
+ }
+ spin_unlock(&xe_vdev->reset_lock);
+ xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+ xe_vfio_pci_reset(xe_vdev);
+}
+
+static const struct pci_error_handlers xe_vfio_pci_err_handlers = {
+ .reset_done = xe_vfio_pci_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static int xe_vfio_pci_open_device(struct vfio_device *core_vdev)
+{
+ struct xe_vfio_pci_core_device *xe_vdev =
+ container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+ struct vfio_pci_core_device *vdev = &xe_vdev->core_device;
+ int ret;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+
+ vfio_pci_core_finish_enable(vdev);
+
+ return 0;
+}
+
+static void xe_vfio_pci_close_device(struct vfio_device *core_vdev)
+{
+ struct xe_vfio_pci_core_device *xe_vdev =
+ container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+ xe_vfio_pci_state_mutex_lock(xe_vdev);
+ xe_vfio_pci_reset(xe_vdev);
+ xe_vfio_pci_state_mutex_unlock(xe_vdev);
+ vfio_pci_core_close_device(core_vdev);
+}
+
+static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp)
+{
+ struct xe_vfio_pci_migration_file *migf = filp->private_data;
+
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+
+ return 0;
+}
+
+static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
+{
+ struct xe_vfio_pci_migration_file *migf = filp->private_data;
+ ssize_t ret;
+
+ if (pos)
+ return -ESPIPE;
+
+ mutex_lock(&migf->lock);
+ if (migf->disabled) {
+ mutex_unlock(&migf->lock);
+ return -ENODEV;
+ }
+
+ ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
+ mutex_unlock(&migf->lock);
+
+ return ret;
+}
+
+static const struct file_operations xe_vfio_pci_save_fops = {
+ .owner = THIS_MODULE,
+ .read = xe_vfio_pci_save_read,
+ .release = xe_vfio_pci_release_file,
+ .llseek = noop_llseek,
+};
+
+static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct xe_vfio_pci_migration_file *migf = filp->private_data;
+ ssize_t ret;
+
+ if (pos)
+ return -ESPIPE;
+
+ mutex_lock(&migf->lock);
+ if (migf->disabled) {
+ mutex_unlock(&migf->lock);
+ return -ENODEV;
+ }
+
+ ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len);
+ mutex_unlock(&migf->lock);
+
+ return ret;
+}
+
+static const struct file_operations xe_vfio_pci_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = xe_vfio_pci_resume_write,
+ .release = xe_vfio_pci_release_file,
+ .llseek = noop_llseek,
+};
+
+static const char *vfio_dev_state_str(u32 state)
+{
+ switch (state) {
+ case VFIO_DEVICE_STATE_RUNNING: return "running";
+ case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p";
+ case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy";
+ case VFIO_DEVICE_STATE_STOP: return "stop";
+ case VFIO_DEVICE_STATE_RESUMING: return "resuming";
+ case VFIO_DEVICE_STATE_ERROR: return "error";
+ default: return "";
+ }
+}
+
+enum xe_vfio_pci_file_type {
+ XE_VFIO_FILE_SAVE = 0,
+ XE_VFIO_FILE_RESUME,
+};
+
+static struct xe_vfio_pci_migration_file *
+xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev,
+ enum xe_vfio_pci_file_type type)
+{
+ struct xe_vfio_pci_migration_file *migf;
+ const struct file_operations *fops;
+ int flags;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops;
+ flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY;
+ migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags);
+ if (IS_ERR(migf->filp)) {
+ kfree(migf);
+ return ERR_CAST(migf->filp);
+ }
+
+ mutex_init(&migf->lock);
+ migf->xe_vdev = xe_vdev;
+ xe_vdev->migf = migf;
+
+ stream_open(migf->filp->f_inode, migf->filp);
+
+ return migf;
+}
+
+static struct file *
+xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new)
+{
+ u32 cur = xe_vdev->mig_state;
+ int ret;
+
+ dev_dbg(xe_vdev_to_dev(xe_vdev),
+ "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new));
+
+ /*
+ * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't
+ * have the capability to selectively block outgoing p2p DMA transfers.
+ * While the device is allowing BAR accesses when the VF is stopped, it
+ * is not processing any new workload requests, effectively stopping
+ * any outgoing DMA transfers (not just p2p).
+ * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and
+ * will be migrated to target VF during stop-copy.
+ */
+ if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid);
+ if (ret)
+ goto err;
+
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
+ return NULL;
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
+ ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid);
+ if (ret)
+ goto err;
+
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct xe_vfio_pci_migration_file *migf;
+
+ migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE);
+ if (IS_ERR(migf)) {
+ ret = PTR_ERR(migf);
+ goto err;
+ }
+ get_file(migf->filp);
+
+ ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid);
+ if (ret) {
+ fput(migf->filp);
+ goto err;
+ }
+
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+ if (xe_vdev->migf)
+ xe_vfio_pci_put_file(xe_vdev);
+
+ ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid);
+ if (ret)
+ goto err;
+
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct xe_vfio_pci_migration_file *migf;
+
+ migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME);
+ if (IS_ERR(migf)) {
+ ret = PTR_ERR(migf);
+ goto err;
+ }
+ get_file(migf->filp);
+
+ ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid);
+ if (ret) {
+ fput(migf->filp);
+ goto err;
+ }
+
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ if (xe_vdev->migf)
+ xe_vfio_pci_put_file(xe_vdev);
+
+ ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid);
+ if (ret)
+ goto err;
+
+ return NULL;
+ }
+
+ WARN(true, "Unknown state transition %d->%d", cur, new);
+ return ERR_PTR(-EINVAL);
+
+err:
+ dev_dbg(xe_vdev_to_dev(xe_vdev),
+ "Failed to transition state: %s->%s err=%d\n",
+ vfio_dev_state_str(cur), vfio_dev_state_str(new), ret);
+ return ERR_PTR(ret);
+}
+
+static struct file *
+xe_vfio_pci_set_device_state(struct vfio_device *core_vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct xe_vfio_pci_core_device *xe_vdev =
+ container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *f = NULL;
+ int ret;
+
+ xe_vfio_pci_state_mutex_lock(xe_vdev);
+ while (new_state != xe_vdev->mig_state) {
+ ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid);
+ f = ERR_PTR(ret);
+ break;
+ }
+ f = xe_vfio_set_state(xe_vdev, next_state);
+ if (IS_ERR(f))
+ break;
+
+ xe_vdev->mig_state = next_state;
+
+ /* Multiple state transitions with non-NULL file in the middle */
+ if (f && new_state != xe_vdev->mig_state) {
+ fput(f);
+ f = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+ return f;
+}
+
+static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct xe_vfio_pci_core_device *xe_vdev =
+ container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+ xe_vfio_pci_state_mutex_lock(xe_vdev);
+ *curr_state = xe_vdev->mig_state;
+ xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+ return 0;
+}
+
+static int xe_vfio_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct xe_vfio_pci_core_device *xe_vdev =
+ container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+ xe_vfio_pci_state_mutex_lock(xe_vdev);
+ *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid);
+ xe_vfio_pci_state_mutex_unlock(xe_vdev);
+
+ return 0;
+}
+
+static const struct vfio_migration_ops xe_vfio_pci_migration_ops = {
+ .migration_set_state = xe_vfio_pci_set_device_state,
+ .migration_get_state = xe_vfio_pci_get_device_state,
+ .migration_get_data_size = xe_vfio_pci_get_data_size,
+};
+
+static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev)
+{
+ struct vfio_device *core_vdev = &xe_vdev->core_device.vdev;
+ struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
+ struct xe_device *xe = xe_sriov_vfio_get_pf(pdev);
+
+ if (!xe)
+ return;
+ if (!xe_sriov_vfio_migration_supported(xe))
+ return;
+
+ mutex_init(&xe_vdev->state_mutex);
+ spin_lock_init(&xe_vdev->reset_lock);
+
+ /* PF internal control uses vfid index starting from 1 */
+ xe_vdev->vfid = pci_iov_vf_id(pdev) + 1;
+ xe_vdev->xe = xe;
+
+ core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+ core_vdev->mig_ops = &xe_vfio_pci_migration_ops;
+}
+
+static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev)
+{
+ if (!xe_vdev->vfid)
+ return;
+
+ mutex_destroy(&xe_vdev->state_mutex);
+}
+
+static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev)
+{
+ struct xe_vfio_pci_core_device *xe_vdev =
+ container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+ xe_vfio_pci_migration_init(xe_vdev);
+
+ return vfio_pci_core_init_dev(core_vdev);
+}
+
+static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev)
+{
+ struct xe_vfio_pci_core_device *xe_vdev =
+ container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev);
+
+ xe_vfio_pci_migration_fini(xe_vdev);
+}
+
+static const struct vfio_device_ops xe_vfio_pci_ops = {
+ .name = "xe-vfio-pci",
+ .init = xe_vfio_pci_init_dev,
+ .release = xe_vfio_pci_release_dev,
+ .open_device = xe_vfio_pci_open_device,
+ .close_device = xe_vfio_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct xe_vfio_pci_core_device *xe_vdev;
+ int ret;
+
+ xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev,
+ &xe_vfio_pci_ops);
+ if (IS_ERR(xe_vdev))
+ return PTR_ERR(xe_vdev);
+
+ dev_set_drvdata(&pdev->dev, &xe_vdev->core_device);
+
+ ret = vfio_pci_core_register_device(&xe_vdev->core_device);
+ if (ret) {
+ vfio_put_device(&xe_vdev->core_device.vdev);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void xe_vfio_pci_remove(struct pci_dev *pdev)
+{
+ struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
+
+ vfio_pci_core_unregister_device(&xe_vdev->core_device);
+ vfio_put_device(&xe_vdev->core_device.vdev);
+}
+
+#define INTEL_PCI_VFIO_DEVICE(_id) { \
+ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \
+}
+
+static const struct pci_device_id xe_vfio_pci_table[] = {
+ INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE),
+ INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE),
+ INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE),
+ {}
+};
+MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table);
+
+static struct pci_driver xe_vfio_pci_driver = {
+ .name = "xe-vfio-pci",
+ .id_table = xe_vfio_pci_table,
+ .probe = xe_vfio_pci_probe,
+ .remove = xe_vfio_pci_remove,
+ .err_handler = &xe_vfio_pci_err_handlers,
+ .driver_managed_dma = true,
+};
+module_pci_driver(xe_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("MichaƂ Winiarski <michal.winiarski@intel.com>");
+MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics");
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 9f5c527baa8a..fa754f203b2d 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -115,6 +115,7 @@ static const struct vfio_device_ops vfio_amba_ops = {
.open_device = vfio_platform_open_device,
.close_device = vfio_platform_close_device,
.ioctl = vfio_platform_ioctl,
+ .get_region_info_caps = vfio_platform_ioctl_get_region_info,
.read = vfio_platform_read,
.write = vfio_platform_write,
.mmap = vfio_platform_mmap,
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 512533501eb7..a4d3ace3e02d 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -101,6 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = {
.open_device = vfio_platform_open_device,
.close_device = vfio_platform_close_device,
.ioctl = vfio_platform_ioctl,
+ .get_region_info_caps = vfio_platform_ioctl_get_region_info,
.read = vfio_platform_read,
.write = vfio_platform_write,
.mmap = vfio_platform_mmap,
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 3bf1043cd795..c2990b7e900f 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -272,6 +272,24 @@ err_irq:
}
EXPORT_SYMBOL_GPL(vfio_platform_open_device);
+int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_platform_device *vdev =
+ container_of(core_vdev, struct vfio_platform_device, vdev);
+
+ if (info->index >= vdev->num_regions)
+ return -EINVAL;
+
+ /* map offset to the physical address */
+ info->offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info->index);
+ info->size = vdev->regions[info->index].size;
+ info->flags = vdev->regions[info->index].flags;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info);
+
long vfio_platform_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
@@ -300,28 +318,6 @@ long vfio_platform_ioctl(struct vfio_device *core_vdev,
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
- } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
- struct vfio_region_info info;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.index >= vdev->num_regions)
- return -EINVAL;
-
- /* map offset to the physical address */
- info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index);
- info.size = vdev->regions[info.index].size;
- info.flags = vdev->regions[info.index].flags;
-
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
-
} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
struct vfio_irq_info info;
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index 8d8fab516849..05084212a76e 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -85,6 +85,9 @@ int vfio_platform_open_device(struct vfio_device *core_vdev);
void vfio_platform_close_device(struct vfio_device *core_vdev);
long vfio_platform_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg);
+int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
ssize_t vfio_platform_read(struct vfio_device *core_vdev,
char __user *buf, size_t count,
loff_t *ppos);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 38c8e9350a60..f7df90c423b4 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device)
if (refcount_dec_and_test(&device->refcount))
complete(&device->comp);
}
+EXPORT_SYMBOL_GPL(vfio_device_put_registration);
bool vfio_device_try_get_registration(struct vfio_device *device)
{
return refcount_inc_not_zero(&device->refcount);
}
+EXPORT_SYMBOL_GPL(vfio_device_try_get_registration);
/*
* VFIO driver API
@@ -1259,6 +1261,51 @@ static int vfio_ioctl_device_feature(struct vfio_device *device,
}
}
+static long vfio_get_region_info(struct vfio_device *device,
+ struct vfio_region_info __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_region_info info = {};
+ struct vfio_info_cap caps = {};
+ int ret;
+
+ if (unlikely(!device->ops->get_region_info_caps))
+ return -EINVAL;
+
+ if (copy_from_user(&info, arg, minsz))
+ return -EFAULT;
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = device->ops->get_region_info_caps(device, &info, &caps);
+ if (ret)
+ goto out_free;
+
+ if (caps.size) {
+ info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ info.cap_offset = 0;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user(arg + 1, caps.buf, caps.size)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+ info.cap_offset = sizeof(info);
+ }
+ }
+
+ if (copy_to_user(arg, &info, minsz)){
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ kfree(caps.buf);
+ return ret;
+}
+
static long vfio_device_fops_unl_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
{
@@ -1296,6 +1343,10 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
ret = vfio_ioctl_device_feature(device, uptr);
break;
+ case VFIO_DEVICE_GET_REGION_INFO:
+ ret = vfio_get_region_info(device, uptr);
+ break;
+
default:
if (unlikely(!device->ops->ioctl))
ret = -EINVAL;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8f7f50acb6d6..7f886d3dba7d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -69,15 +69,15 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
-static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = {
- VHOST_FEATURES |
- (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
- (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
- (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
- (1ULL << VIRTIO_F_RING_RESET) |
- (1ULL << VIRTIO_F_IN_ORDER),
- VIRTIO_BIT(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) |
- VIRTIO_BIT(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO),
+static const int vhost_net_bits[] = {
+ VHOST_FEATURES,
+ VHOST_NET_F_VIRTIO_NET_HDR,
+ VIRTIO_NET_F_MRG_RXBUF,
+ VIRTIO_F_ACCESS_PLATFORM,
+ VIRTIO_F_RING_RESET,
+ VIRTIO_F_IN_ORDER,
+ VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO,
+ VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO
};
enum {
@@ -1731,7 +1731,8 @@ out:
static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
unsigned long arg)
{
- u64 all_features[VIRTIO_FEATURES_DWORDS];
+ const DEFINE_VHOST_FEATURES_ARRAY(vhost_net_features, vhost_net_bits);
+ u64 all_features[VIRTIO_FEATURES_U64S];
struct vhost_net *n = f->private_data;
void __user *argp = (void __user *)arg;
u64 __user *featurep = argp;
@@ -1763,7 +1764,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
/* Copy the net features, up to the user-provided buffer size */
argp += sizeof(u64);
- copied = min(count, VIRTIO_FEATURES_DWORDS);
+ copied = min(count, (u64)VIRTIO_FEATURES_U64S);
if (copy_to_user(argp, vhost_net_features,
copied * sizeof(u64)))
return -EFAULT;
@@ -1778,13 +1779,13 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
virtio_features_zero(all_features);
argp += sizeof(u64);
- copied = min(count, VIRTIO_FEATURES_DWORDS);
+ copied = min(count, (u64)VIRTIO_FEATURES_U64S);
if (copy_from_user(all_features, argp, copied * sizeof(u64)))
return -EFAULT;
/*
* Any feature specified by user-space above
- * VIRTIO_FEATURES_MAX is not supported by definition.
+ * VIRTIO_FEATURES_BITS is not supported by definition.
*/
for (i = copied; i < count; ++i) {
if (copy_from_user(&features, featurep + 1 + i,
@@ -1794,7 +1795,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
return -EOPNOTSUPP;
}
- for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++)
+ for (i = 0; i < VIRTIO_FEATURES_U64S; i++)
if (all_features[i] & ~vhost_net_features[i])
return -EOPNOTSUPP;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 98e4f68f4e3c..f43c1fe9fad9 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -197,11 +197,14 @@ enum {
};
/* Note: can't set VIRTIO_F_VERSION_1 yet, since that implies ANY_LAYOUT. */
-enum {
- VHOST_SCSI_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_SCSI_F_HOTPLUG) |
- (1ULL << VIRTIO_SCSI_F_T10_PI)
+static const int vhost_scsi_bits[] = {
+ VHOST_FEATURES,
+ VIRTIO_SCSI_F_HOTPLUG,
+ VIRTIO_SCSI_F_T10_PI
};
+#define VHOST_SCSI_FEATURES VHOST_FEATURES_U64(vhost_scsi_bits, 0)
+
#define VHOST_SCSI_MAX_TARGET 256
#define VHOST_SCSI_MAX_IO_VQ 1024
#define VHOST_SCSI_MAX_EVENT 128
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 42c955a5b211..1e4e36edbcd2 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -28,6 +28,12 @@
*/
#define VHOST_TEST_PKT_WEIGHT 256
+static const int vhost_test_bits[] = {
+ VHOST_FEATURES
+};
+
+#define VHOST_TEST_FEATURES VHOST_FEATURES_U64(vhost_test_bits, 0)
+
enum {
VHOST_TEST_VQ = 0,
VHOST_TEST_VQ_MAX = 1,
@@ -328,14 +334,14 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
return -EFAULT;
return vhost_test_set_backend(n, backend.index, backend.fd);
case VHOST_GET_FEATURES:
- features = VHOST_FEATURES;
+ features = VHOST_TEST_FEATURES;
if (copy_to_user(featurep, &features, sizeof features))
return -EFAULT;
return 0;
case VHOST_SET_FEATURES:
if (copy_from_user(&features, featurep, sizeof features))
return -EFAULT;
- if (features & ~VHOST_FEATURES)
+ if (features & ~VHOST_TEST_FEATURES)
return -EOPNOTSUPP;
return vhost_test_set_features(n, features);
case VHOST_RESET_OWNER:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a78226b37739..bccdc9eab267 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -804,11 +804,13 @@ static int vhost_kthread_worker_create(struct vhost_worker *worker,
ret = vhost_attach_task_to_cgroups(worker);
if (ret)
- goto stop_worker;
+ goto free_id;
worker->id = id;
return 0;
+free_id:
+ xa_erase(&dev->worker_xa, id);
stop_worker:
vhost_kthread_do_stop(worker);
return ret;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b49f08e4a1b4..4fe99765c5c7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -14,6 +14,7 @@
#include <linux/atomic.h>
#include <linux/vhost_iotlb.h>
#include <linux/irqbypass.h>
+#include <linux/unroll.h>
struct vhost_work;
struct vhost_task;
@@ -287,14 +288,39 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
eventfd_signal((vq)->error_ctx);\
} while (0)
-enum {
- VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
- (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
- (1ULL << VIRTIO_RING_F_EVENT_IDX) |
- (1ULL << VHOST_F_LOG_ALL) |
- (1ULL << VIRTIO_F_ANY_LAYOUT) |
- (1ULL << VIRTIO_F_VERSION_1)
-};
+#define VHOST_FEATURES \
+ VIRTIO_F_NOTIFY_ON_EMPTY, \
+ VIRTIO_RING_F_INDIRECT_DESC, \
+ VIRTIO_RING_F_EVENT_IDX, \
+ VHOST_F_LOG_ALL, \
+ VIRTIO_F_ANY_LAYOUT, \
+ VIRTIO_F_VERSION_1
+
+static inline u64 vhost_features_u64(const int *features, int size, int idx)
+{
+ u64 res = 0;
+
+ unrolled_count(VIRTIO_FEATURES_BITS)
+ for (int i = 0; i < size; ++i) {
+ int bit = features[i];
+
+ if (virtio_features_chk_bit(bit) && VIRTIO_U64(bit) == idx)
+ res |= VIRTIO_BIT(bit);
+ }
+ return res;
+}
+
+#define VHOST_FEATURES_U64(features, idx) \
+ vhost_features_u64(features, ARRAY_SIZE(features), idx)
+
+#define DEFINE_VHOST_FEATURES_ARRAY_ENTRY(idx, features) \
+ [idx] = VHOST_FEATURES_U64(features, idx),
+
+#define DEFINE_VHOST_FEATURES_ARRAY(array, features) \
+ u64 array[VIRTIO_FEATURES_U64S] = { \
+ UNROLL(VIRTIO_FEATURES_U64S, \
+ DEFINE_VHOST_FEATURES_ARRAY_ENTRY, features) \
+ }
/**
* vhost_vq_set_backend - Set backend.
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index ae01457ea2cd..0298ddc34824 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -29,12 +29,14 @@
*/
#define VHOST_VSOCK_PKT_WEIGHT 256
-enum {
- VHOST_VSOCK_FEATURES = VHOST_FEATURES |
- (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
- (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
+static const int vhost_vsock_bits[] = {
+ VHOST_FEATURES,
+ VIRTIO_F_ACCESS_PLATFORM,
+ VIRTIO_VSOCK_F_SEQPACKET
};
+#define VHOST_VSOCK_FEATURES VHOST_FEATURES_U64(vhost_vsock_bits, 0)
+
enum {
VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
};
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index a09eb4d62f82..5bdc6b82b30b 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -53,7 +53,7 @@ static ssize_t features_show(struct device *_d,
/* We actually represent this as a bitstring, as it could be
* arbitrary length in future. */
- for (i = 0; i < VIRTIO_FEATURES_MAX; i++)
+ for (i = 0; i < VIRTIO_FEATURES_BITS; i++)
len += sysfs_emit_at(buf, len, "%c",
__virtio_test_bit(dev, i) ? '1' : '0');
len += sysfs_emit_at(buf, len, "\n");
@@ -272,8 +272,8 @@ static int virtio_dev_probe(struct device *_d)
int err, i;
struct virtio_device *dev = dev_to_virtio(_d);
struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
- u64 device_features[VIRTIO_FEATURES_DWORDS];
- u64 driver_features[VIRTIO_FEATURES_DWORDS];
+ u64 device_features[VIRTIO_FEATURES_U64S];
+ u64 driver_features[VIRTIO_FEATURES_U64S];
u64 driver_features_legacy;
/* We have a driver! */
@@ -286,7 +286,7 @@ static int virtio_dev_probe(struct device *_d)
virtio_features_zero(driver_features);
for (i = 0; i < drv->feature_table_size; i++) {
unsigned int f = drv->feature_table[i];
- if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_MAX))
+ if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_BITS))
virtio_features_set_bit(driver_features, f);
}
@@ -303,7 +303,7 @@ static int virtio_dev_probe(struct device *_d)
}
if (virtio_features_test_bit(device_features, VIRTIO_F_VERSION_1)) {
- for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i)
+ for (i = 0; i < VIRTIO_FEATURES_U64S; ++i)
dev->features_array[i] = driver_features[i] &
device_features[i];
} else {
@@ -325,7 +325,7 @@ static int virtio_dev_probe(struct device *_d)
goto err;
if (drv->validate) {
- u64 features[VIRTIO_FEATURES_DWORDS];
+ u64 features[VIRTIO_FEATURES_U64S];
virtio_features_copy(features, dev->features_array);
err = drv->validate(dev);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1b93d8c64361..74fe59f5a78c 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -983,7 +983,8 @@ static int virtballoon_probe(struct virtio_device *vdev)
goto out_del_vqs;
}
vb->balloon_wq = alloc_workqueue("balloon-wq",
- WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+ WQ_FREEZABLE | WQ_CPU_INTENSIVE | WQ_PERCPU,
+ 0);
if (!vb->balloon_wq) {
err = -ENOMEM;
goto out_del_vqs;
diff --git a/drivers/virtio/virtio_debug.c b/drivers/virtio/virtio_debug.c
index d58713ddf2e5..ccf1955a1183 100644
--- a/drivers/virtio/virtio_debug.c
+++ b/drivers/virtio/virtio_debug.c
@@ -8,12 +8,12 @@ static struct dentry *virtio_debugfs_dir;
static int virtio_debug_device_features_show(struct seq_file *s, void *data)
{
- u64 device_features[VIRTIO_FEATURES_DWORDS];
+ u64 device_features[VIRTIO_FEATURES_U64S];
struct virtio_device *dev = s->private;
unsigned int i;
virtio_get_features(dev, device_features);
- for (i = 0; i < VIRTIO_FEATURES_MAX; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS; i++) {
if (virtio_features_test_bit(device_features, i))
seq_printf(s, "%u\n", i);
}
@@ -26,7 +26,7 @@ static int virtio_debug_filter_features_show(struct seq_file *s, void *data)
struct virtio_device *dev = s->private;
unsigned int i;
- for (i = 0; i < VIRTIO_FEATURES_MAX; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS; i++) {
if (virtio_features_test_bit(dev->debugfs_filter_features, i))
seq_printf(s, "%u\n", i);
}
@@ -50,7 +50,7 @@ static int virtio_debug_filter_feature_add(void *data, u64 val)
{
struct virtio_device *dev = data;
- if (val >= VIRTIO_FEATURES_MAX)
+ if (val >= VIRTIO_FEATURES_BITS)
return -EINVAL;
virtio_features_set_bit(dev->debugfs_filter_features, val);
@@ -64,7 +64,7 @@ static int virtio_debug_filter_feature_del(void *data, u64 val)
{
struct virtio_device *dev = data;
- if (val >= VIRTIO_FEATURES_MAX)
+ if (val >= VIRTIO_FEATURES_BITS)
return -EINVAL;
virtio_features_clear_bit(dev->debugfs_filter_features, val);
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 9e503b7a58d8..413a8c353463 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -401,7 +401,7 @@ void vp_modern_get_extended_features(struct virtio_pci_modern_device *mdev,
int i;
virtio_features_zero(features);
- for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
u64 cur;
vp_iowrite32(i, &cfg->device_feature_select);
@@ -427,7 +427,7 @@ vp_modern_get_driver_extended_features(struct virtio_pci_modern_device *mdev,
int i;
virtio_features_zero(features);
- for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
u64 cur;
vp_iowrite32(i, &cfg->guest_feature_select);
@@ -448,7 +448,7 @@ void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev,
struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
int i;
- for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+ for (i = 0; i < VIRTIO_FEATURES_BITS / 32; i++) {
u32 cur = features[i >> 1] >> (32 * (i & 1));
vp_iowrite32(i, &cfg->guest_feature_select);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 7b6205253b46..ddab68959671 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -3166,6 +3166,7 @@ EXPORT_SYMBOL_GPL(virtqueue_map_alloc_coherent);
* @vdev: the virtio device we are talking to
* @map: metadata for performing mapping
* @size: the size of the buffer
+ * @vaddr: the virtual address that needs to be freed
* @map_handle: the mapped address that needs to be freed
*
*/
@@ -3190,7 +3191,7 @@ EXPORT_SYMBOL_GPL(virtqueue_map_free_coherent);
* @dir: mapping direction
* @attrs: mapping attributes
*
- * Returns mapped address. Caller should check that by virtqueue_mapping_error().
+ * Returns mapped address. Caller should check that by virtqueue_map_mapping_error().
*/
dma_addr_t virtqueue_map_page_attrs(const struct virtqueue *_vq,
struct page *page,
@@ -3249,7 +3250,7 @@ EXPORT_SYMBOL_GPL(virtqueue_unmap_page_attrs);
* The caller calls this to do dma mapping in advance. The DMA address can be
* passed to this _vq when it is in pre-mapped mode.
*
- * return mapped address. Caller should check that by virtqueue_mapping_error().
+ * return mapped address. Caller should check that by virtqueue_map_mapping_error().
*/
dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr,
size_t size,
@@ -3299,7 +3300,7 @@ void virtqueue_unmap_single_attrs(const struct virtqueue *_vq,
EXPORT_SYMBOL_GPL(virtqueue_unmap_single_attrs);
/**
- * virtqueue_mapping_error - check dma address
+ * virtqueue_map_mapping_error - check dma address
* @_vq: the struct virtqueue we're talking about.
* @addr: DMA address
*
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index f9a29045eca0..0a801f67b599 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -80,7 +80,7 @@ static void virtio_vdpa_set_status(struct virtio_device *vdev, u8 status)
{
struct vdpa_device *vdpa = vd_get_vdpa(vdev);
- return vdpa_set_status(vdpa, status);
+ vdpa_set_status(vdpa, status);
}
static void virtio_vdpa_reset(struct virtio_device *vdev)